indeedcrawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/indeedcrawler.rb +101 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0764adb113470fac8d62ee9eda673ae63d52d1cd
4
+ data.tar.gz: 6666aba0baff119ed76720feb85fa7a3cd16f9ca
5
+ SHA512:
6
+ metadata.gz: 3032cd536c0a38062ebeaaa27e1470aa58319a2cd612d7428d62572d3498c8625ba744b80b70cbda7d769f5b0f06a0776c310dc178baa3f50edfc5d90c98b66b
7
+ data.tar.gz: da91a5282da3e4d55407edba9fe4c2c181d56b36b4c6640476893da531a440b8e32f8838232c9fc097ee4c47391286889cf39966809c9308c5565e2356ada1f0
@@ -0,0 +1,101 @@
1
+ require 'json'
2
+ require 'uri'
3
+ require 'requestmanager'
4
+ require 'nokogiri'
5
+ require 'indeedparser'
6
+
7
+ class IndeedCrawler
8
+ def initialize(search_query, location, proxy_list, wait_time, browser_num)
9
+ # Info for query
10
+ @search_query = search_query
11
+ @location = location
12
+
13
+ # Settings for request manager
14
+ @requests = RequestManager.new(proxy_list, wait_time, browser_num)
15
+
16
+ # Result tracking
17
+ @all_resume_links = Array.new
18
+ @output = Array.new
19
+ end
20
+
21
+ # Append query
22
+ def add_query(url)
23
+ url += "&q="+URI.encode_www_form([@search_query])
24
+ end
25
+
26
+ # Append location
27
+ def add_location(url)
28
+ url += "&" if @search_query
29
+ url += "l="+URI.encode_www_form([@location])
30
+ end
31
+
32
+ # Get the links on the page
33
+ def get_page_links(html)
34
+ # Get list of people
35
+ profiles = html.xpath("//li[@itemtype='http://schema.org/Person']")
36
+
37
+ # Get each profile link
38
+ profiles.each do |profile|
39
+ @all_resume_links.push("http://indeed.com/"+profile.xpath(".//a[@class='app_link']")[0]['href'])
40
+ end
41
+
42
+ # Navigate to next page if there's a class to do that
43
+ load_next_page(html) if !html.css("a.next").empty?
44
+ end
45
+
46
+ # Load the next page
47
+ def load_next_page(html)
48
+ next_html = load_restart_page("http://indeed.com/resumes"+html.css("a.next").first['href'], 0)
49
+ get_page_links(Nokogiri::HTML(next_html))
50
+ end
51
+
52
+ # Load the page and return or restart and retry if needed
53
+ def load_restart_page(url, count)
54
+ begin
55
+ return @requests.get_page(url)
56
+ rescue
57
+ if count < 2
58
+ @requests.restart_browser
59
+ load_restart_page(url, count+=1)
60
+ end
61
+ end
62
+ end
63
+
64
+ # Download and parse all resumes
65
+ def parse_resumes
66
+ @all_resume_links.each do |link|
67
+ resume = load_restart_page(link, 0)
68
+
69
+ begin
70
+ # Parse resume and add to results
71
+ i = IndeedParser.new(resume, link, {time_scraped: Time.now})
72
+ results = JSON.parse(i.get_results_by_job)
73
+
74
+ results.each do |result|
75
+ @output.push(result)
76
+ end
77
+ rescue
78
+ end
79
+ end
80
+ end
81
+
82
+ # Get all the profile links
83
+ def collect_it_all
84
+ # Generate URL
85
+ url = "http://indeed.com/resumes?co=US"
86
+ url = add_query(url) if @search_query
87
+ url = add_location(url) if @location
88
+
89
+ # Get first page and navigate the rest
90
+ page_body = load_restart_page(url, 0)
91
+ html = Nokogiri::HTML(page_body)
92
+ get_page_links(html)
93
+
94
+ # Get and parse all results
95
+ parse_resumes
96
+
97
+ # Close browsers when done and return results
98
+ @requests.close_all_browsers
99
+ return JSON.pretty_generate(@output)
100
+ end
101
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indeedcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Crawls Indeed resumes
14
+ email: shidash@transparencytoolkit.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/indeedcrawler.rb
20
+ homepage: https://github.com/TransparencyToolkit/linkedincrawler
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.8
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Crawls Indeed resumes
44
+ test_files: []
45
+ has_rdoc: