indeedcrawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/indeedcrawler.rb +101 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0764adb113470fac8d62ee9eda673ae63d52d1cd
4
+ data.tar.gz: 6666aba0baff119ed76720feb85fa7a3cd16f9ca
5
+ SHA512:
6
+ metadata.gz: 3032cd536c0a38062ebeaaa27e1470aa58319a2cd612d7428d62572d3498c8625ba744b80b70cbda7d769f5b0f06a0776c310dc178baa3f50edfc5d90c98b66b
7
+ data.tar.gz: da91a5282da3e4d55407edba9fe4c2c181d56b36b4c6640476893da531a440b8e32f8838232c9fc097ee4c47391286889cf39966809c9308c5565e2356ada1f0
@@ -0,0 +1,101 @@
1
+ require 'json'
2
+ require 'uri'
3
+ require 'requestmanager'
4
+ require 'nokogiri'
5
+ require 'indeedparser'
6
+
7
+ class IndeedCrawler
8
+ def initialize(search_query, location, proxy_list, wait_time, browser_num)
9
+ # Info for query
10
+ @search_query = search_query
11
+ @location = location
12
+
13
+ # Settings for request manager
14
+ @requests = RequestManager.new(proxy_list, wait_time, browser_num)
15
+
16
+ # Result tracking
17
+ @all_resume_links = Array.new
18
+ @output = Array.new
19
+ end
20
+
21
+ # Append query
22
+ def add_query(url)
23
+ url += "&q="+URI.encode_www_form([@search_query])
24
+ end
25
+
26
+ # Append location
27
+ def add_location(url)
28
+ url += "&" if @search_query
29
+ url += "l="+URI.encode_www_form([@location])
30
+ end
31
+
32
+ # Get the links on the page
33
+ def get_page_links(html)
34
+ # Get list of people
35
+ profiles = html.xpath("//li[@itemtype='http://schema.org/Person']")
36
+
37
+ # Get each profile link
38
+ profiles.each do |profile|
39
+ @all_resume_links.push("http://indeed.com/"+profile.xpath(".//a[@class='app_link']")[0]['href'])
40
+ end
41
+
42
+ # Navigate to next page if there's a class to do that
43
+ load_next_page(html) if !html.css("a.next").empty?
44
+ end
45
+
46
+ # Load the next page
47
+ def load_next_page(html)
48
+ next_html = load_restart_page("http://indeed.com/resumes"+html.css("a.next").first['href'], 0)
49
+ get_page_links(Nokogiri::HTML(next_html))
50
+ end
51
+
52
+ # Load the page and return or restart and retry if needed
53
+ def load_restart_page(url, count)
54
+ begin
55
+ return @requests.get_page(url)
56
+ rescue
57
+ if count < 2
58
+ @requests.restart_browser
59
+ load_restart_page(url, count+=1)
60
+ end
61
+ end
62
+ end
63
+
64
+ # Download and parse all resumes
65
+ def parse_resumes
66
+ @all_resume_links.each do |link|
67
+ resume = load_restart_page(link, 0)
68
+
69
+ begin
70
+ # Parse resume and add to results
71
+ i = IndeedParser.new(resume, link, {time_scraped: Time.now})
72
+ results = JSON.parse(i.get_results_by_job)
73
+
74
+ results.each do |result|
75
+ @output.push(result)
76
+ end
77
+ rescue
78
+ end
79
+ end
80
+ end
81
+
82
+ # Get all the profile links
83
+ def collect_it_all
84
+ # Generate URL
85
+ url = "http://indeed.com/resumes?co=US"
86
+ url = add_query(url) if @search_query
87
+ url = add_location(url) if @location
88
+
89
+ # Get first page and navigate the rest
90
+ page_body = load_restart_page(url, 0)
91
+ html = Nokogiri::HTML(page_body)
92
+ get_page_links(html)
93
+
94
+ # Get and parse all results
95
+ parse_resumes
96
+
97
+ # Close browsers when done and return results
98
+ @requests.close_all_browsers
99
+ return JSON.pretty_generate(@output)
100
+ end
101
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indeedcrawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Crawls Indeed resumes
14
+ email: shidash@transparencytoolkit.org
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/indeedcrawler.rb
20
+ homepage: https://github.com/TransparencyToolkit/linkedincrawler
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.8
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Crawls Indeed resumes
44
+ test_files: []
45
+ has_rdoc: