indeedcrawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/indeedcrawler.rb +101 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0764adb113470fac8d62ee9eda673ae63d52d1cd
|
4
|
+
data.tar.gz: 6666aba0baff119ed76720feb85fa7a3cd16f9ca
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3032cd536c0a38062ebeaaa27e1470aa58319a2cd612d7428d62572d3498c8625ba744b80b70cbda7d769f5b0f06a0776c310dc178baa3f50edfc5d90c98b66b
|
7
|
+
data.tar.gz: da91a5282da3e4d55407edba9fe4c2c181d56b36b4c6640476893da531a440b8e32f8838232c9fc097ee4c47391286889cf39966809c9308c5565e2356ada1f0
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'uri'
|
3
|
+
require 'requestmanager'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'indeedparser'
|
6
|
+
|
7
|
+
class IndeedCrawler
|
8
|
+
def initialize(search_query, location, proxy_list, wait_time, browser_num)
|
9
|
+
# Info for query
|
10
|
+
@search_query = search_query
|
11
|
+
@location = location
|
12
|
+
|
13
|
+
# Settings for request manager
|
14
|
+
@requests = RequestManager.new(proxy_list, wait_time, browser_num)
|
15
|
+
|
16
|
+
# Result tracking
|
17
|
+
@all_resume_links = Array.new
|
18
|
+
@output = Array.new
|
19
|
+
end
|
20
|
+
|
21
|
+
# Append query
|
22
|
+
def add_query(url)
|
23
|
+
url += "&q="+URI.encode_www_form([@search_query])
|
24
|
+
end
|
25
|
+
|
26
|
+
# Append location
|
27
|
+
def add_location(url)
|
28
|
+
url += "&" if @search_query
|
29
|
+
url += "l="+URI.encode_www_form([@location])
|
30
|
+
end
|
31
|
+
|
32
|
+
# Get the links on the page
|
33
|
+
def get_page_links(html)
|
34
|
+
# Get list of people
|
35
|
+
profiles = html.xpath("//li[@itemtype='http://schema.org/Person']")
|
36
|
+
|
37
|
+
# Get each profile link
|
38
|
+
profiles.each do |profile|
|
39
|
+
@all_resume_links.push("http://indeed.com/"+profile.xpath(".//a[@class='app_link']")[0]['href'])
|
40
|
+
end
|
41
|
+
|
42
|
+
# Navigate to next page if there's a class to do that
|
43
|
+
load_next_page(html) if !html.css("a.next").empty?
|
44
|
+
end
|
45
|
+
|
46
|
+
# Load the next page
|
47
|
+
def load_next_page(html)
|
48
|
+
next_html = load_restart_page("http://indeed.com/resumes"+html.css("a.next").first['href'], 0)
|
49
|
+
get_page_links(Nokogiri::HTML(next_html))
|
50
|
+
end
|
51
|
+
|
52
|
+
# Load the page and return or restart and retry if needed
|
53
|
+
def load_restart_page(url, count)
|
54
|
+
begin
|
55
|
+
return @requests.get_page(url)
|
56
|
+
rescue
|
57
|
+
if count < 2
|
58
|
+
@requests.restart_browser
|
59
|
+
load_restart_page(url, count+=1)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Download and parse all resumes
|
65
|
+
def parse_resumes
|
66
|
+
@all_resume_links.each do |link|
|
67
|
+
resume = load_restart_page(link, 0)
|
68
|
+
|
69
|
+
begin
|
70
|
+
# Parse resume and add to results
|
71
|
+
i = IndeedParser.new(resume, link, {time_scraped: Time.now})
|
72
|
+
results = JSON.parse(i.get_results_by_job)
|
73
|
+
|
74
|
+
results.each do |result|
|
75
|
+
@output.push(result)
|
76
|
+
end
|
77
|
+
rescue
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get all the profile links
|
83
|
+
def collect_it_all
|
84
|
+
# Generate URL
|
85
|
+
url = "http://indeed.com/resumes?co=US"
|
86
|
+
url = add_query(url) if @search_query
|
87
|
+
url = add_location(url) if @location
|
88
|
+
|
89
|
+
# Get first page and navigate the rest
|
90
|
+
page_body = load_restart_page(url, 0)
|
91
|
+
html = Nokogiri::HTML(page_body)
|
92
|
+
get_page_links(html)
|
93
|
+
|
94
|
+
# Get and parse all results
|
95
|
+
parse_resumes
|
96
|
+
|
97
|
+
# Close browsers when done and return results
|
98
|
+
@requests.close_all_browsers
|
99
|
+
return JSON.pretty_generate(@output)
|
100
|
+
end
|
101
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: indeedcrawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Crawls Indeed resumes
|
14
|
+
email: shidash@transparencytoolkit.org
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/indeedcrawler.rb
|
20
|
+
homepage: https://github.com/TransparencyToolkit/linkedincrawler
|
21
|
+
licenses:
|
22
|
+
- GPL
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.8
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Crawls Indeed resumes
|
44
|
+
test_files: []
|
45
|
+
has_rdoc:
|