indeedscraper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/indeedscraper.rb +226 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a9642f26b946a935986e9d6f18d1836c839438d5
|
4
|
+
data.tar.gz: 5b3ef9d2f9e4bd7f4addabbbe08bb888b8819827
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c0519e44a8d29abd01776563480e8daec51a7639180d54a09e5bfeb466f47188abaa27f81cc4754676c9a483c353a2ac9f03de59f05b797b25a0571dca6baaad
|
7
|
+
data.tar.gz: fa1db21f31c5313b9077ce8c2ae902a5d7fe31f2084a08ee1efca48609defc5e628a90c4906da4ea289787e2b3eff140e0b83dac27218dd98845c2e7ba8ba0b0
|
@@ -0,0 +1,226 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'json'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'date'
|
5
|
+
require 'open_uri_redirections'
|
6
|
+
|
7
|
+
class IndeedScraper
|
8
|
+
def initialize(searchterm, location)
|
9
|
+
@searchterm = searchterm
|
10
|
+
@location = location
|
11
|
+
@output = Array.new
|
12
|
+
end
|
13
|
+
|
14
|
+
# Get all results
|
15
|
+
def searchResumes
|
16
|
+
@searchterm.gsub!(" ", "-")
|
17
|
+
if @location != nil
|
18
|
+
@location.gsub!(", ", "-")
|
19
|
+
@location.gsub!(" ", "-")
|
20
|
+
url = "http://www.indeed.com/resumes/" + @searchterm + "/in-" + @location
|
21
|
+
else
|
22
|
+
url = "http://www.indeed.com/resumes/" + @searchterm
|
23
|
+
end
|
24
|
+
html = Nokogiri::HTML(open(url))
|
25
|
+
|
26
|
+
# Handle multiple pages
|
27
|
+
numresults = html.css("div#result_count").text.split(" ")
|
28
|
+
fresult = numresults[0].to_i/50.0
|
29
|
+
if fresult != numresults[0].to_i/50
|
30
|
+
count = fresult +1
|
31
|
+
else
|
32
|
+
count = numresults[0].to_i/50
|
33
|
+
end
|
34
|
+
|
35
|
+
# Loop through pages and get results
|
36
|
+
i = 1
|
37
|
+
while i <= count
|
38
|
+
results = html.css("ol#results")
|
39
|
+
results.css("li").each do |l|
|
40
|
+
getResume("http://indeed.com"+l.css("a")[0]["href"].gsub("?sp=0",""))
|
41
|
+
end
|
42
|
+
i += 1
|
43
|
+
nextstart = (i-1)*50
|
44
|
+
html = Nokogiri::HTML(open(url+"?start="+nextstart.to_s))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Process and save resume data
|
49
|
+
def getResume(url)
|
50
|
+
page = Nokogiri::HTML(open(url))
|
51
|
+
name = page.css('h1[itemprop="name"]').text
|
52
|
+
location = page.css('p.locality').text
|
53
|
+
currtitle = page.css('h2[itemprop="jobTitle"]').text
|
54
|
+
summary = page.css('p#res_summary').text
|
55
|
+
additionalinfo = page.css('div#additionalinfo-section').text
|
56
|
+
skills = page.css("div#skills-section").text
|
57
|
+
|
58
|
+
# Get work info
|
59
|
+
page.css("div.work-experience-section").each do |w|
|
60
|
+
positionhash = Hash.new
|
61
|
+
positionhash[:name] = name
|
62
|
+
positionhash[:url] = url
|
63
|
+
positionhash[:title] = w.css("p.work_title").text
|
64
|
+
if w.css("div.work_company").css("span")[0]
|
65
|
+
positionhash[:company] = w.css("div.work_company").css("span")[0].text
|
66
|
+
end
|
67
|
+
if w.css("div.work_company").css("span")[1]
|
68
|
+
positionhash[:company_location] = w.css("div.work_company").css("span")[1].text
|
69
|
+
end
|
70
|
+
|
71
|
+
# Process date info
|
72
|
+
dates = dateParse(w.css("p.work_dates"))
|
73
|
+
if dates
|
74
|
+
positionhash[:start_date] = dates[0]
|
75
|
+
positionhash[:end_date] = dates[1]
|
76
|
+
end
|
77
|
+
|
78
|
+
positionhash[:description] = w.css("p.work_description").text
|
79
|
+
|
80
|
+
# Info for all positions
|
81
|
+
positionhash[:skills] = skills
|
82
|
+
positionhash[:current_location] = location
|
83
|
+
positionhash[:current_title] = currtitle
|
84
|
+
positionhash[:summary] = summary
|
85
|
+
positionhash[:additional_info] = additionalinfo
|
86
|
+
@output.push(positionhash)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get education info
|
90
|
+
page.css("div.education-section").each do |e|
|
91
|
+
eduhash = Hash.new
|
92
|
+
eduhash[:name] = name
|
93
|
+
eduhash[:url] = url
|
94
|
+
eduhash[:degree] = e.css("p.edu_title").text
|
95
|
+
eduhash[:school] = e.css('span[itemprop="name"]').text
|
96
|
+
eduhash[:dates] = e.css("p.edu_dates").text
|
97
|
+
|
98
|
+
# Info for all degrees
|
99
|
+
eduhash[:skills] = skills
|
100
|
+
eduhash[:current_location] = location
|
101
|
+
eduhash[:current_title] = currtitle
|
102
|
+
eduhash[:summary] = summary
|
103
|
+
eduhash[:additional_info] = additionalinfo
|
104
|
+
@output.push(eduhash)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Get military service info
|
108
|
+
page.css("div.military-section").each do |m|
|
109
|
+
milhash = Hash.new
|
110
|
+
milhash[:name] = name
|
111
|
+
milhash[:url] = url
|
112
|
+
milhash[:service_country] = m.css("p.military_country").text.gsub("Service Country: ", "")
|
113
|
+
milhash[:branch] = m.css("p.military_branch").text.gsub("Branch: ", "")
|
114
|
+
milhash[:rank] = m.css("p.military_rank").text.gsub("Rank: ", "")
|
115
|
+
|
116
|
+
# Parse dates
|
117
|
+
dates = dateParse(m.css("p.military_date"))
|
118
|
+
milhash[:start_date] = dates[0]
|
119
|
+
milhash[:end_date] = dates[1]
|
120
|
+
|
121
|
+
milhash[:military_description] = m.css("p.military_description").text
|
122
|
+
milhash[:military_commendations] = m.css("p.military_commendations").text.split("\n")
|
123
|
+
|
124
|
+
# Info for all items
|
125
|
+
milhash[:skills] = skills
|
126
|
+
milhash[:current_location] = location
|
127
|
+
milhash[:current_title] = currtitle
|
128
|
+
milhash[:summary] = summary
|
129
|
+
milhash[:additional_info] = additionalinfo
|
130
|
+
@output.push(milhash)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Process dates
|
135
|
+
def dateParse(date)
|
136
|
+
datearray = Array.new
|
137
|
+
daterange = date.text.split(" to ")
|
138
|
+
if daterange[0] != nil
|
139
|
+
datearray[0] = DateTime.parse(dateCheck(daterange[0]))
|
140
|
+
else
|
141
|
+
datearray[0] = nil
|
142
|
+
end
|
143
|
+
|
144
|
+
if daterange[1] == "Present"
|
145
|
+
datearray[1] = "Present"
|
146
|
+
else
|
147
|
+
if daterange[1] != nil
|
148
|
+
datearray[1] = DateTime.parse(dateCheck(daterange[1]))
|
149
|
+
else
|
150
|
+
datearray = nil
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
return datearray
|
155
|
+
end
|
156
|
+
|
157
|
+
# Handle year only dates
|
158
|
+
def dateCheck(date)
|
159
|
+
if date.length == 4
|
160
|
+
return "January " + date
|
161
|
+
else
|
162
|
+
return date
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Search for jobs
|
167
|
+
def searchJobs
|
168
|
+
@searchterm.gsub!(" ", "+")
|
169
|
+
if @location != nil
|
170
|
+
@location.gsub!(", ", "%2C+")
|
171
|
+
@location.gsub!(" ", "+")
|
172
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location
|
173
|
+
else
|
174
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l="
|
175
|
+
end
|
176
|
+
html = Nokogiri::HTML(open(url))
|
177
|
+
|
178
|
+
# Handle multiple pages
|
179
|
+
numresults = html.css("div#searchCount").text.split(" of ")
|
180
|
+
fresult = numresults[1].to_i/10.0
|
181
|
+
if fresult != numresults[1].to_i/10
|
182
|
+
count = fresult +1
|
183
|
+
else
|
184
|
+
count = numresults[1].to_i/10
|
185
|
+
end
|
186
|
+
|
187
|
+
# Loop through pages and get results
|
188
|
+
i = 1
|
189
|
+
while i <= count
|
190
|
+
# Parse each listing
|
191
|
+
html.css("div.row").each do |r|
|
192
|
+
jobhash = Hash.new
|
193
|
+
jobhash[:position] = r.css("h2.jobtitle").text.strip.lstrip
|
194
|
+
jobhash[:company] = r.css("span.company").text.strip.lstrip
|
195
|
+
jobhash[:location] = r.css('span[itemprop="jobLocation"]').text.strip.lstrip
|
196
|
+
if r.css("h2.jobtitle").css("a")[0]
|
197
|
+
jobhash[:url] = "http://indeed.com" + r.css("h2.jobtitle").css("a")[0]["href"]
|
198
|
+
begin
|
199
|
+
jobhash[:text] = Nokogiri::HTML(open(jobhash[:url])).text
|
200
|
+
rescue
|
201
|
+
begin
|
202
|
+
jobhash[:text] = Nokogiri::HTML(open(jobhash[:url], :allow_redirections => :all)).text
|
203
|
+
rescue
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
@output.push(jobhash)
|
208
|
+
end
|
209
|
+
|
210
|
+
# Get next page
|
211
|
+
i += 1
|
212
|
+
nextstart = (i-1)*10
|
213
|
+
if @location != nil
|
214
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location + "&start=" + nextstart.to_s
|
215
|
+
else
|
216
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&start=" + nextstart.to_s
|
217
|
+
end
|
218
|
+
html = Nokogiri::HTML(open(url))
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
# Generates JSON output
|
223
|
+
def getOutput
|
224
|
+
JSON.pretty_generate(@output)
|
225
|
+
end
|
226
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: indeedscraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-01 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Get resumes and job listings from indeed based on search terms and locations.
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/indeedscraper.rb
|
20
|
+
homepage: https://github.com/TransparencyToolkit/IndeedScraper
|
21
|
+
licenses:
|
22
|
+
- GPL
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.0.14
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Get resumes and job listings from indeed.
|
44
|
+
test_files: []
|
45
|
+
has_rdoc:
|