indeedscraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/indeedscraper.rb +226 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a9642f26b946a935986e9d6f18d1836c839438d5
|
4
|
+
data.tar.gz: 5b3ef9d2f9e4bd7f4addabbbe08bb888b8819827
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c0519e44a8d29abd01776563480e8daec51a7639180d54a09e5bfeb466f47188abaa27f81cc4754676c9a483c353a2ac9f03de59f05b797b25a0571dca6baaad
|
7
|
+
data.tar.gz: fa1db21f31c5313b9077ce8c2ae902a5d7fe31f2084a08ee1efca48609defc5e628a90c4906da4ea289787e2b3eff140e0b83dac27218dd98845c2e7ba8ba0b0
|
@@ -0,0 +1,226 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'json'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'date'
|
5
|
+
require 'open_uri_redirections'
|
6
|
+
|
7
|
+
class IndeedScraper
|
8
|
+
def initialize(searchterm, location)
|
9
|
+
@searchterm = searchterm
|
10
|
+
@location = location
|
11
|
+
@output = Array.new
|
12
|
+
end
|
13
|
+
|
14
|
+
# Get all results
|
15
|
+
def searchResumes
|
16
|
+
@searchterm.gsub!(" ", "-")
|
17
|
+
if @location != nil
|
18
|
+
@location.gsub!(", ", "-")
|
19
|
+
@location.gsub!(" ", "-")
|
20
|
+
url = "http://www.indeed.com/resumes/" + @searchterm + "/in-" + @location
|
21
|
+
else
|
22
|
+
url = "http://www.indeed.com/resumes/" + @searchterm
|
23
|
+
end
|
24
|
+
html = Nokogiri::HTML(open(url))
|
25
|
+
|
26
|
+
# Handle multiple pages
|
27
|
+
numresults = html.css("div#result_count").text.split(" ")
|
28
|
+
fresult = numresults[0].to_i/50.0
|
29
|
+
if fresult != numresults[0].to_i/50
|
30
|
+
count = fresult +1
|
31
|
+
else
|
32
|
+
count = numresults[0].to_i/50
|
33
|
+
end
|
34
|
+
|
35
|
+
# Loop through pages and get results
|
36
|
+
i = 1
|
37
|
+
while i <= count
|
38
|
+
results = html.css("ol#results")
|
39
|
+
results.css("li").each do |l|
|
40
|
+
getResume("http://indeed.com"+l.css("a")[0]["href"].gsub("?sp=0",""))
|
41
|
+
end
|
42
|
+
i += 1
|
43
|
+
nextstart = (i-1)*50
|
44
|
+
html = Nokogiri::HTML(open(url+"?start="+nextstart.to_s))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Process and save resume data
|
49
|
+
def getResume(url)
|
50
|
+
page = Nokogiri::HTML(open(url))
|
51
|
+
name = page.css('h1[itemprop="name"]').text
|
52
|
+
location = page.css('p.locality').text
|
53
|
+
currtitle = page.css('h2[itemprop="jobTitle"]').text
|
54
|
+
summary = page.css('p#res_summary').text
|
55
|
+
additionalinfo = page.css('div#additionalinfo-section').text
|
56
|
+
skills = page.css("div#skills-section").text
|
57
|
+
|
58
|
+
# Get work info
|
59
|
+
page.css("div.work-experience-section").each do |w|
|
60
|
+
positionhash = Hash.new
|
61
|
+
positionhash[:name] = name
|
62
|
+
positionhash[:url] = url
|
63
|
+
positionhash[:title] = w.css("p.work_title").text
|
64
|
+
if w.css("div.work_company").css("span")[0]
|
65
|
+
positionhash[:company] = w.css("div.work_company").css("span")[0].text
|
66
|
+
end
|
67
|
+
if w.css("div.work_company").css("span")[1]
|
68
|
+
positionhash[:company_location] = w.css("div.work_company").css("span")[1].text
|
69
|
+
end
|
70
|
+
|
71
|
+
# Process date info
|
72
|
+
dates = dateParse(w.css("p.work_dates"))
|
73
|
+
if dates
|
74
|
+
positionhash[:start_date] = dates[0]
|
75
|
+
positionhash[:end_date] = dates[1]
|
76
|
+
end
|
77
|
+
|
78
|
+
positionhash[:description] = w.css("p.work_description").text
|
79
|
+
|
80
|
+
# Info for all positions
|
81
|
+
positionhash[:skills] = skills
|
82
|
+
positionhash[:current_location] = location
|
83
|
+
positionhash[:current_title] = currtitle
|
84
|
+
positionhash[:summary] = summary
|
85
|
+
positionhash[:additional_info] = additionalinfo
|
86
|
+
@output.push(positionhash)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Get education info
|
90
|
+
page.css("div.education-section").each do |e|
|
91
|
+
eduhash = Hash.new
|
92
|
+
eduhash[:name] = name
|
93
|
+
eduhash[:url] = url
|
94
|
+
eduhash[:degree] = e.css("p.edu_title").text
|
95
|
+
eduhash[:school] = e.css('span[itemprop="name"]').text
|
96
|
+
eduhash[:dates] = e.css("p.edu_dates").text
|
97
|
+
|
98
|
+
# Info for all degrees
|
99
|
+
eduhash[:skills] = skills
|
100
|
+
eduhash[:current_location] = location
|
101
|
+
eduhash[:current_title] = currtitle
|
102
|
+
eduhash[:summary] = summary
|
103
|
+
eduhash[:additional_info] = additionalinfo
|
104
|
+
@output.push(eduhash)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Get military service info
|
108
|
+
page.css("div.military-section").each do |m|
|
109
|
+
milhash = Hash.new
|
110
|
+
milhash[:name] = name
|
111
|
+
milhash[:url] = url
|
112
|
+
milhash[:service_country] = m.css("p.military_country").text.gsub("Service Country: ", "")
|
113
|
+
milhash[:branch] = m.css("p.military_branch").text.gsub("Branch: ", "")
|
114
|
+
milhash[:rank] = m.css("p.military_rank").text.gsub("Rank: ", "")
|
115
|
+
|
116
|
+
# Parse dates
|
117
|
+
dates = dateParse(m.css("p.military_date"))
|
118
|
+
milhash[:start_date] = dates[0]
|
119
|
+
milhash[:end_date] = dates[1]
|
120
|
+
|
121
|
+
milhash[:military_description] = m.css("p.military_description").text
|
122
|
+
milhash[:military_commendations] = m.css("p.military_commendations").text.split("\n")
|
123
|
+
|
124
|
+
# Info for all items
|
125
|
+
milhash[:skills] = skills
|
126
|
+
milhash[:current_location] = location
|
127
|
+
milhash[:current_title] = currtitle
|
128
|
+
milhash[:summary] = summary
|
129
|
+
milhash[:additional_info] = additionalinfo
|
130
|
+
@output.push(milhash)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Process dates
|
135
|
+
def dateParse(date)
|
136
|
+
datearray = Array.new
|
137
|
+
daterange = date.text.split(" to ")
|
138
|
+
if daterange[0] != nil
|
139
|
+
datearray[0] = DateTime.parse(dateCheck(daterange[0]))
|
140
|
+
else
|
141
|
+
datearray[0] = nil
|
142
|
+
end
|
143
|
+
|
144
|
+
if daterange[1] == "Present"
|
145
|
+
datearray[1] = "Present"
|
146
|
+
else
|
147
|
+
if daterange[1] != nil
|
148
|
+
datearray[1] = DateTime.parse(dateCheck(daterange[1]))
|
149
|
+
else
|
150
|
+
datearray = nil
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
return datearray
|
155
|
+
end
|
156
|
+
|
157
|
+
# Handle year only dates
|
158
|
+
def dateCheck(date)
|
159
|
+
if date.length == 4
|
160
|
+
return "January " + date
|
161
|
+
else
|
162
|
+
return date
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Search for jobs
|
167
|
+
def searchJobs
|
168
|
+
@searchterm.gsub!(" ", "+")
|
169
|
+
if @location != nil
|
170
|
+
@location.gsub!(", ", "%2C+")
|
171
|
+
@location.gsub!(" ", "+")
|
172
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location
|
173
|
+
else
|
174
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l="
|
175
|
+
end
|
176
|
+
html = Nokogiri::HTML(open(url))
|
177
|
+
|
178
|
+
# Handle multiple pages
|
179
|
+
numresults = html.css("div#searchCount").text.split(" of ")
|
180
|
+
fresult = numresults[1].to_i/10.0
|
181
|
+
if fresult != numresults[1].to_i/10
|
182
|
+
count = fresult +1
|
183
|
+
else
|
184
|
+
count = numresults[1].to_i/10
|
185
|
+
end
|
186
|
+
|
187
|
+
# Loop through pages and get results
|
188
|
+
i = 1
|
189
|
+
while i <= count
|
190
|
+
# Parse each listing
|
191
|
+
html.css("div.row").each do |r|
|
192
|
+
jobhash = Hash.new
|
193
|
+
jobhash[:position] = r.css("h2.jobtitle").text.strip.lstrip
|
194
|
+
jobhash[:company] = r.css("span.company").text.strip.lstrip
|
195
|
+
jobhash[:location] = r.css('span[itemprop="jobLocation"]').text.strip.lstrip
|
196
|
+
if r.css("h2.jobtitle").css("a")[0]
|
197
|
+
jobhash[:url] = "http://indeed.com" + r.css("h2.jobtitle").css("a")[0]["href"]
|
198
|
+
begin
|
199
|
+
jobhash[:text] = Nokogiri::HTML(open(jobhash[:url])).text
|
200
|
+
rescue
|
201
|
+
begin
|
202
|
+
jobhash[:text] = Nokogiri::HTML(open(jobhash[:url], :allow_redirections => :all)).text
|
203
|
+
rescue
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
@output.push(jobhash)
|
208
|
+
end
|
209
|
+
|
210
|
+
# Get next page
|
211
|
+
i += 1
|
212
|
+
nextstart = (i-1)*10
|
213
|
+
if @location != nil
|
214
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location + "&start=" + nextstart.to_s
|
215
|
+
else
|
216
|
+
url = "http://www.indeed.com/jobs?q=" + @searchterm + "&start=" + nextstart.to_s
|
217
|
+
end
|
218
|
+
html = Nokogiri::HTML(open(url))
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
# Generates JSON output
|
223
|
+
def getOutput
|
224
|
+
JSON.pretty_generate(@output)
|
225
|
+
end
|
226
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: indeedscraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-01 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Get resumes and job listings from indeed based on search terms and locations.
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/indeedscraper.rb
|
20
|
+
homepage: https://github.com/TransparencyToolkit/IndeedScraper
|
21
|
+
licenses:
|
22
|
+
- GPL
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.0.14
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Get resumes and job listings from indeed.
|
44
|
+
test_files: []
|
45
|
+
has_rdoc:
|