indeedscraper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/indeedscraper.rb +226 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a9642f26b946a935986e9d6f18d1836c839438d5
4
+ data.tar.gz: 5b3ef9d2f9e4bd7f4addabbbe08bb888b8819827
5
+ SHA512:
6
+ metadata.gz: c0519e44a8d29abd01776563480e8daec51a7639180d54a09e5bfeb466f47188abaa27f81cc4754676c9a483c353a2ac9f03de59f05b797b25a0571dca6baaad
7
+ data.tar.gz: fa1db21f31c5313b9077ce8c2ae902a5d7fe31f2084a08ee1efca48609defc5e628a90c4906da4ea289787e2b3eff140e0b83dac27218dd98845c2e7ba8ba0b0
@@ -0,0 +1,226 @@
1
+ require 'open-uri'
2
+ require 'json'
3
+ require 'nokogiri'
4
+ require 'date'
5
+ require 'open_uri_redirections'
6
+
7
+ class IndeedScraper
8
+ def initialize(searchterm, location)
9
+ @searchterm = searchterm
10
+ @location = location
11
+ @output = Array.new
12
+ end
13
+
14
+ # Get all results
15
+ def searchResumes
16
+ @searchterm.gsub!(" ", "-")
17
+ if @location != nil
18
+ @location.gsub!(", ", "-")
19
+ @location.gsub!(" ", "-")
20
+ url = "http://www.indeed.com/resumes/" + @searchterm + "/in-" + @location
21
+ else
22
+ url = "http://www.indeed.com/resumes/" + @searchterm
23
+ end
24
+ html = Nokogiri::HTML(open(url))
25
+
26
+ # Handle multiple pages
27
+ numresults = html.css("div#result_count").text.split(" ")
28
+ fresult = numresults[0].to_i/50.0
29
+ if fresult != numresults[0].to_i/50
30
+ count = fresult +1
31
+ else
32
+ count = numresults[0].to_i/50
33
+ end
34
+
35
+ # Loop through pages and get results
36
+ i = 1
37
+ while i <= count
38
+ results = html.css("ol#results")
39
+ results.css("li").each do |l|
40
+ getResume("http://indeed.com"+l.css("a")[0]["href"].gsub("?sp=0",""))
41
+ end
42
+ i += 1
43
+ nextstart = (i-1)*50
44
+ html = Nokogiri::HTML(open(url+"?start="+nextstart.to_s))
45
+ end
46
+ end
47
+
48
+ # Process and save resume data
49
+ def getResume(url)
50
+ page = Nokogiri::HTML(open(url))
51
+ name = page.css('h1[itemprop="name"]').text
52
+ location = page.css('p.locality').text
53
+ currtitle = page.css('h2[itemprop="jobTitle"]').text
54
+ summary = page.css('p#res_summary').text
55
+ additionalinfo = page.css('div#additionalinfo-section').text
56
+ skills = page.css("div#skills-section").text
57
+
58
+ # Get work info
59
+ page.css("div.work-experience-section").each do |w|
60
+ positionhash = Hash.new
61
+ positionhash[:name] = name
62
+ positionhash[:url] = url
63
+ positionhash[:title] = w.css("p.work_title").text
64
+ if w.css("div.work_company").css("span")[0]
65
+ positionhash[:company] = w.css("div.work_company").css("span")[0].text
66
+ end
67
+ if w.css("div.work_company").css("span")[1]
68
+ positionhash[:company_location] = w.css("div.work_company").css("span")[1].text
69
+ end
70
+
71
+ # Process date info
72
+ dates = dateParse(w.css("p.work_dates"))
73
+ if dates
74
+ positionhash[:start_date] = dates[0]
75
+ positionhash[:end_date] = dates[1]
76
+ end
77
+
78
+ positionhash[:description] = w.css("p.work_description").text
79
+
80
+ # Info for all positions
81
+ positionhash[:skills] = skills
82
+ positionhash[:current_location] = location
83
+ positionhash[:current_title] = currtitle
84
+ positionhash[:summary] = summary
85
+ positionhash[:additional_info] = additionalinfo
86
+ @output.push(positionhash)
87
+ end
88
+
89
+ # Get education info
90
+ page.css("div.education-section").each do |e|
91
+ eduhash = Hash.new
92
+ eduhash[:name] = name
93
+ eduhash[:url] = url
94
+ eduhash[:degree] = e.css("p.edu_title").text
95
+ eduhash[:school] = e.css('span[itemprop="name"]').text
96
+ eduhash[:dates] = e.css("p.edu_dates").text
97
+
98
+ # Info for all degrees
99
+ eduhash[:skills] = skills
100
+ eduhash[:current_location] = location
101
+ eduhash[:current_title] = currtitle
102
+ eduhash[:summary] = summary
103
+ eduhash[:additional_info] = additionalinfo
104
+ @output.push(eduhash)
105
+ end
106
+
107
+ # Get military service info
108
+ page.css("div.military-section").each do |m|
109
+ milhash = Hash.new
110
+ milhash[:name] = name
111
+ milhash[:url] = url
112
+ milhash[:service_country] = m.css("p.military_country").text.gsub("Service Country: ", "")
113
+ milhash[:branch] = m.css("p.military_branch").text.gsub("Branch: ", "")
114
+ milhash[:rank] = m.css("p.military_rank").text.gsub("Rank: ", "")
115
+
116
+ # Parse dates
117
+ dates = dateParse(m.css("p.military_date"))
118
+ milhash[:start_date] = dates[0]
119
+ milhash[:end_date] = dates[1]
120
+
121
+ milhash[:military_description] = m.css("p.military_description").text
122
+ milhash[:military_commendations] = m.css("p.military_commendations").text.split("\n")
123
+
124
+ # Info for all items
125
+ milhash[:skills] = skills
126
+ milhash[:current_location] = location
127
+ milhash[:current_title] = currtitle
128
+ milhash[:summary] = summary
129
+ milhash[:additional_info] = additionalinfo
130
+ @output.push(milhash)
131
+ end
132
+ end
133
+
134
+ # Process dates
135
+ def dateParse(date)
136
+ datearray = Array.new
137
+ daterange = date.text.split(" to ")
138
+ if daterange[0] != nil
139
+ datearray[0] = DateTime.parse(dateCheck(daterange[0]))
140
+ else
141
+ datearray[0] = nil
142
+ end
143
+
144
+ if daterange[1] == "Present"
145
+ datearray[1] = "Present"
146
+ else
147
+ if daterange[1] != nil
148
+ datearray[1] = DateTime.parse(dateCheck(daterange[1]))
149
+ else
150
+ datearray = nil
151
+ end
152
+ end
153
+
154
+ return datearray
155
+ end
156
+
157
+ # Handle year only dates
158
+ def dateCheck(date)
159
+ if date.length == 4
160
+ return "January " + date
161
+ else
162
+ return date
163
+ end
164
+ end
165
+
166
+ # Search for jobs
167
+ def searchJobs
168
+ @searchterm.gsub!(" ", "+")
169
+ if @location != nil
170
+ @location.gsub!(", ", "%2C+")
171
+ @location.gsub!(" ", "+")
172
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location
173
+ else
174
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l="
175
+ end
176
+ html = Nokogiri::HTML(open(url))
177
+
178
+ # Handle multiple pages
179
+ numresults = html.css("div#searchCount").text.split(" of ")
180
+ fresult = numresults[1].to_i/10.0
181
+ if fresult != numresults[1].to_i/10
182
+ count = fresult +1
183
+ else
184
+ count = numresults[1].to_i/10
185
+ end
186
+
187
+ # Loop through pages and get results
188
+ i = 1
189
+ while i <= count
190
+ # Parse each listing
191
+ html.css("div.row").each do |r|
192
+ jobhash = Hash.new
193
+ jobhash[:position] = r.css("h2.jobtitle").text.strip.lstrip
194
+ jobhash[:company] = r.css("span.company").text.strip.lstrip
195
+ jobhash[:location] = r.css('span[itemprop="jobLocation"]').text.strip.lstrip
196
+ if r.css("h2.jobtitle").css("a")[0]
197
+ jobhash[:url] = "http://indeed.com" + r.css("h2.jobtitle").css("a")[0]["href"]
198
+ begin
199
+ jobhash[:text] = Nokogiri::HTML(open(jobhash[:url])).text
200
+ rescue
201
+ begin
202
+ jobhash[:text] = Nokogiri::HTML(open(jobhash[:url], :allow_redirections => :all)).text
203
+ rescue
204
+ end
205
+ end
206
+ end
207
+ @output.push(jobhash)
208
+ end
209
+
210
+ # Get next page
211
+ i += 1
212
+ nextstart = (i-1)*10
213
+ if @location != nil
214
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location + "&start=" + nextstart.to_s
215
+ else
216
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&start=" + nextstart.to_s
217
+ end
218
+ html = Nokogiri::HTML(open(url))
219
+ end
220
+ end
221
+
222
+ # Generates JSON output
223
+ def getOutput
224
+ JSON.pretty_generate(@output)
225
+ end
226
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indeedscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-01 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Get resumes and job listings from indeed based on search terms and locations.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/indeedscraper.rb
20
+ homepage: https://github.com/TransparencyToolkit/IndeedScraper
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Get resumes and job listings from indeed.
44
+ test_files: []
45
+ has_rdoc: