indeedscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/indeedscraper.rb +226 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a9642f26b946a935986e9d6f18d1836c839438d5
4
+ data.tar.gz: 5b3ef9d2f9e4bd7f4addabbbe08bb888b8819827
5
+ SHA512:
6
+ metadata.gz: c0519e44a8d29abd01776563480e8daec51a7639180d54a09e5bfeb466f47188abaa27f81cc4754676c9a483c353a2ac9f03de59f05b797b25a0571dca6baaad
7
+ data.tar.gz: fa1db21f31c5313b9077ce8c2ae902a5d7fe31f2084a08ee1efca48609defc5e628a90c4906da4ea289787e2b3eff140e0b83dac27218dd98845c2e7ba8ba0b0
@@ -0,0 +1,226 @@
1
+ require 'open-uri'
2
+ require 'json'
3
+ require 'nokogiri'
4
+ require 'date'
5
+ require 'open_uri_redirections'
6
+
7
+ class IndeedScraper
8
+ def initialize(searchterm, location)
9
+ @searchterm = searchterm
10
+ @location = location
11
+ @output = Array.new
12
+ end
13
+
14
+ # Get all results
15
+ def searchResumes
16
+ @searchterm.gsub!(" ", "-")
17
+ if @location != nil
18
+ @location.gsub!(", ", "-")
19
+ @location.gsub!(" ", "-")
20
+ url = "http://www.indeed.com/resumes/" + @searchterm + "/in-" + @location
21
+ else
22
+ url = "http://www.indeed.com/resumes/" + @searchterm
23
+ end
24
+ html = Nokogiri::HTML(open(url))
25
+
26
+ # Handle multiple pages
27
+ numresults = html.css("div#result_count").text.split(" ")
28
+ fresult = numresults[0].to_i/50.0
29
+ if fresult != numresults[0].to_i/50
30
+ count = fresult +1
31
+ else
32
+ count = numresults[0].to_i/50
33
+ end
34
+
35
+ # Loop through pages and get results
36
+ i = 1
37
+ while i <= count
38
+ results = html.css("ol#results")
39
+ results.css("li").each do |l|
40
+ getResume("http://indeed.com"+l.css("a")[0]["href"].gsub("?sp=0",""))
41
+ end
42
+ i += 1
43
+ nextstart = (i-1)*50
44
+ html = Nokogiri::HTML(open(url+"?start="+nextstart.to_s))
45
+ end
46
+ end
47
+
48
+ # Process and save resume data
49
+ def getResume(url)
50
+ page = Nokogiri::HTML(open(url))
51
+ name = page.css('h1[itemprop="name"]').text
52
+ location = page.css('p.locality').text
53
+ currtitle = page.css('h2[itemprop="jobTitle"]').text
54
+ summary = page.css('p#res_summary').text
55
+ additionalinfo = page.css('div#additionalinfo-section').text
56
+ skills = page.css("div#skills-section").text
57
+
58
+ # Get work info
59
+ page.css("div.work-experience-section").each do |w|
60
+ positionhash = Hash.new
61
+ positionhash[:name] = name
62
+ positionhash[:url] = url
63
+ positionhash[:title] = w.css("p.work_title").text
64
+ if w.css("div.work_company").css("span")[0]
65
+ positionhash[:company] = w.css("div.work_company").css("span")[0].text
66
+ end
67
+ if w.css("div.work_company").css("span")[1]
68
+ positionhash[:company_location] = w.css("div.work_company").css("span")[1].text
69
+ end
70
+
71
+ # Process date info
72
+ dates = dateParse(w.css("p.work_dates"))
73
+ if dates
74
+ positionhash[:start_date] = dates[0]
75
+ positionhash[:end_date] = dates[1]
76
+ end
77
+
78
+ positionhash[:description] = w.css("p.work_description").text
79
+
80
+ # Info for all positions
81
+ positionhash[:skills] = skills
82
+ positionhash[:current_location] = location
83
+ positionhash[:current_title] = currtitle
84
+ positionhash[:summary] = summary
85
+ positionhash[:additional_info] = additionalinfo
86
+ @output.push(positionhash)
87
+ end
88
+
89
+ # Get education info
90
+ page.css("div.education-section").each do |e|
91
+ eduhash = Hash.new
92
+ eduhash[:name] = name
93
+ eduhash[:url] = url
94
+ eduhash[:degree] = e.css("p.edu_title").text
95
+ eduhash[:school] = e.css('span[itemprop="name"]').text
96
+ eduhash[:dates] = e.css("p.edu_dates").text
97
+
98
+ # Info for all degrees
99
+ eduhash[:skills] = skills
100
+ eduhash[:current_location] = location
101
+ eduhash[:current_title] = currtitle
102
+ eduhash[:summary] = summary
103
+ eduhash[:additional_info] = additionalinfo
104
+ @output.push(eduhash)
105
+ end
106
+
107
+ # Get military service info
108
+ page.css("div.military-section").each do |m|
109
+ milhash = Hash.new
110
+ milhash[:name] = name
111
+ milhash[:url] = url
112
+ milhash[:service_country] = m.css("p.military_country").text.gsub("Service Country: ", "")
113
+ milhash[:branch] = m.css("p.military_branch").text.gsub("Branch: ", "")
114
+ milhash[:rank] = m.css("p.military_rank").text.gsub("Rank: ", "")
115
+
116
+ # Parse dates
117
+ dates = dateParse(m.css("p.military_date"))
118
+ milhash[:start_date] = dates[0]
119
+ milhash[:end_date] = dates[1]
120
+
121
+ milhash[:military_description] = m.css("p.military_description").text
122
+ milhash[:military_commendations] = m.css("p.military_commendations").text.split("\n")
123
+
124
+ # Info for all items
125
+ milhash[:skills] = skills
126
+ milhash[:current_location] = location
127
+ milhash[:current_title] = currtitle
128
+ milhash[:summary] = summary
129
+ milhash[:additional_info] = additionalinfo
130
+ @output.push(milhash)
131
+ end
132
+ end
133
+
134
+ # Process dates
135
+ def dateParse(date)
136
+ datearray = Array.new
137
+ daterange = date.text.split(" to ")
138
+ if daterange[0] != nil
139
+ datearray[0] = DateTime.parse(dateCheck(daterange[0]))
140
+ else
141
+ datearray[0] = nil
142
+ end
143
+
144
+ if daterange[1] == "Present"
145
+ datearray[1] = "Present"
146
+ else
147
+ if daterange[1] != nil
148
+ datearray[1] = DateTime.parse(dateCheck(daterange[1]))
149
+ else
150
+ datearray = nil
151
+ end
152
+ end
153
+
154
+ return datearray
155
+ end
156
+
157
+ # Handle year only dates
158
+ def dateCheck(date)
159
+ if date.length == 4
160
+ return "January " + date
161
+ else
162
+ return date
163
+ end
164
+ end
165
+
166
+ # Search for jobs
167
+ def searchJobs
168
+ @searchterm.gsub!(" ", "+")
169
+ if @location != nil
170
+ @location.gsub!(", ", "%2C+")
171
+ @location.gsub!(" ", "+")
172
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location
173
+ else
174
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l="
175
+ end
176
+ html = Nokogiri::HTML(open(url))
177
+
178
+ # Handle multiple pages
179
+ numresults = html.css("div#searchCount").text.split(" of ")
180
+ fresult = numresults[1].to_i/10.0
181
+ if fresult != numresults[1].to_i/10
182
+ count = fresult +1
183
+ else
184
+ count = numresults[1].to_i/10
185
+ end
186
+
187
+ # Loop through pages and get results
188
+ i = 1
189
+ while i <= count
190
+ # Parse each listing
191
+ html.css("div.row").each do |r|
192
+ jobhash = Hash.new
193
+ jobhash[:position] = r.css("h2.jobtitle").text.strip.lstrip
194
+ jobhash[:company] = r.css("span.company").text.strip.lstrip
195
+ jobhash[:location] = r.css('span[itemprop="jobLocation"]').text.strip.lstrip
196
+ if r.css("h2.jobtitle").css("a")[0]
197
+ jobhash[:url] = "http://indeed.com" + r.css("h2.jobtitle").css("a")[0]["href"]
198
+ begin
199
+ jobhash[:text] = Nokogiri::HTML(open(jobhash[:url])).text
200
+ rescue
201
+ begin
202
+ jobhash[:text] = Nokogiri::HTML(open(jobhash[:url], :allow_redirections => :all)).text
203
+ rescue
204
+ end
205
+ end
206
+ end
207
+ @output.push(jobhash)
208
+ end
209
+
210
+ # Get next page
211
+ i += 1
212
+ nextstart = (i-1)*10
213
+ if @location != nil
214
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&l=" + @location + "&start=" + nextstart.to_s
215
+ else
216
+ url = "http://www.indeed.com/jobs?q=" + @searchterm + "&start=" + nextstart.to_s
217
+ end
218
+ html = Nokogiri::HTML(open(url))
219
+ end
220
+ end
221
+
222
+ # Generates JSON output
223
+ def getOutput
224
+ JSON.pretty_generate(@output)
225
+ end
226
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: indeedscraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-01 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Get resumes and job listings from indeed based on search terms and locations.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/indeedscraper.rb
20
+ homepage: https://github.com/TransparencyToolkit/IndeedScraper
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.0.14
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Get resumes and job listings from indeed.
44
+ test_files: []
45
+ has_rdoc: