indeed_scraper2022 0.2.1 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +120 -4
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
|
4
|
+
data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
|
7
|
+
data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -4,11 +4,20 @@
|
|
4
4
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Given the nature of changes to jobsearch websites,
|
9
10
|
# don't rely upon this gem working in the near future.
|
10
11
|
|
11
12
|
|
13
|
+
|
14
|
+
# this gem consists of 3 main classes:
|
15
|
+
#
|
16
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
17
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
18
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
19
|
+
#
|
20
|
+
|
12
21
|
class IndeedScraper2022Err < Exception
|
13
22
|
end
|
14
23
|
|
@@ -99,7 +108,9 @@ class IndeedScraper2022
|
|
99
108
|
div1 = td.element("div[@class='companyInfo']")
|
100
109
|
|
101
110
|
# company name (e.g. Coda Octopus Products Ltd)
|
102
|
-
|
111
|
+
coname = div1.element("span[@class='companyName']")
|
112
|
+
puts 'coname: ' + coname.text.inspect if @debug
|
113
|
+
company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
|
103
114
|
|
104
115
|
# company location (e.g. Edinburgh)
|
105
116
|
location = div1.element("div[@class='companyLocation']")&.text
|
@@ -144,52 +155,65 @@ class IndeedScraper2022
|
|
144
155
|
def fetchjob(url)
|
145
156
|
|
146
157
|
doc = Nokorexi.new(url).to_doc
|
158
|
+
puts 'before e0' if @debug
|
147
159
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
148
160
|
|
149
161
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
162
|
+
puts 'before div1' if @debug
|
150
163
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
164
|
+
puts 'before div2' if @debug
|
151
165
|
div2 = div1.element("div")
|
152
166
|
|
153
167
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
168
|
+
puts 'before jobtitle' if @debug
|
154
169
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
155
170
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
156
171
|
"er-title']")&.text
|
157
172
|
|
173
|
+
puts 'before div3' if @debug
|
158
174
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
159
175
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
160
176
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
161
177
|
"ainer-subtitle']")
|
162
178
|
|
163
179
|
# icl (e.g. Lyles Sutherland)
|
180
|
+
puts 'before cname' if @debug
|
164
181
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
165
182
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
166
183
|
"s-mr--xs']")[1]
|
184
|
+
puts 'before clink' if @debug
|
167
185
|
clink = div3.element('//a')
|
168
186
|
company = cname.text ? cname.text : clink.text
|
169
187
|
companylink = clink.attributes[:href] if clink
|
170
188
|
|
189
|
+
puts 'before salary' if @debug
|
171
190
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
191
|
+
puts 'before type' if @debug
|
172
192
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
173
193
|
div5 = div3.xpath("div/div")
|
174
194
|
location, worklocation = div5.map(&:text).compact
|
175
195
|
|
176
196
|
# icl (e.g. Full-time, Permanent)
|
197
|
+
puts 'before jobtype' if @debug
|
177
198
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
178
199
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
179
200
|
jobtype = jobtype&.texts.join if jobtype
|
180
201
|
|
181
202
|
# jobsearch (e.g. Urgently needed)
|
203
|
+
puts 'before jobnote1' if @debug
|
182
204
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
183
205
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
184
206
|
"h-DesktopTag-text']")&.text
|
185
207
|
|
186
208
|
# jobsearch (e.g. 10 days ago)
|
209
|
+
puts 'before days' if @debug
|
187
210
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
188
211
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
189
212
|
d = Date.today - days.to_i
|
190
213
|
datepost = d.strftime("%Y-%m-%d")
|
191
214
|
|
192
215
|
|
216
|
+
puts 'before jobdesc' if @debug
|
193
217
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
194
218
|
"ass='jobsearch-jobDescriptionText']").xml
|
195
219
|
|
@@ -218,14 +242,70 @@ class IS22Plus < IndeedScraper2022
|
|
218
242
|
debug: debug)
|
219
243
|
end
|
220
244
|
|
221
|
-
|
245
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
246
|
+
# execute archive() daily
|
247
|
+
#
|
248
|
+
def archive(filepath='/tmp/indeed')
|
249
|
+
|
250
|
+
search() if @results.nil?
|
222
251
|
|
223
252
|
return unless @results
|
224
253
|
|
225
|
-
|
226
|
-
|
254
|
+
FileUtils.mkdir_p filepath
|
255
|
+
|
256
|
+
idxfile = File.join(filepath, 'index.yml')
|
257
|
+
|
258
|
+
index = if File.exists? idxfile then
|
259
|
+
YAML.load(File.read(idxfile))
|
260
|
+
else
|
261
|
+
{}
|
262
|
+
end
|
263
|
+
|
264
|
+
@results.each.with_index do |item, i|
|
265
|
+
|
266
|
+
puts 'saving ' + item[:title] if @debug
|
267
|
+
puts 'link: ' + item[:link].inspect
|
268
|
+
links = RXFReader.reveal(item[:link])
|
269
|
+
puts 'links: ' + links.inspect if @debug
|
270
|
+
|
271
|
+
url = links.last
|
272
|
+
puts 'url: ' + url.inspect if @debug
|
273
|
+
id = url[/(?<=jk=)[^&]+/]
|
274
|
+
|
275
|
+
if index[id.to_sym] then
|
276
|
+
|
277
|
+
# the vacancy record has previously been saved
|
278
|
+
#
|
279
|
+
next
|
280
|
+
|
281
|
+
else
|
282
|
+
|
283
|
+
# write the full page vacancy article to file
|
284
|
+
#
|
285
|
+
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
286
|
+
|
287
|
+
h = {
|
288
|
+
link: url[/^[^&]+/],
|
289
|
+
title: item[:title].to_s,
|
290
|
+
salary: item[:salary].to_s,
|
291
|
+
company: item[:company].to_s.strip,
|
292
|
+
location: item[:location].to_s,
|
293
|
+
jobsnippet: item[:jobsnippet],
|
294
|
+
date: item[:date],
|
295
|
+
added: Time.now.strftime("%Y-%m-%d")
|
296
|
+
}
|
297
|
+
|
298
|
+
# add the vacancy snippet to the index file
|
299
|
+
#
|
300
|
+
index[id.to_sym] = h
|
301
|
+
end
|
302
|
+
|
227
303
|
end
|
228
304
|
|
305
|
+
# save the vacancy index file
|
306
|
+
#
|
307
|
+
File.write idxfile, index.to_yaml
|
308
|
+
|
229
309
|
end
|
230
310
|
|
231
311
|
def list()
|
@@ -238,3 +318,39 @@ class IS22Plus < IndeedScraper2022
|
|
238
318
|
|
239
319
|
|
240
320
|
end
|
321
|
+
|
322
|
+
|
323
|
+
class IS22Archive
|
324
|
+
|
325
|
+
attr_reader :index
|
326
|
+
|
327
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
328
|
+
|
329
|
+
@debug = debug
|
330
|
+
|
331
|
+
FileUtils.mkdir_p filepath
|
332
|
+
@idxfile = File.join(filepath, 'index.yml')
|
333
|
+
|
334
|
+
@index = if File.exists? @idxfile then
|
335
|
+
YAML.load(File.read(@idxfile))
|
336
|
+
else
|
337
|
+
{}
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
|
342
|
+
def list()
|
343
|
+
|
344
|
+
@index.to_a.reverse.map.with_index do |x,i|
|
345
|
+
|
346
|
+
id, h = x
|
347
|
+
|
348
|
+
puts 'h: ' + h.inspect if @debug
|
349
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
350
|
+
"%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
|
351
|
+
|
352
|
+
end.join("\n")
|
353
|
+
|
354
|
+
end
|
355
|
+
|
356
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-04-16 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,20 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|