indeed_scraper2022 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +148 -15
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
|
4
|
+
data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
|
7
|
+
data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -4,23 +4,36 @@
|
|
4
4
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Given the nature of changes to jobsearch websites,
|
9
10
|
# don't rely upon this gem working in the near future.
|
10
11
|
|
11
12
|
|
13
|
+
|
14
|
+
# this gem consists of 3 main classes:
|
15
|
+
#
|
16
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
17
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
18
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
19
|
+
#
|
20
|
+
|
12
21
|
class IndeedScraper2022Err < Exception
|
13
22
|
end
|
14
23
|
|
15
24
|
class IndeedScraper2022
|
16
25
|
|
26
|
+
attr_reader :browser
|
27
|
+
|
17
28
|
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
|
18
29
|
headless: true, cookies: nil, debug: false)
|
19
30
|
|
20
31
|
@debug = debug
|
21
32
|
@url_base, @q, @location = url, q, location
|
22
33
|
@headless, @cookies = headless, cookies
|
23
|
-
|
34
|
+
|
35
|
+
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
36
|
+
@browser = fw.browser
|
24
37
|
|
25
38
|
end
|
26
39
|
|
@@ -32,35 +45,49 @@ class IndeedScraper2022
|
|
32
45
|
|
33
46
|
def search(q: @q, location: @location, start: nil)
|
34
47
|
|
35
|
-
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
36
|
-
|
37
48
|
url = @url_base
|
38
49
|
url += 'start=' + start if start
|
39
50
|
|
40
|
-
browser
|
41
|
-
browser.
|
51
|
+
@browser.goto(url)
|
52
|
+
#@browser.network.wait_for_idle
|
53
|
+
puts 'sleeping for 4 seconds' if @debug
|
54
|
+
sleep 4
|
42
55
|
|
43
56
|
if q.length > 1 then
|
44
|
-
|
45
|
-
input.
|
57
|
+
|
58
|
+
input = @browser.at_xpath("//input[@name='q']")
|
59
|
+
|
60
|
+
# select any existing text and overwrite it
|
61
|
+
input.focus.type(:home); sleep 0.2
|
62
|
+
input.focus.type(:shift, :end); sleep 0.2
|
63
|
+
input.focus.type(q); sleep 0.2
|
46
64
|
end
|
47
65
|
|
48
66
|
if location.length > 1 then
|
49
|
-
|
50
|
-
input2.
|
67
|
+
|
68
|
+
input2 = @browser.at_xpath("//input[@name='l']")
|
69
|
+
|
70
|
+
# select any existing text and overwrite it
|
71
|
+
input2.focus.type(:home); sleep 0.2
|
72
|
+
input2.focus.type(:shift, :end); sleep 0.2
|
73
|
+
input2.focus.type(location); sleep 0.2
|
74
|
+
|
51
75
|
end
|
52
76
|
|
53
|
-
button = browser.at_xpath("//button[@type='submit']")
|
77
|
+
button = @browser.at_xpath("//button[@type='submit']")
|
54
78
|
button.click
|
79
|
+
#@browser.network.wait_for_idle
|
80
|
+
puts 'sleeping for 2 seconds' if @debug
|
81
|
+
sleep 2
|
55
82
|
|
56
|
-
doc2 = Nokogiri::XML(browser.body)
|
83
|
+
doc2 = Nokogiri::XML(@browser.body)
|
57
84
|
|
58
85
|
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
59
86
|
puts 'a2: ' + a2.length.inspect if @debug
|
60
87
|
|
61
88
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
62
89
|
|
63
|
-
@a2.map do |doc|
|
90
|
+
@results = @a2.map do |doc|
|
64
91
|
|
65
92
|
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
66
93
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
@@ -126,52 +153,65 @@ class IndeedScraper2022
|
|
126
153
|
def fetchjob(url)
|
127
154
|
|
128
155
|
doc = Nokorexi.new(url).to_doc
|
156
|
+
puts 'before e0' if @debug
|
129
157
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
130
158
|
|
131
159
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
160
|
+
puts 'before div1' if @debug
|
132
161
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
162
|
+
puts 'before div2' if @debug
|
133
163
|
div2 = div1.element("div")
|
134
164
|
|
135
165
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
166
|
+
puts 'before jobtitle' if @debug
|
136
167
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
137
168
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
138
169
|
"er-title']")&.text
|
139
170
|
|
171
|
+
puts 'before div3' if @debug
|
140
172
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
141
173
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
142
174
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
143
175
|
"ainer-subtitle']")
|
144
176
|
|
145
177
|
# icl (e.g. Lyles Sutherland)
|
178
|
+
puts 'before cname' if @debug
|
146
179
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
147
180
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
148
181
|
"s-mr--xs']")[1]
|
182
|
+
puts 'before clink' if @debug
|
149
183
|
clink = div3.element('//a')
|
150
184
|
company = cname.text ? cname.text : clink.text
|
151
185
|
companylink = clink.attributes[:href] if clink
|
152
186
|
|
187
|
+
puts 'before salary' if @debug
|
153
188
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
189
|
+
puts 'before type' if @debug
|
154
190
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
155
191
|
div5 = div3.xpath("div/div")
|
156
192
|
location, worklocation = div5.map(&:text).compact
|
157
193
|
|
158
194
|
# icl (e.g. Full-time, Permanent)
|
195
|
+
puts 'before jobtype' if @debug
|
159
196
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
160
197
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
161
198
|
jobtype = jobtype&.texts.join if jobtype
|
162
199
|
|
163
200
|
# jobsearch (e.g. Urgently needed)
|
201
|
+
puts 'before jobnote1' if @debug
|
164
202
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
165
203
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
166
204
|
"h-DesktopTag-text']")&.text
|
167
205
|
|
168
206
|
# jobsearch (e.g. 10 days ago)
|
207
|
+
puts 'before days' if @debug
|
169
208
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
170
209
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
171
210
|
d = Date.today - days.to_i
|
172
211
|
datepost = d.strftime("%Y-%m-%d")
|
173
212
|
|
174
213
|
|
214
|
+
puts 'before jobdesc' if @debug
|
175
215
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
176
216
|
"ass='jobsearch-jobDescriptionText']").xml
|
177
217
|
|
@@ -200,12 +240,70 @@ class IS22Plus < IndeedScraper2022
|
|
200
240
|
debug: debug)
|
201
241
|
end
|
202
242
|
|
203
|
-
|
243
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
244
|
+
# execute archive() daily
|
245
|
+
#
|
246
|
+
def archive(filepath='/tmp/indeed')
|
247
|
+
|
248
|
+
search() if @results.nil?
|
249
|
+
|
250
|
+
return unless @results
|
251
|
+
|
252
|
+
FileUtils.mkdir_p filepath
|
253
|
+
|
254
|
+
idxfile = File.join(filepath, 'index.yml')
|
204
255
|
|
205
|
-
|
206
|
-
|
256
|
+
index = if File.exists? idxfile then
|
257
|
+
YAML.load(File.read(idxfile))
|
258
|
+
else
|
259
|
+
{}
|
207
260
|
end
|
208
261
|
|
262
|
+
@results.each.with_index do |item, i|
|
263
|
+
|
264
|
+
puts 'saving ' + item[:title] if @debug
|
265
|
+
puts 'link: ' + item[:link].inspect
|
266
|
+
links = RXFReader.reveal(item[:link])
|
267
|
+
puts 'links: ' + links.inspect if @debug
|
268
|
+
|
269
|
+
url = links.last
|
270
|
+
puts 'url: ' + url.inspect if @debug
|
271
|
+
id = url[/(?<=jk=)[^&]+/]
|
272
|
+
|
273
|
+
if index[id.to_sym] then
|
274
|
+
|
275
|
+
# the vacancy record has previously been saved
|
276
|
+
#
|
277
|
+
next
|
278
|
+
|
279
|
+
else
|
280
|
+
|
281
|
+
# write the full page vacancy article to file
|
282
|
+
#
|
283
|
+
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
284
|
+
|
285
|
+
h = {
|
286
|
+
link: url[/^[^&]+/],
|
287
|
+
title: item[:title].to_s,
|
288
|
+
salary: item[:salary].to_s,
|
289
|
+
company: item[:company].to_s.strip,
|
290
|
+
location: item[:location].to_s,
|
291
|
+
jobsnippet: item[:jobsnippet],
|
292
|
+
date: item[:date],
|
293
|
+
added: Time.now.strftime("%Y-%m-%d")
|
294
|
+
}
|
295
|
+
|
296
|
+
# add the vacancy snippet to the index file
|
297
|
+
#
|
298
|
+
index[id.to_sym] = h
|
299
|
+
end
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
# save the vacancy index file
|
304
|
+
#
|
305
|
+
File.write idxfile, index.to_yaml
|
306
|
+
|
209
307
|
end
|
210
308
|
|
211
309
|
def list()
|
@@ -218,3 +316,38 @@ class IS22Plus < IndeedScraper2022
|
|
218
316
|
|
219
317
|
|
220
318
|
end
|
319
|
+
|
320
|
+
|
321
|
+
class IS22Archive
|
322
|
+
|
323
|
+
attr_reader :index
|
324
|
+
|
325
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
326
|
+
|
327
|
+
@debug = debug
|
328
|
+
|
329
|
+
FileUtils.mkdir_p filepath
|
330
|
+
@idxfile = File.join(filepath, 'index.yml')
|
331
|
+
|
332
|
+
@index = if File.exists? @idxfile then
|
333
|
+
YAML.load(File.read(@idxfile))
|
334
|
+
else
|
335
|
+
{}
|
336
|
+
end
|
337
|
+
|
338
|
+
end
|
339
|
+
|
340
|
+
def list()
|
341
|
+
|
342
|
+
@index.map.with_index do |x,i|
|
343
|
+
|
344
|
+
id, h = x
|
345
|
+
|
346
|
+
puts 'h: ' + h.inspect if @debug
|
347
|
+
"%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
|
348
|
+
|
349
|
+
end.join("\n")
|
350
|
+
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-04-14 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,20 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|