indeed_scraper2022 0.2.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +148 -15
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
|
4
|
+
data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
|
7
|
+
data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -4,23 +4,36 @@
|
|
4
4
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Given the nature of changes to jobsearch websites,
|
9
10
|
# don't rely upon this gem working in the near future.
|
10
11
|
|
11
12
|
|
13
|
+
|
14
|
+
# this gem consists of 3 main classes:
|
15
|
+
#
|
16
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
17
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
18
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
19
|
+
#
|
20
|
+
|
12
21
|
class IndeedScraper2022Err < Exception
|
13
22
|
end
|
14
23
|
|
15
24
|
class IndeedScraper2022
|
16
25
|
|
26
|
+
attr_reader :browser
|
27
|
+
|
17
28
|
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
|
18
29
|
headless: true, cookies: nil, debug: false)
|
19
30
|
|
20
31
|
@debug = debug
|
21
32
|
@url_base, @q, @location = url, q, location
|
22
33
|
@headless, @cookies = headless, cookies
|
23
|
-
|
34
|
+
|
35
|
+
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
36
|
+
@browser = fw.browser
|
24
37
|
|
25
38
|
end
|
26
39
|
|
@@ -32,35 +45,49 @@ class IndeedScraper2022
|
|
32
45
|
|
33
46
|
def search(q: @q, location: @location, start: nil)
|
34
47
|
|
35
|
-
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
36
|
-
|
37
48
|
url = @url_base
|
38
49
|
url += 'start=' + start if start
|
39
50
|
|
40
|
-
browser
|
41
|
-
browser.
|
51
|
+
@browser.goto(url)
|
52
|
+
#@browser.network.wait_for_idle
|
53
|
+
puts 'sleeping for 4 seconds' if @debug
|
54
|
+
sleep 4
|
42
55
|
|
43
56
|
if q.length > 1 then
|
44
|
-
|
45
|
-
input.
|
57
|
+
|
58
|
+
input = @browser.at_xpath("//input[@name='q']")
|
59
|
+
|
60
|
+
# select any existing text and overwrite it
|
61
|
+
input.focus.type(:home); sleep 0.2
|
62
|
+
input.focus.type(:shift, :end); sleep 0.2
|
63
|
+
input.focus.type(q); sleep 0.2
|
46
64
|
end
|
47
65
|
|
48
66
|
if location.length > 1 then
|
49
|
-
|
50
|
-
input2.
|
67
|
+
|
68
|
+
input2 = @browser.at_xpath("//input[@name='l']")
|
69
|
+
|
70
|
+
# select any existing text and overwrite it
|
71
|
+
input2.focus.type(:home); sleep 0.2
|
72
|
+
input2.focus.type(:shift, :end); sleep 0.2
|
73
|
+
input2.focus.type(location); sleep 0.2
|
74
|
+
|
51
75
|
end
|
52
76
|
|
53
|
-
button = browser.at_xpath("//button[@type='submit']")
|
77
|
+
button = @browser.at_xpath("//button[@type='submit']")
|
54
78
|
button.click
|
79
|
+
#@browser.network.wait_for_idle
|
80
|
+
puts 'sleeping for 2 seconds' if @debug
|
81
|
+
sleep 2
|
55
82
|
|
56
|
-
doc2 = Nokogiri::XML(browser.body)
|
83
|
+
doc2 = Nokogiri::XML(@browser.body)
|
57
84
|
|
58
85
|
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
59
86
|
puts 'a2: ' + a2.length.inspect if @debug
|
60
87
|
|
61
88
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
62
89
|
|
63
|
-
@a2.map do |doc|
|
90
|
+
@results = @a2.map do |doc|
|
64
91
|
|
65
92
|
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
66
93
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
@@ -126,52 +153,65 @@ class IndeedScraper2022
|
|
126
153
|
def fetchjob(url)
|
127
154
|
|
128
155
|
doc = Nokorexi.new(url).to_doc
|
156
|
+
puts 'before e0' if @debug
|
129
157
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
130
158
|
|
131
159
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
160
|
+
puts 'before div1' if @debug
|
132
161
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
162
|
+
puts 'before div2' if @debug
|
133
163
|
div2 = div1.element("div")
|
134
164
|
|
135
165
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
166
|
+
puts 'before jobtitle' if @debug
|
136
167
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
137
168
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
138
169
|
"er-title']")&.text
|
139
170
|
|
171
|
+
puts 'before div3' if @debug
|
140
172
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
141
173
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
142
174
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
143
175
|
"ainer-subtitle']")
|
144
176
|
|
145
177
|
# icl (e.g. Lyles Sutherland)
|
178
|
+
puts 'before cname' if @debug
|
146
179
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
147
180
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
148
181
|
"s-mr--xs']")[1]
|
182
|
+
puts 'before clink' if @debug
|
149
183
|
clink = div3.element('//a')
|
150
184
|
company = cname.text ? cname.text : clink.text
|
151
185
|
companylink = clink.attributes[:href] if clink
|
152
186
|
|
187
|
+
puts 'before salary' if @debug
|
153
188
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
189
|
+
puts 'before type' if @debug
|
154
190
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
155
191
|
div5 = div3.xpath("div/div")
|
156
192
|
location, worklocation = div5.map(&:text).compact
|
157
193
|
|
158
194
|
# icl (e.g. Full-time, Permanent)
|
195
|
+
puts 'before jobtype' if @debug
|
159
196
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
160
197
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
161
198
|
jobtype = jobtype&.texts.join if jobtype
|
162
199
|
|
163
200
|
# jobsearch (e.g. Urgently needed)
|
201
|
+
puts 'before jobnote1' if @debug
|
164
202
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
165
203
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
166
204
|
"h-DesktopTag-text']")&.text
|
167
205
|
|
168
206
|
# jobsearch (e.g. 10 days ago)
|
207
|
+
puts 'before days' if @debug
|
169
208
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
170
209
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
171
210
|
d = Date.today - days.to_i
|
172
211
|
datepost = d.strftime("%Y-%m-%d")
|
173
212
|
|
174
213
|
|
214
|
+
puts 'before jobdesc' if @debug
|
175
215
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
176
216
|
"ass='jobsearch-jobDescriptionText']").xml
|
177
217
|
|
@@ -200,12 +240,70 @@ class IS22Plus < IndeedScraper2022
|
|
200
240
|
debug: debug)
|
201
241
|
end
|
202
242
|
|
203
|
-
|
243
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
244
|
+
# execute archive() daily
|
245
|
+
#
|
246
|
+
def archive(filepath='/tmp/indeed')
|
247
|
+
|
248
|
+
search() if @results.nil?
|
249
|
+
|
250
|
+
return unless @results
|
251
|
+
|
252
|
+
FileUtils.mkdir_p filepath
|
253
|
+
|
254
|
+
idxfile = File.join(filepath, 'index.yml')
|
204
255
|
|
205
|
-
|
206
|
-
|
256
|
+
index = if File.exists? idxfile then
|
257
|
+
YAML.load(File.read(idxfile))
|
258
|
+
else
|
259
|
+
{}
|
207
260
|
end
|
208
261
|
|
262
|
+
@results.each.with_index do |item, i|
|
263
|
+
|
264
|
+
puts 'saving ' + item[:title] if @debug
|
265
|
+
puts 'link: ' + item[:link].inspect
|
266
|
+
links = RXFReader.reveal(item[:link])
|
267
|
+
puts 'links: ' + links.inspect if @debug
|
268
|
+
|
269
|
+
url = links.last
|
270
|
+
puts 'url: ' + url.inspect if @debug
|
271
|
+
id = url[/(?<=jk=)[^&]+/]
|
272
|
+
|
273
|
+
if index[id.to_sym] then
|
274
|
+
|
275
|
+
# the vacancy record has previously been saved
|
276
|
+
#
|
277
|
+
next
|
278
|
+
|
279
|
+
else
|
280
|
+
|
281
|
+
# write the full page vacancy article to file
|
282
|
+
#
|
283
|
+
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
284
|
+
|
285
|
+
h = {
|
286
|
+
link: url[/^[^&]+/],
|
287
|
+
title: item[:title].to_s,
|
288
|
+
salary: item[:salary].to_s,
|
289
|
+
company: item[:company].to_s.strip,
|
290
|
+
location: item[:location].to_s,
|
291
|
+
jobsnippet: item[:jobsnippet],
|
292
|
+
date: item[:date],
|
293
|
+
added: Time.now.strftime("%Y-%m-%d")
|
294
|
+
}
|
295
|
+
|
296
|
+
# add the vacancy snippet to the index file
|
297
|
+
#
|
298
|
+
index[id.to_sym] = h
|
299
|
+
end
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
# save the vacancy index file
|
304
|
+
#
|
305
|
+
File.write idxfile, index.to_yaml
|
306
|
+
|
209
307
|
end
|
210
308
|
|
211
309
|
def list()
|
@@ -218,3 +316,38 @@ class IS22Plus < IndeedScraper2022
|
|
218
316
|
|
219
317
|
|
220
318
|
end
|
319
|
+
|
320
|
+
|
321
|
+
class IS22Archive
|
322
|
+
|
323
|
+
attr_reader :index
|
324
|
+
|
325
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
326
|
+
|
327
|
+
@debug = debug
|
328
|
+
|
329
|
+
FileUtils.mkdir_p filepath
|
330
|
+
@idxfile = File.join(filepath, 'index.yml')
|
331
|
+
|
332
|
+
@index = if File.exists? @idxfile then
|
333
|
+
YAML.load(File.read(@idxfile))
|
334
|
+
else
|
335
|
+
{}
|
336
|
+
end
|
337
|
+
|
338
|
+
end
|
339
|
+
|
340
|
+
def list()
|
341
|
+
|
342
|
+
@index.map.with_index do |x,i|
|
343
|
+
|
344
|
+
id, h = x
|
345
|
+
|
346
|
+
puts 'h: ' + h.inspect if @debug
|
347
|
+
"%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
|
348
|
+
|
349
|
+
end.join("\n")
|
350
|
+
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-04-14 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,20 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|