indeed_scraper2022 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +112 -16
- data.tar.gz.sig +0 -0
- metadata +26 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
|
4
|
+
data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
|
7
|
+
data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,12 +5,20 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
11
12
|
|
12
13
|
|
13
14
|
|
15
|
+
# this gem consists of 3 main classes:
|
16
|
+
#
|
17
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
18
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
19
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
20
|
+
#
|
21
|
+
|
14
22
|
class IndeedScraper2022Err < Exception
|
15
23
|
end
|
16
24
|
|
@@ -37,9 +45,10 @@ class IndeedScraper2022
|
|
37
45
|
end
|
38
46
|
|
39
47
|
def search(q: @q, location: @location, start: nil)
|
40
|
-
|
48
|
+
puts 'inside search' if @debug
|
41
49
|
url = @url_base
|
42
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
43
52
|
|
44
53
|
@browser.goto(url)
|
45
54
|
#@browser.network.wait_for_idle
|
@@ -74,34 +83,52 @@ class IndeedScraper2022
|
|
74
83
|
sleep 2
|
75
84
|
|
76
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
77
87
|
|
78
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
79
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
80
90
|
|
81
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
82
92
|
|
83
93
|
@results = @a2.map do |doc|
|
84
94
|
|
85
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
86
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
87
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
88
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
89
100
|
"dy/tr/td[@class='resultContent']")
|
90
101
|
|
91
102
|
# job title (e.g. Software Developer)
|
92
|
-
|
93
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
94
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
95
109
|
|
96
|
-
|
97
|
-
"div[@class='salary-snippet-container']
|
98
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
99
124
|
|
100
125
|
puts 'salary: ' + salary.inspect if @debug
|
101
126
|
div1 = td.element("div[@class='companyInfo']")
|
102
127
|
|
103
128
|
# company name (e.g. Coda Octopus Products Ltd)
|
104
|
-
|
129
|
+
coname = div1.element("span[@class='companyName']")
|
130
|
+
puts 'coname: ' + coname.text.inspect if @debug
|
131
|
+
company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
|
105
132
|
|
106
133
|
# company location (e.g. Edinburgh)
|
107
134
|
location = div1.element("div[@class='companyLocation']")&.text
|
@@ -111,7 +138,12 @@ class IndeedScraper2022
|
|
111
138
|
"v[@class='result-footer']")
|
112
139
|
|
113
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
114
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
115
147
|
|
116
148
|
# visually (e.g. Posted 14 days ago)
|
117
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -119,7 +151,7 @@ class IndeedScraper2022
|
|
119
151
|
|
120
152
|
{
|
121
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
122
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
123
155
|
title: jobtitle,
|
124
156
|
salary: salary,
|
125
157
|
company: company_name,
|
@@ -146,52 +178,65 @@ class IndeedScraper2022
|
|
146
178
|
def fetchjob(url)
|
147
179
|
|
148
180
|
doc = Nokorexi.new(url).to_doc
|
181
|
+
puts 'before e0' if @debug
|
149
182
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
150
183
|
|
151
184
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
185
|
+
puts 'before div1' if @debug
|
152
186
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
187
|
+
puts 'before div2' if @debug
|
153
188
|
div2 = div1.element("div")
|
154
189
|
|
155
190
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
191
|
+
puts 'before jobtitle' if @debug
|
156
192
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
157
193
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
158
194
|
"er-title']")&.text
|
159
195
|
|
196
|
+
puts 'before div3' if @debug
|
160
197
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
161
198
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
162
199
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
163
200
|
"ainer-subtitle']")
|
164
201
|
|
165
202
|
# icl (e.g. Lyles Sutherland)
|
203
|
+
puts 'before cname' if @debug
|
166
204
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
167
205
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
168
206
|
"s-mr--xs']")[1]
|
207
|
+
puts 'before clink' if @debug
|
169
208
|
clink = div3.element('//a')
|
170
209
|
company = cname.text ? cname.text : clink.text
|
171
210
|
companylink = clink.attributes[:href] if clink
|
172
211
|
|
212
|
+
puts 'before salary' if @debug
|
173
213
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
214
|
+
puts 'before type' if @debug
|
174
215
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
175
216
|
div5 = div3.xpath("div/div")
|
176
217
|
location, worklocation = div5.map(&:text).compact
|
177
218
|
|
178
219
|
# icl (e.g. Full-time, Permanent)
|
220
|
+
puts 'before jobtype' if @debug
|
179
221
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
180
222
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
181
223
|
jobtype = jobtype&.texts.join if jobtype
|
182
224
|
|
183
225
|
# jobsearch (e.g. Urgently needed)
|
226
|
+
puts 'before jobnote1' if @debug
|
184
227
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
185
228
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
186
229
|
"h-DesktopTag-text']")&.text
|
187
230
|
|
188
231
|
# jobsearch (e.g. 10 days ago)
|
232
|
+
puts 'before days' if @debug
|
189
233
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
190
234
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
191
235
|
d = Date.today - days.to_i
|
192
236
|
datepost = d.strftime("%Y-%m-%d")
|
193
237
|
|
194
238
|
|
239
|
+
puts 'before jobdesc' if @debug
|
195
240
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
196
241
|
"ass='jobsearch-jobDescriptionText']").xml
|
197
242
|
|
@@ -217,11 +262,16 @@ class IS22Plus < IndeedScraper2022
|
|
217
262
|
|
218
263
|
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
219
264
|
super(q: q, location: location, headless: headless, cookies: cookies,
|
220
|
-
debug:
|
265
|
+
debug: true)
|
221
266
|
end
|
222
267
|
|
268
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
269
|
+
# execute archive() daily
|
270
|
+
#
|
223
271
|
def archive(filepath='/tmp/indeed')
|
224
272
|
|
273
|
+
search() if @results.nil?
|
274
|
+
|
225
275
|
return unless @results
|
226
276
|
|
227
277
|
FileUtils.mkdir_p filepath
|
@@ -238,16 +288,23 @@ class IS22Plus < IndeedScraper2022
|
|
238
288
|
|
239
289
|
puts 'saving ' + item[:title] if @debug
|
240
290
|
puts 'link: ' + item[:link].inspect
|
241
|
-
links =
|
242
|
-
puts 'links: ' + links.inspect
|
291
|
+
links = URL.reveal(item[:link])
|
292
|
+
puts 'links: ' + links.inspect if @debug
|
243
293
|
|
244
294
|
url = links.last
|
245
|
-
|
295
|
+
puts 'url: ' + url.inspect if @debug
|
296
|
+
id = url[/(?<=jk=)[^&]+/]
|
246
297
|
|
247
298
|
if index[id.to_sym] then
|
299
|
+
|
300
|
+
# the vacancy record has previously been saved
|
301
|
+
#
|
248
302
|
next
|
303
|
+
|
249
304
|
else
|
250
305
|
|
306
|
+
# write the full page vacancy article to file
|
307
|
+
#
|
251
308
|
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
252
309
|
|
253
310
|
h = {
|
@@ -257,14 +314,19 @@ class IS22Plus < IndeedScraper2022
|
|
257
314
|
company: item[:company].to_s.strip,
|
258
315
|
location: item[:location].to_s,
|
259
316
|
jobsnippet: item[:jobsnippet],
|
260
|
-
date: item[:date]
|
317
|
+
date: item[:date],
|
318
|
+
added: Time.now.strftime("%Y-%m-%d")
|
261
319
|
}
|
262
320
|
|
321
|
+
# add the vacancy snippet to the index file
|
322
|
+
#
|
263
323
|
index[id.to_sym] = h
|
264
324
|
end
|
265
325
|
|
266
326
|
end
|
267
327
|
|
328
|
+
# save the vacancy index file
|
329
|
+
#
|
268
330
|
File.write idxfile, index.to_yaml
|
269
331
|
|
270
332
|
end
|
@@ -279,3 +341,37 @@ class IS22Plus < IndeedScraper2022
|
|
279
341
|
|
280
342
|
|
281
343
|
end
|
344
|
+
|
345
|
+
|
346
|
+
class IS22Archive
|
347
|
+
|
348
|
+
attr_reader :index
|
349
|
+
|
350
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
351
|
+
|
352
|
+
FileUtils.mkdir_p filepath
|
353
|
+
@idxfile = File.join(filepath, 'index.yml')
|
354
|
+
|
355
|
+
@index = if File.exists? @idxfile then
|
356
|
+
YAML.load(File.read(@idxfile))
|
357
|
+
else
|
358
|
+
{}
|
359
|
+
end
|
360
|
+
|
361
|
+
end
|
362
|
+
|
363
|
+
def list()
|
364
|
+
|
365
|
+
@index.to_a.reverse.map.with_index do |x,i|
|
366
|
+
|
367
|
+
id, h = x
|
368
|
+
|
369
|
+
puts 'h: ' + h.inspect if @debug
|
370
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
371
|
+
"%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
|
372
|
+
|
373
|
+
end.join("\n")
|
374
|
+
|
375
|
+
end
|
376
|
+
|
377
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,40 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: reveal_url22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|