indeed_scraper2022 0.3.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +112 -16
- data.tar.gz.sig +0 -0
- metadata +26 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
|
4
|
+
data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
|
7
|
+
data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,12 +5,20 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
11
12
|
|
12
13
|
|
13
14
|
|
15
|
+
# this gem consists of 3 main classes:
|
16
|
+
#
|
17
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
18
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
19
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
20
|
+
#
|
21
|
+
|
14
22
|
class IndeedScraper2022Err < Exception
|
15
23
|
end
|
16
24
|
|
@@ -37,9 +45,10 @@ class IndeedScraper2022
|
|
37
45
|
end
|
38
46
|
|
39
47
|
def search(q: @q, location: @location, start: nil)
|
40
|
-
|
48
|
+
puts 'inside search' if @debug
|
41
49
|
url = @url_base
|
42
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
43
52
|
|
44
53
|
@browser.goto(url)
|
45
54
|
#@browser.network.wait_for_idle
|
@@ -74,34 +83,52 @@ class IndeedScraper2022
|
|
74
83
|
sleep 2
|
75
84
|
|
76
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
77
87
|
|
78
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
79
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
80
90
|
|
81
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
82
92
|
|
83
93
|
@results = @a2.map do |doc|
|
84
94
|
|
85
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
86
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
87
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
88
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
89
100
|
"dy/tr/td[@class='resultContent']")
|
90
101
|
|
91
102
|
# job title (e.g. Software Developer)
|
92
|
-
|
93
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
94
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
95
109
|
|
96
|
-
|
97
|
-
"div[@class='salary-snippet-container']
|
98
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
99
124
|
|
100
125
|
puts 'salary: ' + salary.inspect if @debug
|
101
126
|
div1 = td.element("div[@class='companyInfo']")
|
102
127
|
|
103
128
|
# company name (e.g. Coda Octopus Products Ltd)
|
104
|
-
|
129
|
+
coname = div1.element("span[@class='companyName']")
|
130
|
+
puts 'coname: ' + coname.text.inspect if @debug
|
131
|
+
company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
|
105
132
|
|
106
133
|
# company location (e.g. Edinburgh)
|
107
134
|
location = div1.element("div[@class='companyLocation']")&.text
|
@@ -111,7 +138,12 @@ class IndeedScraper2022
|
|
111
138
|
"v[@class='result-footer']")
|
112
139
|
|
113
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
114
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
115
147
|
|
116
148
|
# visually (e.g. Posted 14 days ago)
|
117
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -119,7 +151,7 @@ class IndeedScraper2022
|
|
119
151
|
|
120
152
|
{
|
121
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
122
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
123
155
|
title: jobtitle,
|
124
156
|
salary: salary,
|
125
157
|
company: company_name,
|
@@ -146,52 +178,65 @@ class IndeedScraper2022
|
|
146
178
|
def fetchjob(url)
|
147
179
|
|
148
180
|
doc = Nokorexi.new(url).to_doc
|
181
|
+
puts 'before e0' if @debug
|
149
182
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
150
183
|
|
151
184
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
185
|
+
puts 'before div1' if @debug
|
152
186
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
187
|
+
puts 'before div2' if @debug
|
153
188
|
div2 = div1.element("div")
|
154
189
|
|
155
190
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
191
|
+
puts 'before jobtitle' if @debug
|
156
192
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
157
193
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
158
194
|
"er-title']")&.text
|
159
195
|
|
196
|
+
puts 'before div3' if @debug
|
160
197
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
161
198
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
162
199
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
163
200
|
"ainer-subtitle']")
|
164
201
|
|
165
202
|
# icl (e.g. Lyles Sutherland)
|
203
|
+
puts 'before cname' if @debug
|
166
204
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
167
205
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
168
206
|
"s-mr--xs']")[1]
|
207
|
+
puts 'before clink' if @debug
|
169
208
|
clink = div3.element('//a')
|
170
209
|
company = cname.text ? cname.text : clink.text
|
171
210
|
companylink = clink.attributes[:href] if clink
|
172
211
|
|
212
|
+
puts 'before salary' if @debug
|
173
213
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
214
|
+
puts 'before type' if @debug
|
174
215
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
175
216
|
div5 = div3.xpath("div/div")
|
176
217
|
location, worklocation = div5.map(&:text).compact
|
177
218
|
|
178
219
|
# icl (e.g. Full-time, Permanent)
|
220
|
+
puts 'before jobtype' if @debug
|
179
221
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
180
222
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
181
223
|
jobtype = jobtype&.texts.join if jobtype
|
182
224
|
|
183
225
|
# jobsearch (e.g. Urgently needed)
|
226
|
+
puts 'before jobnote1' if @debug
|
184
227
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
185
228
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
186
229
|
"h-DesktopTag-text']")&.text
|
187
230
|
|
188
231
|
# jobsearch (e.g. 10 days ago)
|
232
|
+
puts 'before days' if @debug
|
189
233
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
190
234
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
191
235
|
d = Date.today - days.to_i
|
192
236
|
datepost = d.strftime("%Y-%m-%d")
|
193
237
|
|
194
238
|
|
239
|
+
puts 'before jobdesc' if @debug
|
195
240
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
196
241
|
"ass='jobsearch-jobDescriptionText']").xml
|
197
242
|
|
@@ -217,11 +262,16 @@ class IS22Plus < IndeedScraper2022
|
|
217
262
|
|
218
263
|
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
219
264
|
super(q: q, location: location, headless: headless, cookies: cookies,
|
220
|
-
debug:
|
265
|
+
debug: true)
|
221
266
|
end
|
222
267
|
|
268
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
269
|
+
# execute archive() daily
|
270
|
+
#
|
223
271
|
def archive(filepath='/tmp/indeed')
|
224
272
|
|
273
|
+
search() if @results.nil?
|
274
|
+
|
225
275
|
return unless @results
|
226
276
|
|
227
277
|
FileUtils.mkdir_p filepath
|
@@ -238,16 +288,23 @@ class IS22Plus < IndeedScraper2022
|
|
238
288
|
|
239
289
|
puts 'saving ' + item[:title] if @debug
|
240
290
|
puts 'link: ' + item[:link].inspect
|
241
|
-
links =
|
242
|
-
puts 'links: ' + links.inspect
|
291
|
+
links = URL.reveal(item[:link])
|
292
|
+
puts 'links: ' + links.inspect if @debug
|
243
293
|
|
244
294
|
url = links.last
|
245
|
-
|
295
|
+
puts 'url: ' + url.inspect if @debug
|
296
|
+
id = url[/(?<=jk=)[^&]+/]
|
246
297
|
|
247
298
|
if index[id.to_sym] then
|
299
|
+
|
300
|
+
# the vacancy record has previously been saved
|
301
|
+
#
|
248
302
|
next
|
303
|
+
|
249
304
|
else
|
250
305
|
|
306
|
+
# write the full page vacancy article to file
|
307
|
+
#
|
251
308
|
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
252
309
|
|
253
310
|
h = {
|
@@ -257,14 +314,19 @@ class IS22Plus < IndeedScraper2022
|
|
257
314
|
company: item[:company].to_s.strip,
|
258
315
|
location: item[:location].to_s,
|
259
316
|
jobsnippet: item[:jobsnippet],
|
260
|
-
date: item[:date]
|
317
|
+
date: item[:date],
|
318
|
+
added: Time.now.strftime("%Y-%m-%d")
|
261
319
|
}
|
262
320
|
|
321
|
+
# add the vacancy snippet to the index file
|
322
|
+
#
|
263
323
|
index[id.to_sym] = h
|
264
324
|
end
|
265
325
|
|
266
326
|
end
|
267
327
|
|
328
|
+
# save the vacancy index file
|
329
|
+
#
|
268
330
|
File.write idxfile, index.to_yaml
|
269
331
|
|
270
332
|
end
|
@@ -279,3 +341,37 @@ class IS22Plus < IndeedScraper2022
|
|
279
341
|
|
280
342
|
|
281
343
|
end
|
344
|
+
|
345
|
+
|
346
|
+
class IS22Archive
|
347
|
+
|
348
|
+
attr_reader :index
|
349
|
+
|
350
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
351
|
+
|
352
|
+
FileUtils.mkdir_p filepath
|
353
|
+
@idxfile = File.join(filepath, 'index.yml')
|
354
|
+
|
355
|
+
@index = if File.exists? @idxfile then
|
356
|
+
YAML.load(File.read(@idxfile))
|
357
|
+
else
|
358
|
+
{}
|
359
|
+
end
|
360
|
+
|
361
|
+
end
|
362
|
+
|
363
|
+
def list()
|
364
|
+
|
365
|
+
@index.to_a.reverse.map.with_index do |x,i|
|
366
|
+
|
367
|
+
id, h = x
|
368
|
+
|
369
|
+
puts 'h: ' + h.inspect if @debug
|
370
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
371
|
+
"%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
|
372
|
+
|
373
|
+
end.join("\n")
|
374
|
+
|
375
|
+
end
|
376
|
+
|
377
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,40 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: reveal_url22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|