indeed_scraper2022 0.2.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +120 -4
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
|
4
|
+
data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
|
7
|
+
data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -4,11 +4,20 @@
|
|
4
4
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Given the nature of changes to jobsearch websites,
|
9
10
|
# don't rely upon this gem working in the near future.
|
10
11
|
|
11
12
|
|
13
|
+
|
14
|
+
# this gem consists of 3 main classes:
|
15
|
+
#
|
16
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
17
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
18
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
19
|
+
#
|
20
|
+
|
12
21
|
class IndeedScraper2022Err < Exception
|
13
22
|
end
|
14
23
|
|
@@ -99,7 +108,9 @@ class IndeedScraper2022
|
|
99
108
|
div1 = td.element("div[@class='companyInfo']")
|
100
109
|
|
101
110
|
# company name (e.g. Coda Octopus Products Ltd)
|
102
|
-
|
111
|
+
coname = div1.element("span[@class='companyName']")
|
112
|
+
puts 'coname: ' + coname.text.inspect if @debug
|
113
|
+
company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
|
103
114
|
|
104
115
|
# company location (e.g. Edinburgh)
|
105
116
|
location = div1.element("div[@class='companyLocation']")&.text
|
@@ -144,52 +155,65 @@ class IndeedScraper2022
|
|
144
155
|
def fetchjob(url)
|
145
156
|
|
146
157
|
doc = Nokorexi.new(url).to_doc
|
158
|
+
puts 'before e0' if @debug
|
147
159
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
148
160
|
|
149
161
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
162
|
+
puts 'before div1' if @debug
|
150
163
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
164
|
+
puts 'before div2' if @debug
|
151
165
|
div2 = div1.element("div")
|
152
166
|
|
153
167
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
168
|
+
puts 'before jobtitle' if @debug
|
154
169
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
155
170
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
156
171
|
"er-title']")&.text
|
157
172
|
|
173
|
+
puts 'before div3' if @debug
|
158
174
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
159
175
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
160
176
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
161
177
|
"ainer-subtitle']")
|
162
178
|
|
163
179
|
# icl (e.g. Lyles Sutherland)
|
180
|
+
puts 'before cname' if @debug
|
164
181
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
165
182
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
166
183
|
"s-mr--xs']")[1]
|
184
|
+
puts 'before clink' if @debug
|
167
185
|
clink = div3.element('//a')
|
168
186
|
company = cname.text ? cname.text : clink.text
|
169
187
|
companylink = clink.attributes[:href] if clink
|
170
188
|
|
189
|
+
puts 'before salary' if @debug
|
171
190
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
191
|
+
puts 'before type' if @debug
|
172
192
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
173
193
|
div5 = div3.xpath("div/div")
|
174
194
|
location, worklocation = div5.map(&:text).compact
|
175
195
|
|
176
196
|
# icl (e.g. Full-time, Permanent)
|
197
|
+
puts 'before jobtype' if @debug
|
177
198
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
178
199
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
179
200
|
jobtype = jobtype&.texts.join if jobtype
|
180
201
|
|
181
202
|
# jobsearch (e.g. Urgently needed)
|
203
|
+
puts 'before jobnote1' if @debug
|
182
204
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
183
205
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
184
206
|
"h-DesktopTag-text']")&.text
|
185
207
|
|
186
208
|
# jobsearch (e.g. 10 days ago)
|
209
|
+
puts 'before days' if @debug
|
187
210
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
188
211
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
189
212
|
d = Date.today - days.to_i
|
190
213
|
datepost = d.strftime("%Y-%m-%d")
|
191
214
|
|
192
215
|
|
216
|
+
puts 'before jobdesc' if @debug
|
193
217
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
194
218
|
"ass='jobsearch-jobDescriptionText']").xml
|
195
219
|
|
@@ -218,14 +242,70 @@ class IS22Plus < IndeedScraper2022
|
|
218
242
|
debug: debug)
|
219
243
|
end
|
220
244
|
|
221
|
-
|
245
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
246
|
+
# execute archive() daily
|
247
|
+
#
|
248
|
+
def archive(filepath='/tmp/indeed')
|
249
|
+
|
250
|
+
search() if @results.nil?
|
222
251
|
|
223
252
|
return unless @results
|
224
253
|
|
225
|
-
|
226
|
-
|
254
|
+
FileUtils.mkdir_p filepath
|
255
|
+
|
256
|
+
idxfile = File.join(filepath, 'index.yml')
|
257
|
+
|
258
|
+
index = if File.exists? idxfile then
|
259
|
+
YAML.load(File.read(idxfile))
|
260
|
+
else
|
261
|
+
{}
|
262
|
+
end
|
263
|
+
|
264
|
+
@results.each.with_index do |item, i|
|
265
|
+
|
266
|
+
puts 'saving ' + item[:title] if @debug
|
267
|
+
puts 'link: ' + item[:link].inspect
|
268
|
+
links = RXFReader.reveal(item[:link])
|
269
|
+
puts 'links: ' + links.inspect if @debug
|
270
|
+
|
271
|
+
url = links.last
|
272
|
+
puts 'url: ' + url.inspect if @debug
|
273
|
+
id = url[/(?<=jk=)[^&]+/]
|
274
|
+
|
275
|
+
if index[id.to_sym] then
|
276
|
+
|
277
|
+
# the vacancy record has previously been saved
|
278
|
+
#
|
279
|
+
next
|
280
|
+
|
281
|
+
else
|
282
|
+
|
283
|
+
# write the full page vacancy article to file
|
284
|
+
#
|
285
|
+
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
286
|
+
|
287
|
+
h = {
|
288
|
+
link: url[/^[^&]+/],
|
289
|
+
title: item[:title].to_s,
|
290
|
+
salary: item[:salary].to_s,
|
291
|
+
company: item[:company].to_s.strip,
|
292
|
+
location: item[:location].to_s,
|
293
|
+
jobsnippet: item[:jobsnippet],
|
294
|
+
date: item[:date],
|
295
|
+
added: Time.now.strftime("%Y-%m-%d")
|
296
|
+
}
|
297
|
+
|
298
|
+
# add the vacancy snippet to the index file
|
299
|
+
#
|
300
|
+
index[id.to_sym] = h
|
301
|
+
end
|
302
|
+
|
227
303
|
end
|
228
304
|
|
305
|
+
# save the vacancy index file
|
306
|
+
#
|
307
|
+
File.write idxfile, index.to_yaml
|
308
|
+
|
229
309
|
end
|
230
310
|
|
231
311
|
def list()
|
@@ -238,3 +318,39 @@ class IS22Plus < IndeedScraper2022
|
|
238
318
|
|
239
319
|
|
240
320
|
end
|
321
|
+
|
322
|
+
|
323
|
+
class IS22Archive
|
324
|
+
|
325
|
+
attr_reader :index
|
326
|
+
|
327
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
328
|
+
|
329
|
+
@debug = debug
|
330
|
+
|
331
|
+
FileUtils.mkdir_p filepath
|
332
|
+
@idxfile = File.join(filepath, 'index.yml')
|
333
|
+
|
334
|
+
@index = if File.exists? @idxfile then
|
335
|
+
YAML.load(File.read(@idxfile))
|
336
|
+
else
|
337
|
+
{}
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
|
342
|
+
def list()
|
343
|
+
|
344
|
+
@index.to_a.reverse.map.with_index do |x,i|
|
345
|
+
|
346
|
+
id, h = x
|
347
|
+
|
348
|
+
puts 'h: ' + h.inspect if @debug
|
349
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
350
|
+
"%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
|
351
|
+
|
352
|
+
end.join("\n")
|
353
|
+
|
354
|
+
end
|
355
|
+
|
356
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-04-16 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,20 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|