indeed_scraper2022 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +75 -3
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
|
4
|
+
data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
|
7
|
+
data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -11,6 +11,13 @@ require 'yaml'
|
|
11
11
|
|
12
12
|
|
13
13
|
|
14
|
+
# this gem consists of 3 main classes:
|
15
|
+
#
|
16
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
17
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
18
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
19
|
+
#
|
20
|
+
|
14
21
|
class IndeedScraper2022Err < Exception
|
15
22
|
end
|
16
23
|
|
@@ -146,52 +153,65 @@ class IndeedScraper2022
|
|
146
153
|
def fetchjob(url)
|
147
154
|
|
148
155
|
doc = Nokorexi.new(url).to_doc
|
156
|
+
puts 'before e0' if @debug
|
149
157
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
150
158
|
|
151
159
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
160
|
+
puts 'before div1' if @debug
|
152
161
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
162
|
+
puts 'before div2' if @debug
|
153
163
|
div2 = div1.element("div")
|
154
164
|
|
155
165
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
166
|
+
puts 'before jobtitle' if @debug
|
156
167
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
157
168
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
158
169
|
"er-title']")&.text
|
159
170
|
|
171
|
+
puts 'before div3' if @debug
|
160
172
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
161
173
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
162
174
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
163
175
|
"ainer-subtitle']")
|
164
176
|
|
165
177
|
# icl (e.g. Lyles Sutherland)
|
178
|
+
puts 'before cname' if @debug
|
166
179
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
167
180
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
168
181
|
"s-mr--xs']")[1]
|
182
|
+
puts 'before clink' if @debug
|
169
183
|
clink = div3.element('//a')
|
170
184
|
company = cname.text ? cname.text : clink.text
|
171
185
|
companylink = clink.attributes[:href] if clink
|
172
186
|
|
187
|
+
puts 'before salary' if @debug
|
173
188
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
189
|
+
puts 'before type' if @debug
|
174
190
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
175
191
|
div5 = div3.xpath("div/div")
|
176
192
|
location, worklocation = div5.map(&:text).compact
|
177
193
|
|
178
194
|
# icl (e.g. Full-time, Permanent)
|
195
|
+
puts 'before jobtype' if @debug
|
179
196
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
180
197
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
181
198
|
jobtype = jobtype&.texts.join if jobtype
|
182
199
|
|
183
200
|
# jobsearch (e.g. Urgently needed)
|
201
|
+
puts 'before jobnote1' if @debug
|
184
202
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
185
203
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
186
204
|
"h-DesktopTag-text']")&.text
|
187
205
|
|
188
206
|
# jobsearch (e.g. 10 days ago)
|
207
|
+
puts 'before days' if @debug
|
189
208
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
190
209
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
191
210
|
d = Date.today - days.to_i
|
192
211
|
datepost = d.strftime("%Y-%m-%d")
|
193
212
|
|
194
213
|
|
214
|
+
puts 'before jobdesc' if @debug
|
195
215
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
196
216
|
"ass='jobsearch-jobDescriptionText']").xml
|
197
217
|
|
@@ -220,8 +240,13 @@ class IS22Plus < IndeedScraper2022
|
|
220
240
|
debug: debug)
|
221
241
|
end
|
222
242
|
|
243
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
244
|
+
# execute archive() daily
|
245
|
+
#
|
223
246
|
def archive(filepath='/tmp/indeed')
|
224
247
|
|
248
|
+
search() if @results.nil?
|
249
|
+
|
225
250
|
return unless @results
|
226
251
|
|
227
252
|
FileUtils.mkdir_p filepath
|
@@ -239,15 +264,22 @@ class IS22Plus < IndeedScraper2022
|
|
239
264
|
puts 'saving ' + item[:title] if @debug
|
240
265
|
puts 'link: ' + item[:link].inspect
|
241
266
|
links = RXFReader.reveal(item[:link])
|
242
|
-
puts 'links: ' + links.inspect
|
267
|
+
puts 'links: ' + links.inspect if @debug
|
243
268
|
|
244
269
|
url = links.last
|
245
|
-
|
270
|
+
puts 'url: ' + url.inspect if @debug
|
271
|
+
id = url[/(?<=jk=)[^&]+/]
|
246
272
|
|
247
273
|
if index[id.to_sym] then
|
274
|
+
|
275
|
+
# the vacancy record has previously been saved
|
276
|
+
#
|
248
277
|
next
|
278
|
+
|
249
279
|
else
|
250
280
|
|
281
|
+
# write the full page vacancy article to file
|
282
|
+
#
|
251
283
|
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
252
284
|
|
253
285
|
h = {
|
@@ -257,14 +289,19 @@ class IS22Plus < IndeedScraper2022
|
|
257
289
|
company: item[:company].to_s.strip,
|
258
290
|
location: item[:location].to_s,
|
259
291
|
jobsnippet: item[:jobsnippet],
|
260
|
-
date: item[:date]
|
292
|
+
date: item[:date],
|
293
|
+
added: Time.now.strftime("%Y-%m-%d")
|
261
294
|
}
|
262
295
|
|
296
|
+
# add the vacancy snippet to the index file
|
297
|
+
#
|
263
298
|
index[id.to_sym] = h
|
264
299
|
end
|
265
300
|
|
266
301
|
end
|
267
302
|
|
303
|
+
# save the vacancy index file
|
304
|
+
#
|
268
305
|
File.write idxfile, index.to_yaml
|
269
306
|
|
270
307
|
end
|
@@ -279,3 +316,38 @@ class IS22Plus < IndeedScraper2022
|
|
279
316
|
|
280
317
|
|
281
318
|
end
|
319
|
+
|
320
|
+
|
321
|
+
class IS22Archive
|
322
|
+
|
323
|
+
attr_reader :index
|
324
|
+
|
325
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
326
|
+
|
327
|
+
@debug = debug
|
328
|
+
|
329
|
+
FileUtils.mkdir_p filepath
|
330
|
+
@idxfile = File.join(filepath, 'index.yml')
|
331
|
+
|
332
|
+
@index = if File.exists? @idxfile then
|
333
|
+
YAML.load(File.read(@idxfile))
|
334
|
+
else
|
335
|
+
{}
|
336
|
+
end
|
337
|
+
|
338
|
+
end
|
339
|
+
|
340
|
+
def list()
|
341
|
+
|
342
|
+
@index.map.with_index do |x,i|
|
343
|
+
|
344
|
+
id, h = x
|
345
|
+
|
346
|
+
puts 'h: ' + h.inspect if @debug
|
347
|
+
"%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
|
348
|
+
|
349
|
+
end.join("\n")
|
350
|
+
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-04-
|
38
|
+
date: 2022-04-14 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,20 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|