indeed_scraper2022 0.4.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +103 -20
- data.tar.gz.sig +0 -0
- metadata +42 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1cf994e7ea8c7fd89ad96e55d2d3f2d3908b8c565e3f07bad5056e4d63b90b65
|
4
|
+
data.tar.gz: eff54d2290a32a6a78cd31646ceef2736d329b40ea327f72f7a5eb4594dd3fe2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4da62ce0099ee02985981f8f166e371a4b05388998c94b5b5d3e33637e7dc649cc7aefbbeb19e002d777634b0bef9222563aad552c89d575757633c7764b5eb9
|
7
|
+
data.tar.gz: dc4a0bcdb86bef4925880c95c5b558de5e1c4cff0756e134f8502be97e0dc2a16e4e38fd7217db6b9994a02c2384956c90ec7a66913b8baaf5fbdfe92d46b322
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
@@ -44,9 +45,10 @@ class IndeedScraper2022
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def search(q: @q, location: @location, start: nil)
|
47
|
-
|
48
|
+
puts 'inside search' if @debug
|
48
49
|
url = @url_base
|
49
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
50
52
|
|
51
53
|
@browser.goto(url)
|
52
54
|
#@browser.network.wait_for_idle
|
@@ -81,28 +83,44 @@ class IndeedScraper2022
|
|
81
83
|
sleep 2
|
82
84
|
|
83
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
84
87
|
|
85
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
86
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
87
90
|
|
88
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
89
92
|
|
90
93
|
@results = @a2.map do |doc|
|
91
94
|
|
92
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
93
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
94
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
95
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
96
100
|
"dy/tr/td[@class='resultContent']")
|
97
101
|
|
98
102
|
# job title (e.g. Software Developer)
|
99
|
-
|
100
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
101
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
102
109
|
|
103
|
-
|
104
|
-
"div[@class='salary-snippet-container']
|
105
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
106
124
|
|
107
125
|
puts 'salary: ' + salary.inspect if @debug
|
108
126
|
div1 = td.element("div[@class='companyInfo']")
|
@@ -120,7 +138,12 @@ class IndeedScraper2022
|
|
120
138
|
"v[@class='result-footer']")
|
121
139
|
|
122
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
123
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
124
147
|
|
125
148
|
# visually (e.g. Posted 14 days ago)
|
126
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -128,7 +151,7 @@ class IndeedScraper2022
|
|
128
151
|
|
129
152
|
{
|
130
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
131
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
132
155
|
title: jobtitle,
|
133
156
|
salary: salary,
|
134
157
|
company: company_name,
|
@@ -265,10 +288,8 @@ class IS22Plus < IndeedScraper2022
|
|
265
288
|
|
266
289
|
puts 'saving ' + item[:title] if @debug
|
267
290
|
puts 'link: ' + item[:link].inspect
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
url = links.last
|
291
|
+
url = URL.reveal(item[:link])
|
292
|
+
item[:link] = url
|
272
293
|
puts 'url: ' + url.inspect if @debug
|
273
294
|
id = url[/(?<=jk=)[^&]+/]
|
274
295
|
|
@@ -290,7 +311,7 @@ class IS22Plus < IndeedScraper2022
|
|
290
311
|
salary: item[:salary].to_s,
|
291
312
|
company: item[:company].to_s.strip,
|
292
313
|
location: item[:location].to_s,
|
293
|
-
jobsnippet: item[:jobsnippet],
|
314
|
+
jobsnippet: item[:jobsnippet].to_s,
|
294
315
|
date: item[:date],
|
295
316
|
added: Time.now.strftime("%Y-%m-%d")
|
296
317
|
}
|
@@ -321,18 +342,17 @@ end
|
|
321
342
|
|
322
343
|
|
323
344
|
class IS22Archive
|
345
|
+
include RXFReadWriteModule
|
324
346
|
|
325
347
|
attr_reader :index
|
326
348
|
|
327
349
|
def initialize(filepath='/tmp/indeed', debug: false)
|
328
350
|
|
329
|
-
|
330
|
-
|
331
|
-
FileUtils.mkdir_p filepath
|
351
|
+
FileX.mkdir_p filepath
|
332
352
|
@idxfile = File.join(filepath, 'index.yml')
|
333
353
|
|
334
|
-
@index = if
|
335
|
-
YAML.load(
|
354
|
+
@index = if FileX.exists? @idxfile then
|
355
|
+
YAML.load(FileX.read(@idxfile))
|
336
356
|
else
|
337
357
|
{}
|
338
358
|
end
|
@@ -353,4 +373,67 @@ class IS22Archive
|
|
353
373
|
|
354
374
|
end
|
355
375
|
|
376
|
+
def to_html()
|
377
|
+
|
378
|
+
rows = latest().map do |h|
|
379
|
+
|
380
|
+
puts 'h: ' + h.inspect if @debug
|
381
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
382
|
+
"* %s: [%s](%s)%s" % [h[:added].strftime("%d %b"), h[:title], h[:link], co]
|
383
|
+
|
384
|
+
end.join("\n")
|
385
|
+
|
386
|
+
|
387
|
+
md = '# Indeed.com: Latest jobs
|
388
|
+
|
389
|
+
' + rows
|
390
|
+
|
391
|
+
RDiscount.new(md).to_html
|
392
|
+
|
393
|
+
end
|
394
|
+
|
395
|
+
def to_form(action: '')
|
396
|
+
|
397
|
+
rows = latest().map.with_index do |h, i|
|
398
|
+
|
399
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
400
|
+
|
401
|
+
"<input type='checkbox' id='#{h[:jobid]}' name='#{h[:jobid]}' value='#{h[:title]}'/>
|
402
|
+
<label for='j#{i}'>#{h[:added].strftime("%d %b")}: #{h[:title] + ' ' + co}</label><br/>
|
403
|
+
"
|
404
|
+
|
405
|
+
end.join("\n")
|
406
|
+
|
407
|
+
|
408
|
+
return "<form action='#{action}'>#{rows}" +
|
409
|
+
"<input type='submit' value='submit'/></form>"
|
410
|
+
|
411
|
+
end
|
412
|
+
|
413
|
+
def filter(a)
|
414
|
+
|
415
|
+
dx = Dynarex.new
|
416
|
+
a2 = latest().select {|h| a.include? h[:jobid] }
|
417
|
+
dx.import a2
|
418
|
+
|
419
|
+
return dx
|
420
|
+
end
|
421
|
+
|
422
|
+
private
|
423
|
+
|
424
|
+
def latest()
|
425
|
+
|
426
|
+
a = @index.to_a.map do |id, h|
|
427
|
+
h[:jobid] = id
|
428
|
+
h[:added] = Date.parse(h[:added]) if h[:added].is_a? String
|
429
|
+
h
|
430
|
+
end
|
431
|
+
|
432
|
+
a.select do |x|
|
433
|
+
x[:added] >= (Date.today - 7)
|
434
|
+
end.reverse
|
435
|
+
|
436
|
+
end
|
437
|
+
|
356
438
|
end
|
439
|
+
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-25 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -77,6 +77,46 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: url_reveal22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
100
|
+
- !ruby/object:Gem::Dependency
|
101
|
+
name: dynarex
|
102
|
+
requirement: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - "~>"
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '1.9'
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.9.11
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - "~>"
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '1.9'
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 1.9.11
|
80
120
|
description:
|
81
121
|
email: digital.robertson@gmail.com
|
82
122
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|