indeed_scraper2022 0.4.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +103 -20
- data.tar.gz.sig +0 -0
- metadata +42 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1cf994e7ea8c7fd89ad96e55d2d3f2d3908b8c565e3f07bad5056e4d63b90b65
|
4
|
+
data.tar.gz: eff54d2290a32a6a78cd31646ceef2736d329b40ea327f72f7a5eb4594dd3fe2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4da62ce0099ee02985981f8f166e371a4b05388998c94b5b5d3e33637e7dc649cc7aefbbeb19e002d777634b0bef9222563aad552c89d575757633c7764b5eb9
|
7
|
+
data.tar.gz: dc4a0bcdb86bef4925880c95c5b558de5e1c4cff0756e134f8502be97e0dc2a16e4e38fd7217db6b9994a02c2384956c90ec7a66913b8baaf5fbdfe92d46b322
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
@@ -44,9 +45,10 @@ class IndeedScraper2022
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def search(q: @q, location: @location, start: nil)
|
47
|
-
|
48
|
+
puts 'inside search' if @debug
|
48
49
|
url = @url_base
|
49
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
50
52
|
|
51
53
|
@browser.goto(url)
|
52
54
|
#@browser.network.wait_for_idle
|
@@ -81,28 +83,44 @@ class IndeedScraper2022
|
|
81
83
|
sleep 2
|
82
84
|
|
83
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
84
87
|
|
85
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
86
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
87
90
|
|
88
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
89
92
|
|
90
93
|
@results = @a2.map do |doc|
|
91
94
|
|
92
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
93
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
94
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
95
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
96
100
|
"dy/tr/td[@class='resultContent']")
|
97
101
|
|
98
102
|
# job title (e.g. Software Developer)
|
99
|
-
|
100
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
101
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
102
109
|
|
103
|
-
|
104
|
-
"div[@class='salary-snippet-container']
|
105
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
106
124
|
|
107
125
|
puts 'salary: ' + salary.inspect if @debug
|
108
126
|
div1 = td.element("div[@class='companyInfo']")
|
@@ -120,7 +138,12 @@ class IndeedScraper2022
|
|
120
138
|
"v[@class='result-footer']")
|
121
139
|
|
122
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
123
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
124
147
|
|
125
148
|
# visually (e.g. Posted 14 days ago)
|
126
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -128,7 +151,7 @@ class IndeedScraper2022
|
|
128
151
|
|
129
152
|
{
|
130
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
131
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
132
155
|
title: jobtitle,
|
133
156
|
salary: salary,
|
134
157
|
company: company_name,
|
@@ -265,10 +288,8 @@ class IS22Plus < IndeedScraper2022
|
|
265
288
|
|
266
289
|
puts 'saving ' + item[:title] if @debug
|
267
290
|
puts 'link: ' + item[:link].inspect
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
url = links.last
|
291
|
+
url = URL.reveal(item[:link])
|
292
|
+
item[:link] = url
|
272
293
|
puts 'url: ' + url.inspect if @debug
|
273
294
|
id = url[/(?<=jk=)[^&]+/]
|
274
295
|
|
@@ -290,7 +311,7 @@ class IS22Plus < IndeedScraper2022
|
|
290
311
|
salary: item[:salary].to_s,
|
291
312
|
company: item[:company].to_s.strip,
|
292
313
|
location: item[:location].to_s,
|
293
|
-
jobsnippet: item[:jobsnippet],
|
314
|
+
jobsnippet: item[:jobsnippet].to_s,
|
294
315
|
date: item[:date],
|
295
316
|
added: Time.now.strftime("%Y-%m-%d")
|
296
317
|
}
|
@@ -321,18 +342,17 @@ end
|
|
321
342
|
|
322
343
|
|
323
344
|
class IS22Archive
|
345
|
+
include RXFReadWriteModule
|
324
346
|
|
325
347
|
attr_reader :index
|
326
348
|
|
327
349
|
def initialize(filepath='/tmp/indeed', debug: false)
|
328
350
|
|
329
|
-
|
330
|
-
|
331
|
-
FileUtils.mkdir_p filepath
|
351
|
+
FileX.mkdir_p filepath
|
332
352
|
@idxfile = File.join(filepath, 'index.yml')
|
333
353
|
|
334
|
-
@index = if
|
335
|
-
YAML.load(
|
354
|
+
@index = if FileX.exists? @idxfile then
|
355
|
+
YAML.load(FileX.read(@idxfile))
|
336
356
|
else
|
337
357
|
{}
|
338
358
|
end
|
@@ -353,4 +373,67 @@ class IS22Archive
|
|
353
373
|
|
354
374
|
end
|
355
375
|
|
376
|
+
def to_html()
|
377
|
+
|
378
|
+
rows = latest().map do |h|
|
379
|
+
|
380
|
+
puts 'h: ' + h.inspect if @debug
|
381
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
382
|
+
"* %s: [%s](%s)%s" % [h[:added].strftime("%d %b"), h[:title], h[:link], co]
|
383
|
+
|
384
|
+
end.join("\n")
|
385
|
+
|
386
|
+
|
387
|
+
md = '# Indeed.com: Latest jobs
|
388
|
+
|
389
|
+
' + rows
|
390
|
+
|
391
|
+
RDiscount.new(md).to_html
|
392
|
+
|
393
|
+
end
|
394
|
+
|
395
|
+
def to_form(action: '')
|
396
|
+
|
397
|
+
rows = latest().map.with_index do |h, i|
|
398
|
+
|
399
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
400
|
+
|
401
|
+
"<input type='checkbox' id='#{h[:jobid]}' name='#{h[:jobid]}' value='#{h[:title]}'/>
|
402
|
+
<label for='j#{i}'>#{h[:added].strftime("%d %b")}: #{h[:title] + ' ' + co}</label><br/>
|
403
|
+
"
|
404
|
+
|
405
|
+
end.join("\n")
|
406
|
+
|
407
|
+
|
408
|
+
return "<form action='#{action}'>#{rows}" +
|
409
|
+
"<input type='submit' value='submit'/></form>"
|
410
|
+
|
411
|
+
end
|
412
|
+
|
413
|
+
def filter(a)
|
414
|
+
|
415
|
+
dx = Dynarex.new
|
416
|
+
a2 = latest().select {|h| a.include? h[:jobid] }
|
417
|
+
dx.import a2
|
418
|
+
|
419
|
+
return dx
|
420
|
+
end
|
421
|
+
|
422
|
+
private
|
423
|
+
|
424
|
+
def latest()
|
425
|
+
|
426
|
+
a = @index.to_a.map do |id, h|
|
427
|
+
h[:jobid] = id
|
428
|
+
h[:added] = Date.parse(h[:added]) if h[:added].is_a? String
|
429
|
+
h
|
430
|
+
end
|
431
|
+
|
432
|
+
a.select do |x|
|
433
|
+
x[:added] >= (Date.today - 7)
|
434
|
+
end.reverse
|
435
|
+
|
436
|
+
end
|
437
|
+
|
356
438
|
end
|
439
|
+
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-25 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -77,6 +77,46 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: url_reveal22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
100
|
+
- !ruby/object:Gem::Dependency
|
101
|
+
name: dynarex
|
102
|
+
requirement: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - "~>"
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '1.9'
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.9.11
|
110
|
+
type: :runtime
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - "~>"
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '1.9'
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 1.9.11
|
80
120
|
description:
|
81
121
|
email: digital.robertson@gmail.com
|
82
122
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|