indeed_scraper2022 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +43 -21
- data.tar.gz.sig +0 -0
- metadata +22 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e9d1593bc7531fd77592f8765ccac9011f47980f0fb1f0d7e9c89513a319b85e
|
4
|
+
data.tar.gz: ac88d94ce2271fdc4561f43ff7218baed73db60867506c20bc34522fd357cb89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 133e26d46ca8bae7b08aff57a7451c4cf23f507f4cb4e3875f3b3ca320c60e14f747952f1b893a8016cfd3dfb907440597864158e0cf9a5dc92a833d98855aff
|
7
|
+
data.tar.gz: 0c5d883b0b9a1582a2b83c8a64d47e420b7c0f79f2e6f8b1e409439fd171539adca26f10c66b57db19f9d5238c3017649c527d0e2f8fe405b3aa73c4cf276e8f
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
@@ -44,9 +45,10 @@ class IndeedScraper2022
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def search(q: @q, location: @location, start: nil)
|
47
|
-
|
48
|
+
puts 'inside search' if @debug
|
48
49
|
url = @url_base
|
49
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
50
52
|
|
51
53
|
@browser.goto(url)
|
52
54
|
#@browser.network.wait_for_idle
|
@@ -81,34 +83,52 @@ class IndeedScraper2022
|
|
81
83
|
sleep 2
|
82
84
|
|
83
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
84
87
|
|
85
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
86
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
87
90
|
|
88
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
89
92
|
|
90
93
|
@results = @a2.map do |doc|
|
91
94
|
|
92
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
93
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
94
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
95
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
96
100
|
"dy/tr/td[@class='resultContent']")
|
97
101
|
|
98
102
|
# job title (e.g. Software Developer)
|
99
|
-
|
100
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
101
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
102
109
|
|
103
|
-
|
104
|
-
"div[@class='salary-snippet-container']
|
105
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
106
124
|
|
107
125
|
puts 'salary: ' + salary.inspect if @debug
|
108
126
|
div1 = td.element("div[@class='companyInfo']")
|
109
127
|
|
110
128
|
# company name (e.g. Coda Octopus Products Ltd)
|
111
|
-
|
129
|
+
coname = div1.element("span[@class='companyName']")
|
130
|
+
puts 'coname: ' + coname.text.inspect if @debug
|
131
|
+
company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
|
112
132
|
|
113
133
|
# company location (e.g. Edinburgh)
|
114
134
|
location = div1.element("div[@class='companyLocation']")&.text
|
@@ -118,7 +138,12 @@ class IndeedScraper2022
|
|
118
138
|
"v[@class='result-footer']")
|
119
139
|
|
120
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
121
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
122
147
|
|
123
148
|
# visually (e.g. Posted 14 days ago)
|
124
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -126,7 +151,7 @@ class IndeedScraper2022
|
|
126
151
|
|
127
152
|
{
|
128
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
129
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
130
155
|
title: jobtitle,
|
131
156
|
salary: salary,
|
132
157
|
company: company_name,
|
@@ -237,7 +262,7 @@ class IS22Plus < IndeedScraper2022
|
|
237
262
|
|
238
263
|
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
239
264
|
super(q: q, location: location, headless: headless, cookies: cookies,
|
240
|
-
debug:
|
265
|
+
debug: true)
|
241
266
|
end
|
242
267
|
|
243
268
|
# note: The most efficient method to accumulate vacancy articles is to
|
@@ -263,10 +288,8 @@ class IS22Plus < IndeedScraper2022
|
|
263
288
|
|
264
289
|
puts 'saving ' + item[:title] if @debug
|
265
290
|
puts 'link: ' + item[:link].inspect
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
url = links.last
|
291
|
+
url = URL.reveal(item[:link])
|
292
|
+
item[:link] = url
|
270
293
|
puts 'url: ' + url.inspect if @debug
|
271
294
|
id = url[/(?<=jk=)[^&]+/]
|
272
295
|
|
@@ -288,7 +311,7 @@ class IS22Plus < IndeedScraper2022
|
|
288
311
|
salary: item[:salary].to_s,
|
289
312
|
company: item[:company].to_s.strip,
|
290
313
|
location: item[:location].to_s,
|
291
|
-
jobsnippet: item[:jobsnippet],
|
314
|
+
jobsnippet: item[:jobsnippet].to_s,
|
292
315
|
date: item[:date],
|
293
316
|
added: Time.now.strftime("%Y-%m-%d")
|
294
317
|
}
|
@@ -324,8 +347,6 @@ class IS22Archive
|
|
324
347
|
|
325
348
|
def initialize(filepath='/tmp/indeed', debug: false)
|
326
349
|
|
327
|
-
@debug = debug
|
328
|
-
|
329
350
|
FileUtils.mkdir_p filepath
|
330
351
|
@idxfile = File.join(filepath, 'index.yml')
|
331
352
|
|
@@ -339,12 +360,13 @@ class IS22Archive
|
|
339
360
|
|
340
361
|
def list()
|
341
362
|
|
342
|
-
@index.map.with_index do |x,i|
|
363
|
+
@index.to_a.reverse.map.with_index do |x,i|
|
343
364
|
|
344
365
|
id, h = x
|
345
366
|
|
346
367
|
puts 'h: ' + h.inspect if @debug
|
347
|
-
|
368
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
369
|
+
"%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
|
348
370
|
|
349
371
|
end.join("\n")
|
350
372
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -77,6 +77,26 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: reveal_url22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|