indeed_scraper2022 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +43 -21
- data.tar.gz.sig +0 -0
- metadata +22 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e9d1593bc7531fd77592f8765ccac9011f47980f0fb1f0d7e9c89513a319b85e
|
4
|
+
data.tar.gz: ac88d94ce2271fdc4561f43ff7218baed73db60867506c20bc34522fd357cb89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 133e26d46ca8bae7b08aff57a7451c4cf23f507f4cb4e3875f3b3ca320c60e14f747952f1b893a8016cfd3dfb907440597864158e0cf9a5dc92a833d98855aff
|
7
|
+
data.tar.gz: 0c5d883b0b9a1582a2b83c8a64d47e420b7c0f79f2e6f8b1e409439fd171539adca26f10c66b57db19f9d5238c3017649c527d0e2f8fe405b3aa73c4cf276e8f
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
@@ -44,9 +45,10 @@ class IndeedScraper2022
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def search(q: @q, location: @location, start: nil)
|
47
|
-
|
48
|
+
puts 'inside search' if @debug
|
48
49
|
url = @url_base
|
49
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
50
52
|
|
51
53
|
@browser.goto(url)
|
52
54
|
#@browser.network.wait_for_idle
|
@@ -81,34 +83,52 @@ class IndeedScraper2022
|
|
81
83
|
sleep 2
|
82
84
|
|
83
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
84
87
|
|
85
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
86
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
87
90
|
|
88
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
89
92
|
|
90
93
|
@results = @a2.map do |doc|
|
91
94
|
|
92
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
93
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
94
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
95
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
96
100
|
"dy/tr/td[@class='resultContent']")
|
97
101
|
|
98
102
|
# job title (e.g. Software Developer)
|
99
|
-
|
100
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
101
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
102
109
|
|
103
|
-
|
104
|
-
"div[@class='salary-snippet-container']
|
105
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
106
124
|
|
107
125
|
puts 'salary: ' + salary.inspect if @debug
|
108
126
|
div1 = td.element("div[@class='companyInfo']")
|
109
127
|
|
110
128
|
# company name (e.g. Coda Octopus Products Ltd)
|
111
|
-
|
129
|
+
coname = div1.element("span[@class='companyName']")
|
130
|
+
puts 'coname: ' + coname.text.inspect if @debug
|
131
|
+
company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
|
112
132
|
|
113
133
|
# company location (e.g. Edinburgh)
|
114
134
|
location = div1.element("div[@class='companyLocation']")&.text
|
@@ -118,7 +138,12 @@ class IndeedScraper2022
|
|
118
138
|
"v[@class='result-footer']")
|
119
139
|
|
120
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
121
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
122
147
|
|
123
148
|
# visually (e.g. Posted 14 days ago)
|
124
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -126,7 +151,7 @@ class IndeedScraper2022
|
|
126
151
|
|
127
152
|
{
|
128
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
129
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
130
155
|
title: jobtitle,
|
131
156
|
salary: salary,
|
132
157
|
company: company_name,
|
@@ -237,7 +262,7 @@ class IS22Plus < IndeedScraper2022
|
|
237
262
|
|
238
263
|
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
239
264
|
super(q: q, location: location, headless: headless, cookies: cookies,
|
240
|
-
debug:
|
265
|
+
debug: true)
|
241
266
|
end
|
242
267
|
|
243
268
|
# note: The most efficient method to accumulate vacancy articles is to
|
@@ -263,10 +288,8 @@ class IS22Plus < IndeedScraper2022
|
|
263
288
|
|
264
289
|
puts 'saving ' + item[:title] if @debug
|
265
290
|
puts 'link: ' + item[:link].inspect
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
url = links.last
|
291
|
+
url = URL.reveal(item[:link])
|
292
|
+
item[:link] = url
|
270
293
|
puts 'url: ' + url.inspect if @debug
|
271
294
|
id = url[/(?<=jk=)[^&]+/]
|
272
295
|
|
@@ -288,7 +311,7 @@ class IS22Plus < IndeedScraper2022
|
|
288
311
|
salary: item[:salary].to_s,
|
289
312
|
company: item[:company].to_s.strip,
|
290
313
|
location: item[:location].to_s,
|
291
|
-
jobsnippet: item[:jobsnippet],
|
314
|
+
jobsnippet: item[:jobsnippet].to_s,
|
292
315
|
date: item[:date],
|
293
316
|
added: Time.now.strftime("%Y-%m-%d")
|
294
317
|
}
|
@@ -324,8 +347,6 @@ class IS22Archive
|
|
324
347
|
|
325
348
|
def initialize(filepath='/tmp/indeed', debug: false)
|
326
349
|
|
327
|
-
@debug = debug
|
328
|
-
|
329
350
|
FileUtils.mkdir_p filepath
|
330
351
|
@idxfile = File.join(filepath, 'index.yml')
|
331
352
|
|
@@ -339,12 +360,13 @@ class IS22Archive
|
|
339
360
|
|
340
361
|
def list()
|
341
362
|
|
342
|
-
@index.map.with_index do |x,i|
|
363
|
+
@index.to_a.reverse.map.with_index do |x,i|
|
343
364
|
|
344
365
|
id, h = x
|
345
366
|
|
346
367
|
puts 'h: ' + h.inspect if @debug
|
347
|
-
|
368
|
+
co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
|
369
|
+
"%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
|
348
370
|
|
349
371
|
end.join("\n")
|
350
372
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -77,6 +77,26 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: reveal_url22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|