indeed_scraper2022 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +2 -3
- data/lib/indeed_scraper2022.rb +35 -14
- data.tar.gz.sig +0 -0
- metadata +22 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
|
4
|
+
data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
|
7
|
+
data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
|
checksums.yaml.gz.sig
CHANGED
@@ -1,3 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
�
|
1
|
+
}ә���T��*��-ԣ��xe�. �x�<>X�JX~���)���-zi���w��Lɜ!n��
|
2
|
+
˾/�x��0V��"�����䭴B��J�� Vt��<���3�8LW���R���k�K��������hÅ��y���{<��IJC�:D�>�F%�8��m�XҘ��Aw�O�GL����ޚZ,!"l��k����&�/��'����8V�]r#�qZ�Vm!������
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
@@ -44,9 +45,10 @@ class IndeedScraper2022
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def search(q: @q, location: @location, start: nil)
|
47
|
-
|
48
|
+
puts 'inside search' if @debug
|
48
49
|
url = @url_base
|
49
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
50
52
|
|
51
53
|
@browser.goto(url)
|
52
54
|
#@browser.network.wait_for_idle
|
@@ -81,28 +83,44 @@ class IndeedScraper2022
|
|
81
83
|
sleep 2
|
82
84
|
|
83
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
84
87
|
|
85
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
86
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
87
90
|
|
88
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
89
92
|
|
90
93
|
@results = @a2.map do |doc|
|
91
94
|
|
92
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
93
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
94
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
95
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
96
100
|
"dy/tr/td[@class='resultContent']")
|
97
101
|
|
98
102
|
# job title (e.g. Software Developer)
|
99
|
-
|
100
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
101
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
102
109
|
|
103
|
-
|
104
|
-
"div[@class='salary-snippet-container']
|
105
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
106
124
|
|
107
125
|
puts 'salary: ' + salary.inspect if @debug
|
108
126
|
div1 = td.element("div[@class='companyInfo']")
|
@@ -120,7 +138,12 @@ class IndeedScraper2022
|
|
120
138
|
"v[@class='result-footer']")
|
121
139
|
|
122
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
123
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
124
147
|
|
125
148
|
# visually (e.g. Posted 14 days ago)
|
126
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -128,7 +151,7 @@ class IndeedScraper2022
|
|
128
151
|
|
129
152
|
{
|
130
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
131
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
132
155
|
title: jobtitle,
|
133
156
|
salary: salary,
|
134
157
|
company: company_name,
|
@@ -239,7 +262,7 @@ class IS22Plus < IndeedScraper2022
|
|
239
262
|
|
240
263
|
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
241
264
|
super(q: q, location: location, headless: headless, cookies: cookies,
|
242
|
-
debug:
|
265
|
+
debug: true)
|
243
266
|
end
|
244
267
|
|
245
268
|
# note: The most efficient method to accumulate vacancy articles is to
|
@@ -265,7 +288,7 @@ class IS22Plus < IndeedScraper2022
|
|
265
288
|
|
266
289
|
puts 'saving ' + item[:title] if @debug
|
267
290
|
puts 'link: ' + item[:link].inspect
|
268
|
-
links =
|
291
|
+
links = URL.reveal(item[:link])
|
269
292
|
puts 'links: ' + links.inspect if @debug
|
270
293
|
|
271
294
|
url = links.last
|
@@ -326,8 +349,6 @@ class IS22Archive
|
|
326
349
|
|
327
350
|
def initialize(filepath='/tmp/indeed', debug: false)
|
328
351
|
|
329
|
-
@debug = debug
|
330
|
-
|
331
352
|
FileUtils.mkdir_p filepath
|
332
353
|
@idxfile = File.join(filepath, 'index.yml')
|
333
354
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -77,6 +77,26 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: reveal_url22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|