indeed_scraper2022 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +2 -3
- data/lib/indeed_scraper2022.rb +35 -14
- data.tar.gz.sig +0 -0
- metadata +22 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
|
4
|
+
data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
|
7
|
+
data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
|
checksums.yaml.gz.sig
CHANGED
@@ -1,3 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
�
|
1
|
+
}ә���T��*��-ԣ��xe�. �x�<>X�JX~���)���-zi���w��Lɜ!n��
|
2
|
+
˾/�x��0V��"�����䭴B��J�� Vt��<���3�8LW���R���k�K��������hÅ��y���{<��IJC�:D�>�F%�8��m�XҘ��Aw�O�GL����ޚZ,!"l��k����&�/��'����8V�]r#�qZ�Vm!������
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
7
|
require 'yaml'
|
8
|
+
require 'reveal_url22'
|
8
9
|
|
9
10
|
# Given the nature of changes to jobsearch websites,
|
10
11
|
# don't rely upon this gem working in the near future.
|
@@ -44,9 +45,10 @@ class IndeedScraper2022
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def search(q: @q, location: @location, start: nil)
|
47
|
-
|
48
|
+
puts 'inside search' if @debug
|
48
49
|
url = @url_base
|
49
50
|
url += 'start=' + start if start
|
51
|
+
puts 'url: ' + url.inspect if @debug
|
50
52
|
|
51
53
|
@browser.goto(url)
|
52
54
|
#@browser.network.wait_for_idle
|
@@ -81,28 +83,44 @@ class IndeedScraper2022
|
|
81
83
|
sleep 2
|
82
84
|
|
83
85
|
doc2 = Nokogiri::XML(@browser.body)
|
86
|
+
File.write '/tmp/body.txt', doc2.to_s if @debug
|
84
87
|
|
85
|
-
a2 = doc2.xpath "//
|
88
|
+
a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
|
86
89
|
puts 'a2: ' + a2.length.inspect if @debug
|
87
90
|
|
88
91
|
@a2 = a2.map {|x| Rexle.new x.to_s }
|
89
92
|
|
90
93
|
@results = @a2.map do |doc|
|
91
94
|
|
92
|
-
div = doc.element("
|
95
|
+
div = doc.element("div[@class='cardOutline']/div[@class='slider" \
|
93
96
|
"_container']/div[@class='slider_list']/div[@class='sl" \
|
94
97
|
"ider_item']/div[@class='job_seen_beacon']")
|
98
|
+
|
95
99
|
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
96
100
|
"dy/tr/td[@class='resultContent']")
|
97
101
|
|
98
102
|
# job title (e.g. Software Developer)
|
99
|
-
|
100
|
-
"class='jobTitle-color-purple']/
|
103
|
+
job = td.element("div[@class='tapItem-gutter']/h2[@" \
|
104
|
+
"class='jobTitle-color-purple']/a")
|
105
|
+
href = job.attributes[:href]
|
106
|
+
jobtitle = job.element("span")&.text
|
107
|
+
|
101
108
|
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
102
109
|
|
103
|
-
|
104
|
-
"div[@class='salary-snippet-container']
|
105
|
-
|
110
|
+
sal = td.element("div[@class='metadataContainer']/" \
|
111
|
+
"div[@class='salary-snippet-container']")
|
112
|
+
|
113
|
+
salary = if sal then
|
114
|
+
sal_e = sal.element("div[@class='attribute_snippet']")
|
115
|
+
if sal_e then
|
116
|
+
sal_e.texts[0]
|
117
|
+
else
|
118
|
+
sal_e2 = sal.element("div[@class='salary-snippet']/span")
|
119
|
+
sal_e2 ? sal_e2.text : ''
|
120
|
+
end
|
121
|
+
else
|
122
|
+
''
|
123
|
+
end
|
106
124
|
|
107
125
|
puts 'salary: ' + salary.inspect if @debug
|
108
126
|
div1 = td.element("div[@class='companyInfo']")
|
@@ -120,7 +138,12 @@ class IndeedScraper2022
|
|
120
138
|
"v[@class='result-footer']")
|
121
139
|
|
122
140
|
# job (e.g. Our products are primarily written in C#, using...)
|
123
|
-
|
141
|
+
advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
|
142
|
+
jobsnippet = if advert_items.any? then
|
143
|
+
advert_items.join("\n")
|
144
|
+
else
|
145
|
+
div3.element("div[@class='job-snippet']").text
|
146
|
+
end
|
124
147
|
|
125
148
|
# visually (e.g. Posted 14 days ago)
|
126
149
|
dateposted = div3.element("span[@class='date']")&.texts
|
@@ -128,7 +151,7 @@ class IndeedScraper2022
|
|
128
151
|
|
129
152
|
{
|
130
153
|
link: @url_base.sub(/\/[^\/]+$/,'') \
|
131
|
-
+
|
154
|
+
+ href.gsub(/&/,'&'),
|
132
155
|
title: jobtitle,
|
133
156
|
salary: salary,
|
134
157
|
company: company_name,
|
@@ -239,7 +262,7 @@ class IS22Plus < IndeedScraper2022
|
|
239
262
|
|
240
263
|
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
241
264
|
super(q: q, location: location, headless: headless, cookies: cookies,
|
242
|
-
debug:
|
265
|
+
debug: true)
|
243
266
|
end
|
244
267
|
|
245
268
|
# note: The most efficient method to accumulate vacancy articles is to
|
@@ -265,7 +288,7 @@ class IS22Plus < IndeedScraper2022
|
|
265
288
|
|
266
289
|
puts 'saving ' + item[:title] if @debug
|
267
290
|
puts 'link: ' + item[:link].inspect
|
268
|
-
links =
|
291
|
+
links = URL.reveal(item[:link])
|
269
292
|
puts 'links: ' + links.inspect if @debug
|
270
293
|
|
271
294
|
url = links.last
|
@@ -326,8 +349,6 @@ class IS22Archive
|
|
326
349
|
|
327
350
|
def initialize(filepath='/tmp/indeed', debug: false)
|
328
351
|
|
329
|
-
@debug = debug
|
330
|
-
|
331
352
|
FileUtils.mkdir_p filepath
|
332
353
|
@idxfile = File.join(filepath, 'index.yml')
|
333
354
|
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-05-12 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -77,6 +77,26 @@ dependencies:
|
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
79
|
version: 0.3.1
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: reveal_url22
|
82
|
+
requirement: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - "~>"
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0.1'
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.1.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.1'
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.1.0
|
80
100
|
description:
|
81
101
|
email: digital.robertson@gmail.com
|
82
102
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|