indeed_scraper2022 0.1.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +157 -62
- data.tar.gz.sig +0 -0
- metadata +7 -7
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
|
4
|
+
data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
|
7
|
+
data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -2,23 +2,31 @@
|
|
2
2
|
|
3
3
|
# file: indeed_scraper2022.rb
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Given the nature of changes to jobsearch websites,
|
9
10
|
# don't rely upon this gem working in the near future.
|
10
11
|
|
11
12
|
|
13
|
+
|
12
14
|
class IndeedScraper2022Err < Exception
|
13
15
|
end
|
14
16
|
|
15
17
|
class IndeedScraper2022
|
16
18
|
|
17
|
-
|
19
|
+
attr_reader :browser
|
20
|
+
|
21
|
+
def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
|
22
|
+
headless: true, cookies: nil, debug: false)
|
18
23
|
|
19
24
|
@debug = debug
|
20
25
|
@url_base, @q, @location = url, q, location
|
21
|
-
@
|
26
|
+
@headless, @cookies = headless, cookies
|
27
|
+
|
28
|
+
fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
|
29
|
+
@browser = fw.browser
|
22
30
|
|
23
31
|
end
|
24
32
|
|
@@ -28,6 +36,101 @@ class IndeedScraper2022
|
|
28
36
|
@results
|
29
37
|
end
|
30
38
|
|
39
|
+
def search(q: @q, location: @location, start: nil)
|
40
|
+
|
41
|
+
url = @url_base
|
42
|
+
url += 'start=' + start if start
|
43
|
+
|
44
|
+
@browser.goto(url)
|
45
|
+
#@browser.network.wait_for_idle
|
46
|
+
puts 'sleeping for 4 seconds' if @debug
|
47
|
+
sleep 4
|
48
|
+
|
49
|
+
if q.length > 1 then
|
50
|
+
|
51
|
+
input = @browser.at_xpath("//input[@name='q']")
|
52
|
+
|
53
|
+
# select any existing text and overwrite it
|
54
|
+
input.focus.type(:home); sleep 0.2
|
55
|
+
input.focus.type(:shift, :end); sleep 0.2
|
56
|
+
input.focus.type(q); sleep 0.2
|
57
|
+
end
|
58
|
+
|
59
|
+
if location.length > 1 then
|
60
|
+
|
61
|
+
input2 = @browser.at_xpath("//input[@name='l']")
|
62
|
+
|
63
|
+
# select any existing text and overwrite it
|
64
|
+
input2.focus.type(:home); sleep 0.2
|
65
|
+
input2.focus.type(:shift, :end); sleep 0.2
|
66
|
+
input2.focus.type(location); sleep 0.2
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
button = @browser.at_xpath("//button[@type='submit']")
|
71
|
+
button.click
|
72
|
+
#@browser.network.wait_for_idle
|
73
|
+
puts 'sleeping for 2 seconds' if @debug
|
74
|
+
sleep 2
|
75
|
+
|
76
|
+
doc2 = Nokogiri::XML(@browser.body)
|
77
|
+
|
78
|
+
a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
|
79
|
+
puts 'a2: ' + a2.length.inspect if @debug
|
80
|
+
|
81
|
+
@a2 = a2.map {|x| Rexle.new x.to_s }
|
82
|
+
|
83
|
+
@results = @a2.map do |doc|
|
84
|
+
|
85
|
+
div = doc.element("a[@class='desktop']/div[@class='slider" \
|
86
|
+
"_container']/div[@class='slider_list']/div[@class='sl" \
|
87
|
+
"ider_item']/div[@class='job_seen_beacon']")
|
88
|
+
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
89
|
+
"dy/tr/td[@class='resultContent']")
|
90
|
+
|
91
|
+
# job title (e.g. Software Developer)
|
92
|
+
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
93
|
+
"class='jobTitle-color-purple']/span")&.text
|
94
|
+
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
95
|
+
|
96
|
+
salary = td.element("div[@class='metadataContainer']/" \
|
97
|
+
"div[@class='salary-snippet-container']/div[@class='sa" \
|
98
|
+
"lary-snippet']/span")&.text
|
99
|
+
|
100
|
+
puts 'salary: ' + salary.inspect if @debug
|
101
|
+
div1 = td.element("div[@class='companyInfo']")
|
102
|
+
|
103
|
+
# company name (e.g. Coda Octopus Products Ltd)
|
104
|
+
company_name = div1.element("span[@class='companyName']")&.text
|
105
|
+
|
106
|
+
# company location (e.g. Edinburgh)
|
107
|
+
location = div1.element("div[@class='companyLocation']")&.text
|
108
|
+
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
109
|
+
|
110
|
+
div3 = tbody.element("tr[@class='underShelfFooter']/td/di" \
|
111
|
+
"v[@class='result-footer']")
|
112
|
+
|
113
|
+
# job (e.g. Our products are primarily written in C#, using...)
|
114
|
+
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
115
|
+
|
116
|
+
# visually (e.g. Posted 14 days ago)
|
117
|
+
dateposted = div3.element("span[@class='date']")&.texts
|
118
|
+
date = (Date.today - dateposted.first.to_i).to_s if dateposted
|
119
|
+
|
120
|
+
{
|
121
|
+
link: @url_base.sub(/\/[^\/]+$/,'') \
|
122
|
+
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
123
|
+
title: jobtitle,
|
124
|
+
salary: salary,
|
125
|
+
company: company_name,
|
126
|
+
location: location,
|
127
|
+
jobsnippet: jobsnippet,
|
128
|
+
date: date
|
129
|
+
}
|
130
|
+
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
31
134
|
def page(n)
|
32
135
|
|
33
136
|
if n < 1 or n > @results.length then
|
@@ -64,16 +167,18 @@ class IndeedScraper2022
|
|
64
167
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
65
168
|
"s-mr--xs']")[1]
|
66
169
|
clink = div3.element('//a')
|
67
|
-
company = cname ? cname.text : clink.text
|
170
|
+
company = cname.text ? cname.text : clink.text
|
68
171
|
companylink = clink.attributes[:href] if clink
|
69
172
|
|
173
|
+
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
174
|
+
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
70
175
|
div5 = div3.xpath("div/div")
|
71
176
|
location, worklocation = div5.map(&:text).compact
|
72
177
|
|
73
178
|
# icl (e.g. Full-time, Permanent)
|
74
179
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
75
180
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
76
|
-
jobtype = jobtype
|
181
|
+
jobtype = jobtype&.texts.join if jobtype
|
77
182
|
|
78
183
|
# jobsearch (e.g. Urgently needed)
|
79
184
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
@@ -81,98 +186,87 @@ class IndeedScraper2022
|
|
81
186
|
"h-DesktopTag-text']")&.text
|
82
187
|
|
83
188
|
# jobsearch (e.g. 10 days ago)
|
84
|
-
|
85
|
-
"tent']/div[@class='jobsearch-JobMetadataFooter']/div")&.text
|
189
|
+
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
190
|
+
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
191
|
+
d = Date.today - days.to_i
|
192
|
+
datepost = d.strftime("%Y-%m-%d")
|
193
|
+
|
86
194
|
|
87
195
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
88
196
|
"ass='jobsearch-jobDescriptionText']").xml
|
89
197
|
|
90
198
|
{
|
91
199
|
title: jobtitle,
|
200
|
+
type: type,
|
92
201
|
company: company,
|
93
202
|
companylink: companylink,
|
94
203
|
location: location,
|
204
|
+
salary: salary,
|
95
205
|
worklocation: worklocation,
|
96
206
|
note: jobnote1,
|
97
|
-
date:
|
207
|
+
date: datepost,
|
98
208
|
desc: jobdesc
|
99
209
|
}
|
100
210
|
|
101
211
|
end
|
102
212
|
|
103
|
-
def search(q='', location='')
|
104
213
|
|
105
|
-
|
106
|
-
|
107
|
-
page = a.get(@url_base)
|
108
|
-
form = page.forms.first
|
109
|
-
form.fields[0].value = @q
|
110
|
-
form.fields[1].value = @location
|
111
|
-
pg = form.submit
|
214
|
+
end
|
112
215
|
|
113
|
-
|
216
|
+
class IS22Plus < IndeedScraper2022
|
114
217
|
|
115
|
-
|
116
|
-
|
218
|
+
def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
|
219
|
+
super(q: q, location: location, headless: headless, cookies: cookies,
|
220
|
+
debug: debug)
|
221
|
+
end
|
117
222
|
|
118
|
-
|
223
|
+
def archive(filepath='/tmp/indeed')
|
119
224
|
|
120
|
-
|
225
|
+
return unless @results
|
121
226
|
|
122
|
-
|
123
|
-
"_container']/div[@class='slider_list']/div[@class='sl" \
|
124
|
-
"ider_item']/div[@class='job_seen_beacon']")
|
125
|
-
td = div.element("table[@class='jobCard_mainContent']/tbo" \
|
126
|
-
"dy/tr/td[@class='resultContent']")
|
227
|
+
FileUtils.mkdir_p filepath
|
127
228
|
|
128
|
-
|
129
|
-
jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
|
130
|
-
"class='jobTitle-color-purple']/span")&.text
|
131
|
-
puts 'jobtitle: ' + jobtitle.inspect if @debug
|
229
|
+
idxfile = File.join(filepath, 'index.yml')
|
132
230
|
|
133
|
-
|
134
|
-
|
135
|
-
|
231
|
+
index = if File.exists? idxfile then
|
232
|
+
YAML.load(File.read(idxfile))
|
233
|
+
else
|
234
|
+
{}
|
235
|
+
end
|
136
236
|
|
137
|
-
|
138
|
-
div1 = td.element("div[@class='companyInfo']")
|
237
|
+
@results.each.with_index do |item, i|
|
139
238
|
|
140
|
-
|
141
|
-
|
239
|
+
puts 'saving ' + item[:title] if @debug
|
240
|
+
puts 'link: ' + item[:link].inspect
|
241
|
+
links = RXFReader.reveal(item[:link])
|
242
|
+
puts 'links: ' + links.inspect
|
142
243
|
|
143
|
-
|
144
|
-
|
145
|
-
tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
|
244
|
+
url = links.last
|
245
|
+
id = url[/(?<=\?jk=)[^&]+/]
|
146
246
|
|
147
|
-
|
148
|
-
|
247
|
+
if index[id.to_sym] then
|
248
|
+
next
|
249
|
+
else
|
149
250
|
|
150
|
-
|
151
|
-
jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
|
251
|
+
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
152
252
|
|
153
|
-
|
154
|
-
|
155
|
-
|
253
|
+
h = {
|
254
|
+
link: url[/^[^&]+/],
|
255
|
+
title: item[:title].to_s,
|
256
|
+
salary: item[:salary].to_s,
|
257
|
+
company: item[:company].to_s.strip,
|
258
|
+
location: item[:location].to_s,
|
259
|
+
jobsnippet: item[:jobsnippet],
|
260
|
+
date: item[:date]
|
261
|
+
}
|
156
262
|
|
157
|
-
|
158
|
-
|
159
|
-
+ doc.root.attributes[:href].gsub(/&/,'&'),
|
160
|
-
title: jobtitle,
|
161
|
-
salary: salary,
|
162
|
-
company: company_name,
|
163
|
-
location: location,
|
164
|
-
jobsnippet: jobsnippet,
|
165
|
-
date: date
|
166
|
-
}
|
263
|
+
index[id.to_sym] = h
|
264
|
+
end
|
167
265
|
|
168
266
|
end
|
169
|
-
end
|
170
|
-
end
|
171
267
|
|
172
|
-
|
268
|
+
File.write idxfile, index.to_yaml
|
173
269
|
|
174
|
-
def initialize(q: '', location: '', debug: false)
|
175
|
-
super(q: q, location: location, debug: debug)
|
176
270
|
end
|
177
271
|
|
178
272
|
def list()
|
@@ -183,4 +277,5 @@ class IS22Plus < IndeedScraper2022
|
|
183
277
|
|
184
278
|
end
|
185
279
|
|
280
|
+
|
186
281
|
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-04-01 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -58,25 +58,25 @@ dependencies:
|
|
58
58
|
- !ruby/object:Gem::Version
|
59
59
|
version: 0.7.0
|
60
60
|
- !ruby/object:Gem::Dependency
|
61
|
-
name:
|
61
|
+
name: ferrumwizard
|
62
62
|
requirement: !ruby/object:Gem::Requirement
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '2
|
66
|
+
version: '0.2'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 2.
|
69
|
+
version: 0.2.2
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '2
|
76
|
+
version: '0.2'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 2.
|
79
|
+
version: 0.2.2
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|