indeed_scraper2022 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +75 -3
- data.tar.gz.sig +0 -0
- metadata +6 -6
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
|
4
|
+
data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
|
7
|
+
data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -11,6 +11,13 @@ require 'yaml'
|
|
11
11
|
|
12
12
|
|
13
13
|
|
14
|
+
# this gem consists of 3 main classes:
|
15
|
+
#
|
16
|
+
# * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
|
17
|
+
# * IS22Plus - Archives the scraped vacancies to local file
|
18
|
+
# * IS22Archive - Allows viewing of archived vacancies offline
|
19
|
+
#
|
20
|
+
|
14
21
|
class IndeedScraper2022Err < Exception
|
15
22
|
end
|
16
23
|
|
@@ -146,52 +153,65 @@ class IndeedScraper2022
|
|
146
153
|
def fetchjob(url)
|
147
154
|
|
148
155
|
doc = Nokorexi.new(url).to_doc
|
156
|
+
puts 'before e0' if @debug
|
149
157
|
e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
|
150
158
|
|
151
159
|
#div = e0.element("//div[@class='jobsearch-JobComponent']")
|
160
|
+
puts 'before div1' if @debug
|
152
161
|
div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
|
162
|
+
puts 'before div2' if @debug
|
153
163
|
div2 = div1.element("div")
|
154
164
|
|
155
165
|
# jobsearch (e.g. Full Stack Website Developer (Wordpress))
|
166
|
+
puts 'before jobtitle' if @debug
|
156
167
|
jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
|
157
168
|
"er-title-container']/h1[@class='jobsearch-JobInfoHead" \
|
158
169
|
"er-title']")&.text
|
159
170
|
|
171
|
+
puts 'before div3' if @debug
|
160
172
|
div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
|
161
173
|
"tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
|
162
174
|
"erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
|
163
175
|
"ainer-subtitle']")
|
164
176
|
|
165
177
|
# icl (e.g. Lyles Sutherland)
|
178
|
+
puts 'before cname' if @debug
|
166
179
|
cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
|
167
180
|
"ickyContainer-companyrating']/div/div[@class='icl-u-x" \
|
168
181
|
"s-mr--xs']")[1]
|
182
|
+
puts 'before clink' if @debug
|
169
183
|
clink = div3.element('//a')
|
170
184
|
company = cname.text ? cname.text : clink.text
|
171
185
|
companylink = clink.attributes[:href] if clink
|
172
186
|
|
187
|
+
puts 'before salary' if @debug
|
173
188
|
salary = div1.element("//span[@class='attribute_snippet']")&.text
|
189
|
+
puts 'before type' if @debug
|
174
190
|
type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
|
175
191
|
div5 = div3.xpath("div/div")
|
176
192
|
location, worklocation = div5.map(&:text).compact
|
177
193
|
|
178
194
|
# icl (e.g. Full-time, Permanent)
|
195
|
+
puts 'before jobtype' if @debug
|
179
196
|
jobtype = div1.element("div/div/div[@class='jobsearch-J" \
|
180
197
|
"obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
|
181
198
|
jobtype = jobtype&.texts.join if jobtype
|
182
199
|
|
183
200
|
# jobsearch (e.g. Urgently needed)
|
201
|
+
puts 'before jobnote1' if @debug
|
184
202
|
jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
|
185
203
|
"']/div[@class='urgently-hiring']/div[@class='jobsearc" \
|
186
204
|
"h-DesktopTag-text']")&.text
|
187
205
|
|
188
206
|
# jobsearch (e.g. 10 days ago)
|
207
|
+
puts 'before days' if @debug
|
189
208
|
days = e0.element("//div[@class='jobsearch-JobTab-con" \
|
190
209
|
"tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
|
191
210
|
d = Date.today - days.to_i
|
192
211
|
datepost = d.strftime("%Y-%m-%d")
|
193
212
|
|
194
213
|
|
214
|
+
puts 'before jobdesc' if @debug
|
195
215
|
jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
|
196
216
|
"ass='jobsearch-jobDescriptionText']").xml
|
197
217
|
|
@@ -220,8 +240,13 @@ class IS22Plus < IndeedScraper2022
|
|
220
240
|
debug: debug)
|
221
241
|
end
|
222
242
|
|
243
|
+
# note: The most efficient method to accumulate vacancy articles is to
|
244
|
+
# execute archive() daily
|
245
|
+
#
|
223
246
|
def archive(filepath='/tmp/indeed')
|
224
247
|
|
248
|
+
search() if @results.nil?
|
249
|
+
|
225
250
|
return unless @results
|
226
251
|
|
227
252
|
FileUtils.mkdir_p filepath
|
@@ -239,15 +264,22 @@ class IS22Plus < IndeedScraper2022
|
|
239
264
|
puts 'saving ' + item[:title] if @debug
|
240
265
|
puts 'link: ' + item[:link].inspect
|
241
266
|
links = RXFReader.reveal(item[:link])
|
242
|
-
puts 'links: ' + links.inspect
|
267
|
+
puts 'links: ' + links.inspect if @debug
|
243
268
|
|
244
269
|
url = links.last
|
245
|
-
|
270
|
+
puts 'url: ' + url.inspect if @debug
|
271
|
+
id = url[/(?<=jk=)[^&]+/]
|
246
272
|
|
247
273
|
if index[id.to_sym] then
|
274
|
+
|
275
|
+
# the vacancy record has previously been saved
|
276
|
+
#
|
248
277
|
next
|
278
|
+
|
249
279
|
else
|
250
280
|
|
281
|
+
# write the full page vacancy article to file
|
282
|
+
#
|
251
283
|
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
252
284
|
|
253
285
|
h = {
|
@@ -257,14 +289,19 @@ class IS22Plus < IndeedScraper2022
|
|
257
289
|
company: item[:company].to_s.strip,
|
258
290
|
location: item[:location].to_s,
|
259
291
|
jobsnippet: item[:jobsnippet],
|
260
|
-
date: item[:date]
|
292
|
+
date: item[:date],
|
293
|
+
added: Time.now.strftime("%Y-%m-%d")
|
261
294
|
}
|
262
295
|
|
296
|
+
# add the vacancy snippet to the index file
|
297
|
+
#
|
263
298
|
index[id.to_sym] = h
|
264
299
|
end
|
265
300
|
|
266
301
|
end
|
267
302
|
|
303
|
+
# save the vacancy index file
|
304
|
+
#
|
268
305
|
File.write idxfile, index.to_yaml
|
269
306
|
|
270
307
|
end
|
@@ -279,3 +316,38 @@ class IS22Plus < IndeedScraper2022
|
|
279
316
|
|
280
317
|
|
281
318
|
end
|
319
|
+
|
320
|
+
|
321
|
+
class IS22Archive
|
322
|
+
|
323
|
+
attr_reader :index
|
324
|
+
|
325
|
+
def initialize(filepath='/tmp/indeed', debug: false)
|
326
|
+
|
327
|
+
@debug = debug
|
328
|
+
|
329
|
+
FileUtils.mkdir_p filepath
|
330
|
+
@idxfile = File.join(filepath, 'index.yml')
|
331
|
+
|
332
|
+
@index = if File.exists? @idxfile then
|
333
|
+
YAML.load(File.read(@idxfile))
|
334
|
+
else
|
335
|
+
{}
|
336
|
+
end
|
337
|
+
|
338
|
+
end
|
339
|
+
|
340
|
+
def list()
|
341
|
+
|
342
|
+
@index.map.with_index do |x,i|
|
343
|
+
|
344
|
+
id, h = x
|
345
|
+
|
346
|
+
puts 'h: ' + h.inspect if @debug
|
347
|
+
"%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
|
348
|
+
|
349
|
+
end.join("\n")
|
350
|
+
|
351
|
+
end
|
352
|
+
|
353
|
+
end
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-04-
|
38
|
+
date: 2022-04-14 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
@@ -63,20 +63,20 @@ dependencies:
|
|
63
63
|
requirements:
|
64
64
|
- - "~>"
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version: '0.
|
66
|
+
version: '0.3'
|
67
67
|
- - ">="
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: 0.
|
69
|
+
version: 0.3.1
|
70
70
|
type: :runtime
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '0.
|
76
|
+
version: '0.3'
|
77
77
|
- - ">="
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version: 0.
|
79
|
+
version: 0.3.1
|
80
80
|
description:
|
81
81
|
email: digital.robertson@gmail.com
|
82
82
|
executables: []
|
metadata.gz.sig
CHANGED
Binary file
|