indeed_scraper2022 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
4
- data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
3
+ metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
4
+ data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
5
5
  SHA512:
6
- metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
7
- data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
6
+ metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
7
+ data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
checksums.yaml.gz.sig CHANGED
Binary file
@@ -11,6 +11,13 @@ require 'yaml'
11
11
 
12
12
 
13
13
 
14
+ # this gem consists of 3 main classes:
15
+ #
16
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
17
+ # * IS22Plus - Archives the scraped vacancies to local file
18
+ # * IS22Archive - Allows viewing of archived vacancies offline
19
+ #
20
+
14
21
  class IndeedScraper2022Err < Exception
15
22
  end
16
23
 
@@ -146,52 +153,65 @@ class IndeedScraper2022
146
153
  def fetchjob(url)
147
154
 
148
155
  doc = Nokorexi.new(url).to_doc
156
+ puts 'before e0' if @debug
149
157
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
150
158
 
151
159
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
160
+ puts 'before div1' if @debug
152
161
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
162
+ puts 'before div2' if @debug
153
163
  div2 = div1.element("div")
154
164
 
155
165
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
166
+ puts 'before jobtitle' if @debug
156
167
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
157
168
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
158
169
  "er-title']")&.text
159
170
 
171
+ puts 'before div3' if @debug
160
172
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
161
173
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
162
174
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
163
175
  "ainer-subtitle']")
164
176
 
165
177
  # icl (e.g. Lyles Sutherland)
178
+ puts 'before cname' if @debug
166
179
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
167
180
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
168
181
  "s-mr--xs']")[1]
182
+ puts 'before clink' if @debug
169
183
  clink = div3.element('//a')
170
184
  company = cname.text ? cname.text : clink.text
171
185
  companylink = clink.attributes[:href] if clink
172
186
 
187
+ puts 'before salary' if @debug
173
188
  salary = div1.element("//span[@class='attribute_snippet']")&.text
189
+ puts 'before type' if @debug
174
190
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
175
191
  div5 = div3.xpath("div/div")
176
192
  location, worklocation = div5.map(&:text).compact
177
193
 
178
194
  # icl (e.g. Full-time, Permanent)
195
+ puts 'before jobtype' if @debug
179
196
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
180
197
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
181
198
  jobtype = jobtype&.texts.join if jobtype
182
199
 
183
200
  # jobsearch (e.g. Urgently needed)
201
+ puts 'before jobnote1' if @debug
184
202
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
185
203
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
186
204
  "h-DesktopTag-text']")&.text
187
205
 
188
206
  # jobsearch (e.g. 10 days ago)
207
+ puts 'before days' if @debug
189
208
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
190
209
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
191
210
  d = Date.today - days.to_i
192
211
  datepost = d.strftime("%Y-%m-%d")
193
212
 
194
213
 
214
+ puts 'before jobdesc' if @debug
195
215
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
196
216
  "ass='jobsearch-jobDescriptionText']").xml
197
217
 
@@ -220,8 +240,13 @@ class IS22Plus < IndeedScraper2022
220
240
  debug: debug)
221
241
  end
222
242
 
243
+ # note: The most efficient method to accumulate vacancy articles is to
244
+ # execute archive() daily
245
+ #
223
246
  def archive(filepath='/tmp/indeed')
224
247
 
248
+ search() if @results.nil?
249
+
225
250
  return unless @results
226
251
 
227
252
  FileUtils.mkdir_p filepath
@@ -239,15 +264,22 @@ class IS22Plus < IndeedScraper2022
239
264
  puts 'saving ' + item[:title] if @debug
240
265
  puts 'link: ' + item[:link].inspect
241
266
  links = RXFReader.reveal(item[:link])
242
- puts 'links: ' + links.inspect
267
+ puts 'links: ' + links.inspect if @debug
243
268
 
244
269
  url = links.last
245
- id = url[/(?<=\?jk=)[^&]+/]
270
+ puts 'url: ' + url.inspect if @debug
271
+ id = url[/(?<=jk=)[^&]+/]
246
272
 
247
273
  if index[id.to_sym] then
274
+
275
+ # the vacancy record has previously been saved
276
+ #
248
277
  next
278
+
249
279
  else
250
280
 
281
+ # write the full page vacancy article to file
282
+ #
251
283
  File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
252
284
 
253
285
  h = {
@@ -257,14 +289,19 @@ class IS22Plus < IndeedScraper2022
257
289
  company: item[:company].to_s.strip,
258
290
  location: item[:location].to_s,
259
291
  jobsnippet: item[:jobsnippet],
260
- date: item[:date]
292
+ date: item[:date],
293
+ added: Time.now.strftime("%Y-%m-%d")
261
294
  }
262
295
 
296
+ # add the vacancy snippet to the index file
297
+ #
263
298
  index[id.to_sym] = h
264
299
  end
265
300
 
266
301
  end
267
302
 
303
+ # save the vacancy index file
304
+ #
268
305
  File.write idxfile, index.to_yaml
269
306
 
270
307
  end
@@ -279,3 +316,38 @@ class IS22Plus < IndeedScraper2022
279
316
 
280
317
 
281
318
  end
319
+
320
+
321
+ class IS22Archive
322
+
323
+ attr_reader :index
324
+
325
+ def initialize(filepath='/tmp/indeed', debug: false)
326
+
327
+ @debug = debug
328
+
329
+ FileUtils.mkdir_p filepath
330
+ @idxfile = File.join(filepath, 'index.yml')
331
+
332
+ @index = if File.exists? @idxfile then
333
+ YAML.load(File.read(@idxfile))
334
+ else
335
+ {}
336
+ end
337
+
338
+ end
339
+
340
+ def list()
341
+
342
+ @index.map.with_index do |x,i|
343
+
344
+ id, h = x
345
+
346
+ puts 'h: ' + h.inspect if @debug
347
+ "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
348
+
349
+ end.join("\n")
350
+
351
+ end
352
+
353
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-01 00:00:00.000000000 Z
38
+ date: 2022-04-14 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file