indeed_scraper2022 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
4
- data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
3
+ metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
4
+ data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
5
5
  SHA512:
6
- metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
7
- data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
6
+ metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
7
+ data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
checksums.yaml.gz.sig CHANGED
Binary file
@@ -11,6 +11,13 @@ require 'yaml'
11
11
 
12
12
 
13
13
 
14
+ # this gem consists of 3 main classes:
15
+ #
16
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
17
+ # * IS22Plus - Archives the scraped vacancies to local file
18
+ # * IS22Archive - Allows viewing of archived vacancies offline
19
+ #
20
+
14
21
  class IndeedScraper2022Err < Exception
15
22
  end
16
23
 
@@ -146,52 +153,65 @@ class IndeedScraper2022
146
153
  def fetchjob(url)
147
154
 
148
155
  doc = Nokorexi.new(url).to_doc
156
+ puts 'before e0' if @debug
149
157
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
150
158
 
151
159
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
160
+ puts 'before div1' if @debug
152
161
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
162
+ puts 'before div2' if @debug
153
163
  div2 = div1.element("div")
154
164
 
155
165
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
166
+ puts 'before jobtitle' if @debug
156
167
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
157
168
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
158
169
  "er-title']")&.text
159
170
 
171
+ puts 'before div3' if @debug
160
172
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
161
173
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
162
174
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
163
175
  "ainer-subtitle']")
164
176
 
165
177
  # icl (e.g. Lyles Sutherland)
178
+ puts 'before cname' if @debug
166
179
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
167
180
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
168
181
  "s-mr--xs']")[1]
182
+ puts 'before clink' if @debug
169
183
  clink = div3.element('//a')
170
184
  company = cname.text ? cname.text : clink.text
171
185
  companylink = clink.attributes[:href] if clink
172
186
 
187
+ puts 'before salary' if @debug
173
188
  salary = div1.element("//span[@class='attribute_snippet']")&.text
189
+ puts 'before type' if @debug
174
190
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
175
191
  div5 = div3.xpath("div/div")
176
192
  location, worklocation = div5.map(&:text).compact
177
193
 
178
194
  # icl (e.g. Full-time, Permanent)
195
+ puts 'before jobtype' if @debug
179
196
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
180
197
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
181
198
  jobtype = jobtype&.texts.join if jobtype
182
199
 
183
200
  # jobsearch (e.g. Urgently needed)
201
+ puts 'before jobnote1' if @debug
184
202
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
185
203
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
186
204
  "h-DesktopTag-text']")&.text
187
205
 
188
206
  # jobsearch (e.g. 10 days ago)
207
+ puts 'before days' if @debug
189
208
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
190
209
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
191
210
  d = Date.today - days.to_i
192
211
  datepost = d.strftime("%Y-%m-%d")
193
212
 
194
213
 
214
+ puts 'before jobdesc' if @debug
195
215
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
196
216
  "ass='jobsearch-jobDescriptionText']").xml
197
217
 
@@ -220,8 +240,13 @@ class IS22Plus < IndeedScraper2022
220
240
  debug: debug)
221
241
  end
222
242
 
243
+ # note: The most efficient method to accumulate vacancy articles is to
244
+ # execute archive() daily
245
+ #
223
246
  def archive(filepath='/tmp/indeed')
224
247
 
248
+ search() if @results.nil?
249
+
225
250
  return unless @results
226
251
 
227
252
  FileUtils.mkdir_p filepath
@@ -239,15 +264,22 @@ class IS22Plus < IndeedScraper2022
239
264
  puts 'saving ' + item[:title] if @debug
240
265
  puts 'link: ' + item[:link].inspect
241
266
  links = RXFReader.reveal(item[:link])
242
- puts 'links: ' + links.inspect
267
+ puts 'links: ' + links.inspect if @debug
243
268
 
244
269
  url = links.last
245
- id = url[/(?<=\?jk=)[^&]+/]
270
+ puts 'url: ' + url.inspect if @debug
271
+ id = url[/(?<=jk=)[^&]+/]
246
272
 
247
273
  if index[id.to_sym] then
274
+
275
+ # the vacancy record has previously been saved
276
+ #
248
277
  next
278
+
249
279
  else
250
280
 
281
+ # write the full page vacancy article to file
282
+ #
251
283
  File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
252
284
 
253
285
  h = {
@@ -257,14 +289,19 @@ class IS22Plus < IndeedScraper2022
257
289
  company: item[:company].to_s.strip,
258
290
  location: item[:location].to_s,
259
291
  jobsnippet: item[:jobsnippet],
260
- date: item[:date]
292
+ date: item[:date],
293
+ added: Time.now.strftime("%Y-%m-%d")
261
294
  }
262
295
 
296
+ # add the vacancy snippet to the index file
297
+ #
263
298
  index[id.to_sym] = h
264
299
  end
265
300
 
266
301
  end
267
302
 
303
+ # save the vacancy index file
304
+ #
268
305
  File.write idxfile, index.to_yaml
269
306
 
270
307
  end
@@ -279,3 +316,38 @@ class IS22Plus < IndeedScraper2022
279
316
 
280
317
 
281
318
  end
319
+
320
+
321
+ class IS22Archive
322
+
323
+ attr_reader :index
324
+
325
+ def initialize(filepath='/tmp/indeed', debug: false)
326
+
327
+ @debug = debug
328
+
329
+ FileUtils.mkdir_p filepath
330
+ @idxfile = File.join(filepath, 'index.yml')
331
+
332
+ @index = if File.exists? @idxfile then
333
+ YAML.load(File.read(@idxfile))
334
+ else
335
+ {}
336
+ end
337
+
338
+ end
339
+
340
+ def list()
341
+
342
+ @index.map.with_index do |x,i|
343
+
344
+ id, h = x
345
+
346
+ puts 'h: ' + h.inspect if @debug
347
+ "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
348
+
349
+ end.join("\n")
350
+
351
+ end
352
+
353
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-01 00:00:00.000000000 Z
38
+ date: 2022-04-14 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file