indeed_scraper2022 0.4.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
4
- data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
3
+ metadata.gz: 1cf994e7ea8c7fd89ad96e55d2d3f2d3908b8c565e3f07bad5056e4d63b90b65
4
+ data.tar.gz: eff54d2290a32a6a78cd31646ceef2736d329b40ea327f72f7a5eb4594dd3fe2
5
5
  SHA512:
6
- metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
7
- data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
6
+ metadata.gz: 4da62ce0099ee02985981f8f166e371a4b05388998c94b5b5d3e33637e7dc649cc7aefbbeb19e002d777634b0bef9222563aad552c89d575757633c7764b5eb9
7
+ data.tar.gz: dc4a0bcdb86bef4925880c95c5b558de5e1c4cff0756e134f8502be97e0dc2a16e4e38fd7217db6b9994a02c2384956c90ec7a66913b8baaf5fbdfe92d46b322
checksums.yaml.gz.sig CHANGED
Binary file
@@ -5,6 +5,7 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
@@ -44,9 +45,10 @@ class IndeedScraper2022
44
45
  end
45
46
 
46
47
  def search(q: @q, location: @location, start: nil)
47
-
48
+ puts 'inside search' if @debug
48
49
  url = @url_base
49
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
50
52
 
51
53
  @browser.goto(url)
52
54
  #@browser.network.wait_for_idle
@@ -81,28 +83,44 @@ class IndeedScraper2022
81
83
  sleep 2
82
84
 
83
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
84
87
 
85
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
86
89
  puts 'a2: ' + a2.length.inspect if @debug
87
90
 
88
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
89
92
 
90
93
  @results = @a2.map do |doc|
91
94
 
92
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
93
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
94
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
95
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
96
100
  "dy/tr/td[@class='resultContent']")
97
101
 
98
102
  # job title (e.g. Software Developer)
99
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
100
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
101
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
102
109
 
103
- salary = td.element("div[@class='metadataContainer']/" \
104
- "div[@class='salary-snippet-container']/div[@class='sa" \
105
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
106
124
 
107
125
  puts 'salary: ' + salary.inspect if @debug
108
126
  div1 = td.element("div[@class='companyInfo']")
@@ -120,7 +138,12 @@ class IndeedScraper2022
120
138
  "v[@class='result-footer']")
121
139
 
122
140
  # job (e.g. Our products are primarily written in C#, using...)
123
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
124
147
 
125
148
  # visually (e.g. Posted 14 days ago)
126
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -128,7 +151,7 @@ class IndeedScraper2022
128
151
 
129
152
  {
130
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
131
- + doc.root.attributes[:href].gsub(/&/,'&'),
154
+ + href.gsub(/&/,'&'),
132
155
  title: jobtitle,
133
156
  salary: salary,
134
157
  company: company_name,
@@ -265,10 +288,8 @@ class IS22Plus < IndeedScraper2022
265
288
 
266
289
  puts 'saving ' + item[:title] if @debug
267
290
  puts 'link: ' + item[:link].inspect
268
- links = RXFReader.reveal(item[:link])
269
- puts 'links: ' + links.inspect if @debug
270
-
271
- url = links.last
291
+ url = URL.reveal(item[:link])
292
+ item[:link] = url
272
293
  puts 'url: ' + url.inspect if @debug
273
294
  id = url[/(?<=jk=)[^&]+/]
274
295
 
@@ -290,7 +311,7 @@ class IS22Plus < IndeedScraper2022
290
311
  salary: item[:salary].to_s,
291
312
  company: item[:company].to_s.strip,
292
313
  location: item[:location].to_s,
293
- jobsnippet: item[:jobsnippet],
314
+ jobsnippet: item[:jobsnippet].to_s,
294
315
  date: item[:date],
295
316
  added: Time.now.strftime("%Y-%m-%d")
296
317
  }
@@ -321,18 +342,17 @@ end
321
342
 
322
343
 
323
344
  class IS22Archive
345
+ include RXFReadWriteModule
324
346
 
325
347
  attr_reader :index
326
348
 
327
349
  def initialize(filepath='/tmp/indeed', debug: false)
328
350
 
329
- @debug = debug
330
-
331
- FileUtils.mkdir_p filepath
351
+ FileX.mkdir_p filepath
332
352
  @idxfile = File.join(filepath, 'index.yml')
333
353
 
334
- @index = if File.exists? @idxfile then
335
- YAML.load(File.read(@idxfile))
354
+ @index = if FileX.exists? @idxfile then
355
+ YAML.load(FileX.read(@idxfile))
336
356
  else
337
357
  {}
338
358
  end
@@ -353,4 +373,67 @@ class IS22Archive
353
373
 
354
374
  end
355
375
 
376
+ def to_html()
377
+
378
+ rows = latest().map do |h|
379
+
380
+ puts 'h: ' + h.inspect if @debug
381
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
382
+ "* %s: [%s](%s)%s" % [h[:added].strftime("%d %b"), h[:title], h[:link], co]
383
+
384
+ end.join("\n")
385
+
386
+
387
+ md = '# Indeed.com: Latest jobs
388
+
389
+ ' + rows
390
+
391
+ RDiscount.new(md).to_html
392
+
393
+ end
394
+
395
+ def to_form(action: '')
396
+
397
+ rows = latest().map.with_index do |h, i|
398
+
399
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
400
+
401
+ "<input type='checkbox' id='#{h[:jobid]}' name='#{h[:jobid]}' value='#{h[:title]}'/>
402
+ <label for='j#{i}'>#{h[:added].strftime("%d %b")}: #{h[:title] + ' ' + co}</label><br/>
403
+ "
404
+
405
+ end.join("\n")
406
+
407
+
408
+ return "<form action='#{action}'>#{rows}" +
409
+ "<input type='submit' value='submit'/></form>"
410
+
411
+ end
412
+
413
+ def filter(a)
414
+
415
+ dx = Dynarex.new
416
+ a2 = latest().select {|h| a.include? h[:jobid] }
417
+ dx.import a2
418
+
419
+ return dx
420
+ end
421
+
422
+ private
423
+
424
+ def latest()
425
+
426
+ a = @index.to_a.map do |id, h|
427
+ h[:jobid] = id
428
+ h[:added] = Date.parse(h[:added]) if h[:added].is_a? String
429
+ h
430
+ end
431
+
432
+ a.select do |x|
433
+ x[:added] >= (Date.today - 7)
434
+ end.reverse
435
+
436
+ end
437
+
356
438
  end
439
+
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-16 00:00:00.000000000 Z
38
+ date: 2022-05-25 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -77,6 +77,46 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: url_reveal22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
100
+ - !ruby/object:Gem::Dependency
101
+ name: dynarex
102
+ requirement: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - "~>"
105
+ - !ruby/object:Gem::Version
106
+ version: '1.9'
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 1.9.11
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '1.9'
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 1.9.11
80
120
  description:
81
121
  email: digital.robertson@gmail.com
82
122
  executables: []
metadata.gz.sig CHANGED
Binary file