indeed_scraper2022 0.4.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
4
- data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
3
+ metadata.gz: 1cf994e7ea8c7fd89ad96e55d2d3f2d3908b8c565e3f07bad5056e4d63b90b65
4
+ data.tar.gz: eff54d2290a32a6a78cd31646ceef2736d329b40ea327f72f7a5eb4594dd3fe2
5
5
  SHA512:
6
- metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
7
- data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
6
+ metadata.gz: 4da62ce0099ee02985981f8f166e371a4b05388998c94b5b5d3e33637e7dc649cc7aefbbeb19e002d777634b0bef9222563aad552c89d575757633c7764b5eb9
7
+ data.tar.gz: dc4a0bcdb86bef4925880c95c5b558de5e1c4cff0756e134f8502be97e0dc2a16e4e38fd7217db6b9994a02c2384956c90ec7a66913b8baaf5fbdfe92d46b322
checksums.yaml.gz.sig CHANGED
Binary file
@@ -5,6 +5,7 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
@@ -44,9 +45,10 @@ class IndeedScraper2022
44
45
  end
45
46
 
46
47
  def search(q: @q, location: @location, start: nil)
47
-
48
+ puts 'inside search' if @debug
48
49
  url = @url_base
49
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
50
52
 
51
53
  @browser.goto(url)
52
54
  #@browser.network.wait_for_idle
@@ -81,28 +83,44 @@ class IndeedScraper2022
81
83
  sleep 2
82
84
 
83
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
84
87
 
85
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
86
89
  puts 'a2: ' + a2.length.inspect if @debug
87
90
 
88
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
89
92
 
90
93
  @results = @a2.map do |doc|
91
94
 
92
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
93
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
94
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
95
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
96
100
  "dy/tr/td[@class='resultContent']")
97
101
 
98
102
  # job title (e.g. Software Developer)
99
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
100
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
101
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
102
109
 
103
- salary = td.element("div[@class='metadataContainer']/" \
104
- "div[@class='salary-snippet-container']/div[@class='sa" \
105
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
106
124
 
107
125
  puts 'salary: ' + salary.inspect if @debug
108
126
  div1 = td.element("div[@class='companyInfo']")
@@ -120,7 +138,12 @@ class IndeedScraper2022
120
138
  "v[@class='result-footer']")
121
139
 
122
140
  # job (e.g. Our products are primarily written in C#, using...)
123
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
124
147
 
125
148
  # visually (e.g. Posted 14 days ago)
126
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -128,7 +151,7 @@ class IndeedScraper2022
128
151
 
129
152
  {
130
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
131
- + doc.root.attributes[:href].gsub(/&/,'&'),
154
+ + href.gsub(/&/,'&'),
132
155
  title: jobtitle,
133
156
  salary: salary,
134
157
  company: company_name,
@@ -265,10 +288,8 @@ class IS22Plus < IndeedScraper2022
265
288
 
266
289
  puts 'saving ' + item[:title] if @debug
267
290
  puts 'link: ' + item[:link].inspect
268
- links = RXFReader.reveal(item[:link])
269
- puts 'links: ' + links.inspect if @debug
270
-
271
- url = links.last
291
+ url = URL.reveal(item[:link])
292
+ item[:link] = url
272
293
  puts 'url: ' + url.inspect if @debug
273
294
  id = url[/(?<=jk=)[^&]+/]
274
295
 
@@ -290,7 +311,7 @@ class IS22Plus < IndeedScraper2022
290
311
  salary: item[:salary].to_s,
291
312
  company: item[:company].to_s.strip,
292
313
  location: item[:location].to_s,
293
- jobsnippet: item[:jobsnippet],
314
+ jobsnippet: item[:jobsnippet].to_s,
294
315
  date: item[:date],
295
316
  added: Time.now.strftime("%Y-%m-%d")
296
317
  }
@@ -321,18 +342,17 @@ end
321
342
 
322
343
 
323
344
  class IS22Archive
345
+ include RXFReadWriteModule
324
346
 
325
347
  attr_reader :index
326
348
 
327
349
  def initialize(filepath='/tmp/indeed', debug: false)
328
350
 
329
- @debug = debug
330
-
331
- FileUtils.mkdir_p filepath
351
+ FileX.mkdir_p filepath
332
352
  @idxfile = File.join(filepath, 'index.yml')
333
353
 
334
- @index = if File.exists? @idxfile then
335
- YAML.load(File.read(@idxfile))
354
+ @index = if FileX.exists? @idxfile then
355
+ YAML.load(FileX.read(@idxfile))
336
356
  else
337
357
  {}
338
358
  end
@@ -353,4 +373,67 @@ class IS22Archive
353
373
 
354
374
  end
355
375
 
376
+ def to_html()
377
+
378
+ rows = latest().map do |h|
379
+
380
+ puts 'h: ' + h.inspect if @debug
381
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
382
+ "* %s: [%s](%s)%s" % [h[:added].strftime("%d %b"), h[:title], h[:link], co]
383
+
384
+ end.join("\n")
385
+
386
+
387
+ md = '# Indeed.com: Latest jobs
388
+
389
+ ' + rows
390
+
391
+ RDiscount.new(md).to_html
392
+
393
+ end
394
+
395
+ def to_form(action: '')
396
+
397
+ rows = latest().map.with_index do |h, i|
398
+
399
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
400
+
401
+ "<input type='checkbox' id='#{h[:jobid]}' name='#{h[:jobid]}' value='#{h[:title]}'/>
402
+ <label for='j#{i}'>#{h[:added].strftime("%d %b")}: #{h[:title] + ' ' + co}</label><br/>
403
+ "
404
+
405
+ end.join("\n")
406
+
407
+
408
+ return "<form action='#{action}'>#{rows}" +
409
+ "<input type='submit' value='submit'/></form>"
410
+
411
+ end
412
+
413
+ def filter(a)
414
+
415
+ dx = Dynarex.new
416
+ a2 = latest().select {|h| a.include? h[:jobid] }
417
+ dx.import a2
418
+
419
+ return dx
420
+ end
421
+
422
+ private
423
+
424
+ def latest()
425
+
426
+ a = @index.to_a.map do |id, h|
427
+ h[:jobid] = id
428
+ h[:added] = Date.parse(h[:added]) if h[:added].is_a? String
429
+ h
430
+ end
431
+
432
+ a.select do |x|
433
+ x[:added] >= (Date.today - 7)
434
+ end.reverse
435
+
436
+ end
437
+
356
438
  end
439
+
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-16 00:00:00.000000000 Z
38
+ date: 2022-05-25 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -77,6 +77,46 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: url_reveal22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
100
+ - !ruby/object:Gem::Dependency
101
+ name: dynarex
102
+ requirement: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - "~>"
105
+ - !ruby/object:Gem::Version
106
+ version: '1.9'
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 1.9.11
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '1.9'
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 1.9.11
80
120
  description:
81
121
  email: digital.robertson@gmail.com
82
122
  executables: []
metadata.gz.sig CHANGED
Binary file