indeed_scraper2022 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
4
- data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
3
+ metadata.gz: e9d1593bc7531fd77592f8765ccac9011f47980f0fb1f0d7e9c89513a319b85e
4
+ data.tar.gz: ac88d94ce2271fdc4561f43ff7218baed73db60867506c20bc34522fd357cb89
5
5
  SHA512:
6
- metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
7
- data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
6
+ metadata.gz: 133e26d46ca8bae7b08aff57a7451c4cf23f507f4cb4e3875f3b3ca320c60e14f747952f1b893a8016cfd3dfb907440597864158e0cf9a5dc92a833d98855aff
7
+ data.tar.gz: 0c5d883b0b9a1582a2b83c8a64d47e420b7c0f79f2e6f8b1e409439fd171539adca26f10c66b57db19f9d5238c3017649c527d0e2f8fe405b3aa73c4cf276e8f
checksums.yaml.gz.sig CHANGED
Binary file
@@ -5,6 +5,7 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
@@ -44,9 +45,10 @@ class IndeedScraper2022
44
45
  end
45
46
 
46
47
  def search(q: @q, location: @location, start: nil)
47
-
48
+ puts 'inside search' if @debug
48
49
  url = @url_base
49
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
50
52
 
51
53
  @browser.goto(url)
52
54
  #@browser.network.wait_for_idle
@@ -81,34 +83,52 @@ class IndeedScraper2022
81
83
  sleep 2
82
84
 
83
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
84
87
 
85
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
86
89
  puts 'a2: ' + a2.length.inspect if @debug
87
90
 
88
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
89
92
 
90
93
  @results = @a2.map do |doc|
91
94
 
92
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
93
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
94
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
95
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
96
100
  "dy/tr/td[@class='resultContent']")
97
101
 
98
102
  # job title (e.g. Software Developer)
99
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
100
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
101
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
102
109
 
103
- salary = td.element("div[@class='metadataContainer']/" \
104
- "div[@class='salary-snippet-container']/div[@class='sa" \
105
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
106
124
 
107
125
  puts 'salary: ' + salary.inspect if @debug
108
126
  div1 = td.element("div[@class='companyInfo']")
109
127
 
110
128
  # company name (e.g. Coda Octopus Products Ltd)
111
- company_name = div1.element("span[@class='companyName']")&.text
129
+ coname = div1.element("span[@class='companyName']")
130
+ puts 'coname: ' + coname.text.inspect if @debug
131
+ company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
112
132
 
113
133
  # company location (e.g. Edinburgh)
114
134
  location = div1.element("div[@class='companyLocation']")&.text
@@ -118,7 +138,12 @@ class IndeedScraper2022
118
138
  "v[@class='result-footer']")
119
139
 
120
140
  # job (e.g. Our products are primarily written in C#, using...)
121
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
122
147
 
123
148
  # visually (e.g. Posted 14 days ago)
124
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -126,7 +151,7 @@ class IndeedScraper2022
126
151
 
127
152
  {
128
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
129
- + doc.root.attributes[:href].gsub(/&/,'&'),
154
+ + href.gsub(/&/,'&'),
130
155
  title: jobtitle,
131
156
  salary: salary,
132
157
  company: company_name,
@@ -237,7 +262,7 @@ class IS22Plus < IndeedScraper2022
237
262
 
238
263
  def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
239
264
  super(q: q, location: location, headless: headless, cookies: cookies,
240
- debug: debug)
265
+ debug: true)
241
266
  end
242
267
 
243
268
  # note: The most efficient method to accumulate vacancy articles is to
@@ -263,10 +288,8 @@ class IS22Plus < IndeedScraper2022
263
288
 
264
289
  puts 'saving ' + item[:title] if @debug
265
290
  puts 'link: ' + item[:link].inspect
266
- links = RXFReader.reveal(item[:link])
267
- puts 'links: ' + links.inspect if @debug
268
-
269
- url = links.last
291
+ url = URL.reveal(item[:link])
292
+ item[:link] = url
270
293
  puts 'url: ' + url.inspect if @debug
271
294
  id = url[/(?<=jk=)[^&]+/]
272
295
 
@@ -288,7 +311,7 @@ class IS22Plus < IndeedScraper2022
288
311
  salary: item[:salary].to_s,
289
312
  company: item[:company].to_s.strip,
290
313
  location: item[:location].to_s,
291
- jobsnippet: item[:jobsnippet],
314
+ jobsnippet: item[:jobsnippet].to_s,
292
315
  date: item[:date],
293
316
  added: Time.now.strftime("%Y-%m-%d")
294
317
  }
@@ -324,8 +347,6 @@ class IS22Archive
324
347
 
325
348
  def initialize(filepath='/tmp/indeed', debug: false)
326
349
 
327
- @debug = debug
328
-
329
350
  FileUtils.mkdir_p filepath
330
351
  @idxfile = File.join(filepath, 'index.yml')
331
352
 
@@ -339,12 +360,13 @@ class IS22Archive
339
360
 
340
361
  def list()
341
362
 
342
- @index.map.with_index do |x,i|
363
+ @index.to_a.reverse.map.with_index do |x,i|
343
364
 
344
365
  id, h = x
345
366
 
346
367
  puts 'h: ' + h.inspect if @debug
347
- "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
368
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
369
+ "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
348
370
 
349
371
  end.join("\n")
350
372
 
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-14 00:00:00.000000000 Z
38
+ date: 2022-05-12 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -77,6 +77,26 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: reveal_url22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
metadata.gz.sig CHANGED
Binary file