indeed_scraper2022 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
4
- data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
3
+ metadata.gz: e9d1593bc7531fd77592f8765ccac9011f47980f0fb1f0d7e9c89513a319b85e
4
+ data.tar.gz: ac88d94ce2271fdc4561f43ff7218baed73db60867506c20bc34522fd357cb89
5
5
  SHA512:
6
- metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
7
- data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
6
+ metadata.gz: 133e26d46ca8bae7b08aff57a7451c4cf23f507f4cb4e3875f3b3ca320c60e14f747952f1b893a8016cfd3dfb907440597864158e0cf9a5dc92a833d98855aff
7
+ data.tar.gz: 0c5d883b0b9a1582a2b83c8a64d47e420b7c0f79f2e6f8b1e409439fd171539adca26f10c66b57db19f9d5238c3017649c527d0e2f8fe405b3aa73c4cf276e8f
checksums.yaml.gz.sig CHANGED
Binary file
@@ -5,6 +5,7 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
@@ -44,9 +45,10 @@ class IndeedScraper2022
44
45
  end
45
46
 
46
47
  def search(q: @q, location: @location, start: nil)
47
-
48
+ puts 'inside search' if @debug
48
49
  url = @url_base
49
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
50
52
 
51
53
  @browser.goto(url)
52
54
  #@browser.network.wait_for_idle
@@ -81,34 +83,52 @@ class IndeedScraper2022
81
83
  sleep 2
82
84
 
83
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
84
87
 
85
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
86
89
  puts 'a2: ' + a2.length.inspect if @debug
87
90
 
88
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
89
92
 
90
93
  @results = @a2.map do |doc|
91
94
 
92
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
93
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
94
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
95
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
96
100
  "dy/tr/td[@class='resultContent']")
97
101
 
98
102
  # job title (e.g. Software Developer)
99
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
100
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
101
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
102
109
 
103
- salary = td.element("div[@class='metadataContainer']/" \
104
- "div[@class='salary-snippet-container']/div[@class='sa" \
105
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
106
124
 
107
125
  puts 'salary: ' + salary.inspect if @debug
108
126
  div1 = td.element("div[@class='companyInfo']")
109
127
 
110
128
  # company name (e.g. Coda Octopus Products Ltd)
111
- company_name = div1.element("span[@class='companyName']")&.text
129
+ coname = div1.element("span[@class='companyName']")
130
+ puts 'coname: ' + coname.text.inspect if @debug
131
+ company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
112
132
 
113
133
  # company location (e.g. Edinburgh)
114
134
  location = div1.element("div[@class='companyLocation']")&.text
@@ -118,7 +138,12 @@ class IndeedScraper2022
118
138
  "v[@class='result-footer']")
119
139
 
120
140
  # job (e.g. Our products are primarily written in C#, using...)
121
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
122
147
 
123
148
  # visually (e.g. Posted 14 days ago)
124
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -126,7 +151,7 @@ class IndeedScraper2022
126
151
 
127
152
  {
128
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
129
- + doc.root.attributes[:href].gsub(/&/,'&'),
154
+ + href.gsub(/&/,'&'),
130
155
  title: jobtitle,
131
156
  salary: salary,
132
157
  company: company_name,
@@ -237,7 +262,7 @@ class IS22Plus < IndeedScraper2022
237
262
 
238
263
  def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
239
264
  super(q: q, location: location, headless: headless, cookies: cookies,
240
- debug: debug)
265
+ debug: true)
241
266
  end
242
267
 
243
268
  # note: The most efficient method to accumulate vacancy articles is to
@@ -263,10 +288,8 @@ class IS22Plus < IndeedScraper2022
263
288
 
264
289
  puts 'saving ' + item[:title] if @debug
265
290
  puts 'link: ' + item[:link].inspect
266
- links = RXFReader.reveal(item[:link])
267
- puts 'links: ' + links.inspect if @debug
268
-
269
- url = links.last
291
+ url = URL.reveal(item[:link])
292
+ item[:link] = url
270
293
  puts 'url: ' + url.inspect if @debug
271
294
  id = url[/(?<=jk=)[^&]+/]
272
295
 
@@ -288,7 +311,7 @@ class IS22Plus < IndeedScraper2022
288
311
  salary: item[:salary].to_s,
289
312
  company: item[:company].to_s.strip,
290
313
  location: item[:location].to_s,
291
- jobsnippet: item[:jobsnippet],
314
+ jobsnippet: item[:jobsnippet].to_s,
292
315
  date: item[:date],
293
316
  added: Time.now.strftime("%Y-%m-%d")
294
317
  }
@@ -324,8 +347,6 @@ class IS22Archive
324
347
 
325
348
  def initialize(filepath='/tmp/indeed', debug: false)
326
349
 
327
- @debug = debug
328
-
329
350
  FileUtils.mkdir_p filepath
330
351
  @idxfile = File.join(filepath, 'index.yml')
331
352
 
@@ -339,12 +360,13 @@ class IS22Archive
339
360
 
340
361
  def list()
341
362
 
342
- @index.map.with_index do |x,i|
363
+ @index.to_a.reverse.map.with_index do |x,i|
343
364
 
344
365
  id, h = x
345
366
 
346
367
  puts 'h: ' + h.inspect if @debug
347
- "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
368
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
369
+ "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
348
370
 
349
371
  end.join("\n")
350
372
 
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-14 00:00:00.000000000 Z
38
+ date: 2022-05-12 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -77,6 +77,26 @@ dependencies:
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
79
  version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: reveal_url22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
metadata.gz.sig CHANGED
Binary file