indeed_scraper2022 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
4
- data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
3
+ metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
4
+ data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
5
5
  SHA512:
6
- metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
7
- data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
6
+ metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
7
+ data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
checksums.yaml.gz.sig CHANGED
Binary file
@@ -5,12 +5,20 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
11
12
 
12
13
 
13
14
 
15
+ # this gem consists of 3 main classes:
16
+ #
17
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
18
+ # * IS22Plus - Archives the scraped vacancies to local file
19
+ # * IS22Archive - Allows viewing of archived vacancies offline
20
+ #
21
+
14
22
  class IndeedScraper2022Err < Exception
15
23
  end
16
24
 
@@ -37,9 +45,10 @@ class IndeedScraper2022
37
45
  end
38
46
 
39
47
  def search(q: @q, location: @location, start: nil)
40
-
48
+ puts 'inside search' if @debug
41
49
  url = @url_base
42
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
43
52
 
44
53
  @browser.goto(url)
45
54
  #@browser.network.wait_for_idle
@@ -74,34 +83,52 @@ class IndeedScraper2022
74
83
  sleep 2
75
84
 
76
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
77
87
 
78
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
79
89
  puts 'a2: ' + a2.length.inspect if @debug
80
90
 
81
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
82
92
 
83
93
  @results = @a2.map do |doc|
84
94
 
85
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
86
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
87
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
88
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
89
100
  "dy/tr/td[@class='resultContent']")
90
101
 
91
102
  # job title (e.g. Software Developer)
92
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
93
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
94
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
95
109
 
96
- salary = td.element("div[@class='metadataContainer']/" \
97
- "div[@class='salary-snippet-container']/div[@class='sa" \
98
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
99
124
 
100
125
  puts 'salary: ' + salary.inspect if @debug
101
126
  div1 = td.element("div[@class='companyInfo']")
102
127
 
103
128
  # company name (e.g. Coda Octopus Products Ltd)
104
- company_name = div1.element("span[@class='companyName']")&.text
129
+ coname = div1.element("span[@class='companyName']")
130
+ puts 'coname: ' + coname.text.inspect if @debug
131
+ company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
105
132
 
106
133
  # company location (e.g. Edinburgh)
107
134
  location = div1.element("div[@class='companyLocation']")&.text
@@ -111,7 +138,12 @@ class IndeedScraper2022
111
138
  "v[@class='result-footer']")
112
139
 
113
140
  # job (e.g. Our products are primarily written in C#, using...)
114
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
115
147
 
116
148
  # visually (e.g. Posted 14 days ago)
117
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -119,7 +151,7 @@ class IndeedScraper2022
119
151
 
120
152
  {
121
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
122
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
154
+ + href.gsub(/&amp;/,'&'),
123
155
  title: jobtitle,
124
156
  salary: salary,
125
157
  company: company_name,
@@ -146,52 +178,65 @@ class IndeedScraper2022
146
178
  def fetchjob(url)
147
179
 
148
180
  doc = Nokorexi.new(url).to_doc
181
+ puts 'before e0' if @debug
149
182
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
150
183
 
151
184
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
185
+ puts 'before div1' if @debug
152
186
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
187
+ puts 'before div2' if @debug
153
188
  div2 = div1.element("div")
154
189
 
155
190
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
191
+ puts 'before jobtitle' if @debug
156
192
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
157
193
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
158
194
  "er-title']")&.text
159
195
 
196
+ puts 'before div3' if @debug
160
197
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
161
198
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
162
199
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
163
200
  "ainer-subtitle']")
164
201
 
165
202
  # icl (e.g. Lyles Sutherland)
203
+ puts 'before cname' if @debug
166
204
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
167
205
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
168
206
  "s-mr--xs']")[1]
207
+ puts 'before clink' if @debug
169
208
  clink = div3.element('//a')
170
209
  company = cname.text ? cname.text : clink.text
171
210
  companylink = clink.attributes[:href] if clink
172
211
 
212
+ puts 'before salary' if @debug
173
213
  salary = div1.element("//span[@class='attribute_snippet']")&.text
214
+ puts 'before type' if @debug
174
215
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
175
216
  div5 = div3.xpath("div/div")
176
217
  location, worklocation = div5.map(&:text).compact
177
218
 
178
219
  # icl (e.g. Full-time, Permanent)
220
+ puts 'before jobtype' if @debug
179
221
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
180
222
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
181
223
  jobtype = jobtype&.texts.join if jobtype
182
224
 
183
225
  # jobsearch (e.g. Urgently needed)
226
+ puts 'before jobnote1' if @debug
184
227
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
185
228
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
186
229
  "h-DesktopTag-text']")&.text
187
230
 
188
231
  # jobsearch (e.g. 10 days ago)
232
+ puts 'before days' if @debug
189
233
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
190
234
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
191
235
  d = Date.today - days.to_i
192
236
  datepost = d.strftime("%Y-%m-%d")
193
237
 
194
238
 
239
+ puts 'before jobdesc' if @debug
195
240
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
196
241
  "ass='jobsearch-jobDescriptionText']").xml
197
242
 
@@ -217,11 +262,16 @@ class IS22Plus < IndeedScraper2022
217
262
 
218
263
  def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
219
264
  super(q: q, location: location, headless: headless, cookies: cookies,
220
- debug: debug)
265
+ debug: true)
221
266
  end
222
267
 
268
+ # note: The most efficient method to accumulate vacancy articles is to
269
+ # execute archive() daily
270
+ #
223
271
  def archive(filepath='/tmp/indeed')
224
272
 
273
+ search() if @results.nil?
274
+
225
275
  return unless @results
226
276
 
227
277
  FileUtils.mkdir_p filepath
@@ -238,16 +288,23 @@ class IS22Plus < IndeedScraper2022
238
288
 
239
289
  puts 'saving ' + item[:title] if @debug
240
290
  puts 'link: ' + item[:link].inspect
241
- links = RXFReader.reveal(item[:link])
242
- puts 'links: ' + links.inspect
291
+ links = URL.reveal(item[:link])
292
+ puts 'links: ' + links.inspect if @debug
243
293
 
244
294
  url = links.last
245
- id = url[/(?<=\?jk=)[^&]+/]
295
+ puts 'url: ' + url.inspect if @debug
296
+ id = url[/(?<=jk=)[^&]+/]
246
297
 
247
298
  if index[id.to_sym] then
299
+
300
+ # the vacancy record has previously been saved
301
+ #
248
302
  next
303
+
249
304
  else
250
305
 
306
+ # write the full page vacancy article to file
307
+ #
251
308
  File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
252
309
 
253
310
  h = {
@@ -257,14 +314,19 @@ class IS22Plus < IndeedScraper2022
257
314
  company: item[:company].to_s.strip,
258
315
  location: item[:location].to_s,
259
316
  jobsnippet: item[:jobsnippet],
260
- date: item[:date]
317
+ date: item[:date],
318
+ added: Time.now.strftime("%Y-%m-%d")
261
319
  }
262
320
 
321
+ # add the vacancy snippet to the index file
322
+ #
263
323
  index[id.to_sym] = h
264
324
  end
265
325
 
266
326
  end
267
327
 
328
+ # save the vacancy index file
329
+ #
268
330
  File.write idxfile, index.to_yaml
269
331
 
270
332
  end
@@ -279,3 +341,37 @@ class IS22Plus < IndeedScraper2022
279
341
 
280
342
 
281
343
  end
344
+
345
+
346
+ class IS22Archive
347
+
348
+ attr_reader :index
349
+
350
+ def initialize(filepath='/tmp/indeed', debug: false)
351
+
352
+ FileUtils.mkdir_p filepath
353
+ @idxfile = File.join(filepath, 'index.yml')
354
+
355
+ @index = if File.exists? @idxfile then
356
+ YAML.load(File.read(@idxfile))
357
+ else
358
+ {}
359
+ end
360
+
361
+ end
362
+
363
+ def list()
364
+
365
+ @index.to_a.reverse.map.with_index do |x,i|
366
+
367
+ id, h = x
368
+
369
+ puts 'h: ' + h.inspect if @debug
370
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
371
+ "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
372
+
373
+ end.join("\n")
374
+
375
+ end
376
+
377
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-01 00:00:00.000000000 Z
38
+ date: 2022-05-12 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,40 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: reveal_url22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
metadata.gz.sig CHANGED
Binary file