indeed_scraper2022 0.2.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
4
- data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
3
+ metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
4
+ data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
5
5
  SHA512:
6
- metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
7
- data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
6
+ metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
7
+ data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
checksums.yaml.gz.sig CHANGED
Binary file
@@ -4,11 +4,20 @@
4
4
 
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'yaml'
7
8
 
8
9
  # Given the nature of changes to jobsearch websites,
9
10
  # don't rely upon this gem working in the near future.
10
11
 
11
12
 
13
+
14
+ # this gem consists of 3 main classes:
15
+ #
16
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
17
+ # * IS22Plus - Archives the scraped vacancies to local file
18
+ # * IS22Archive - Allows viewing of archived vacancies offline
19
+ #
20
+
12
21
  class IndeedScraper2022Err < Exception
13
22
  end
14
23
 
@@ -99,7 +108,9 @@ class IndeedScraper2022
99
108
  div1 = td.element("div[@class='companyInfo']")
100
109
 
101
110
  # company name (e.g. Coda Octopus Products Ltd)
102
- company_name = div1.element("span[@class='companyName']")&.text
111
+ coname = div1.element("span[@class='companyName']")
112
+ puts 'coname: ' + coname.text.inspect if @debug
113
+ company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
103
114
 
104
115
  # company location (e.g. Edinburgh)
105
116
  location = div1.element("div[@class='companyLocation']")&.text
@@ -144,52 +155,65 @@ class IndeedScraper2022
144
155
  def fetchjob(url)
145
156
 
146
157
  doc = Nokorexi.new(url).to_doc
158
+ puts 'before e0' if @debug
147
159
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
148
160
 
149
161
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
162
+ puts 'before div1' if @debug
150
163
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
164
+ puts 'before div2' if @debug
151
165
  div2 = div1.element("div")
152
166
 
153
167
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
168
+ puts 'before jobtitle' if @debug
154
169
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
155
170
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
156
171
  "er-title']")&.text
157
172
 
173
+ puts 'before div3' if @debug
158
174
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
159
175
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
160
176
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
161
177
  "ainer-subtitle']")
162
178
 
163
179
  # icl (e.g. Lyles Sutherland)
180
+ puts 'before cname' if @debug
164
181
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
165
182
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
166
183
  "s-mr--xs']")[1]
184
+ puts 'before clink' if @debug
167
185
  clink = div3.element('//a')
168
186
  company = cname.text ? cname.text : clink.text
169
187
  companylink = clink.attributes[:href] if clink
170
188
 
189
+ puts 'before salary' if @debug
171
190
  salary = div1.element("//span[@class='attribute_snippet']")&.text
191
+ puts 'before type' if @debug
172
192
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
173
193
  div5 = div3.xpath("div/div")
174
194
  location, worklocation = div5.map(&:text).compact
175
195
 
176
196
  # icl (e.g. Full-time, Permanent)
197
+ puts 'before jobtype' if @debug
177
198
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
178
199
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
179
200
  jobtype = jobtype&.texts.join if jobtype
180
201
 
181
202
  # jobsearch (e.g. Urgently needed)
203
+ puts 'before jobnote1' if @debug
182
204
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
183
205
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
184
206
  "h-DesktopTag-text']")&.text
185
207
 
186
208
  # jobsearch (e.g. 10 days ago)
209
+ puts 'before days' if @debug
187
210
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
188
211
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
189
212
  d = Date.today - days.to_i
190
213
  datepost = d.strftime("%Y-%m-%d")
191
214
 
192
215
 
216
+ puts 'before jobdesc' if @debug
193
217
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
194
218
  "ass='jobsearch-jobDescriptionText']").xml
195
219
 
@@ -218,14 +242,70 @@ class IS22Plus < IndeedScraper2022
218
242
  debug: debug)
219
243
  end
220
244
 
221
- def archive()
245
+ # note: The most efficient method to accumulate vacancy articles is to
246
+ # execute archive() daily
247
+ #
248
+ def archive(filepath='/tmp/indeed')
249
+
250
+ search() if @results.nil?
222
251
 
223
252
  return unless @results
224
253
 
225
- 1.upto(@results.length).each do |n|
226
- page(n)
254
+ FileUtils.mkdir_p filepath
255
+
256
+ idxfile = File.join(filepath, 'index.yml')
257
+
258
+ index = if File.exists? idxfile then
259
+ YAML.load(File.read(idxfile))
260
+ else
261
+ {}
262
+ end
263
+
264
+ @results.each.with_index do |item, i|
265
+
266
+ puts 'saving ' + item[:title] if @debug
267
+ puts 'link: ' + item[:link].inspect
268
+ links = RXFReader.reveal(item[:link])
269
+ puts 'links: ' + links.inspect if @debug
270
+
271
+ url = links.last
272
+ puts 'url: ' + url.inspect if @debug
273
+ id = url[/(?<=jk=)[^&]+/]
274
+
275
+ if index[id.to_sym] then
276
+
277
+ # the vacancy record has previously been saved
278
+ #
279
+ next
280
+
281
+ else
282
+
283
+ # write the full page vacancy article to file
284
+ #
285
+ File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
286
+
287
+ h = {
288
+ link: url[/^[^&]+/],
289
+ title: item[:title].to_s,
290
+ salary: item[:salary].to_s,
291
+ company: item[:company].to_s.strip,
292
+ location: item[:location].to_s,
293
+ jobsnippet: item[:jobsnippet],
294
+ date: item[:date],
295
+ added: Time.now.strftime("%Y-%m-%d")
296
+ }
297
+
298
+ # add the vacancy snippet to the index file
299
+ #
300
+ index[id.to_sym] = h
301
+ end
302
+
227
303
  end
228
304
 
305
+ # save the vacancy index file
306
+ #
307
+ File.write idxfile, index.to_yaml
308
+
229
309
  end
230
310
 
231
311
  def list()
@@ -238,3 +318,39 @@ class IS22Plus < IndeedScraper2022
238
318
 
239
319
 
240
320
  end
321
+
322
+
323
+ class IS22Archive
324
+
325
+ attr_reader :index
326
+
327
+ def initialize(filepath='/tmp/indeed', debug: false)
328
+
329
+ @debug = debug
330
+
331
+ FileUtils.mkdir_p filepath
332
+ @idxfile = File.join(filepath, 'index.yml')
333
+
334
+ @index = if File.exists? @idxfile then
335
+ YAML.load(File.read(@idxfile))
336
+ else
337
+ {}
338
+ end
339
+
340
+ end
341
+
342
+ def list()
343
+
344
+ @index.to_a.reverse.map.with_index do |x,i|
345
+
346
+ id, h = x
347
+
348
+ puts 'h: ' + h.inspect if @debug
349
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
350
+ "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
351
+
352
+ end.join("\n")
353
+
354
+ end
355
+
356
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-30 00:00:00.000000000 Z
38
+ date: 2022-04-16 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file