indeed_scraper2022 0.2.1 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
4
- data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
3
+ metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
4
+ data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
5
5
  SHA512:
6
- metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
7
- data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
6
+ metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
7
+ data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed
checksums.yaml.gz.sig CHANGED
Binary file
@@ -4,11 +4,20 @@
4
4
 
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'yaml'
7
8
 
8
9
  # Given the nature of changes to jobsearch websites,
9
10
  # don't rely upon this gem working in the near future.
10
11
 
11
12
 
13
+
14
+ # this gem consists of 3 main classes:
15
+ #
16
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
17
+ # * IS22Plus - Archives the scraped vacancies to local file
18
+ # * IS22Archive - Allows viewing of archived vacancies offline
19
+ #
20
+
12
21
  class IndeedScraper2022Err < Exception
13
22
  end
14
23
 
@@ -99,7 +108,9 @@ class IndeedScraper2022
99
108
  div1 = td.element("div[@class='companyInfo']")
100
109
 
101
110
  # company name (e.g. Coda Octopus Products Ltd)
102
- company_name = div1.element("span[@class='companyName']")&.text
111
+ coname = div1.element("span[@class='companyName']")
112
+ puts 'coname: ' + coname.text.inspect if @debug
113
+ company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
103
114
 
104
115
  # company location (e.g. Edinburgh)
105
116
  location = div1.element("div[@class='companyLocation']")&.text
@@ -144,52 +155,65 @@ class IndeedScraper2022
144
155
  def fetchjob(url)
145
156
 
146
157
  doc = Nokorexi.new(url).to_doc
158
+ puts 'before e0' if @debug
147
159
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
148
160
 
149
161
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
162
+ puts 'before div1' if @debug
150
163
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
164
+ puts 'before div2' if @debug
151
165
  div2 = div1.element("div")
152
166
 
153
167
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
168
+ puts 'before jobtitle' if @debug
154
169
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
155
170
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
156
171
  "er-title']")&.text
157
172
 
173
+ puts 'before div3' if @debug
158
174
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
159
175
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
160
176
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
161
177
  "ainer-subtitle']")
162
178
 
163
179
  # icl (e.g. Lyles Sutherland)
180
+ puts 'before cname' if @debug
164
181
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
165
182
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
166
183
  "s-mr--xs']")[1]
184
+ puts 'before clink' if @debug
167
185
  clink = div3.element('//a')
168
186
  company = cname.text ? cname.text : clink.text
169
187
  companylink = clink.attributes[:href] if clink
170
188
 
189
+ puts 'before salary' if @debug
171
190
  salary = div1.element("//span[@class='attribute_snippet']")&.text
191
+ puts 'before type' if @debug
172
192
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
173
193
  div5 = div3.xpath("div/div")
174
194
  location, worklocation = div5.map(&:text).compact
175
195
 
176
196
  # icl (e.g. Full-time, Permanent)
197
+ puts 'before jobtype' if @debug
177
198
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
178
199
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
179
200
  jobtype = jobtype&.texts.join if jobtype
180
201
 
181
202
  # jobsearch (e.g. Urgently needed)
203
+ puts 'before jobnote1' if @debug
182
204
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
183
205
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
184
206
  "h-DesktopTag-text']")&.text
185
207
 
186
208
  # jobsearch (e.g. 10 days ago)
209
+ puts 'before days' if @debug
187
210
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
188
211
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
189
212
  d = Date.today - days.to_i
190
213
  datepost = d.strftime("%Y-%m-%d")
191
214
 
192
215
 
216
+ puts 'before jobdesc' if @debug
193
217
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
194
218
  "ass='jobsearch-jobDescriptionText']").xml
195
219
 
@@ -218,14 +242,70 @@ class IS22Plus < IndeedScraper2022
218
242
  debug: debug)
219
243
  end
220
244
 
221
- def archive()
245
+ # note: The most efficient method to accumulate vacancy articles is to
246
+ # execute archive() daily
247
+ #
248
+ def archive(filepath='/tmp/indeed')
249
+
250
+ search() if @results.nil?
222
251
 
223
252
  return unless @results
224
253
 
225
- 1.upto(@results.length).each do |n|
226
- page(n)
254
+ FileUtils.mkdir_p filepath
255
+
256
+ idxfile = File.join(filepath, 'index.yml')
257
+
258
+ index = if File.exists? idxfile then
259
+ YAML.load(File.read(idxfile))
260
+ else
261
+ {}
262
+ end
263
+
264
+ @results.each.with_index do |item, i|
265
+
266
+ puts 'saving ' + item[:title] if @debug
267
+ puts 'link: ' + item[:link].inspect
268
+ links = RXFReader.reveal(item[:link])
269
+ puts 'links: ' + links.inspect if @debug
270
+
271
+ url = links.last
272
+ puts 'url: ' + url.inspect if @debug
273
+ id = url[/(?<=jk=)[^&]+/]
274
+
275
+ if index[id.to_sym] then
276
+
277
+ # the vacancy record has previously been saved
278
+ #
279
+ next
280
+
281
+ else
282
+
283
+ # write the full page vacancy article to file
284
+ #
285
+ File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
286
+
287
+ h = {
288
+ link: url[/^[^&]+/],
289
+ title: item[:title].to_s,
290
+ salary: item[:salary].to_s,
291
+ company: item[:company].to_s.strip,
292
+ location: item[:location].to_s,
293
+ jobsnippet: item[:jobsnippet],
294
+ date: item[:date],
295
+ added: Time.now.strftime("%Y-%m-%d")
296
+ }
297
+
298
+ # add the vacancy snippet to the index file
299
+ #
300
+ index[id.to_sym] = h
301
+ end
302
+
227
303
  end
228
304
 
305
+ # save the vacancy index file
306
+ #
307
+ File.write idxfile, index.to_yaml
308
+
229
309
  end
230
310
 
231
311
  def list()
@@ -238,3 +318,39 @@ class IS22Plus < IndeedScraper2022
238
318
 
239
319
 
240
320
  end
321
+
322
+
323
+ class IS22Archive
324
+
325
+ attr_reader :index
326
+
327
+ def initialize(filepath='/tmp/indeed', debug: false)
328
+
329
+ @debug = debug
330
+
331
+ FileUtils.mkdir_p filepath
332
+ @idxfile = File.join(filepath, 'index.yml')
333
+
334
+ @index = if File.exists? @idxfile then
335
+ YAML.load(File.read(@idxfile))
336
+ else
337
+ {}
338
+ end
339
+
340
+ end
341
+
342
+ def list()
343
+
344
+ @index.to_a.reverse.map.with_index do |x,i|
345
+
346
+ id, h = x
347
+
348
+ puts 'h: ' + h.inspect if @debug
349
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
350
+ "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
351
+
352
+ end.join("\n")
353
+
354
+ end
355
+
356
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-30 00:00:00.000000000 Z
38
+ date: 2022-04-16 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file