indeed_scraper2022 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
4
- data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
3
+ metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
4
+ data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
5
5
  SHA512:
6
- metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
7
- data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
6
+ metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
7
+ data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
checksums.yaml.gz.sig CHANGED
Binary file
@@ -4,23 +4,36 @@
4
4
 
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'yaml'
7
8
 
8
9
  # Given the nature of changes to jobsearch websites,
9
10
  # don't rely upon this gem working in the near future.
10
11
 
11
12
 
13
+
14
+ # this gem consists of 3 main classes:
15
+ #
16
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
17
+ # * IS22Plus - Archives the scraped vacancies to local file
18
+ # * IS22Archive - Allows viewing of archived vacancies offline
19
+ #
20
+
12
21
  class IndeedScraper2022Err < Exception
13
22
  end
14
23
 
15
24
  class IndeedScraper2022
16
25
 
26
+ attr_reader :browser
27
+
17
28
  def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
18
29
  headless: true, cookies: nil, debug: false)
19
30
 
20
31
  @debug = debug
21
32
  @url_base, @q, @location = url, q, location
22
33
  @headless, @cookies = headless, cookies
23
- @results = search(q: @q, location: @location)
34
+
35
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
+ @browser = fw.browser
24
37
 
25
38
  end
26
39
 
@@ -32,35 +45,49 @@ class IndeedScraper2022
32
45
 
33
46
  def search(q: @q, location: @location, start: nil)
34
47
 
35
- fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
-
37
48
  url = @url_base
38
49
  url += 'start=' + start if start
39
50
 
40
- browser = fw.browser
41
- browser.goto(url)
51
+ @browser.goto(url)
52
+ #@browser.network.wait_for_idle
53
+ puts 'sleeping for 4 seconds' if @debug
54
+ sleep 4
42
55
 
43
56
  if q.length > 1 then
44
- input = browser.at_xpath("//input[@name='q']")
45
- input.focus.type(q)
57
+
58
+ input = @browser.at_xpath("//input[@name='q']")
59
+
60
+ # select any existing text and overwrite it
61
+ input.focus.type(:home); sleep 0.2
62
+ input.focus.type(:shift, :end); sleep 0.2
63
+ input.focus.type(q); sleep 0.2
46
64
  end
47
65
 
48
66
  if location.length > 1 then
49
- input2 = browser.at_xpath("//input[@name='l']")
50
- input2.focus.type(location)
67
+
68
+ input2 = @browser.at_xpath("//input[@name='l']")
69
+
70
+ # select any existing text and overwrite it
71
+ input2.focus.type(:home); sleep 0.2
72
+ input2.focus.type(:shift, :end); sleep 0.2
73
+ input2.focus.type(location); sleep 0.2
74
+
51
75
  end
52
76
 
53
- button = browser.at_xpath("//button[@type='submit']")
77
+ button = @browser.at_xpath("//button[@type='submit']")
54
78
  button.click
79
+ #@browser.network.wait_for_idle
80
+ puts 'sleeping for 2 seconds' if @debug
81
+ sleep 2
55
82
 
56
- doc2 = Nokogiri::XML(browser.body)
83
+ doc2 = Nokogiri::XML(@browser.body)
57
84
 
58
85
  a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
59
86
  puts 'a2: ' + a2.length.inspect if @debug
60
87
 
61
88
  @a2 = a2.map {|x| Rexle.new x.to_s }
62
89
 
63
- @a2.map do |doc|
90
+ @results = @a2.map do |doc|
64
91
 
65
92
  div = doc.element("a[@class='desktop']/div[@class='slider" \
66
93
  "_container']/div[@class='slider_list']/div[@class='sl" \
@@ -126,52 +153,65 @@ class IndeedScraper2022
126
153
  def fetchjob(url)
127
154
 
128
155
  doc = Nokorexi.new(url).to_doc
156
+ puts 'before e0' if @debug
129
157
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
130
158
 
131
159
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
160
+ puts 'before div1' if @debug
132
161
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
162
+ puts 'before div2' if @debug
133
163
  div2 = div1.element("div")
134
164
 
135
165
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
166
+ puts 'before jobtitle' if @debug
136
167
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
137
168
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
138
169
  "er-title']")&.text
139
170
 
171
+ puts 'before div3' if @debug
140
172
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
141
173
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
142
174
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
143
175
  "ainer-subtitle']")
144
176
 
145
177
  # icl (e.g. Lyles Sutherland)
178
+ puts 'before cname' if @debug
146
179
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
147
180
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
148
181
  "s-mr--xs']")[1]
182
+ puts 'before clink' if @debug
149
183
  clink = div3.element('//a')
150
184
  company = cname.text ? cname.text : clink.text
151
185
  companylink = clink.attributes[:href] if clink
152
186
 
187
+ puts 'before salary' if @debug
153
188
  salary = div1.element("//span[@class='attribute_snippet']")&.text
189
+ puts 'before type' if @debug
154
190
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
155
191
  div5 = div3.xpath("div/div")
156
192
  location, worklocation = div5.map(&:text).compact
157
193
 
158
194
  # icl (e.g. Full-time, Permanent)
195
+ puts 'before jobtype' if @debug
159
196
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
160
197
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
161
198
  jobtype = jobtype&.texts.join if jobtype
162
199
 
163
200
  # jobsearch (e.g. Urgently needed)
201
+ puts 'before jobnote1' if @debug
164
202
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
165
203
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
166
204
  "h-DesktopTag-text']")&.text
167
205
 
168
206
  # jobsearch (e.g. 10 days ago)
207
+ puts 'before days' if @debug
169
208
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
170
209
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
171
210
  d = Date.today - days.to_i
172
211
  datepost = d.strftime("%Y-%m-%d")
173
212
 
174
213
 
214
+ puts 'before jobdesc' if @debug
175
215
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
176
216
  "ass='jobsearch-jobDescriptionText']").xml
177
217
 
@@ -200,12 +240,70 @@ class IS22Plus < IndeedScraper2022
200
240
  debug: debug)
201
241
  end
202
242
 
203
- def archive()
243
+ # note: The most efficient method to accumulate vacancy articles is to
244
+ # execute archive() daily
245
+ #
246
+ def archive(filepath='/tmp/indeed')
247
+
248
+ search() if @results.nil?
249
+
250
+ return unless @results
251
+
252
+ FileUtils.mkdir_p filepath
253
+
254
+ idxfile = File.join(filepath, 'index.yml')
204
255
 
205
- 1.upto(15).each do |n|
206
- page(n)
256
+ index = if File.exists? idxfile then
257
+ YAML.load(File.read(idxfile))
258
+ else
259
+ {}
207
260
  end
208
261
 
262
+ @results.each.with_index do |item, i|
263
+
264
+ puts 'saving ' + item[:title] if @debug
265
+ puts 'link: ' + item[:link].inspect
266
+ links = RXFReader.reveal(item[:link])
267
+ puts 'links: ' + links.inspect if @debug
268
+
269
+ url = links.last
270
+ puts 'url: ' + url.inspect if @debug
271
+ id = url[/(?<=jk=)[^&]+/]
272
+
273
+ if index[id.to_sym] then
274
+
275
+ # the vacancy record has previously been saved
276
+ #
277
+ next
278
+
279
+ else
280
+
281
+ # write the full page vacancy article to file
282
+ #
283
+ File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
284
+
285
+ h = {
286
+ link: url[/^[^&]+/],
287
+ title: item[:title].to_s,
288
+ salary: item[:salary].to_s,
289
+ company: item[:company].to_s.strip,
290
+ location: item[:location].to_s,
291
+ jobsnippet: item[:jobsnippet],
292
+ date: item[:date],
293
+ added: Time.now.strftime("%Y-%m-%d")
294
+ }
295
+
296
+ # add the vacancy snippet to the index file
297
+ #
298
+ index[id.to_sym] = h
299
+ end
300
+
301
+ end
302
+
303
+ # save the vacancy index file
304
+ #
305
+ File.write idxfile, index.to_yaml
306
+
209
307
  end
210
308
 
211
309
  def list()
@@ -218,3 +316,38 @@ class IS22Plus < IndeedScraper2022
218
316
 
219
317
 
220
318
  end
319
+
320
+
321
+ class IS22Archive
322
+
323
+ attr_reader :index
324
+
325
+ def initialize(filepath='/tmp/indeed', debug: false)
326
+
327
+ @debug = debug
328
+
329
+ FileUtils.mkdir_p filepath
330
+ @idxfile = File.join(filepath, 'index.yml')
331
+
332
+ @index = if File.exists? @idxfile then
333
+ YAML.load(File.read(@idxfile))
334
+ else
335
+ {}
336
+ end
337
+
338
+ end
339
+
340
+ def list()
341
+
342
+ @index.map.with_index do |x,i|
343
+
344
+ id, h = x
345
+
346
+ puts 'h: ' + h.inspect if @debug
347
+ "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
348
+
349
+ end.join("\n")
350
+
351
+ end
352
+
353
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-30 00:00:00.000000000 Z
38
+ date: 2022-04-14 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file