indeed_scraper2022 0.2.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
4
- data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
3
+ metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
4
+ data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
5
5
  SHA512:
6
- metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
7
- data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
6
+ metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
7
+ data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6
checksums.yaml.gz.sig CHANGED
Binary file
@@ -4,23 +4,36 @@
4
4
 
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
+ require 'yaml'
7
8
 
8
9
  # Given the nature of changes to jobsearch websites,
9
10
  # don't rely upon this gem working in the near future.
10
11
 
11
12
 
13
+
14
+ # this gem consists of 3 main classes:
15
+ #
16
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
17
+ # * IS22Plus - Archives the scraped vacancies to local file
18
+ # * IS22Archive - Allows viewing of archived vacancies offline
19
+ #
20
+
12
21
  class IndeedScraper2022Err < Exception
13
22
  end
14
23
 
15
24
  class IndeedScraper2022
16
25
 
26
+ attr_reader :browser
27
+
17
28
  def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
18
29
  headless: true, cookies: nil, debug: false)
19
30
 
20
31
  @debug = debug
21
32
  @url_base, @q, @location = url, q, location
22
33
  @headless, @cookies = headless, cookies
23
- @results = search(q: @q, location: @location)
34
+
35
+ fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
+ @browser = fw.browser
24
37
 
25
38
  end
26
39
 
@@ -32,35 +45,49 @@ class IndeedScraper2022
32
45
 
33
46
  def search(q: @q, location: @location, start: nil)
34
47
 
35
- fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
36
-
37
48
  url = @url_base
38
49
  url += 'start=' + start if start
39
50
 
40
- browser = fw.browser
41
- browser.goto(url)
51
+ @browser.goto(url)
52
+ #@browser.network.wait_for_idle
53
+ puts 'sleeping for 4 seconds' if @debug
54
+ sleep 4
42
55
 
43
56
  if q.length > 1 then
44
- input = browser.at_xpath("//input[@name='q']")
45
- input.focus.type(q)
57
+
58
+ input = @browser.at_xpath("//input[@name='q']")
59
+
60
+ # select any existing text and overwrite it
61
+ input.focus.type(:home); sleep 0.2
62
+ input.focus.type(:shift, :end); sleep 0.2
63
+ input.focus.type(q); sleep 0.2
46
64
  end
47
65
 
48
66
  if location.length > 1 then
49
- input2 = browser.at_xpath("//input[@name='l']")
50
- input2.focus.type(location)
67
+
68
+ input2 = @browser.at_xpath("//input[@name='l']")
69
+
70
+ # select any existing text and overwrite it
71
+ input2.focus.type(:home); sleep 0.2
72
+ input2.focus.type(:shift, :end); sleep 0.2
73
+ input2.focus.type(location); sleep 0.2
74
+
51
75
  end
52
76
 
53
- button = browser.at_xpath("//button[@type='submit']")
77
+ button = @browser.at_xpath("//button[@type='submit']")
54
78
  button.click
79
+ #@browser.network.wait_for_idle
80
+ puts 'sleeping for 2 seconds' if @debug
81
+ sleep 2
55
82
 
56
- doc2 = Nokogiri::XML(browser.body)
83
+ doc2 = Nokogiri::XML(@browser.body)
57
84
 
58
85
  a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
59
86
  puts 'a2: ' + a2.length.inspect if @debug
60
87
 
61
88
  @a2 = a2.map {|x| Rexle.new x.to_s }
62
89
 
63
- @a2.map do |doc|
90
+ @results = @a2.map do |doc|
64
91
 
65
92
  div = doc.element("a[@class='desktop']/div[@class='slider" \
66
93
  "_container']/div[@class='slider_list']/div[@class='sl" \
@@ -126,52 +153,65 @@ class IndeedScraper2022
126
153
  def fetchjob(url)
127
154
 
128
155
  doc = Nokorexi.new(url).to_doc
156
+ puts 'before e0' if @debug
129
157
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
130
158
 
131
159
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
160
+ puts 'before div1' if @debug
132
161
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
162
+ puts 'before div2' if @debug
133
163
  div2 = div1.element("div")
134
164
 
135
165
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
166
+ puts 'before jobtitle' if @debug
136
167
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
137
168
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
138
169
  "er-title']")&.text
139
170
 
171
+ puts 'before div3' if @debug
140
172
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
141
173
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
142
174
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
143
175
  "ainer-subtitle']")
144
176
 
145
177
  # icl (e.g. Lyles Sutherland)
178
+ puts 'before cname' if @debug
146
179
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
147
180
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
148
181
  "s-mr--xs']")[1]
182
+ puts 'before clink' if @debug
149
183
  clink = div3.element('//a')
150
184
  company = cname.text ? cname.text : clink.text
151
185
  companylink = clink.attributes[:href] if clink
152
186
 
187
+ puts 'before salary' if @debug
153
188
  salary = div1.element("//span[@class='attribute_snippet']")&.text
189
+ puts 'before type' if @debug
154
190
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
155
191
  div5 = div3.xpath("div/div")
156
192
  location, worklocation = div5.map(&:text).compact
157
193
 
158
194
  # icl (e.g. Full-time, Permanent)
195
+ puts 'before jobtype' if @debug
159
196
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
160
197
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
161
198
  jobtype = jobtype&.texts.join if jobtype
162
199
 
163
200
  # jobsearch (e.g. Urgently needed)
201
+ puts 'before jobnote1' if @debug
164
202
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
165
203
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
166
204
  "h-DesktopTag-text']")&.text
167
205
 
168
206
  # jobsearch (e.g. 10 days ago)
207
+ puts 'before days' if @debug
169
208
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
170
209
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
171
210
  d = Date.today - days.to_i
172
211
  datepost = d.strftime("%Y-%m-%d")
173
212
 
174
213
 
214
+ puts 'before jobdesc' if @debug
175
215
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
176
216
  "ass='jobsearch-jobDescriptionText']").xml
177
217
 
@@ -200,12 +240,70 @@ class IS22Plus < IndeedScraper2022
200
240
  debug: debug)
201
241
  end
202
242
 
203
- def archive()
243
+ # note: The most efficient method to accumulate vacancy articles is to
244
+ # execute archive() daily
245
+ #
246
+ def archive(filepath='/tmp/indeed')
247
+
248
+ search() if @results.nil?
249
+
250
+ return unless @results
251
+
252
+ FileUtils.mkdir_p filepath
253
+
254
+ idxfile = File.join(filepath, 'index.yml')
204
255
 
205
- 1.upto(15).each do |n|
206
- page(n)
256
+ index = if File.exists? idxfile then
257
+ YAML.load(File.read(idxfile))
258
+ else
259
+ {}
207
260
  end
208
261
 
262
+ @results.each.with_index do |item, i|
263
+
264
+ puts 'saving ' + item[:title] if @debug
265
+ puts 'link: ' + item[:link].inspect
266
+ links = RXFReader.reveal(item[:link])
267
+ puts 'links: ' + links.inspect if @debug
268
+
269
+ url = links.last
270
+ puts 'url: ' + url.inspect if @debug
271
+ id = url[/(?<=jk=)[^&]+/]
272
+
273
+ if index[id.to_sym] then
274
+
275
+ # the vacancy record has previously been saved
276
+ #
277
+ next
278
+
279
+ else
280
+
281
+ # write the full page vacancy article to file
282
+ #
283
+ File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
284
+
285
+ h = {
286
+ link: url[/^[^&]+/],
287
+ title: item[:title].to_s,
288
+ salary: item[:salary].to_s,
289
+ company: item[:company].to_s.strip,
290
+ location: item[:location].to_s,
291
+ jobsnippet: item[:jobsnippet],
292
+ date: item[:date],
293
+ added: Time.now.strftime("%Y-%m-%d")
294
+ }
295
+
296
+ # add the vacancy snippet to the index file
297
+ #
298
+ index[id.to_sym] = h
299
+ end
300
+
301
+ end
302
+
303
+ # save the vacancy index file
304
+ #
305
+ File.write idxfile, index.to_yaml
306
+
209
307
  end
210
308
 
211
309
  def list()
@@ -218,3 +316,38 @@ class IS22Plus < IndeedScraper2022
218
316
 
219
317
 
220
318
  end
319
+
320
+
321
+ class IS22Archive
322
+
323
+ attr_reader :index
324
+
325
+ def initialize(filepath='/tmp/indeed', debug: false)
326
+
327
+ @debug = debug
328
+
329
+ FileUtils.mkdir_p filepath
330
+ @idxfile = File.join(filepath, 'index.yml')
331
+
332
+ @index = if File.exists? @idxfile then
333
+ YAML.load(File.read(@idxfile))
334
+ else
335
+ {}
336
+ end
337
+
338
+ end
339
+
340
+ def list()
341
+
342
+ @index.map.with_index do |x,i|
343
+
344
+ id, h = x
345
+
346
+ puts 'h: ' + h.inspect if @debug
347
+ "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
348
+
349
+ end.join("\n")
350
+
351
+ end
352
+
353
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-03-30 00:00:00.000000000 Z
38
+ date: 2022-04-14 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
80
  description:
81
81
  email: digital.robertson@gmail.com
82
82
  executables: []
metadata.gz.sig CHANGED
Binary file