indeed_scraper2022 0.3.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
4
- data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
3
+ metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
4
+ data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
5
5
  SHA512:
6
- metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
7
- data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
6
+ metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
7
+ data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39
checksums.yaml.gz.sig CHANGED
Binary file
@@ -5,12 +5,20 @@
5
5
  require 'ferrumwizard'
6
6
  require 'nokorexi'
7
7
  require 'yaml'
8
+ require 'reveal_url22'
8
9
 
9
10
  # Given the nature of changes to jobsearch websites,
10
11
  # don't rely upon this gem working in the near future.
11
12
 
12
13
 
13
14
 
15
+ # this gem consists of 3 main classes:
16
+ #
17
+ # * IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
18
+ # * IS22Plus - Archives the scraped vacancies to local file
19
+ # * IS22Archive - Allows viewing of archived vacancies offline
20
+ #
21
+
14
22
  class IndeedScraper2022Err < Exception
15
23
  end
16
24
 
@@ -37,9 +45,10 @@ class IndeedScraper2022
37
45
  end
38
46
 
39
47
  def search(q: @q, location: @location, start: nil)
40
-
48
+ puts 'inside search' if @debug
41
49
  url = @url_base
42
50
  url += 'start=' + start if start
51
+ puts 'url: ' + url.inspect if @debug
43
52
 
44
53
  @browser.goto(url)
45
54
  #@browser.network.wait_for_idle
@@ -74,34 +83,52 @@ class IndeedScraper2022
74
83
  sleep 2
75
84
 
76
85
  doc2 = Nokogiri::XML(@browser.body)
86
+ File.write '/tmp/body.txt', doc2.to_s if @debug
77
87
 
78
- a2 = doc2.xpath "//a[div/div/div/div/table/tbody/tr/td/div]"
88
+ a2 = doc2.root.xpath "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
79
89
  puts 'a2: ' + a2.length.inspect if @debug
80
90
 
81
91
  @a2 = a2.map {|x| Rexle.new x.to_s }
82
92
 
83
93
  @results = @a2.map do |doc|
84
94
 
85
- div = doc.element("a[@class='desktop']/div[@class='slider" \
95
+ div = doc.element("div[@class='cardOutline']/div[@class='slider" \
86
96
  "_container']/div[@class='slider_list']/div[@class='sl" \
87
97
  "ider_item']/div[@class='job_seen_beacon']")
98
+
88
99
  td = div.element("table[@class='jobCard_mainContent']/tbo" \
89
100
  "dy/tr/td[@class='resultContent']")
90
101
 
91
102
  # job title (e.g. Software Developer)
92
- jobtitle = td.element("div[@class='tapItem-gutter']/h2[@" \
93
- "class='jobTitle-color-purple']/span")&.text
103
+ job = td.element("div[@class='tapItem-gutter']/h2[@" \
104
+ "class='jobTitle-color-purple']/a")
105
+ href = job.attributes[:href]
106
+ jobtitle = job.element("span")&.text
107
+
94
108
  puts 'jobtitle: ' + jobtitle.inspect if @debug
95
109
 
96
- salary = td.element("div[@class='metadataContainer']/" \
97
- "div[@class='salary-snippet-container']/div[@class='sa" \
98
- "lary-snippet']/span")&.text
110
+ sal = td.element("div[@class='metadataContainer']/" \
111
+ "div[@class='salary-snippet-container']")
112
+
113
+ salary = if sal then
114
+ sal_e = sal.element("div[@class='attribute_snippet']")
115
+ if sal_e then
116
+ sal_e.texts[0]
117
+ else
118
+ sal_e2 = sal.element("div[@class='salary-snippet']/span")
119
+ sal_e2 ? sal_e2.text : ''
120
+ end
121
+ else
122
+ ''
123
+ end
99
124
 
100
125
  puts 'salary: ' + salary.inspect if @debug
101
126
  div1 = td.element("div[@class='companyInfo']")
102
127
 
103
128
  # company name (e.g. Coda Octopus Products Ltd)
104
- company_name = div1.element("span[@class='companyName']")&.text
129
+ coname = div1.element("span[@class='companyName']")
130
+ puts 'coname: ' + coname.text.inspect if @debug
131
+ company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
105
132
 
106
133
  # company location (e.g. Edinburgh)
107
134
  location = div1.element("div[@class='companyLocation']")&.text
@@ -111,7 +138,12 @@ class IndeedScraper2022
111
138
  "v[@class='result-footer']")
112
139
 
113
140
  # job (e.g. Our products are primarily written in C#, using...)
114
- jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
141
+ advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
142
+ jobsnippet = if advert_items.any? then
143
+ advert_items.join("\n")
144
+ else
145
+ div3.element("div[@class='job-snippet']").text
146
+ end
115
147
 
116
148
  # visually (e.g. Posted 14 days ago)
117
149
  dateposted = div3.element("span[@class='date']")&.texts
@@ -119,7 +151,7 @@ class IndeedScraper2022
119
151
 
120
152
  {
121
153
  link: @url_base.sub(/\/[^\/]+$/,'') \
122
- + doc.root.attributes[:href].gsub(/&amp;/,'&'),
154
+ + href.gsub(/&amp;/,'&'),
123
155
  title: jobtitle,
124
156
  salary: salary,
125
157
  company: company_name,
@@ -146,52 +178,65 @@ class IndeedScraper2022
146
178
  def fetchjob(url)
147
179
 
148
180
  doc = Nokorexi.new(url).to_doc
181
+ puts 'before e0' if @debug
149
182
  e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
150
183
 
151
184
  #div = e0.element("//div[@class='jobsearch-JobComponent']")
185
+ puts 'before div1' if @debug
152
186
  div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
187
+ puts 'before div2' if @debug
153
188
  div2 = div1.element("div")
154
189
 
155
190
  # jobsearch (e.g. Full Stack Website Developer (Wordpress))
191
+ puts 'before jobtitle' if @debug
156
192
  jobtitle = div2.element("div[@class='jobsearch-JobInfoHead" \
157
193
  "er-title-container']/h1[@class='jobsearch-JobInfoHead" \
158
194
  "er-title']")&.text
159
195
 
196
+ puts 'before div3' if @debug
160
197
  div3 = div2.element("div[@class='jobsearch-CompanyInfoCon" \
161
198
  "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead" \
162
199
  "erImage']/div/div[@class='jobsearch-DesktopStickyCont" \
163
200
  "ainer-subtitle']")
164
201
 
165
202
  # icl (e.g. Lyles Sutherland)
203
+ puts 'before cname' if @debug
166
204
  cname = div3.xpath("div[@class='jobsearch-DesktopSt" \
167
205
  "ickyContainer-companyrating']/div/div[@class='icl-u-x" \
168
206
  "s-mr--xs']")[1]
207
+ puts 'before clink' if @debug
169
208
  clink = div3.element('//a')
170
209
  company = cname.text ? cname.text : clink.text
171
210
  companylink = clink.attributes[:href] if clink
172
211
 
212
+ puts 'before salary' if @debug
173
213
  salary = div1.element("//span[@class='attribute_snippet']")&.text
214
+ puts 'before type' if @debug
174
215
  type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
175
216
  div5 = div3.xpath("div/div")
176
217
  location, worklocation = div5.map(&:text).compact
177
218
 
178
219
  # icl (e.g. Full-time, Permanent)
220
+ puts 'before jobtype' if @debug
179
221
  jobtype = div1.element("div/div/div[@class='jobsearch-J" \
180
222
  "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
181
223
  jobtype = jobtype&.texts.join if jobtype
182
224
 
183
225
  # jobsearch (e.g. Urgently needed)
226
+ puts 'before jobnote1' if @debug
184
227
  jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag" \
185
228
  "']/div[@class='urgently-hiring']/div[@class='jobsearc" \
186
229
  "h-DesktopTag-text']")&.text
187
230
 
188
231
  # jobsearch (e.g. 10 days ago)
232
+ puts 'before days' if @debug
189
233
  days = e0.element("//div[@class='jobsearch-JobTab-con" \
190
234
  "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
191
235
  d = Date.today - days.to_i
192
236
  datepost = d.strftime("%Y-%m-%d")
193
237
 
194
238
 
239
+ puts 'before jobdesc' if @debug
195
240
  jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl" \
196
241
  "ass='jobsearch-jobDescriptionText']").xml
197
242
 
@@ -217,11 +262,16 @@ class IS22Plus < IndeedScraper2022
217
262
 
218
263
  def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
219
264
  super(q: q, location: location, headless: headless, cookies: cookies,
220
- debug: debug)
265
+ debug: true)
221
266
  end
222
267
 
268
+ # note: The most efficient method to accumulate vacancy articles is to
269
+ # execute archive() daily
270
+ #
223
271
  def archive(filepath='/tmp/indeed')
224
272
 
273
+ search() if @results.nil?
274
+
225
275
  return unless @results
226
276
 
227
277
  FileUtils.mkdir_p filepath
@@ -238,16 +288,23 @@ class IS22Plus < IndeedScraper2022
238
288
 
239
289
  puts 'saving ' + item[:title] if @debug
240
290
  puts 'link: ' + item[:link].inspect
241
- links = RXFReader.reveal(item[:link])
242
- puts 'links: ' + links.inspect
291
+ links = URL.reveal(item[:link])
292
+ puts 'links: ' + links.inspect if @debug
243
293
 
244
294
  url = links.last
245
- id = url[/(?<=\?jk=)[^&]+/]
295
+ puts 'url: ' + url.inspect if @debug
296
+ id = url[/(?<=jk=)[^&]+/]
246
297
 
247
298
  if index[id.to_sym] then
299
+
300
+ # the vacancy record has previously been saved
301
+ #
248
302
  next
303
+
249
304
  else
250
305
 
306
+ # write the full page vacancy article to file
307
+ #
251
308
  File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
252
309
 
253
310
  h = {
@@ -257,14 +314,19 @@ class IS22Plus < IndeedScraper2022
257
314
  company: item[:company].to_s.strip,
258
315
  location: item[:location].to_s,
259
316
  jobsnippet: item[:jobsnippet],
260
- date: item[:date]
317
+ date: item[:date],
318
+ added: Time.now.strftime("%Y-%m-%d")
261
319
  }
262
320
 
321
+ # add the vacancy snippet to the index file
322
+ #
263
323
  index[id.to_sym] = h
264
324
  end
265
325
 
266
326
  end
267
327
 
328
+ # save the vacancy index file
329
+ #
268
330
  File.write idxfile, index.to_yaml
269
331
 
270
332
  end
@@ -279,3 +341,37 @@ class IS22Plus < IndeedScraper2022
279
341
 
280
342
 
281
343
  end
344
+
345
+
346
+ class IS22Archive
347
+
348
+ attr_reader :index
349
+
350
+ def initialize(filepath='/tmp/indeed', debug: false)
351
+
352
+ FileUtils.mkdir_p filepath
353
+ @idxfile = File.join(filepath, 'index.yml')
354
+
355
+ @index = if File.exists? @idxfile then
356
+ YAML.load(File.read(@idxfile))
357
+ else
358
+ {}
359
+ end
360
+
361
+ end
362
+
363
+ def list()
364
+
365
+ @index.to_a.reverse.map.with_index do |x,i|
366
+
367
+ id, h = x
368
+
369
+ puts 'h: ' + h.inspect if @debug
370
+ co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
371
+ "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
372
+
373
+ end.join("\n")
374
+
375
+ end
376
+
377
+ end
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeed_scraper2022
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
35
35
  YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
36
36
  SW/2zInu2bkj/meWm5eBoWHT
37
37
  -----END CERTIFICATE-----
38
- date: 2022-04-01 00:00:00.000000000 Z
38
+ date: 2022-05-12 00:00:00.000000000 Z
39
39
  dependencies:
40
40
  - !ruby/object:Gem::Dependency
41
41
  name: nokorexi
@@ -63,20 +63,40 @@ dependencies:
63
63
  requirements:
64
64
  - - "~>"
65
65
  - !ruby/object:Gem::Version
66
- version: '0.2'
66
+ version: '0.3'
67
67
  - - ">="
68
68
  - !ruby/object:Gem::Version
69
- version: 0.2.2
69
+ version: 0.3.1
70
70
  type: :runtime
71
71
  prerelease: false
72
72
  version_requirements: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - "~>"
75
75
  - !ruby/object:Gem::Version
76
- version: '0.2'
76
+ version: '0.3'
77
77
  - - ">="
78
78
  - !ruby/object:Gem::Version
79
- version: 0.2.2
79
+ version: 0.3.1
80
+ - !ruby/object:Gem::Dependency
81
+ name: reveal_url22
82
+ requirement: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '0.1'
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: 0.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.1'
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: 0.1.0
80
100
  description:
81
101
  email: digital.robertson@gmail.com
82
102
  executables: []
metadata.gz.sig CHANGED
Binary file