indeed_scraper2022 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +44 -3
- data.tar.gz.sig +0 -0
- metadata +2 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
|
4
|
+
data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
|
7
|
+
data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -4,11 +4,13 @@
|
|
4
4
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Given the nature of changes to jobsearch websites,
|
9
10
|
# don't rely upon this gem working in the near future.
|
10
11
|
|
11
12
|
|
13
|
+
|
12
14
|
class IndeedScraper2022Err < Exception
|
13
15
|
end
|
14
16
|
|
@@ -218,14 +220,53 @@ class IS22Plus < IndeedScraper2022
|
|
218
220
|
debug: debug)
|
219
221
|
end
|
220
222
|
|
221
|
-
def archive()
|
223
|
+
def archive(filepath='/tmp/indeed')
|
222
224
|
|
223
225
|
return unless @results
|
224
226
|
|
225
|
-
|
226
|
-
|
227
|
+
FileUtils.mkdir_p filepath
|
228
|
+
|
229
|
+
idxfile = File.join(filepath, 'index.yml')
|
230
|
+
|
231
|
+
index = if File.exists? idxfile then
|
232
|
+
YAML.load(File.read(idxfile))
|
233
|
+
else
|
234
|
+
{}
|
227
235
|
end
|
228
236
|
|
237
|
+
@results.each.with_index do |item, i|
|
238
|
+
|
239
|
+
puts 'saving ' + item[:title] if @debug
|
240
|
+
puts 'link: ' + item[:link].inspect
|
241
|
+
links = RXFReader.reveal(item[:link])
|
242
|
+
puts 'links: ' + links.inspect
|
243
|
+
|
244
|
+
url = links.last
|
245
|
+
id = url[/(?<=\?jk=)[^&]+/]
|
246
|
+
|
247
|
+
if index[id.to_sym] then
|
248
|
+
next
|
249
|
+
else
|
250
|
+
|
251
|
+
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
252
|
+
|
253
|
+
h = {
|
254
|
+
link: url[/^[^&]+/],
|
255
|
+
title: item[:title].to_s,
|
256
|
+
salary: item[:salary].to_s,
|
257
|
+
company: item[:company].to_s.strip,
|
258
|
+
location: item[:location].to_s,
|
259
|
+
jobsnippet: item[:jobsnippet],
|
260
|
+
date: item[:date]
|
261
|
+
}
|
262
|
+
|
263
|
+
index[id.to_sym] = h
|
264
|
+
end
|
265
|
+
|
266
|
+
end
|
267
|
+
|
268
|
+
File.write idxfile, index.to_yaml
|
269
|
+
|
229
270
|
end
|
230
271
|
|
231
272
|
def list()
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-04-01 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
metadata.gz.sig
CHANGED
Binary file
|