indeed_scraper2022 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- checksums.yaml.gz.sig +0 -0
- data/lib/indeed_scraper2022.rb +44 -3
- data.tar.gz.sig +0 -0
- metadata +2 -2
- metadata.gz.sig +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
|
4
|
+
data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
|
7
|
+
data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
|
checksums.yaml.gz.sig
CHANGED
Binary file
|
data/lib/indeed_scraper2022.rb
CHANGED
@@ -4,11 +4,13 @@
|
|
4
4
|
|
5
5
|
require 'ferrumwizard'
|
6
6
|
require 'nokorexi'
|
7
|
+
require 'yaml'
|
7
8
|
|
8
9
|
# Given the nature of changes to jobsearch websites,
|
9
10
|
# don't rely upon this gem working in the near future.
|
10
11
|
|
11
12
|
|
13
|
+
|
12
14
|
class IndeedScraper2022Err < Exception
|
13
15
|
end
|
14
16
|
|
@@ -218,14 +220,53 @@ class IS22Plus < IndeedScraper2022
|
|
218
220
|
debug: debug)
|
219
221
|
end
|
220
222
|
|
221
|
-
def archive()
|
223
|
+
def archive(filepath='/tmp/indeed')
|
222
224
|
|
223
225
|
return unless @results
|
224
226
|
|
225
|
-
|
226
|
-
|
227
|
+
FileUtils.mkdir_p filepath
|
228
|
+
|
229
|
+
idxfile = File.join(filepath, 'index.yml')
|
230
|
+
|
231
|
+
index = if File.exists? idxfile then
|
232
|
+
YAML.load(File.read(idxfile))
|
233
|
+
else
|
234
|
+
{}
|
227
235
|
end
|
228
236
|
|
237
|
+
@results.each.with_index do |item, i|
|
238
|
+
|
239
|
+
puts 'saving ' + item[:title] if @debug
|
240
|
+
puts 'link: ' + item[:link].inspect
|
241
|
+
links = RXFReader.reveal(item[:link])
|
242
|
+
puts 'links: ' + links.inspect
|
243
|
+
|
244
|
+
url = links.last
|
245
|
+
id = url[/(?<=\?jk=)[^&]+/]
|
246
|
+
|
247
|
+
if index[id.to_sym] then
|
248
|
+
next
|
249
|
+
else
|
250
|
+
|
251
|
+
File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
|
252
|
+
|
253
|
+
h = {
|
254
|
+
link: url[/^[^&]+/],
|
255
|
+
title: item[:title].to_s,
|
256
|
+
salary: item[:salary].to_s,
|
257
|
+
company: item[:company].to_s.strip,
|
258
|
+
location: item[:location].to_s,
|
259
|
+
jobsnippet: item[:jobsnippet],
|
260
|
+
date: item[:date]
|
261
|
+
}
|
262
|
+
|
263
|
+
index[id.to_sym] = h
|
264
|
+
end
|
265
|
+
|
266
|
+
end
|
267
|
+
|
268
|
+
File.write idxfile, index.to_yaml
|
269
|
+
|
229
270
|
end
|
230
271
|
|
231
272
|
def list()
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeed_scraper2022
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Robertson
|
@@ -35,7 +35,7 @@ cert_chain:
|
|
35
35
|
YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
|
36
36
|
SW/2zInu2bkj/meWm5eBoWHT
|
37
37
|
-----END CERTIFICATE-----
|
38
|
-
date: 2022-
|
38
|
+
date: 2022-04-01 00:00:00.000000000 Z
|
39
39
|
dependencies:
|
40
40
|
- !ruby/object:Gem::Dependency
|
41
41
|
name: nokorexi
|
metadata.gz.sig
CHANGED
Binary file
|