indeedcrawler 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/indeedcrawler.rb +33 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d72ad14e0778839a3c4240834a4c10ebb94111e
|
4
|
+
data.tar.gz: e5a4bc28084ad26a63d0814c868eb18a6ba4565d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9d27cfc99eae6badb5643454a3b4b45192720d9e9df3530dbfe4bc2b766b84fb1dd414082a66c2d2cb777c634e6de542f19e9e69551a099f9ce95e182372b5b1
|
7
|
+
data.tar.gz: a4af2a1ed5dad981de77f48fccaa9025f2ceae7f3d59ad487f4698f89c3b10d523890b0552f3a9e86bf8942dc7526341938d3863da066330893a30480209071f
|
data/lib/indeedcrawler.rb
CHANGED
@@ -3,9 +3,10 @@ require 'uri'
|
|
3
3
|
require 'requestmanager'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'indeedparser'
|
6
|
+
require 'curb'
|
6
7
|
|
7
8
|
class IndeedCrawler
|
8
|
-
def initialize(search_query, location, proxy_list, wait_time, browser_num)
|
9
|
+
def initialize(search_query, location, proxy_list, wait_time, browser_num, cm_hash)
|
9
10
|
# Info for query
|
10
11
|
@search_query = search_query
|
11
12
|
@location = location
|
@@ -16,6 +17,10 @@ class IndeedCrawler
|
|
16
17
|
# Result tracking
|
17
18
|
@all_resume_links = Array.new
|
18
19
|
@output = Array.new
|
20
|
+
|
21
|
+
# Handle crawler manager info
|
22
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
23
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
19
24
|
end
|
20
25
|
|
21
26
|
# Append query
|
@@ -70,15 +75,38 @@ class IndeedCrawler
|
|
70
75
|
# Parse resume and add to results
|
71
76
|
i = IndeedParser.new(resume, link, {time_scraped: Time.now})
|
72
77
|
results = JSON.parse(i.get_results_by_job)
|
73
|
-
|
74
|
-
results.each do |result|
|
75
|
-
@output.push(result)
|
76
|
-
end
|
78
|
+
report_results(results, link)
|
77
79
|
rescue
|
80
|
+
|
78
81
|
end
|
79
82
|
end
|
80
83
|
end
|
81
84
|
|
85
|
+
# Figure out how to report results
|
86
|
+
def report_results(results, link)
|
87
|
+
if @cm_url
|
88
|
+
report_incremental(results, link)
|
89
|
+
else
|
90
|
+
report_batch(results)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Report all results in one JSON
|
95
|
+
def report_batch(results)
|
96
|
+
results.each do |result|
|
97
|
+
@output.push(result)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Report results back to Harvester incrementally
|
102
|
+
def report_incremental(results, link)
|
103
|
+
curl_url = @cm_url+"/relay_results"
|
104
|
+
c = Curl::Easy.http_post(curl_url,
|
105
|
+
Curl::PostField.content('selector_id', @selector_id),
|
106
|
+
Curl::PostField.content('status_message', "Collected " + link),
|
107
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
108
|
+
end
|
109
|
+
|
82
110
|
# Get all the profile links
|
83
111
|
def collect_it_all
|
84
112
|
# Generate URL
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeedcrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls Indeed resumes
|
14
14
|
email: shidash@transparencytoolkit.org
|