indeedcrawler 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/indeedcrawler.rb +33 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d72ad14e0778839a3c4240834a4c10ebb94111e
|
4
|
+
data.tar.gz: e5a4bc28084ad26a63d0814c868eb18a6ba4565d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9d27cfc99eae6badb5643454a3b4b45192720d9e9df3530dbfe4bc2b766b84fb1dd414082a66c2d2cb777c634e6de542f19e9e69551a099f9ce95e182372b5b1
|
7
|
+
data.tar.gz: a4af2a1ed5dad981de77f48fccaa9025f2ceae7f3d59ad487f4698f89c3b10d523890b0552f3a9e86bf8942dc7526341938d3863da066330893a30480209071f
|
data/lib/indeedcrawler.rb
CHANGED
@@ -3,9 +3,10 @@ require 'uri'
|
|
3
3
|
require 'requestmanager'
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'indeedparser'
|
6
|
+
require 'curb'
|
6
7
|
|
7
8
|
class IndeedCrawler
|
8
|
-
def initialize(search_query, location, proxy_list, wait_time, browser_num)
|
9
|
+
def initialize(search_query, location, proxy_list, wait_time, browser_num, cm_hash)
|
9
10
|
# Info for query
|
10
11
|
@search_query = search_query
|
11
12
|
@location = location
|
@@ -16,6 +17,10 @@ class IndeedCrawler
|
|
16
17
|
# Result tracking
|
17
18
|
@all_resume_links = Array.new
|
18
19
|
@output = Array.new
|
20
|
+
|
21
|
+
# Handle crawler manager info
|
22
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
23
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
19
24
|
end
|
20
25
|
|
21
26
|
# Append query
|
@@ -70,15 +75,38 @@ class IndeedCrawler
|
|
70
75
|
# Parse resume and add to results
|
71
76
|
i = IndeedParser.new(resume, link, {time_scraped: Time.now})
|
72
77
|
results = JSON.parse(i.get_results_by_job)
|
73
|
-
|
74
|
-
results.each do |result|
|
75
|
-
@output.push(result)
|
76
|
-
end
|
78
|
+
report_results(results, link)
|
77
79
|
rescue
|
80
|
+
|
78
81
|
end
|
79
82
|
end
|
80
83
|
end
|
81
84
|
|
85
|
+
# Figure out how to report results
|
86
|
+
def report_results(results, link)
|
87
|
+
if @cm_url
|
88
|
+
report_incremental(results, link)
|
89
|
+
else
|
90
|
+
report_batch(results)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Report all results in one JSON
|
95
|
+
def report_batch(results)
|
96
|
+
results.each do |result|
|
97
|
+
@output.push(result)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Report results back to Harvester incrementally
|
102
|
+
def report_incremental(results, link)
|
103
|
+
curl_url = @cm_url+"/relay_results"
|
104
|
+
c = Curl::Easy.http_post(curl_url,
|
105
|
+
Curl::PostField.content('selector_id', @selector_id),
|
106
|
+
Curl::PostField.content('status_message', "Collected " + link),
|
107
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
108
|
+
end
|
109
|
+
|
82
110
|
# Get all the profile links
|
83
111
|
def collect_it_all
|
84
112
|
# Generate URL
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indeedcrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-10-05 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls Indeed resumes
|
14
14
|
email: shidash@transparencytoolkit.org
|