indeedcrawler 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/indeedcrawler.rb +33 -5
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 733c0a9b4b19d7441025971ab8cd432a4a016b65
4
- data.tar.gz: 165539a0a501c1f730b408a5ffaa7d9c9bce2f27
3
+ metadata.gz: 8d72ad14e0778839a3c4240834a4c10ebb94111e
4
+ data.tar.gz: e5a4bc28084ad26a63d0814c868eb18a6ba4565d
5
5
  SHA512:
6
- metadata.gz: f0de486ae6199adec6ef6b432bd879b22f8fd9cad92c746fad755cba272c778f52cd125d1ed3198c14540712726432417d8784869cc714d144e4af8a71e37865
7
- data.tar.gz: 22666895be68a355c658c7321c763c7de50ef29a29ad63d56f7221e8b970f5cb6a0ec87cc48ec66cd8ea4519b30c7fde5bbb3c048347e88aab5a403e5fd63f20
6
+ metadata.gz: 9d27cfc99eae6badb5643454a3b4b45192720d9e9df3530dbfe4bc2b766b84fb1dd414082a66c2d2cb777c634e6de542f19e9e69551a099f9ce95e182372b5b1
7
+ data.tar.gz: a4af2a1ed5dad981de77f48fccaa9025f2ceae7f3d59ad487f4698f89c3b10d523890b0552f3a9e86bf8942dc7526341938d3863da066330893a30480209071f
data/lib/indeedcrawler.rb CHANGED
@@ -3,9 +3,10 @@ require 'uri'
3
3
  require 'requestmanager'
4
4
  require 'nokogiri'
5
5
  require 'indeedparser'
6
+ require 'curb'
6
7
 
7
8
  class IndeedCrawler
8
- def initialize(search_query, location, proxy_list, wait_time, browser_num)
9
+ def initialize(search_query, location, proxy_list, wait_time, browser_num, cm_hash)
9
10
  # Info for query
10
11
  @search_query = search_query
11
12
  @location = location
@@ -16,6 +17,10 @@ class IndeedCrawler
16
17
  # Result tracking
17
18
  @all_resume_links = Array.new
18
19
  @output = Array.new
20
+
21
+ # Handle crawler manager info
22
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
23
+ @selector_id = cm_hash[:selector_id] if cm_hash
19
24
  end
20
25
 
21
26
  # Append query
@@ -70,15 +75,38 @@ class IndeedCrawler
70
75
  # Parse resume and add to results
71
76
  i = IndeedParser.new(resume, link, {time_scraped: Time.now})
72
77
  results = JSON.parse(i.get_results_by_job)
73
-
74
- results.each do |result|
75
- @output.push(result)
76
- end
78
+ report_results(results, link)
77
79
  rescue
80
+
78
81
  end
79
82
  end
80
83
  end
81
84
 
85
+ # Figure out how to report results
86
+ def report_results(results, link)
87
+ if @cm_url
88
+ report_incremental(results, link)
89
+ else
90
+ report_batch(results)
91
+ end
92
+ end
93
+
94
+ # Report all results in one JSON
95
+ def report_batch(results)
96
+ results.each do |result|
97
+ @output.push(result)
98
+ end
99
+ end
100
+
101
+ # Report results back to Harvester incrementally
102
+ def report_incremental(results, link)
103
+ curl_url = @cm_url+"/relay_results"
104
+ c = Curl::Easy.http_post(curl_url,
105
+ Curl::PostField.content('selector_id', @selector_id),
106
+ Curl::PostField.content('status_message', "Collected " + link),
107
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
108
+ end
109
+
82
110
  # Get all the profile links
83
111
  def collect_it_all
84
112
  # Generate URL
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeedcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-23 00:00:00.000000000 Z
11
+ date: 2016-10-05 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls Indeed resumes
14
14
  email: shidash@transparencytoolkit.org