indeedcrawler 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/indeedcrawler.rb +33 -5
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 733c0a9b4b19d7441025971ab8cd432a4a016b65
4
- data.tar.gz: 165539a0a501c1f730b408a5ffaa7d9c9bce2f27
3
+ metadata.gz: 8d72ad14e0778839a3c4240834a4c10ebb94111e
4
+ data.tar.gz: e5a4bc28084ad26a63d0814c868eb18a6ba4565d
5
5
  SHA512:
6
- metadata.gz: f0de486ae6199adec6ef6b432bd879b22f8fd9cad92c746fad755cba272c778f52cd125d1ed3198c14540712726432417d8784869cc714d144e4af8a71e37865
7
- data.tar.gz: 22666895be68a355c658c7321c763c7de50ef29a29ad63d56f7221e8b970f5cb6a0ec87cc48ec66cd8ea4519b30c7fde5bbb3c048347e88aab5a403e5fd63f20
6
+ metadata.gz: 9d27cfc99eae6badb5643454a3b4b45192720d9e9df3530dbfe4bc2b766b84fb1dd414082a66c2d2cb777c634e6de542f19e9e69551a099f9ce95e182372b5b1
7
+ data.tar.gz: a4af2a1ed5dad981de77f48fccaa9025f2ceae7f3d59ad487f4698f89c3b10d523890b0552f3a9e86bf8942dc7526341938d3863da066330893a30480209071f
data/lib/indeedcrawler.rb CHANGED
@@ -3,9 +3,10 @@ require 'uri'
3
3
  require 'requestmanager'
4
4
  require 'nokogiri'
5
5
  require 'indeedparser'
6
+ require 'curb'
6
7
 
7
8
  class IndeedCrawler
8
- def initialize(search_query, location, proxy_list, wait_time, browser_num)
9
+ def initialize(search_query, location, proxy_list, wait_time, browser_num, cm_hash)
9
10
  # Info for query
10
11
  @search_query = search_query
11
12
  @location = location
@@ -16,6 +17,10 @@ class IndeedCrawler
16
17
  # Result tracking
17
18
  @all_resume_links = Array.new
18
19
  @output = Array.new
20
+
21
+ # Handle crawler manager info
22
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
23
+ @selector_id = cm_hash[:selector_id] if cm_hash
19
24
  end
20
25
 
21
26
  # Append query
@@ -70,15 +75,38 @@ class IndeedCrawler
70
75
  # Parse resume and add to results
71
76
  i = IndeedParser.new(resume, link, {time_scraped: Time.now})
72
77
  results = JSON.parse(i.get_results_by_job)
73
-
74
- results.each do |result|
75
- @output.push(result)
76
- end
78
+ report_results(results, link)
77
79
  rescue
80
+
78
81
  end
79
82
  end
80
83
  end
81
84
 
85
+ # Figure out how to report results
86
+ def report_results(results, link)
87
+ if @cm_url
88
+ report_incremental(results, link)
89
+ else
90
+ report_batch(results)
91
+ end
92
+ end
93
+
94
+ # Report all results in one JSON
95
+ def report_batch(results)
96
+ results.each do |result|
97
+ @output.push(result)
98
+ end
99
+ end
100
+
101
+ # Report results back to Harvester incrementally
102
+ def report_incremental(results, link)
103
+ curl_url = @cm_url+"/relay_results"
104
+ c = Curl::Easy.http_post(curl_url,
105
+ Curl::PostField.content('selector_id', @selector_id),
106
+ Curl::PostField.content('status_message', "Collected " + link),
107
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
108
+ end
109
+
82
110
  # Get all the profile links
83
111
  def collect_it_all
84
112
  # Generate URL
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indeedcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-23 00:00:00.000000000 Z
11
+ date: 2016-10-05 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls Indeed resumes
14
14
  email: shidash@transparencytoolkit.org