linkedincrawler 0.0.15 → 0.0.16

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +31 -3
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d133e662b783dcf61594013b375ddec17cb22102
4
- data.tar.gz: 4c477ee0f284c4bc13171d25c747cf9f2b42c090
3
+ metadata.gz: e497521e3f492f55300fe474d802e36645cb9ff3
4
+ data.tar.gz: 1c7599985292c36c758577b0325dfa49fa671ec7
5
5
  SHA512:
6
- metadata.gz: 47a121927417ce0d195d5be681d533f35c88613710f104d540e032bc9539ebd25840163eda54c75a63caeb27924f1fc65fd767042e5e18664d4c0fd13809e01e
7
- data.tar.gz: 87f53246bb4d98c67683525ea32095729a453502e0426f87268edd949809fea93ea27da0f288598a1698202b433e7a75f660c64820a2bb66b2605a56ad8d563c
6
+ metadata.gz: a7b2616237182ca01b2144ae3b527c27c7adb76f41d454b6cb01f7ae7e10151cbe519815aadeb838ecbb1d0f9ae247829b584bbf74d8c69364cda0d7b262481f
7
+ data.tar.gz: 4911139eb248197b6bf503fe5d3373beee04e3033eae8283106fa4141e3911a938f353190e1ffbe92abc633f184b3c401239c3208469a4f2c13bd36e1166efda
@@ -6,7 +6,7 @@ require 'selenium-webdriver'
6
6
  require 'pry'
7
7
 
8
8
  class LinkedinCrawler
9
- def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details)
9
+ def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
10
10
  @search_terms = search_terms
11
11
  @output = Array.new
12
12
 
@@ -17,10 +17,16 @@ class LinkedinCrawler
17
17
  @requests_google = requests_google
18
18
  @requests_google2 = requests_google2
19
19
  @solver_details = solver_details
20
+
21
+ # Handle crawler manager info
22
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
23
+ @selector_id = cm_hash[:selector_id] if cm_hash
20
24
  end
21
25
 
22
26
  # Run search terms and get results
23
27
  def search
28
+
29
+ begin
24
30
  # Run Google search
25
31
  g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details)
26
32
  urls = g.getURLs
@@ -28,6 +34,9 @@ class LinkedinCrawler
28
34
  # Look for new LI urls
29
35
  g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details)
30
36
  urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
37
+ rescue Exception
38
+ binding.pry
39
+ end
31
40
 
32
41
  # Scrape each resulting LinkedIn page
33
42
  urls.each do |profile|
@@ -77,14 +86,33 @@ class LinkedinCrawler
77
86
  @retry_count += 1
78
87
  scrape(profile_url)
79
88
  else # Just save it and move on
80
- save_and_continue(parsed_profile)
89
+ report_results(parsed_profile, profile_url)
81
90
  end
82
91
 
83
92
  else # It succeeded!
84
- save_and_continue(parsed_profile)
93
+ report_results(parsed_profile, profile_url)
85
94
  end
86
95
  end
87
96
 
97
+ # Figure out how to report results
98
+ def report_results(results, link)
99
+ if @cm_url
100
+ report_incremental(results, link)
101
+ else
102
+ save_and_continue(results)
103
+ end
104
+ end
105
+
106
+ # Report results back to Harvester incrementally
107
+ def report_incremental(results, link)
108
+ curl_url = @cm_url+"/relay_results"
109
+ @retry_count = 0
110
+ c = Curl::Easy.http_post(curl_url,
111
+ Curl::PostField.content('selector_id', @selector_id),
112
+ Curl::PostField.content('status_message', "Collected " + link),
113
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
114
+ end
115
+
88
116
  # Print output in JSON
89
117
  def gen_json
90
118
  JSON.pretty_generate(@output)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-16 00:00:00.000000000 Z
11
+ date: 2016-10-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com