linkedincrawler 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +31 -3
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d133e662b783dcf61594013b375ddec17cb22102
4
- data.tar.gz: 4c477ee0f284c4bc13171d25c747cf9f2b42c090
3
+ metadata.gz: e497521e3f492f55300fe474d802e36645cb9ff3
4
+ data.tar.gz: 1c7599985292c36c758577b0325dfa49fa671ec7
5
5
  SHA512:
6
- metadata.gz: 47a121927417ce0d195d5be681d533f35c88613710f104d540e032bc9539ebd25840163eda54c75a63caeb27924f1fc65fd767042e5e18664d4c0fd13809e01e
7
- data.tar.gz: 87f53246bb4d98c67683525ea32095729a453502e0426f87268edd949809fea93ea27da0f288598a1698202b433e7a75f660c64820a2bb66b2605a56ad8d563c
6
+ metadata.gz: a7b2616237182ca01b2144ae3b527c27c7adb76f41d454b6cb01f7ae7e10151cbe519815aadeb838ecbb1d0f9ae247829b584bbf74d8c69364cda0d7b262481f
7
+ data.tar.gz: 4911139eb248197b6bf503fe5d3373beee04e3033eae8283106fa4141e3911a938f353190e1ffbe92abc633f184b3c401239c3208469a4f2c13bd36e1166efda
@@ -6,7 +6,7 @@ require 'selenium-webdriver'
6
6
  require 'pry'
7
7
 
8
8
  class LinkedinCrawler
9
- def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details)
9
+ def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
10
10
  @search_terms = search_terms
11
11
  @output = Array.new
12
12
 
@@ -17,10 +17,16 @@ class LinkedinCrawler
17
17
  @requests_google = requests_google
18
18
  @requests_google2 = requests_google2
19
19
  @solver_details = solver_details
20
+
21
+ # Handle crawler manager info
22
+ @cm_url = cm_hash[:crawler_manager_url] if cm_hash
23
+ @selector_id = cm_hash[:selector_id] if cm_hash
20
24
  end
21
25
 
22
26
  # Run search terms and get results
23
27
  def search
28
+
29
+ begin
24
30
  # Run Google search
25
31
  g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details)
26
32
  urls = g.getURLs
@@ -28,6 +34,9 @@ class LinkedinCrawler
28
34
  # Look for new LI urls
29
35
  g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details)
30
36
  urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
37
+ rescue Exception
38
+ binding.pry
39
+ end
31
40
 
32
41
  # Scrape each resulting LinkedIn page
33
42
  urls.each do |profile|
@@ -77,14 +86,33 @@ class LinkedinCrawler
77
86
  @retry_count += 1
78
87
  scrape(profile_url)
79
88
  else # Just save it and move on
80
- save_and_continue(parsed_profile)
89
+ report_results(parsed_profile, profile_url)
81
90
  end
82
91
 
83
92
  else # It succeeded!
84
- save_and_continue(parsed_profile)
93
+ report_results(parsed_profile, profile_url)
85
94
  end
86
95
  end
87
96
 
97
+ # Figure out how to report results
98
+ def report_results(results, link)
99
+ if @cm_url
100
+ report_incremental(results, link)
101
+ else
102
+ save_and_continue(results)
103
+ end
104
+ end
105
+
106
+ # Report results back to Harvester incrementally
107
+ def report_incremental(results, link)
108
+ curl_url = @cm_url+"/relay_results"
109
+ @retry_count = 0
110
+ c = Curl::Easy.http_post(curl_url,
111
+ Curl::PostField.content('selector_id', @selector_id),
112
+ Curl::PostField.content('status_message', "Collected " + link),
113
+ Curl::PostField.content('results', JSON.pretty_generate(results)))
114
+ end
115
+
88
116
  # Print output in JSON
89
117
  def gen_json
90
118
  JSON.pretty_generate(@output)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-16 00:00:00.000000000 Z
11
+ date: 2016-10-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com