linkedincrawler 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +19 -4
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0256d272cfb618594d7bc0bb926252d890f86101
4
- data.tar.gz: f1a370dcc7e4b28dda00b885a66496bc31d9d733
3
+ metadata.gz: b55cee51c70158577d2c1e1c618aa284f6a4647c
4
+ data.tar.gz: 7c2ab05da2829fb7f85f6510a3eee45ba7ef65df
5
5
  SHA512:
6
- metadata.gz: a134b51328467c71c7e3fe709fb59ed290380244aa77015539eeeff84c23468f5afd8da7964f2a09ee695b81b2832569d7c9f8fcbe7b3da4d8d8e44428920588
7
- data.tar.gz: 93ae61b064aa8bdd24beaa8e07c5e0b4fed0943a18c689907c03cb997db43891990f5be4c1c97c7e7696a99aeda536faf7be37cd899a36bdb63391b5dc2f3457
6
+ metadata.gz: 51c17a2cd6eb7a819a095291aa84fc52cbb7ddf00e7fbb0c627f9dd19a10181c86f195a2b807253ad89820761e0025eed2c07865b184d763d4a94359f7fad077
7
+ data.tar.gz: 9f9f9ef9af43157daac01b2a2e085782538eb5441f46c98f754f26c9428260b0193df59125cc8dd8bfa352a7b46b88a79c39279ce5666aabc2e95afdf71be652
@@ -19,22 +19,24 @@ class LinkedinCrawler
19
19
  @solver_details = solver_details
20
20
 
21
21
  # Handle crawler manager info
22
+ @cm_hash = cm_hash
22
23
  @cm_url = cm_hash[:crawler_manager_url] if cm_hash
23
24
  @selector_id = cm_hash[:selector_id] if cm_hash
24
25
  end
25
26
 
26
27
  # Run search terms and get results
27
28
  def search
28
-
29
+
29
30
  begin
30
31
  # Run Google search
31
- g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, nil)
32
+ g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
32
33
  urls = g.getURLs
33
34
 
34
35
  # Look for new LI urls
35
- g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, nil)
36
+ g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
36
37
  urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
37
- rescue Exception
38
+ rescue => e
39
+ report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
38
40
  binding.pry
39
41
  end
40
42
 
@@ -47,6 +49,7 @@ class LinkedinCrawler
47
49
 
48
50
  # Close all the browsers when done
49
51
  @requests.close_all_browsers
52
+ report_status("Data collection completed for " + @search_terms.to_s)
50
53
  end
51
54
 
52
55
  # Check that it is actually a LinkedIn profile page
@@ -84,8 +87,10 @@ class LinkedinCrawler
84
87
  if @retry_count < @retry_limit
85
88
  @requests.restart_browser
86
89
  @retry_count += 1
90
+ report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
87
91
  scrape(profile_url)
88
92
  else # Just save it and move on
93
+ report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
89
94
  report_results(parsed_profile, profile_url)
90
95
  end
91
96
 
@@ -113,6 +118,16 @@ class LinkedinCrawler
113
118
  Curl::PostField.content('results', JSON.pretty_generate(results)))
114
119
  end
115
120
 
121
+ # Report Harvester status message
122
+ def report_status(status_msg)
123
+ if @cm_url
124
+ curl_url = @cm_url+"/update_status"
125
+ c = Curl::Easy.http_post(curl_url,
126
+ Curl::PostField.content('selector_id', @selector_id),
127
+ Curl::PostField.content('status_message', status_msg))
128
+ end
129
+ end
130
+
116
131
  # Print output in JSON
117
132
  def gen_json
118
133
  JSON.pretty_generate(@output)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.18
4
+ version: 0.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-08 00:00:00.000000000 Z
11
+ date: 2016-10-30 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com