linkedincrawler 0.0.18 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +19 -4
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0256d272cfb618594d7bc0bb926252d890f86101
4
- data.tar.gz: f1a370dcc7e4b28dda00b885a66496bc31d9d733
3
+ metadata.gz: b55cee51c70158577d2c1e1c618aa284f6a4647c
4
+ data.tar.gz: 7c2ab05da2829fb7f85f6510a3eee45ba7ef65df
5
5
  SHA512:
6
- metadata.gz: a134b51328467c71c7e3fe709fb59ed290380244aa77015539eeeff84c23468f5afd8da7964f2a09ee695b81b2832569d7c9f8fcbe7b3da4d8d8e44428920588
7
- data.tar.gz: 93ae61b064aa8bdd24beaa8e07c5e0b4fed0943a18c689907c03cb997db43891990f5be4c1c97c7e7696a99aeda536faf7be37cd899a36bdb63391b5dc2f3457
6
+ metadata.gz: 51c17a2cd6eb7a819a095291aa84fc52cbb7ddf00e7fbb0c627f9dd19a10181c86f195a2b807253ad89820761e0025eed2c07865b184d763d4a94359f7fad077
7
+ data.tar.gz: 9f9f9ef9af43157daac01b2a2e085782538eb5441f46c98f754f26c9428260b0193df59125cc8dd8bfa352a7b46b88a79c39279ce5666aabc2e95afdf71be652
@@ -19,22 +19,24 @@ class LinkedinCrawler
19
19
  @solver_details = solver_details
20
20
 
21
21
  # Handle crawler manager info
22
+ @cm_hash = cm_hash
22
23
  @cm_url = cm_hash[:crawler_manager_url] if cm_hash
23
24
  @selector_id = cm_hash[:selector_id] if cm_hash
24
25
  end
25
26
 
26
27
  # Run search terms and get results
27
28
  def search
28
-
29
+
29
30
  begin
30
31
  # Run Google search
31
- g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, nil)
32
+ g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
32
33
  urls = g.getURLs
33
34
 
34
35
  # Look for new LI urls
35
- g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, nil)
36
+ g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
36
37
  urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
37
- rescue Exception
38
+ rescue => e
39
+ report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
38
40
  binding.pry
39
41
  end
40
42
 
@@ -47,6 +49,7 @@ class LinkedinCrawler
47
49
 
48
50
  # Close all the browsers when done
49
51
  @requests.close_all_browsers
52
+ report_status("Data collection completed for " + @search_terms.to_s)
50
53
  end
51
54
 
52
55
  # Check that it is actually a LinkedIn profile page
@@ -84,8 +87,10 @@ class LinkedinCrawler
84
87
  if @retry_count < @retry_limit
85
88
  @requests.restart_browser
86
89
  @retry_count += 1
90
+ report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
87
91
  scrape(profile_url)
88
92
  else # Just save it and move on
93
+ report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
89
94
  report_results(parsed_profile, profile_url)
90
95
  end
91
96
 
@@ -113,6 +118,16 @@ class LinkedinCrawler
113
118
  Curl::PostField.content('results', JSON.pretty_generate(results)))
114
119
  end
115
120
 
121
+ # Report Harvester status message
122
+ def report_status(status_msg)
123
+ if @cm_url
124
+ curl_url = @cm_url+"/update_status"
125
+ c = Curl::Easy.http_post(curl_url,
126
+ Curl::PostField.content('selector_id', @selector_id),
127
+ Curl::PostField.content('status_message', status_msg))
128
+ end
129
+ end
130
+
116
131
  # Print output in JSON
117
132
  def gen_json
118
133
  JSON.pretty_generate(@output)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.18
4
+ version: 0.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-08 00:00:00.000000000 Z
11
+ date: 2016-10-30 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com