linkedincrawler 0.0.18 → 0.0.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/linkedincrawler.rb +19 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b55cee51c70158577d2c1e1c618aa284f6a4647c
|
4
|
+
data.tar.gz: 7c2ab05da2829fb7f85f6510a3eee45ba7ef65df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51c17a2cd6eb7a819a095291aa84fc52cbb7ddf00e7fbb0c627f9dd19a10181c86f195a2b807253ad89820761e0025eed2c07865b184d763d4a94359f7fad077
|
7
|
+
data.tar.gz: 9f9f9ef9af43157daac01b2a2e085782538eb5441f46c98f754f26c9428260b0193df59125cc8dd8bfa352a7b46b88a79c39279ce5666aabc2e95afdf71be652
|
data/lib/linkedincrawler.rb
CHANGED
@@ -19,22 +19,24 @@ class LinkedinCrawler
|
|
19
19
|
@solver_details = solver_details
|
20
20
|
|
21
21
|
# Handle crawler manager info
|
22
|
+
@cm_hash = cm_hash
|
22
23
|
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
23
24
|
@selector_id = cm_hash[:selector_id] if cm_hash
|
24
25
|
end
|
25
26
|
|
26
27
|
# Run search terms and get results
|
27
28
|
def search
|
28
|
-
|
29
|
+
|
29
30
|
begin
|
30
31
|
# Run Google search
|
31
|
-
g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details,
|
32
|
+
g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
|
32
33
|
urls = g.getURLs
|
33
34
|
|
34
35
|
# Look for new LI urls
|
35
|
-
g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details,
|
36
|
+
g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
|
36
37
|
urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
|
37
|
-
rescue
|
38
|
+
rescue => e
|
39
|
+
report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
|
38
40
|
binding.pry
|
39
41
|
end
|
40
42
|
|
@@ -47,6 +49,7 @@ class LinkedinCrawler
|
|
47
49
|
|
48
50
|
# Close all the browsers when done
|
49
51
|
@requests.close_all_browsers
|
52
|
+
report_status("Data collection completed for " + @search_terms.to_s)
|
50
53
|
end
|
51
54
|
|
52
55
|
# Check that it is actually a LinkedIn profile page
|
@@ -84,8 +87,10 @@ class LinkedinCrawler
|
|
84
87
|
if @retry_count < @retry_limit
|
85
88
|
@requests.restart_browser
|
86
89
|
@retry_count += 1
|
90
|
+
report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
|
87
91
|
scrape(profile_url)
|
88
92
|
else # Just save it and move on
|
93
|
+
report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
|
89
94
|
report_results(parsed_profile, profile_url)
|
90
95
|
end
|
91
96
|
|
@@ -113,6 +118,16 @@ class LinkedinCrawler
|
|
113
118
|
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
114
119
|
end
|
115
120
|
|
121
|
+
# Report Harvester status message
|
122
|
+
def report_status(status_msg)
|
123
|
+
if @cm_url
|
124
|
+
curl_url = @cm_url+"/update_status"
|
125
|
+
c = Curl::Easy.http_post(curl_url,
|
126
|
+
Curl::PostField.content('selector_id', @selector_id),
|
127
|
+
Curl::PostField.content('status_message', status_msg))
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
116
131
|
# Print output in JSON
|
117
132
|
def gen_json
|
118
133
|
JSON.pretty_generate(@output)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedincrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls public LinkedIn profiles via Google
|
14
14
|
email: shidash@shidash.com
|