linkedincrawler 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/linkedincrawler.rb +19 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b55cee51c70158577d2c1e1c618aa284f6a4647c
|
4
|
+
data.tar.gz: 7c2ab05da2829fb7f85f6510a3eee45ba7ef65df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51c17a2cd6eb7a819a095291aa84fc52cbb7ddf00e7fbb0c627f9dd19a10181c86f195a2b807253ad89820761e0025eed2c07865b184d763d4a94359f7fad077
|
7
|
+
data.tar.gz: 9f9f9ef9af43157daac01b2a2e085782538eb5441f46c98f754f26c9428260b0193df59125cc8dd8bfa352a7b46b88a79c39279ce5666aabc2e95afdf71be652
|
data/lib/linkedincrawler.rb
CHANGED
@@ -19,22 +19,24 @@ class LinkedinCrawler
|
|
19
19
|
@solver_details = solver_details
|
20
20
|
|
21
21
|
# Handle crawler manager info
|
22
|
+
@cm_hash = cm_hash
|
22
23
|
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
23
24
|
@selector_id = cm_hash[:selector_id] if cm_hash
|
24
25
|
end
|
25
26
|
|
26
27
|
# Run search terms and get results
|
27
28
|
def search
|
28
|
-
|
29
|
+
|
29
30
|
begin
|
30
31
|
# Run Google search
|
31
|
-
g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details,
|
32
|
+
g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
|
32
33
|
urls = g.getURLs
|
33
34
|
|
34
35
|
# Look for new LI urls
|
35
|
-
g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details,
|
36
|
+
g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
|
36
37
|
urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
|
37
|
-
rescue
|
38
|
+
rescue => e
|
39
|
+
report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
|
38
40
|
binding.pry
|
39
41
|
end
|
40
42
|
|
@@ -47,6 +49,7 @@ class LinkedinCrawler
|
|
47
49
|
|
48
50
|
# Close all the browsers when done
|
49
51
|
@requests.close_all_browsers
|
52
|
+
report_status("Data collection completed for " + @search_terms.to_s)
|
50
53
|
end
|
51
54
|
|
52
55
|
# Check that it is actually a LinkedIn profile page
|
@@ -84,8 +87,10 @@ class LinkedinCrawler
|
|
84
87
|
if @retry_count < @retry_limit
|
85
88
|
@requests.restart_browser
|
86
89
|
@retry_count += 1
|
90
|
+
report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
|
87
91
|
scrape(profile_url)
|
88
92
|
else # Just save it and move on
|
93
|
+
report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
|
89
94
|
report_results(parsed_profile, profile_url)
|
90
95
|
end
|
91
96
|
|
@@ -113,6 +118,16 @@ class LinkedinCrawler
|
|
113
118
|
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
114
119
|
end
|
115
120
|
|
121
|
+
# Report Harvester status message
|
122
|
+
def report_status(status_msg)
|
123
|
+
if @cm_url
|
124
|
+
curl_url = @cm_url+"/update_status"
|
125
|
+
c = Curl::Easy.http_post(curl_url,
|
126
|
+
Curl::PostField.content('selector_id', @selector_id),
|
127
|
+
Curl::PostField.content('status_message', status_msg))
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
116
131
|
# Print output in JSON
|
117
132
|
def gen_json
|
118
133
|
JSON.pretty_generate(@output)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedincrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls public LinkedIn profiles via Google
|
14
14
|
email: shidash@shidash.com
|