linkedincrawler 0.0.15 → 0.0.16
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/linkedincrawler.rb +31 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e497521e3f492f55300fe474d802e36645cb9ff3
|
4
|
+
data.tar.gz: 1c7599985292c36c758577b0325dfa49fa671ec7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a7b2616237182ca01b2144ae3b527c27c7adb76f41d454b6cb01f7ae7e10151cbe519815aadeb838ecbb1d0f9ae247829b584bbf74d8c69364cda0d7b262481f
|
7
|
+
data.tar.gz: 4911139eb248197b6bf503fe5d3373beee04e3033eae8283106fa4141e3911a938f353190e1ffbe92abc633f184b3c401239c3208469a4f2c13bd36e1166efda
|
data/lib/linkedincrawler.rb
CHANGED
@@ -6,7 +6,7 @@ require 'selenium-webdriver'
|
|
6
6
|
require 'pry'
|
7
7
|
|
8
8
|
class LinkedinCrawler
|
9
|
-
def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details)
|
9
|
+
def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
|
10
10
|
@search_terms = search_terms
|
11
11
|
@output = Array.new
|
12
12
|
|
@@ -17,10 +17,16 @@ class LinkedinCrawler
|
|
17
17
|
@requests_google = requests_google
|
18
18
|
@requests_google2 = requests_google2
|
19
19
|
@solver_details = solver_details
|
20
|
+
|
21
|
+
# Handle crawler manager info
|
22
|
+
@cm_url = cm_hash[:crawler_manager_url] if cm_hash
|
23
|
+
@selector_id = cm_hash[:selector_id] if cm_hash
|
20
24
|
end
|
21
25
|
|
22
26
|
# Run search terms and get results
|
23
27
|
def search
|
28
|
+
|
29
|
+
begin
|
24
30
|
# Run Google search
|
25
31
|
g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details)
|
26
32
|
urls = g.getURLs
|
@@ -28,6 +34,9 @@ class LinkedinCrawler
|
|
28
34
|
# Look for new LI urls
|
29
35
|
g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details)
|
30
36
|
urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
|
37
|
+
rescue Exception
|
38
|
+
binding.pry
|
39
|
+
end
|
31
40
|
|
32
41
|
# Scrape each resulting LinkedIn page
|
33
42
|
urls.each do |profile|
|
@@ -77,14 +86,33 @@ class LinkedinCrawler
|
|
77
86
|
@retry_count += 1
|
78
87
|
scrape(profile_url)
|
79
88
|
else # Just save it and move on
|
80
|
-
|
89
|
+
report_results(parsed_profile, profile_url)
|
81
90
|
end
|
82
91
|
|
83
92
|
else # It succeeded!
|
84
|
-
|
93
|
+
report_results(parsed_profile, profile_url)
|
85
94
|
end
|
86
95
|
end
|
87
96
|
|
97
|
+
# Figure out how to report results
|
98
|
+
def report_results(results, link)
|
99
|
+
if @cm_url
|
100
|
+
report_incremental(results, link)
|
101
|
+
else
|
102
|
+
save_and_continue(results)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Report results back to Harvester incrementally
|
107
|
+
def report_incremental(results, link)
|
108
|
+
curl_url = @cm_url+"/relay_results"
|
109
|
+
@retry_count = 0
|
110
|
+
c = Curl::Easy.http_post(curl_url,
|
111
|
+
Curl::PostField.content('selector_id', @selector_id),
|
112
|
+
Curl::PostField.content('status_message', "Collected " + link),
|
113
|
+
Curl::PostField.content('results', JSON.pretty_generate(results)))
|
114
|
+
end
|
115
|
+
|
88
116
|
# Print output in JSON
|
89
117
|
def gen_json
|
90
118
|
JSON.pretty_generate(@output)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedincrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.16
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-10-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls public LinkedIn profiles via Google
|
14
14
|
email: shidash@shidash.com
|