linkedincrawler 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/linkedincrawler.rb +28 -12
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1933a4a905a2b52b2d3b888d54c4d8d044ec7c5
|
4
|
+
data.tar.gz: 771175b55c592fc4bf14317a1c63cc76efb4ecd9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a427d86d2ddc648e1bb6362b93f97ffb56e1f68420ce418d6b4754062e752b8961b8e4e72da6a465c990d69367097e37e1d3a9ac054394e077de6449ca1fe62d
|
7
|
+
data.tar.gz: 23618949f1ce1bd4eaf16589bed5078e6cc840b36f8c8624f6e477e0f89890c48ac1c5efa5a8d2d191119689509f5284090714c8380693d1ae7fc93bd014c92f
|
data/lib/linkedincrawler.rb
CHANGED
@@ -1,45 +1,61 @@
|
|
1
|
+
require 'requestmanager'
|
1
2
|
require 'linkedinparser'
|
2
3
|
require 'generalscraper'
|
4
|
+
|
3
5
|
require 'selenium-webdriver'
|
4
6
|
require 'pry'
|
5
7
|
|
6
8
|
class LinkedinCrawler
|
7
|
-
|
8
|
-
def initialize(search_terms)
|
9
|
+
def initialize(search_terms, retry_limit, proxy_list, request_time)
|
9
10
|
@search_terms = search_terms
|
10
11
|
@output = Array.new
|
12
|
+
@retry_limit = retry_limit
|
13
|
+
@retry_count = 0
|
14
|
+
@proxy_list = proxy_list
|
15
|
+
@requests = RequestManager.new(@proxy_list, request_time, 5)
|
11
16
|
end
|
12
17
|
|
13
18
|
# Run search terms and get results
|
14
19
|
def search
|
15
20
|
# Run Google search
|
16
|
-
g = GeneralScraper.new("site:linkedin.com/pub
|
21
|
+
g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @proxy_list)
|
17
22
|
|
18
23
|
# Scrape each resulting LinkedIn page
|
19
|
-
gen_driver
|
20
24
|
JSON.parse(g.getURLs).each do |profile|
|
21
|
-
|
25
|
+
if profile.include?(".linkedin.") && !profile.include?("/search")
|
26
|
+
scrape(profile)
|
27
|
+
end
|
22
28
|
end
|
29
|
+
|
30
|
+
# Close all the browsers
|
31
|
+
@requests.close_all_browsers
|
23
32
|
end
|
24
33
|
|
25
|
-
#
|
26
|
-
def
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
34
|
+
# Check that it is actually a LinkedIn profile page
|
35
|
+
def check_right_page(profile_url)
|
36
|
+
return !profile_url.include?("www.google") &&
|
37
|
+
!profile_url.include?("linkedin.com/pub/dir") &&
|
38
|
+
!profile_url.include?("/search") &&
|
39
|
+
@retry_count < @retry_limit
|
31
40
|
end
|
32
41
|
|
33
42
|
# Scrape each page
|
34
43
|
def scrape(profile_url)
|
35
44
|
# Get profile page
|
36
|
-
profile_html =
|
45
|
+
profile_html = @requests.get_page(profile_url)
|
37
46
|
|
38
47
|
# Parse profile and add to output
|
39
48
|
begin
|
40
49
|
l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now})
|
41
50
|
@output += JSON.parse(l.results_by_job)
|
51
|
+
@retry_count = 0
|
42
52
|
rescue
|
53
|
+
# If proxy doesn't work, try another a few times
|
54
|
+
if check_right_page(profile_url)
|
55
|
+
@requests.restart_browser
|
56
|
+
@retry_count += 1
|
57
|
+
scrape(profile_url)
|
58
|
+
end
|
43
59
|
end
|
44
60
|
end
|
45
61
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedincrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls public LinkedIn profiles via Google
|
14
14
|
email: shidash@shidash.com
|