linkedincrawler 0.0.19 → 0.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/linkedincrawler.rb +31 -33
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66c4b7e1c8b816cc32145cfbee2531a3669a4709
|
4
|
+
data.tar.gz: fd88835b7695d9493b2521461fc9af53f7f34884
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 221a8351e11bc443a609eaf460651e03442ff179b333a7bd596b9e3c6ee725d37c407f6bf34eb566c7864ee3cdacbfcfe2d3676455ae2169ce1c5afddb8145cb
|
7
|
+
data.tar.gz: 1bc4ec5b78fe5bae53b5e494a87c538759aeee72bd5ffe895f3b43e04fd1ac9e99292811f1fa5c37e8863c2bf8e3b0611d8aa33abc3c0a640fddf9641b83b8f3
|
data/lib/linkedincrawler.rb
CHANGED
@@ -4,6 +4,7 @@ require 'generalscraper'
|
|
4
4
|
|
5
5
|
require 'selenium-webdriver'
|
6
6
|
require 'pry'
|
7
|
+
require 'headless'
|
7
8
|
|
8
9
|
class LinkedinCrawler
|
9
10
|
def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
|
@@ -26,30 +27,39 @@ class LinkedinCrawler
|
|
26
27
|
|
27
28
|
# Run search terms and get results
|
28
29
|
def search
|
29
|
-
|
30
|
+
# Get matching profiles
|
31
|
+
urls = google_queries
|
32
|
+
|
33
|
+
# Get pages and report results
|
34
|
+
get_pages(urls)
|
35
|
+
report_status("Data collection completed for " + @search_terms.to_s)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Run queries on google
|
39
|
+
def google_queries
|
30
40
|
begin
|
31
41
|
# Run Google search
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
42
|
+
g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
|
43
|
+
urls = g.getURLs
|
44
|
+
|
45
|
+
# Look for new LI urls
|
46
|
+
g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
|
47
|
+
urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
|
38
48
|
rescue => e
|
39
49
|
report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
|
40
50
|
binding.pry
|
41
51
|
end
|
42
|
-
|
43
|
-
|
44
|
-
urls.each do |profile|
|
45
|
-
if check_right_page(profile)
|
46
|
-
scrape(profile)
|
47
|
-
end
|
48
|
-
end
|
52
|
+
return urls
|
53
|
+
end
|
49
54
|
|
50
|
-
|
51
|
-
|
52
|
-
|
55
|
+
# Get each page itself
|
56
|
+
def get_pages(urls)
|
57
|
+
profiles = urls.select{|u| check_right_page(u)}
|
58
|
+
t = TranslatePage.new(profiles, @requests)
|
59
|
+
parsed_profiles = t.translate
|
60
|
+
parsed_profiles.each do |profile|
|
61
|
+
parse_and_report(profile[:url], profile[:html])
|
62
|
+
end
|
53
63
|
end
|
54
64
|
|
55
65
|
# Check that it is actually a LinkedIn profile page
|
@@ -72,28 +82,16 @@ class LinkedinCrawler
|
|
72
82
|
return (parsed_profile == nil) || parsed_profile.empty? || parsed_profile.first["parsing_failed"]
|
73
83
|
end
|
74
84
|
|
75
|
-
#
|
76
|
-
def
|
77
|
-
# Get profile page
|
78
|
-
profile_html = @requests.get_page(profile_url)
|
79
|
-
|
85
|
+
# Parse each page
|
86
|
+
def parse_and_report(profile_url, profile_html)
|
80
87
|
# Parse profile
|
81
88
|
l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
|
82
89
|
parsed_profile = JSON.parse(l.results_by_job)
|
83
90
|
|
84
91
|
# Check if it failed or succeeded
|
85
92
|
if profile_parsing_failed?(parsed_profile)
|
86
|
-
|
87
|
-
|
88
|
-
@requests.restart_browser
|
89
|
-
@retry_count += 1
|
90
|
-
report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
|
91
|
-
scrape(profile_url)
|
92
|
-
else # Just save it and move on
|
93
|
-
report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
|
94
|
-
report_results(parsed_profile, profile_url)
|
95
|
-
end
|
96
|
-
|
93
|
+
report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
|
94
|
+
report_results(parsed_profile, profile_url)
|
97
95
|
else # It succeeded!
|
98
96
|
report_results(parsed_profile, profile_url)
|
99
97
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedincrawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.20
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-18 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Crawls public LinkedIn profiles via Google
|
14
14
|
email: shidash@shidash.com
|