linkedincrawler 0.0.19 → 0.0.20

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +31 -33
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b55cee51c70158577d2c1e1c618aa284f6a4647c
4
- data.tar.gz: 7c2ab05da2829fb7f85f6510a3eee45ba7ef65df
3
+ metadata.gz: 66c4b7e1c8b816cc32145cfbee2531a3669a4709
4
+ data.tar.gz: fd88835b7695d9493b2521461fc9af53f7f34884
5
5
  SHA512:
6
- metadata.gz: 51c17a2cd6eb7a819a095291aa84fc52cbb7ddf00e7fbb0c627f9dd19a10181c86f195a2b807253ad89820761e0025eed2c07865b184d763d4a94359f7fad077
7
- data.tar.gz: 9f9f9ef9af43157daac01b2a2e085782538eb5441f46c98f754f26c9428260b0193df59125cc8dd8bfa352a7b46b88a79c39279ce5666aabc2e95afdf71be652
6
+ metadata.gz: 221a8351e11bc443a609eaf460651e03442ff179b333a7bd596b9e3c6ee725d37c407f6bf34eb566c7864ee3cdacbfcfe2d3676455ae2169ce1c5afddb8145cb
7
+ data.tar.gz: 1bc4ec5b78fe5bae53b5e494a87c538759aeee72bd5ffe895f3b43e04fd1ac9e99292811f1fa5c37e8863c2bf8e3b0611d8aa33abc3c0a640fddf9641b83b8f3
@@ -4,6 +4,7 @@ require 'generalscraper'
4
4
 
5
5
  require 'selenium-webdriver'
6
6
  require 'pry'
7
+ require 'headless'
7
8
 
8
9
  class LinkedinCrawler
9
10
  def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
@@ -26,30 +27,39 @@ class LinkedinCrawler
26
27
 
27
28
  # Run search terms and get results
28
29
  def search
29
-
30
+ # Get matching profiles
31
+ urls = google_queries
32
+
33
+ # Get pages and report results
34
+ get_pages(urls)
35
+ report_status("Data collection completed for " + @search_terms.to_s)
36
+ end
37
+
38
+ # Run queries on google
39
+ def google_queries
30
40
  begin
31
41
  # Run Google search
32
- g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
33
- urls = g.getURLs
34
-
35
- # Look for new LI urls
36
- g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
37
- urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
42
+ g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
43
+ urls = g.getURLs
44
+
45
+ # Look for new LI urls
46
+ g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
47
+ urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
38
48
  rescue => e
39
49
  report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
40
50
  binding.pry
41
51
  end
42
-
43
- # Scrape each resulting LinkedIn page
44
- urls.each do |profile|
45
- if check_right_page(profile)
46
- scrape(profile)
47
- end
48
- end
52
+ return urls
53
+ end
49
54
 
50
- # Close all the browsers when done
51
- @requests.close_all_browsers
52
- report_status("Data collection completed for " + @search_terms.to_s)
55
+ # Get each page itself
56
+ def get_pages(urls)
57
+ profiles = urls.select{|u| check_right_page(u)}
58
+ t = TranslatePage.new(profiles, @requests)
59
+ parsed_profiles = t.translate
60
+ parsed_profiles.each do |profile|
61
+ parse_and_report(profile[:url], profile[:html])
62
+ end
53
63
  end
54
64
 
55
65
  # Check that it is actually a LinkedIn profile page
@@ -72,28 +82,16 @@ class LinkedinCrawler
72
82
  return (parsed_profile == nil) || parsed_profile.empty? || parsed_profile.first["parsing_failed"]
73
83
  end
74
84
 
75
- # Scrape each page
76
- def scrape(profile_url)
77
- # Get profile page
78
- profile_html = @requests.get_page(profile_url)
79
-
85
+ # Parse each page
86
+ def parse_and_report(profile_url, profile_html)
80
87
  # Parse profile
81
88
  l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
82
89
  parsed_profile = JSON.parse(l.results_by_job)
83
90
 
84
91
  # Check if it failed or succeeded
85
92
  if profile_parsing_failed?(parsed_profile)
86
- # Handle something wrong- restart in case it is blocked and rescrape
87
- if @retry_count < @retry_limit
88
- @requests.restart_browser
89
- @retry_count += 1
90
- report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
91
- scrape(profile_url)
92
- else # Just save it and move on
93
- report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
94
- report_results(parsed_profile, profile_url)
95
- end
96
-
93
+ report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
94
+ report_results(parsed_profile, profile_url)
97
95
  else # It succeeded!
98
96
  report_results(parsed_profile, profile_url)
99
97
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-30 00:00:00.000000000 Z
11
+ date: 2017-02-18 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com