linkedincrawler 0.0.19 → 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +31 -33
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b55cee51c70158577d2c1e1c618aa284f6a4647c
4
- data.tar.gz: 7c2ab05da2829fb7f85f6510a3eee45ba7ef65df
3
+ metadata.gz: 66c4b7e1c8b816cc32145cfbee2531a3669a4709
4
+ data.tar.gz: fd88835b7695d9493b2521461fc9af53f7f34884
5
5
  SHA512:
6
- metadata.gz: 51c17a2cd6eb7a819a095291aa84fc52cbb7ddf00e7fbb0c627f9dd19a10181c86f195a2b807253ad89820761e0025eed2c07865b184d763d4a94359f7fad077
7
- data.tar.gz: 9f9f9ef9af43157daac01b2a2e085782538eb5441f46c98f754f26c9428260b0193df59125cc8dd8bfa352a7b46b88a79c39279ce5666aabc2e95afdf71be652
6
+ metadata.gz: 221a8351e11bc443a609eaf460651e03442ff179b333a7bd596b9e3c6ee725d37c407f6bf34eb566c7864ee3cdacbfcfe2d3676455ae2169ce1c5afddb8145cb
7
+ data.tar.gz: 1bc4ec5b78fe5bae53b5e494a87c538759aeee72bd5ffe895f3b43e04fd1ac9e99292811f1fa5c37e8863c2bf8e3b0611d8aa33abc3c0a640fddf9641b83b8f3
@@ -4,6 +4,7 @@ require 'generalscraper'
4
4
 
5
5
  require 'selenium-webdriver'
6
6
  require 'pry'
7
+ require 'headless'
7
8
 
8
9
  class LinkedinCrawler
9
10
  def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
@@ -26,30 +27,39 @@ class LinkedinCrawler
26
27
 
27
28
  # Run search terms and get results
28
29
  def search
29
-
30
+ # Get matching profiles
31
+ urls = google_queries
32
+
33
+ # Get pages and report results
34
+ get_pages(urls)
35
+ report_status("Data collection completed for " + @search_terms.to_s)
36
+ end
37
+
38
+ # Run queries on google
39
+ def google_queries
30
40
  begin
31
41
  # Run Google search
32
- g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
33
- urls = g.getURLs
34
-
35
- # Look for new LI urls
36
- g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
37
- urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
42
+ g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
43
+ urls = g.getURLs
44
+
45
+ # Look for new LI urls
46
+ g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
47
+ urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
38
48
  rescue => e
39
49
  report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
40
50
  binding.pry
41
51
  end
42
-
43
- # Scrape each resulting LinkedIn page
44
- urls.each do |profile|
45
- if check_right_page(profile)
46
- scrape(profile)
47
- end
48
- end
52
+ return urls
53
+ end
49
54
 
50
- # Close all the browsers when done
51
- @requests.close_all_browsers
52
- report_status("Data collection completed for " + @search_terms.to_s)
55
+ # Get each page itself
56
+ def get_pages(urls)
57
+ profiles = urls.select{|u| check_right_page(u)}
58
+ t = TranslatePage.new(profiles, @requests)
59
+ parsed_profiles = t.translate
60
+ parsed_profiles.each do |profile|
61
+ parse_and_report(profile[:url], profile[:html])
62
+ end
53
63
  end
54
64
 
55
65
  # Check that it is actually a LinkedIn profile page
@@ -72,28 +82,16 @@ class LinkedinCrawler
72
82
  return (parsed_profile == nil) || parsed_profile.empty? || parsed_profile.first["parsing_failed"]
73
83
  end
74
84
 
75
- # Scrape each page
76
- def scrape(profile_url)
77
- # Get profile page
78
- profile_html = @requests.get_page(profile_url)
79
-
85
+ # Parse each page
86
+ def parse_and_report(profile_url, profile_html)
80
87
  # Parse profile
81
88
  l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
82
89
  parsed_profile = JSON.parse(l.results_by_job)
83
90
 
84
91
  # Check if it failed or succeeded
85
92
  if profile_parsing_failed?(parsed_profile)
86
- # Handle something wrong- restart in case it is blocked and rescrape
87
- if @retry_count < @retry_limit
88
- @requests.restart_browser
89
- @retry_count += 1
90
- report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
91
- scrape(profile_url)
92
- else # Just save it and move on
93
- report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
94
- report_results(parsed_profile, profile_url)
95
- end
96
-
93
+ report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
94
+ report_results(parsed_profile, profile_url)
97
95
  else # It succeeded!
98
96
  report_results(parsed_profile, profile_url)
99
97
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.20
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-30 00:00:00.000000000 Z
11
+ date: 2017-02-18 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com