RubyGems - linkedincrawler - Versions diffs - 0.0.19 → 0.0.20 - Mend

linkedincrawler 0.0.19 → 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b55cee51c70158577d2c1e1c618aa284f6a4647c
-  data.tar.gz: 7c2ab05da2829fb7f85f6510a3eee45ba7ef65df
+  metadata.gz: 66c4b7e1c8b816cc32145cfbee2531a3669a4709
+  data.tar.gz: fd88835b7695d9493b2521461fc9af53f7f34884
 SHA512:
-  metadata.gz: 51c17a2cd6eb7a819a095291aa84fc52cbb7ddf00e7fbb0c627f9dd19a10181c86f195a2b807253ad89820761e0025eed2c07865b184d763d4a94359f7fad077
-  data.tar.gz: 9f9f9ef9af43157daac01b2a2e085782538eb5441f46c98f754f26c9428260b0193df59125cc8dd8bfa352a7b46b88a79c39279ce5666aabc2e95afdf71be652
+  metadata.gz: 221a8351e11bc443a609eaf460651e03442ff179b333a7bd596b9e3c6ee725d37c407f6bf34eb566c7864ee3cdacbfcfe2d3676455ae2169ce1c5afddb8145cb
+  data.tar.gz: 1bc4ec5b78fe5bae53b5e494a87c538759aeee72bd5ffe895f3b43e04fd1ac9e99292811f1fa5c37e8863c2bf8e3b0611d8aa33abc3c0a640fddf9641b83b8f3

data/lib/linkedincrawler.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'generalscraper'
 require 'selenium-webdriver'
 require 'pry'
+require 'headless'
 class LinkedinCrawler
   def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
@@ -26,30 +27,39 @@ class LinkedinCrawler
   # Run search terms and get results
   def search
+    # Get matching profiles
+    urls = google_queries
+    # Get pages and report results
+    get_pages(urls)
+    report_status("Data collection completed for " + @search_terms.to_s)
+  end
+  # Run queries on google
+  def google_queries
     begin
       # Run Google search
-    g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
-    urls = g.getURLs
-    # Look for new LI urls
-    g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
-    urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
+      g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
+      urls = g.getURLs
+      # Look for new LI urls
+      g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
+      urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
     rescue => e
       report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
       binding.pry
     end
-    # Scrape each resulting LinkedIn page
-    urls.each do |profile|
-      if check_right_page(profile)
-        scrape(profile)
-      end
-    end
+    return urls
+  end
-    # Close all the browsers when done
-    @requests.close_all_browsers
-    report_status("Data collection completed for " + @search_terms.to_s)
+  # Get each page itself
+  def get_pages(urls)
+    profiles = urls.select{|u| check_right_page(u)}
+    t = TranslatePage.new(profiles, @requests)
+    parsed_profiles = t.translate
+    parsed_profiles.each do |profile|
+      parse_and_report(profile[:url], profile[:html])
+    end
   end
   # Check that it is actually a LinkedIn profile page
@@ -72,28 +82,16 @@ class LinkedinCrawler
     return (parsed_profile == nil) || parsed_profile.empty? || parsed_profile.first["parsing_failed"]
   end
-  # Scrape each page
-  def scrape(profile_url)
-    # Get profile page
-    profile_html = @requests.get_page(profile_url)
+  # Parse each page
+  def parse_and_report(profile_url, profile_html)
     # Parse profile
     l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
     parsed_profile = JSON.parse(l.results_by_job)
     # Check if it failed or succeeded
     if profile_parsing_failed?(parsed_profile)
-      # Handle something wrong- restart in case it is blocked and rescrape
-      if @retry_count < @retry_limit
-        @requests.restart_browser
-        @retry_count += 1
-        report_status("Profile parsing failed for "+profile_url.to_s+". Retrying...")
-        scrape(profile_url)
-      else # Just save it and move on
-        report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
-        report_results(parsed_profile, profile_url)
-      end
+      report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
+      report_results(parsed_profile, profile_url)
     else # It succeeded!
       report_results(parsed_profile, profile_url)
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: linkedincrawler
 version: !ruby/object:Gem::Version
-  version: 0.0.19
+  version: 0.0.20
 platform: ruby
 authors:
 - M. C. McGrath
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-10-30 00:00:00.000000000 Z
+date: 2017-02-18 00:00:00.000000000 Z
 dependencies: []
 description: Crawls public LinkedIn profiles via Google
 email: shidash@shidash.com