linkedincrawler 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +28 -12
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3be949ebe25839f19c8c03dc0be404dc8e15928f
4
- data.tar.gz: b070cd1c81b4e91d31f38d13856279c27c9455ff
3
+ metadata.gz: c1933a4a905a2b52b2d3b888d54c4d8d044ec7c5
4
+ data.tar.gz: 771175b55c592fc4bf14317a1c63cc76efb4ecd9
5
5
  SHA512:
6
- metadata.gz: fae6bb4559925406d27e631e08c901d5d86a7089b5d94f7e31ebe16f6849d284da8a4070af5d3931de73ea17a1cb3779eb728a8b9036989dfb5cdc6873588727
7
- data.tar.gz: f100d18fd9951c0baa93e5dbbe6be08a4977cfb4aade1cb2e872d6bfecb6bc5f7f082ce85b4a5e671a1507287f9cd6cf7d1ab84121d36cd9c02be54984197646
6
+ metadata.gz: a427d86d2ddc648e1bb6362b93f97ffb56e1f68420ce418d6b4754062e752b8961b8e4e72da6a465c990d69367097e37e1d3a9ac054394e077de6449ca1fe62d
7
+ data.tar.gz: 23618949f1ce1bd4eaf16589bed5078e6cc840b36f8c8624f6e477e0f89890c48ac1c5efa5a8d2d191119689509f5284090714c8380693d1ae7fc93bd014c92f
@@ -1,45 +1,61 @@
1
+ require 'requestmanager'
1
2
  require 'linkedinparser'
2
3
  require 'generalscraper'
4
+
3
5
  require 'selenium-webdriver'
4
6
  require 'pry'
5
7
 
6
8
  class LinkedinCrawler
7
- include ProxyManager
8
- def initialize(search_terms)
9
+ def initialize(search_terms, retry_limit, proxy_list, request_time)
9
10
  @search_terms = search_terms
10
11
  @output = Array.new
12
+ @retry_limit = retry_limit
13
+ @retry_count = 0
14
+ @proxy_list = proxy_list
15
+ @requests = RequestManager.new(@proxy_list, request_time, 5)
11
16
  end
12
17
 
13
18
  # Run search terms and get results
14
19
  def search
15
20
  # Run Google search
16
- g = GeneralScraper.new("site:linkedin.com/pub", @search_terms, "/home/shidash/proxies", false)
21
+ g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @proxy_list)
17
22
 
18
23
  # Scrape each resulting LinkedIn page
19
- gen_driver
20
24
  JSON.parse(g.getURLs).each do |profile|
21
- scrape(profile)
25
+ if profile.include?(".linkedin.") && !profile.include?("/search")
26
+ scrape(profile)
27
+ end
22
28
  end
29
+
30
+ # Close all the browsers
31
+ @requests.close_all_browsers
23
32
  end
24
33
 
25
- # Generate driver for searches
26
- def gen_driver
27
- profile = Selenium::WebDriver::Firefox::Profile.new
28
- profile['intl.accept_languages'] = 'en'
29
- profile["javascript.enabled"] = false
30
- @driver = Selenium::WebDriver.for :firefox, profile: profile
34
+ # Check that it is actually a LinkedIn profile page
35
+ def check_right_page(profile_url)
36
+ return !profile_url.include?("www.google") &&
37
+ !profile_url.include?("linkedin.com/pub/dir") &&
38
+ !profile_url.include?("/search") &&
39
+ @retry_count < @retry_limit
31
40
  end
32
41
 
33
42
  # Scrape each page
34
43
  def scrape(profile_url)
35
44
  # Get profile page
36
- profile_html = getPage(profile_url, @driver, nil, 5, false).page_source
45
+ profile_html = @requests.get_page(profile_url)
37
46
 
38
47
  # Parse profile and add to output
39
48
  begin
40
49
  l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now})
41
50
  @output += JSON.parse(l.results_by_job)
51
+ @retry_count = 0
42
52
  rescue
53
+ # If proxy doesn't work, try another a few times
54
+ if check_right_page(profile_url)
55
+ @requests.restart_browser
56
+ @retry_count += 1
57
+ scrape(profile_url)
58
+ end
43
59
  end
44
60
  end
45
61
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-01 00:00:00.000000000 Z
11
+ date: 2015-11-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com