linkedincrawler 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/linkedincrawler.rb +28 -12
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3be949ebe25839f19c8c03dc0be404dc8e15928f
4
- data.tar.gz: b070cd1c81b4e91d31f38d13856279c27c9455ff
3
+ metadata.gz: c1933a4a905a2b52b2d3b888d54c4d8d044ec7c5
4
+ data.tar.gz: 771175b55c592fc4bf14317a1c63cc76efb4ecd9
5
5
  SHA512:
6
- metadata.gz: fae6bb4559925406d27e631e08c901d5d86a7089b5d94f7e31ebe16f6849d284da8a4070af5d3931de73ea17a1cb3779eb728a8b9036989dfb5cdc6873588727
7
- data.tar.gz: f100d18fd9951c0baa93e5dbbe6be08a4977cfb4aade1cb2e872d6bfecb6bc5f7f082ce85b4a5e671a1507287f9cd6cf7d1ab84121d36cd9c02be54984197646
6
+ metadata.gz: a427d86d2ddc648e1bb6362b93f97ffb56e1f68420ce418d6b4754062e752b8961b8e4e72da6a465c990d69367097e37e1d3a9ac054394e077de6449ca1fe62d
7
+ data.tar.gz: 23618949f1ce1bd4eaf16589bed5078e6cc840b36f8c8624f6e477e0f89890c48ac1c5efa5a8d2d191119689509f5284090714c8380693d1ae7fc93bd014c92f
@@ -1,45 +1,61 @@
1
+ require 'requestmanager'
1
2
  require 'linkedinparser'
2
3
  require 'generalscraper'
4
+
3
5
  require 'selenium-webdriver'
4
6
  require 'pry'
5
7
 
6
8
  class LinkedinCrawler
7
- include ProxyManager
8
- def initialize(search_terms)
9
+ def initialize(search_terms, retry_limit, proxy_list, request_time)
9
10
  @search_terms = search_terms
10
11
  @output = Array.new
12
+ @retry_limit = retry_limit
13
+ @retry_count = 0
14
+ @proxy_list = proxy_list
15
+ @requests = RequestManager.new(@proxy_list, request_time, 5)
11
16
  end
12
17
 
13
18
  # Run search terms and get results
14
19
  def search
15
20
  # Run Google search
16
- g = GeneralScraper.new("site:linkedin.com/pub", @search_terms, "/home/shidash/proxies", false)
21
+ g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @proxy_list)
17
22
 
18
23
  # Scrape each resulting LinkedIn page
19
- gen_driver
20
24
  JSON.parse(g.getURLs).each do |profile|
21
- scrape(profile)
25
+ if profile.include?(".linkedin.") && !profile.include?("/search")
26
+ scrape(profile)
27
+ end
22
28
  end
29
+
30
+ # Close all the browsers
31
+ @requests.close_all_browsers
23
32
  end
24
33
 
25
- # Generate driver for searches
26
- def gen_driver
27
- profile = Selenium::WebDriver::Firefox::Profile.new
28
- profile['intl.accept_languages'] = 'en'
29
- profile["javascript.enabled"] = false
30
- @driver = Selenium::WebDriver.for :firefox, profile: profile
34
+ # Check that it is actually a LinkedIn profile page
35
+ def check_right_page(profile_url)
36
+ return !profile_url.include?("www.google") &&
37
+ !profile_url.include?("linkedin.com/pub/dir") &&
38
+ !profile_url.include?("/search") &&
39
+ @retry_count < @retry_limit
31
40
  end
32
41
 
33
42
  # Scrape each page
34
43
  def scrape(profile_url)
35
44
  # Get profile page
36
- profile_html = getPage(profile_url, @driver, nil, 5, false).page_source
45
+ profile_html = @requests.get_page(profile_url)
37
46
 
38
47
  # Parse profile and add to output
39
48
  begin
40
49
  l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now})
41
50
  @output += JSON.parse(l.results_by_job)
51
+ @retry_count = 0
42
52
  rescue
53
+ # If proxy doesn't work, try another a few times
54
+ if check_right_page(profile_url)
55
+ @requests.restart_browser
56
+ @retry_count += 1
57
+ scrape(profile_url)
58
+ end
43
59
  end
44
60
  end
45
61
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedincrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-01 00:00:00.000000000 Z
11
+ date: 2015-11-03 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Crawls public LinkedIn profiles via Google
14
14
  email: shidash@shidash.com