requestmanager 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/requestmanager.rb +77 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4f67cdd4a24c5e45ff2509d3d934a1a9024178b8
4
+ data.tar.gz: ff5e527045259cb2207062af68042afd968f4c3f
5
+ SHA512:
6
+ metadata.gz: 3419dea05f953ecb677c97cb4a7c06ba6cd264e79b4b8baa4cfa94bac8c0081550615a9b2039402c32e960900eb48e651723380b19d5309137c83e6d4ff78a92
7
+ data.tar.gz: cdfec6f8eda2883212ee9fdb0cd4fabe11c37b6e8cb525f86fbc05576e8f84dca905a3ecb3bc689474b308d9c4faa68ebdbe47163c4cf83896364a6988e2bff9
@@ -0,0 +1,77 @@
1
+ require 'selenium-webdriver'
2
+ require 'uri'
3
+ require 'pry'
4
+
5
+
6
+ class RequestManager
7
+ def initialize(proxy_list, request_interval)
8
+ @proxy_list = parse_proxy_list(proxy_list)
9
+ @request_interval = request_interval
10
+ @used_proxies = Hash.new
11
+ end
12
+
13
+ # Get the page requested
14
+ def get_page(url, form_input = nil)
15
+ chosen_proxy = @proxy_list != nil ? get_random_proxy(url) : nil
16
+ driver = gen_driver(chosen_proxy)
17
+ driver.navigate.to url
18
+ puts "Getting page " + url
19
+
20
+ # Handle form input if there is any
21
+ if form_input
22
+ element = driver.find_element(name: "q")
23
+ element.send_keys form_input
24
+ element.submit
25
+ end
26
+
27
+ page_html = driver.page_source
28
+ driver.quit
29
+ return page_html
30
+ end
31
+
32
+ # Generate driver for searches
33
+ def gen_driver(chosen_proxy)
34
+ # Profile settings
35
+ profile = Selenium::WebDriver::Firefox::Profile.new
36
+ profile['intl.accept_languages'] = 'en'
37
+
38
+ # Set proxy if proxy list, otherwise sleep
39
+ if chosen_proxy
40
+ proxy = Selenium::WebDriver::Proxy.new(http: chosen_proxy, ssl: chosen_proxy)
41
+ profile.proxy = proxy
42
+ else
43
+ sleep(rand(@request_interval[0]..@request_interval[1]))
44
+ end
45
+
46
+ return Selenium::WebDriver.for :firefox, profile: profile
47
+ end
48
+
49
+ # Choose a random proxy that hasn't been used recently
50
+ def get_random_proxy(url)
51
+ max = @proxy_list.length
52
+ chosen = @proxy_list[Random.rand(max)]
53
+
54
+ # Only use proxy if it hasn't been used in last n seconds on same host
55
+ if is_not_used?(chosen, url)
56
+ @used_proxies[chosen] = [Time.now, URI.parse(url).host]
57
+ return chosen[0]+":"+chosen[1]
58
+ else
59
+ sleep(0.005)
60
+ get_random_proxy(url)
61
+ end
62
+ end
63
+
64
+ # Checks if a proxy has been used on domain in the last 20 seconds
65
+ def is_not_used?(chosen, url)
66
+ return (!@used_proxies[chosen] ||
67
+ @used_proxies[chosen][0] <= Time.now-@request_interval[0] ||
68
+ @used_proxies[chosen][1] != URI.parse(url).host)
69
+ end
70
+
71
+ # Parse the proxy list
72
+ def parse_proxy_list(proxy_file)
73
+ if proxy_file
74
+ return IO.readlines(proxy_file).map{ |proxy| proxy.strip.split(":")}
75
+ end
76
+ end
77
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: requestmanager
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-11-01 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Manages proxies, wait intervals, etc
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/requestmanager.rb
20
+ homepage: https://github.com/TransparencyToolkit/linkedincrawler
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.8
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Manages scraper http requests
44
+ test_files: []
45
+ has_rdoc: