requestmanager 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/requestmanager.rb +77 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4f67cdd4a24c5e45ff2509d3d934a1a9024178b8
|
4
|
+
data.tar.gz: ff5e527045259cb2207062af68042afd968f4c3f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3419dea05f953ecb677c97cb4a7c06ba6cd264e79b4b8baa4cfa94bac8c0081550615a9b2039402c32e960900eb48e651723380b19d5309137c83e6d4ff78a92
|
7
|
+
data.tar.gz: cdfec6f8eda2883212ee9fdb0cd4fabe11c37b6e8cb525f86fbc05576e8f84dca905a3ecb3bc689474b308d9c4faa68ebdbe47163c4cf83896364a6988e2bff9
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'selenium-webdriver'
|
2
|
+
require 'uri'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
|
6
|
+
class RequestManager
|
7
|
+
def initialize(proxy_list, request_interval)
|
8
|
+
@proxy_list = parse_proxy_list(proxy_list)
|
9
|
+
@request_interval = request_interval
|
10
|
+
@used_proxies = Hash.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Get the page requested
|
14
|
+
def get_page(url, form_input = nil)
|
15
|
+
chosen_proxy = @proxy_list != nil ? get_random_proxy(url) : nil
|
16
|
+
driver = gen_driver(chosen_proxy)
|
17
|
+
driver.navigate.to url
|
18
|
+
puts "Getting page " + url
|
19
|
+
|
20
|
+
# Handle form input if there is any
|
21
|
+
if form_input
|
22
|
+
element = driver.find_element(name: "q")
|
23
|
+
element.send_keys form_input
|
24
|
+
element.submit
|
25
|
+
end
|
26
|
+
|
27
|
+
page_html = driver.page_source
|
28
|
+
driver.quit
|
29
|
+
return page_html
|
30
|
+
end
|
31
|
+
|
32
|
+
# Generate driver for searches
|
33
|
+
def gen_driver(chosen_proxy)
|
34
|
+
# Profile settings
|
35
|
+
profile = Selenium::WebDriver::Firefox::Profile.new
|
36
|
+
profile['intl.accept_languages'] = 'en'
|
37
|
+
|
38
|
+
# Set proxy if proxy list, otherwise sleep
|
39
|
+
if chosen_proxy
|
40
|
+
proxy = Selenium::WebDriver::Proxy.new(http: chosen_proxy, ssl: chosen_proxy)
|
41
|
+
profile.proxy = proxy
|
42
|
+
else
|
43
|
+
sleep(rand(@request_interval[0]..@request_interval[1]))
|
44
|
+
end
|
45
|
+
|
46
|
+
return Selenium::WebDriver.for :firefox, profile: profile
|
47
|
+
end
|
48
|
+
|
49
|
+
# Choose a random proxy that hasn't been used recently
|
50
|
+
def get_random_proxy(url)
|
51
|
+
max = @proxy_list.length
|
52
|
+
chosen = @proxy_list[Random.rand(max)]
|
53
|
+
|
54
|
+
# Only use proxy if it hasn't been used in last n seconds on same host
|
55
|
+
if is_not_used?(chosen, url)
|
56
|
+
@used_proxies[chosen] = [Time.now, URI.parse(url).host]
|
57
|
+
return chosen[0]+":"+chosen[1]
|
58
|
+
else
|
59
|
+
sleep(0.005)
|
60
|
+
get_random_proxy(url)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Checks if a proxy has been used on domain in the last 20 seconds
|
65
|
+
def is_not_used?(chosen, url)
|
66
|
+
return (!@used_proxies[chosen] ||
|
67
|
+
@used_proxies[chosen][0] <= Time.now-@request_interval[0] ||
|
68
|
+
@used_proxies[chosen][1] != URI.parse(url).host)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Parse the proxy list
|
72
|
+
def parse_proxy_list(proxy_file)
|
73
|
+
if proxy_file
|
74
|
+
return IO.readlines(proxy_file).map{ |proxy| proxy.strip.split(":")}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: requestmanager
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-11-01 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Manages proxies, wait intervals, etc
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/requestmanager.rb
|
20
|
+
homepage: https://github.com/TransparencyToolkit/linkedincrawler
|
21
|
+
licenses:
|
22
|
+
- GPL
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.8
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Manages scraper http requests
|
44
|
+
test_files: []
|
45
|
+
has_rdoc:
|