scruber 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +2 -0
- data/.travis.yml +5 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/scruber +4 -0
- data/lib/scruber/app_searcher.rb +31 -0
- data/lib/scruber/cli/project_generator.rb +47 -0
- data/lib/scruber/cli/templates/Gemfile.tt +6 -0
- data/lib/scruber/cli/templates/application.tt +18 -0
- data/lib/scruber/cli/templates/bin/scruber.tt +6 -0
- data/lib/scruber/cli/templates/boot.tt +3 -0
- data/lib/scruber/cli/templates/gitignore.tt +12 -0
- data/lib/scruber/cli/templates/initializers/proxies.tt +10 -0
- data/lib/scruber/cli/templates/initializers/user_agents.tt +14 -0
- data/lib/scruber/cli/templates/scrapers/sample.tt +7 -0
- data/lib/scruber/cli.rb +40 -0
- data/lib/scruber/core/configuration.rb +30 -0
- data/lib/scruber/core/crawler.rb +92 -0
- data/lib/scruber/core/extensions/base.rb +26 -0
- data/lib/scruber/core/extensions/csv_output.rb +62 -0
- data/lib/scruber/core/extensions/loop.rb +39 -0
- data/lib/scruber/core/page_format/base.rb +11 -0
- data/lib/scruber/core/page_format/html.rb +13 -0
- data/lib/scruber/core/page_format/xml.rb +13 -0
- data/lib/scruber/core/page_format.rb +33 -0
- data/lib/scruber/fetcher.rb +34 -0
- data/lib/scruber/fetcher_adapters/abstract_adapter.rb +119 -0
- data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +78 -0
- data/lib/scruber/helpers/dictionary_reader/csv.rb +27 -0
- data/lib/scruber/helpers/dictionary_reader/xml.rb +23 -0
- data/lib/scruber/helpers/dictionary_reader.rb +33 -0
- data/lib/scruber/helpers/fetcher_agent.rb +40 -0
- data/lib/scruber/helpers/fetcher_agent_adapters/abstract_adapter.rb +69 -0
- data/lib/scruber/helpers/fetcher_agent_adapters/memory.rb +41 -0
- data/lib/scruber/helpers/proxy_rotator.rb +125 -0
- data/lib/scruber/helpers/user_agent_rotator.rb +91 -0
- data/lib/scruber/queue.rb +34 -0
- data/lib/scruber/queue_adapters/abstract_adapter.rb +112 -0
- data/lib/scruber/queue_adapters/memory.rb +70 -0
- data/lib/scruber/version.rb +3 -0
- data/lib/scruber.rb +69 -0
- data/scruber.gemspec +43 -0
- metadata +233 -0
@@ -0,0 +1,33 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module PageFormat
|
4
|
+
class << self
|
5
|
+
def process(page, page_format)
|
6
|
+
if page_format.nil?
|
7
|
+
nil
|
8
|
+
elsif _registered_formats.keys.include?(page_format.to_sym)
|
9
|
+
_registered_formats[page_format.to_sym].process(page)
|
10
|
+
else
|
11
|
+
raise "Unsupported format"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def add(label, claz)
|
16
|
+
unless claz.respond_to?(:process)
|
17
|
+
raise NoMethodError, "process is not declared in the #{claz.inspect}"
|
18
|
+
end
|
19
|
+
|
20
|
+
_registered_formats[label] = claz
|
21
|
+
end
|
22
|
+
|
23
|
+
def [](label)
|
24
|
+
_registered_formats[label]
|
25
|
+
end
|
26
|
+
|
27
|
+
def _registered_formats
|
28
|
+
@registered_formats ||= {}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Fetcher
|
3
|
+
class << self
|
4
|
+
attr_writer :adapter
|
5
|
+
|
6
|
+
def new(options={})
|
7
|
+
adapter.new(::Scruber.configuration.fetcher_options.merge(options))
|
8
|
+
end
|
9
|
+
|
10
|
+
def adapter
|
11
|
+
unless @adapter
|
12
|
+
@adapter = ::Scruber.configuration.fetcher_adapter || _adapters.keys.first
|
13
|
+
end
|
14
|
+
raise Scruber::ArgumentError.new("Adapter not found") unless @adapter
|
15
|
+
_adapters[@adapter]
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_adapter(label, claz)
|
19
|
+
unless claz.method_defined?(:run)
|
20
|
+
raise NoMethodError, "run is not declared in the #{label.inspect}"
|
21
|
+
end
|
22
|
+
_adapters[label] = claz
|
23
|
+
end
|
24
|
+
|
25
|
+
def [](label)
|
26
|
+
_adapters[label]
|
27
|
+
end
|
28
|
+
|
29
|
+
def _adapters
|
30
|
+
@_adapters ||= {}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
module Scruber
|
2
|
+
module FetcherAdapters
|
3
|
+
class AbstractAdapter
|
4
|
+
attr_accessor :options, # all passed options
|
5
|
+
:max_concurrency,
|
6
|
+
:max_retry_times,
|
7
|
+
:retry_delays,
|
8
|
+
:followlocation,
|
9
|
+
:request_timeout
|
10
|
+
|
11
|
+
def initialize(options={})
|
12
|
+
@options = options
|
13
|
+
@max_concurrency = options.fetch(:max_concurrency) { 1 }
|
14
|
+
@max_retry_times = options.fetch(:max_retry_times) { 5 }
|
15
|
+
@retry_delays = options.fetch(:retry_delays) { [1,2,2,4,4] }
|
16
|
+
@followlocation = options.fetch(:followlocation) { false }
|
17
|
+
@request_timeout = options.fetch(:request_timeout) { 15 }
|
18
|
+
end
|
19
|
+
|
20
|
+
def run(queue)
|
21
|
+
raise NotImplementedError
|
22
|
+
end
|
23
|
+
|
24
|
+
def before_request_callback(page)
|
25
|
+
page
|
26
|
+
end
|
27
|
+
|
28
|
+
def after_request_callback(page)
|
29
|
+
if bad_response?(page)
|
30
|
+
page.retry_at = determine_retry_at(page)
|
31
|
+
page.retry_count += 1
|
32
|
+
if page.max_retry_times.nil?
|
33
|
+
page.max_retry_times = @max_retry_times
|
34
|
+
end
|
35
|
+
else
|
36
|
+
page.fetched_at = Time.now.to_i
|
37
|
+
end
|
38
|
+
page
|
39
|
+
end
|
40
|
+
|
41
|
+
def headers_for(page)
|
42
|
+
if page.fetcher_agent
|
43
|
+
headers = page.fetcher_agent.headers
|
44
|
+
else
|
45
|
+
headers = page.headers
|
46
|
+
end
|
47
|
+
headers = {} unless headers.is_a?(Hash)
|
48
|
+
headers["User-Agent"] = user_agent_for(page)
|
49
|
+
cookie = cookie_for(page)
|
50
|
+
if cookie
|
51
|
+
headers["Cookie"] = cookie
|
52
|
+
end
|
53
|
+
headers
|
54
|
+
end
|
55
|
+
|
56
|
+
def cookie_for(page)
|
57
|
+
if page.fetcher_agent
|
58
|
+
cookie = page.fetcher_agent.cookie_for(page.url)
|
59
|
+
cookie.blank? ? nil : cookie
|
60
|
+
else
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def user_agent_for(page)
|
66
|
+
if page.user_agent
|
67
|
+
page.user_agent
|
68
|
+
elsif page.fetcher_agent && page.fetcher_agent.user_agent
|
69
|
+
page.fetcher_agent.user_agent
|
70
|
+
else
|
71
|
+
Scruber::Helpers::UserAgentRotator.next
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def proxy_for(page)
|
76
|
+
if page.proxy
|
77
|
+
page.proxy
|
78
|
+
elsif page.fetcher_agent && page.fetcher_agent.proxy
|
79
|
+
page.fetcher_agent.proxy
|
80
|
+
else
|
81
|
+
Scruber::Helpers::ProxyRotator.next
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def determine_retry_at(page)
|
86
|
+
delay = @retry_delays[page.retry_count] || @retry_delays.last
|
87
|
+
Time.now.to_i + delay
|
88
|
+
end
|
89
|
+
|
90
|
+
def bad_response?(page)
|
91
|
+
case page.response_code
|
92
|
+
when 0
|
93
|
+
true
|
94
|
+
when 1
|
95
|
+
true
|
96
|
+
when 100..199
|
97
|
+
true
|
98
|
+
when 200
|
99
|
+
false
|
100
|
+
when 201..299
|
101
|
+
false
|
102
|
+
when 300..399
|
103
|
+
@options.fetch(:followlocation) { false }
|
104
|
+
when 404
|
105
|
+
false
|
106
|
+
when 407
|
107
|
+
raise "RejectedByProxyError"
|
108
|
+
when 400..499
|
109
|
+
true
|
110
|
+
when 500..599
|
111
|
+
true
|
112
|
+
else
|
113
|
+
true
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'typhoeus'
|
2
|
+
module Scruber
|
3
|
+
module FetcherAdapters
|
4
|
+
class TyphoeusFetcher < AbstractAdapter
|
5
|
+
attr_accessor :ssl_verifypeer,
|
6
|
+
:ssl_verifyhost
|
7
|
+
|
8
|
+
def initialize(options={})
|
9
|
+
super(options)
|
10
|
+
@ssl_verifypeer = options.fetch(:ssl_verifypeer) { false }
|
11
|
+
@ssl_verifyhost = options.fetch(:ssl_verifyhost) { 0 }
|
12
|
+
@max_requests = options.fetch(:max_requests) { @max_concurrency * 10 }
|
13
|
+
end
|
14
|
+
|
15
|
+
def run(queue)
|
16
|
+
queue.fetch_pending(@max_requests).each do |page|
|
17
|
+
request = build_request(page)
|
18
|
+
|
19
|
+
hydra.queue(request)
|
20
|
+
end
|
21
|
+
if hydra.queued_requests.count > 0
|
22
|
+
hydra.run
|
23
|
+
else
|
24
|
+
sleep 1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def build_request(page)
|
29
|
+
page = before_request_callback(page)
|
30
|
+
request_options = {
|
31
|
+
method: page[:method],
|
32
|
+
body: page[:body],
|
33
|
+
# params: page[:params],
|
34
|
+
headers: headers_for(page),
|
35
|
+
accept_encoding: 'gzip',
|
36
|
+
forbid_reuse: true,
|
37
|
+
followlocation: page.options.fetch(:followlocation){ @followlocation },
|
38
|
+
ssl_verifypeer: page.options.fetch(:ssl_verifypeer){ @ssl_verifypeer },
|
39
|
+
ssl_verifyhost: page.options.fetch(:ssl_verifyhost){ @ssl_verifyhost },
|
40
|
+
timeout: @request_timeout
|
41
|
+
}
|
42
|
+
|
43
|
+
proxy = proxy_for(page)
|
44
|
+
request_options.merge!({proxy: proxy.http? ? proxy.address : "socks://#{proxy.address}"}) if proxy
|
45
|
+
request_options.merge!({proxyuserpwd: proxy.proxyuserpwd}) if proxy && proxy.proxyuserpwd.present?
|
46
|
+
|
47
|
+
request = Typhoeus::Request.new(page[:url], request_options)
|
48
|
+
|
49
|
+
request.on_complete do |response|
|
50
|
+
on_complete_callback(page, response)
|
51
|
+
end
|
52
|
+
|
53
|
+
request
|
54
|
+
end
|
55
|
+
|
56
|
+
def hydra
|
57
|
+
@hydra ||= Typhoeus::Hydra.new(max_concurrency: @max_concurrency)
|
58
|
+
end
|
59
|
+
|
60
|
+
def on_complete_callback(page, response)
|
61
|
+
page.response_code = response.code
|
62
|
+
page.response_body = response.body
|
63
|
+
page.response_headers = response.response_headers
|
64
|
+
page.response_total_time = response.total_time
|
65
|
+
|
66
|
+
if response.timed_out?
|
67
|
+
page[:response_code] = 1
|
68
|
+
end
|
69
|
+
|
70
|
+
page = after_request_callback(page)
|
71
|
+
page.save
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
Scruber::Fetcher.add_adapter(:typhoeus_fetcher, Scruber::FetcherAdapters::TyphoeusFetcher)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
module DictionaryReader
|
4
|
+
class Csv
|
5
|
+
def initialize(file_path)
|
6
|
+
@file_path = file_path
|
7
|
+
end
|
8
|
+
|
9
|
+
def read(options={})
|
10
|
+
col_sep = options.delete(:col_sep) || ';'
|
11
|
+
|
12
|
+
CSV.foreach(@file_path, col_sep: col_sep, headers: true, encoding: 'utf-8') do |csv_row|
|
13
|
+
if options.blank?
|
14
|
+
yield csv_row
|
15
|
+
else
|
16
|
+
if options.all?{|(k,v)| csv_row[k.to_s] == v }
|
17
|
+
yield csv_row
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
Scruber::Helpers::DictionaryReader.add(:csv, Scruber::Helpers::DictionaryReader::Csv)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
module DictionaryReader
|
4
|
+
class Xml
|
5
|
+
def initialize(file_path)
|
6
|
+
@xml = Nokogiri.parse(File.open(file_path).read)
|
7
|
+
end
|
8
|
+
|
9
|
+
def read(options={})
|
10
|
+
selector = options.delete(:selector) || 'item'
|
11
|
+
options.each do |k,v|
|
12
|
+
selector = "#{selector}[#{k}=\"#{v}\"]"
|
13
|
+
end
|
14
|
+
@xml.search(selector).each do |item|
|
15
|
+
yield item.to_h
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
Scruber::Helpers::DictionaryReader.add(:xml, Scruber::Helpers::DictionaryReader::Xml)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
module DictionaryReader
|
4
|
+
class << self
|
5
|
+
def read(file_path, file_type, options)
|
6
|
+
if _registered_types.keys.include?(file_type.to_sym)
|
7
|
+
_registered_types[file_type.to_sym].new(file_path).read(options) do |obj|
|
8
|
+
yield obj
|
9
|
+
end
|
10
|
+
else
|
11
|
+
raise "Unsupported type, supported types #{_registered_types.keys}"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def add(label, claz)
|
16
|
+
unless claz.instance_methods.include?(:read)
|
17
|
+
raise NoMethodError, "read is not declared in the #{claz.inspect}"
|
18
|
+
end
|
19
|
+
|
20
|
+
_registered_types[label] = claz
|
21
|
+
end
|
22
|
+
|
23
|
+
def [](label)
|
24
|
+
_registered_types[label]
|
25
|
+
end
|
26
|
+
|
27
|
+
def _registered_types
|
28
|
+
@registered_types ||= {}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
module FetcherAgent
|
4
|
+
class << self
|
5
|
+
attr_writer :adapter
|
6
|
+
|
7
|
+
def new(options={})
|
8
|
+
adapter.new(::Scruber.configuration.fetcher_agent_options.merge(options))
|
9
|
+
end
|
10
|
+
|
11
|
+
def find(id)
|
12
|
+
adapter.find(id)
|
13
|
+
end
|
14
|
+
|
15
|
+
def adapter
|
16
|
+
unless @adapter
|
17
|
+
@adapter = ::Scruber.configuration.fetcher_agent_adapter || _adapters.keys.first
|
18
|
+
end
|
19
|
+
raise Scruber::ArgumentError.new("Adapter not found") unless @adapter
|
20
|
+
_adapters[@adapter]
|
21
|
+
end
|
22
|
+
|
23
|
+
def add_adapter(label, claz)
|
24
|
+
# unless claz.method_defined?(:run)
|
25
|
+
# raise NoMethodError, "run is not declared in the #{label.inspect}"
|
26
|
+
# end
|
27
|
+
_adapters[label] = claz
|
28
|
+
end
|
29
|
+
|
30
|
+
def [](label)
|
31
|
+
_adapters[label]
|
32
|
+
end
|
33
|
+
|
34
|
+
def _adapters
|
35
|
+
@_adapters ||= {}
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
module FetcherAgentAdapters
|
4
|
+
class AbstractAdapter
|
5
|
+
attr_accessor :id, :user_agent, :proxy_id, :headers, :cookie_jar, :updated_at, :created_at, :disable_proxy
|
6
|
+
attr_reader :jar
|
7
|
+
|
8
|
+
def initialize(options={})
|
9
|
+
@id = options.fetch(:id) { nil }
|
10
|
+
@user_agent = options.fetch(:user_agent) { nil }
|
11
|
+
@proxy_id = options.fetch(:proxy_id) { nil }
|
12
|
+
@headers = options.fetch(:headers) { {} }
|
13
|
+
@cookie_jar = options.fetch(:cookie_jar) { {} }
|
14
|
+
@disable_proxy = options.fetch(:disable_proxy) { false }
|
15
|
+
@updated_at = options.fetch(:updated_at) { Time.now }
|
16
|
+
@created_at = options.fetch(:created_at) { Time.now }
|
17
|
+
@jar = HTTP::CookieJar.new
|
18
|
+
if @cookie_jar.is_a?(String)
|
19
|
+
@jar.load(StringIO.new(@cookie_jar))
|
20
|
+
end
|
21
|
+
@_proxy = false
|
22
|
+
end
|
23
|
+
|
24
|
+
def proxy
|
25
|
+
if @_proxy == false
|
26
|
+
@_proxy = (@proxy_id ? Scruber::Helpers::ProxyRotator.find(@proxy_id) : nil)
|
27
|
+
else
|
28
|
+
@_proxy
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_cookies_from_page!(page)
|
33
|
+
cookies = page.response_cookies
|
34
|
+
cookies.each do |cookie|
|
35
|
+
@jar.parse(cookie, URI(page.url))
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def serialize_cookies
|
40
|
+
io = StringIO.new
|
41
|
+
@jar.save(io)
|
42
|
+
@cookie_jar = io.string
|
43
|
+
end
|
44
|
+
|
45
|
+
def cookie_for(uri_or_url)
|
46
|
+
if uri_or_url.is_a?(String)
|
47
|
+
uri_or_url = URI(uri_or_url)
|
48
|
+
end
|
49
|
+
HTTP::Cookie.cookie_value(@jar.cookies(uri_or_url))
|
50
|
+
end
|
51
|
+
|
52
|
+
def save
|
53
|
+
raise NotImplementedError
|
54
|
+
end
|
55
|
+
|
56
|
+
def delete
|
57
|
+
raise NotImplementedError
|
58
|
+
end
|
59
|
+
|
60
|
+
class << self
|
61
|
+
def find(id)
|
62
|
+
raise NotImplementedError
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
module FetcherAgentAdapters
|
4
|
+
class Memory < AbstractAdapter
|
5
|
+
def initialize(options={})
|
6
|
+
super(options)
|
7
|
+
@id = Time.now.to_i.to_s+'_'+rand(1_000..999_999).to_s if @id.nil?
|
8
|
+
end
|
9
|
+
|
10
|
+
def save
|
11
|
+
Scruber::Helpers::FetcherAgentAdapters::Memory.store(self)
|
12
|
+
end
|
13
|
+
|
14
|
+
def delete
|
15
|
+
Scruber::Helpers::FetcherAgentAdapters::Memory.delete(self)
|
16
|
+
end
|
17
|
+
|
18
|
+
class << self
|
19
|
+
def find(id)
|
20
|
+
_collection[id]
|
21
|
+
end
|
22
|
+
|
23
|
+
def _collection
|
24
|
+
@_collection ||= {}
|
25
|
+
end
|
26
|
+
|
27
|
+
def store(fetcher_agent)
|
28
|
+
_collection[fetcher_agent.id] = fetcher_agent
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(fetcher_agent)
|
32
|
+
_collection.delete fetcher_agent.id
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Scruber::Helpers::FetcherAgent.add_adapter(:memory, Scruber::Helpers::FetcherAgentAdapters::Memory)
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Helpers
|
3
|
+
class ProxyRotator
|
4
|
+
|
5
|
+
class Proxy
|
6
|
+
attr_accessor :host, :port, :user, :password, :probability, :type
|
7
|
+
|
8
|
+
def initialize(proxy, options={})
|
9
|
+
@host = proxy.split(':', 2).first
|
10
|
+
raise Scruber::ArgumentError.new("You need to specify proxy address") if @host.blank?
|
11
|
+
@port = options.fetch(:port) { proxy.split(':', 2)[1] }.to_i rescue nil
|
12
|
+
raise Scruber::ArgumentError.new("You need to specify :port for this proxy or pass full proxy address like 127.0.0.1:100") if @port.nil? || @port.zero?
|
13
|
+
@type = options.fetch(:type) { 'http' }
|
14
|
+
@user = options.fetch(:user) { nil }
|
15
|
+
@password = options.fetch(:password) { nil }
|
16
|
+
@probability = options.fetch(:probability) { 1 }
|
17
|
+
end
|
18
|
+
|
19
|
+
def id
|
20
|
+
(@host + ':' + @port.to_s)
|
21
|
+
end
|
22
|
+
|
23
|
+
def address
|
24
|
+
@host + ':' + @port.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
def http?
|
28
|
+
@type == 'http'
|
29
|
+
end
|
30
|
+
|
31
|
+
def proxyuserpwd
|
32
|
+
if @user.blank?
|
33
|
+
nil
|
34
|
+
else
|
35
|
+
"#{@user}:#{@password}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class Configuration
|
41
|
+
include Scruber::Core::Extensions::Loop::CoreMethods
|
42
|
+
|
43
|
+
AVAILABLE_MODES=[:random, :round_robin]
|
44
|
+
|
45
|
+
attr_reader :mode, :proxies, :proxy_keys, :pickup
|
46
|
+
|
47
|
+
def initialize
|
48
|
+
@mode = :round_robin
|
49
|
+
@proxies = {}
|
50
|
+
@proxy_keys = []
|
51
|
+
@pickup = nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def configure(&block)
|
55
|
+
instance_eval &block
|
56
|
+
rebuild_caches
|
57
|
+
end
|
58
|
+
|
59
|
+
def clean
|
60
|
+
@proxies = {}
|
61
|
+
end
|
62
|
+
|
63
|
+
def add(proxy_address, options={})
|
64
|
+
proxy = Proxy.new(proxy_address, options)
|
65
|
+
@proxies[proxy.id] = proxy
|
66
|
+
end
|
67
|
+
|
68
|
+
def set_mode(mode)
|
69
|
+
if AVAILABLE_MODES.include?(mode)
|
70
|
+
@mode = mode
|
71
|
+
else
|
72
|
+
raise Scruber::ArgumentError.new("Wrong mode. Available modes: #{AVAILABLE_MODES}")
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def rebuild_caches
|
79
|
+
if @mode == :random
|
80
|
+
@pickup = Pickup.new(@proxies.inject({}){ |acc,(k,p)| acc[p] = p.probability; acc })
|
81
|
+
else
|
82
|
+
@proxy_keys = @proxies.keys
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class << self
|
88
|
+
attr_writer :configuration
|
89
|
+
attr_accessor :cursor
|
90
|
+
|
91
|
+
def configuration
|
92
|
+
@configuration ||= Configuration.new
|
93
|
+
end
|
94
|
+
|
95
|
+
def configure(&block)
|
96
|
+
configuration.configure(&block)
|
97
|
+
end
|
98
|
+
|
99
|
+
def next(options={})
|
100
|
+
# raise Scruber::ArgumentError.new("Proxy rotator not configured") if @configuration.nil?
|
101
|
+
return nil if @configuration.nil?
|
102
|
+
if @configuration.mode == :random
|
103
|
+
@configuration.pickup.pick
|
104
|
+
else
|
105
|
+
if @cursor.nil? || @cursor >= @configuration.proxy_keys.count-1
|
106
|
+
@cursor = 0
|
107
|
+
else
|
108
|
+
@cursor += 1
|
109
|
+
end
|
110
|
+
@configuration.proxies[@configuration.proxy_keys[@cursor]]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
alias_method :random, :next
|
114
|
+
|
115
|
+
def find(id)
|
116
|
+
@configuration.proxies[id] rescue nil
|
117
|
+
end
|
118
|
+
|
119
|
+
def configured?
|
120
|
+
!@configuration.nil? && !@configuration.proxies.blank?
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|