proxy_fetcher 0.10.2 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +8 -5
- data/Rakefile +4 -2
- data/gemfiles/nokogiri.gemfile +8 -6
- data/gemfiles/oga.gemfile +8 -6
- data/lib/proxy_fetcher.rb +30 -30
- data/lib/proxy_fetcher/client/client.rb +10 -3
- data/lib/proxy_fetcher/client/request.rb +4 -4
- data/lib/proxy_fetcher/configuration.rb +12 -11
- data/lib/proxy_fetcher/document.rb +0 -9
- data/lib/proxy_fetcher/document/adapters.rb +1 -1
- data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +3 -12
- data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +1 -1
- data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +1 -1
- data/lib/proxy_fetcher/document/node.rb +2 -2
- data/lib/proxy_fetcher/exceptions.rb +6 -6
- data/lib/proxy_fetcher/manager.rb +2 -2
- data/lib/proxy_fetcher/providers/base.rb +42 -22
- data/lib/proxy_fetcher/providers/free_proxy_list.rb +30 -10
- data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +7 -16
- data/lib/proxy_fetcher/providers/gather_proxy.rb +9 -17
- data/lib/proxy_fetcher/providers/http_tunnel.rb +11 -19
- data/lib/proxy_fetcher/providers/proxy_list.rb +8 -16
- data/lib/proxy_fetcher/providers/xroxy.rb +9 -17
- data/lib/proxy_fetcher/proxy.rb +4 -4
- data/lib/proxy_fetcher/utils/http_client.rb +10 -8
- data/lib/proxy_fetcher/utils/proxy_list_validator.rb +3 -1
- data/lib/proxy_fetcher/utils/proxy_validator.rb +1 -1
- data/lib/proxy_fetcher/version.rb +3 -3
- data/proxy_fetcher.gemspec +19 -16
- data/spec/proxy_fetcher/client/client_spec.rb +72 -57
- data/spec/proxy_fetcher/configuration_spec.rb +11 -11
- data/spec/proxy_fetcher/document/adapters_spec.rb +8 -8
- data/spec/proxy_fetcher/document/node_spec.rb +4 -4
- data/spec/proxy_fetcher/providers/base_spec.rb +9 -9
- data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +2 -2
- data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +2 -2
- data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +2 -2
- data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +2 -2
- data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +4 -4
- data/spec/proxy_fetcher/providers/proxy_list_spec.rb +2 -2
- data/spec/proxy_fetcher/providers/xroxy_spec.rb +2 -2
- data/spec/proxy_fetcher/proxy_spec.rb +14 -14
- data/spec/proxy_fetcher/version_spec.rb +2 -0
- data/spec/spec_helper.rb +10 -10
- data/spec/support/manager_examples.rb +21 -21
- metadata +14 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5a0d7b377ed3c25e50552e89ba76c0e73fad3923bf171e1cee2f592d777787c3
|
4
|
+
data.tar.gz: 83b594e04e03c74a63146a6907c99025d50607c88f8cf94f6d5ce044795243ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f096c2473035255eb9492297b5641ab5caee62566eb20eb40d3b2f02eea5d06fa1279a2cadeb3266c0f52ce98040185a69678faf116b6398c2c75f79d5c4ebd
|
7
|
+
data.tar.gz: a9372ef8bdbb3c51c5060308cbc46c905df3819e682bceb858ba494f5f94722095f2a2bee94575606f628d091b87325425c218f8c31a2b807bcb159c59ba6e65
|
data/Gemfile
CHANGED
@@ -1,11 +1,14 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
source "https://rubygems.org"
|
2
4
|
|
3
5
|
gemspec
|
4
6
|
|
5
|
-
gem
|
6
|
-
gem
|
7
|
+
gem "nokogiri", "~> 1.8"
|
8
|
+
gem "oga", "~> 2.0"
|
9
|
+
gem "rubocop", "~> 0.74"
|
7
10
|
|
8
11
|
group :test do
|
9
|
-
gem
|
10
|
-
gem
|
12
|
+
gem "coveralls", require: false
|
13
|
+
gem "evil-proxy", "~> 0.2"
|
11
14
|
end
|
data/Rakefile
CHANGED
data/gemfiles/nokogiri.gemfile
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
3
|
+
source "https://rubygems.org"
|
4
4
|
|
5
|
-
|
5
|
+
gemspec path: "../"
|
6
|
+
|
7
|
+
gem "nokogiri", "~> 1.8"
|
6
8
|
|
7
9
|
group :test do
|
8
|
-
gem
|
9
|
-
gem
|
10
|
-
gem
|
10
|
+
gem "coveralls", require: false
|
11
|
+
gem "evil-proxy", "~> 0.2"
|
12
|
+
gem "rspec", "~> 3.6"
|
11
13
|
end
|
data/gemfiles/oga.gemfile
CHANGED
@@ -1,11 +1,13 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
3
|
+
source "https://rubygems.org"
|
4
4
|
|
5
|
-
|
5
|
+
gemspec path: "../"
|
6
|
+
|
7
|
+
gem "oga", "~> 2.0"
|
6
8
|
|
7
9
|
group :test do
|
8
|
-
gem
|
9
|
-
gem
|
10
|
-
gem
|
10
|
+
gem "coveralls", require: false
|
11
|
+
gem "evil-proxy", "~> 0.2"
|
12
|
+
gem "rspec", "~> 3.6"
|
11
13
|
end
|
data/lib/proxy_fetcher.rb
CHANGED
@@ -1,44 +1,44 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
3
|
+
require "uri"
|
4
|
+
require "http"
|
5
|
+
require "logger"
|
6
6
|
|
7
|
-
require File.dirname(__FILE__) +
|
7
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/version"
|
8
8
|
|
9
|
-
require File.dirname(__FILE__) +
|
10
|
-
require File.dirname(__FILE__) +
|
11
|
-
require File.dirname(__FILE__) +
|
12
|
-
require File.dirname(__FILE__) +
|
13
|
-
require File.dirname(__FILE__) +
|
14
|
-
require File.dirname(__FILE__) +
|
9
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/exceptions"
|
10
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/configuration"
|
11
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/configuration/providers_registry"
|
12
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/proxy"
|
13
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/manager"
|
14
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/null_logger"
|
15
15
|
|
16
|
-
require File.dirname(__FILE__) +
|
17
|
-
require File.dirname(__FILE__) +
|
18
|
-
require File.dirname(__FILE__) +
|
19
|
-
require File.dirname(__FILE__) +
|
20
|
-
require File.dirname(__FILE__) +
|
21
|
-
require File.dirname(__FILE__) +
|
16
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/utils/http_client"
|
17
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/utils/proxy_validator"
|
18
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/utils/proxy_list_validator"
|
19
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/client/client"
|
20
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/client/request"
|
21
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/client/proxies_registry"
|
22
22
|
|
23
|
-
require File.dirname(__FILE__) +
|
24
|
-
require File.dirname(__FILE__) +
|
25
|
-
require File.dirname(__FILE__) +
|
26
|
-
require File.dirname(__FILE__) +
|
27
|
-
require File.dirname(__FILE__) +
|
28
|
-
require File.dirname(__FILE__) +
|
23
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/document"
|
24
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/document/adapters"
|
25
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/document/node"
|
26
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/document/adapters/abstract_adapter"
|
27
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/document/adapters/nokogiri_adapter"
|
28
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/document/adapters/oga_adapter"
|
29
29
|
|
30
30
|
##
|
31
31
|
# Ruby / JRuby lib for managing proxies
|
32
32
|
module ProxyFetcher
|
33
33
|
# ProxyFetcher providers namespace
|
34
34
|
module Providers
|
35
|
-
require File.dirname(__FILE__) +
|
36
|
-
require File.dirname(__FILE__) +
|
37
|
-
require File.dirname(__FILE__) +
|
38
|
-
require File.dirname(__FILE__) +
|
39
|
-
require File.dirname(__FILE__) +
|
40
|
-
require File.dirname(__FILE__) +
|
41
|
-
require File.dirname(__FILE__) +
|
35
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/providers/base"
|
36
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/providers/free_proxy_list"
|
37
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/providers/free_proxy_list_ssl"
|
38
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/providers/gather_proxy"
|
39
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/providers/http_tunnel"
|
40
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/providers/proxy_list"
|
41
|
+
require File.dirname(__FILE__) + "/proxy_fetcher/providers/xroxy"
|
42
42
|
end
|
43
43
|
|
44
44
|
# Main ProxyFetcher module.
|
@@ -75,7 +75,7 @@ module ProxyFetcher
|
|
75
75
|
|
76
76
|
# Returns ProxyFetcher logger instance.
|
77
77
|
#
|
78
|
-
# @return [Logger, NullLogger] logger object
|
78
|
+
# @return [Logger, ProxyFetcher::NullLogger] logger object
|
79
79
|
#
|
80
80
|
def logger
|
81
81
|
return @logger if defined?(@logger)
|
@@ -128,7 +128,11 @@ module ProxyFetcher
|
|
128
128
|
#
|
129
129
|
def request_with_payload(method, url, payload, headers, options)
|
130
130
|
with_proxy_for(url, options.fetch(:max_retries, 1000)) do |proxy|
|
131
|
-
opts = options.merge(
|
131
|
+
opts = options.merge(
|
132
|
+
payload: payload,
|
133
|
+
proxy: options.fetch(:proxy, proxy),
|
134
|
+
headers: default_headers.merge(headers)
|
135
|
+
)
|
132
136
|
|
133
137
|
Request.execute(url: url, method: method, **opts)
|
134
138
|
end
|
@@ -138,7 +142,10 @@ module ProxyFetcher
|
|
138
142
|
#
|
139
143
|
def request_without_payload(method, url, headers, options)
|
140
144
|
with_proxy_for(url, options.fetch(:max_retries, 1000)) do |proxy|
|
141
|
-
opts = options.merge(
|
145
|
+
opts = options.merge(
|
146
|
+
proxy: options.fetch(:proxy, proxy),
|
147
|
+
headers: default_headers.merge(headers)
|
148
|
+
)
|
142
149
|
|
143
150
|
Request.execute(url: url, method: method, **opts)
|
144
151
|
end
|
@@ -152,7 +159,7 @@ module ProxyFetcher
|
|
152
159
|
#
|
153
160
|
def default_headers
|
154
161
|
{
|
155
|
-
|
162
|
+
"User-Agent" => ProxyFetcher.config.user_agent
|
156
163
|
}
|
157
164
|
end
|
158
165
|
|
@@ -50,7 +50,7 @@ module ProxyFetcher
|
|
50
50
|
# @return [Request]
|
51
51
|
#
|
52
52
|
def initialize(args)
|
53
|
-
raise ArgumentError,
|
53
|
+
raise ArgumentError, "args must be a Hash!" unless args.is_a?(Hash)
|
54
54
|
|
55
55
|
@url = args.fetch(:url)
|
56
56
|
@method = args.fetch(:method).to_s.downcase
|
@@ -86,9 +86,9 @@ module ProxyFetcher
|
|
86
86
|
#
|
87
87
|
def build_http_client
|
88
88
|
HTTP.via(proxy.addr, proxy.port.to_i)
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
.headers(headers)
|
90
|
+
.timeout(connect: timeout, read: timeout)
|
91
|
+
.follow(max_hops: max_redirects)
|
92
92
|
end
|
93
93
|
|
94
94
|
# Default SSL options that will be used for connecting to resources
|
@@ -6,15 +6,20 @@ module ProxyFetcher
|
|
6
6
|
#
|
7
7
|
class Configuration
|
8
8
|
# @!attribute client_timeout
|
9
|
-
# @return [Integer]
|
9
|
+
# @return [Integer]
|
10
|
+
# HTTP request timeout (connect / open) for [ProxyFetcher::Client]
|
10
11
|
attr_accessor :client_timeout
|
11
12
|
|
12
13
|
# @!attribute provider_proxies_load_timeout
|
13
|
-
# @return [Integer]
|
14
|
+
# @return [Integer]
|
15
|
+
# HTTP request timeout (connect / open) for loading
|
16
|
+
# of proxies list by provider
|
14
17
|
attr_accessor :provider_proxies_load_timeout
|
15
18
|
|
16
19
|
# @!attribute proxy_validation_timeout
|
17
|
-
# @return [Integer]
|
20
|
+
# @return [Integer]
|
21
|
+
# HTTP request timeout (connect / open) for proxy
|
22
|
+
# validation with [ProxyFetcher::ProxyValidator]
|
18
23
|
attr_accessor :proxy_validation_timeout
|
19
24
|
|
20
25
|
# to save compatibility
|
@@ -30,16 +35,12 @@ module ProxyFetcher
|
|
30
35
|
attr_accessor :user_agent
|
31
36
|
|
32
37
|
# @!attribute [r] logger
|
33
|
-
# @return [
|
38
|
+
# @return [Logger] Logger object
|
34
39
|
attr_accessor :logger
|
35
40
|
|
36
41
|
# @!attribute [r] adapter
|
37
42
|
# @return [Object] HTML parser adapter
|
38
|
-
|
39
|
-
|
40
|
-
# @!attribute [r] adapter_class
|
41
|
-
# @return [Object] HTML adapter class
|
42
|
-
attr_reader :adapter_class
|
43
|
+
attr_reader :adapter
|
43
44
|
|
44
45
|
# @!attribute [r] http_client
|
45
46
|
# @return [Object] HTTP client class
|
@@ -58,8 +59,8 @@ module ProxyFetcher
|
|
58
59
|
#
|
59
60
|
# Default is Google Chrome 60, but can be changed in <code>ProxyFetcher.config</code>.
|
60
61
|
#
|
61
|
-
DEFAULT_USER_AGENT =
|
62
|
-
|
62
|
+
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 " \
|
63
|
+
"(KHTML, like Gecko) Chrome/60.0.3112 Safari/537.36"
|
63
64
|
|
64
65
|
# HTML parser adapter name.
|
65
66
|
#
|
@@ -36,14 +36,5 @@ module ProxyFetcher
|
|
36
36
|
def xpath(*args)
|
37
37
|
backend.xpath(*args).map { |node| backend.proxy_node.new(node) }
|
38
38
|
end
|
39
|
-
|
40
|
-
# Searches elements by CSS selector.
|
41
|
-
#
|
42
|
-
# @return [Array<ProxyFetcher::Document::Node>]
|
43
|
-
# collection of nodes
|
44
|
-
#
|
45
|
-
def css(*args)
|
46
|
-
backend.css(*args).map { |node| backend.proxy_node.new(node) }
|
47
|
-
end
|
48
39
|
end
|
49
40
|
end
|
@@ -26,15 +26,6 @@ module ProxyFetcher
|
|
26
26
|
document.xpath(selector)
|
27
27
|
end
|
28
28
|
|
29
|
-
# You can override this method in your own adapter class
|
30
|
-
#
|
31
|
-
# @param selector [String]
|
32
|
-
# CSS selector
|
33
|
-
#
|
34
|
-
def css(selector)
|
35
|
-
document.css(selector)
|
36
|
-
end
|
37
|
-
|
38
29
|
# Returns <code>Node</code> class that will handle HTML
|
39
30
|
# nodes for particular adapter.
|
40
31
|
#
|
@@ -42,7 +33,7 @@ module ProxyFetcher
|
|
42
33
|
# node
|
43
34
|
#
|
44
35
|
def proxy_node
|
45
|
-
self.class.const_get(
|
36
|
+
self.class.const_get("Node")
|
46
37
|
end
|
47
38
|
|
48
39
|
# Installs adapter requirements.
|
@@ -53,8 +44,8 @@ module ProxyFetcher
|
|
53
44
|
def self.setup!(*args)
|
54
45
|
install_requirements!(*args)
|
55
46
|
self
|
56
|
-
rescue LoadError, StandardError =>
|
57
|
-
raise Exceptions::AdapterSetupError.new(name,
|
47
|
+
rescue LoadError, StandardError => e
|
48
|
+
raise Exceptions::AdapterSetupError.new(name, e.message)
|
58
49
|
end
|
59
50
|
end
|
60
51
|
end
|
@@ -13,7 +13,7 @@ module ProxyFetcher
|
|
13
13
|
# @return [WrongCustomClass]
|
14
14
|
#
|
15
15
|
def initialize(klass, methods)
|
16
|
-
required_methods = Array(methods).join(
|
16
|
+
required_methods = Array(methods).join(", ")
|
17
17
|
super("#{klass} must respond to [#{required_methods}] class methods!")
|
18
18
|
end
|
19
19
|
end
|
@@ -53,7 +53,7 @@ module ProxyFetcher
|
|
53
53
|
# @return [MaximumRedirectsReached]
|
54
54
|
#
|
55
55
|
def initialize(*)
|
56
|
-
super(
|
56
|
+
super("maximum redirects reached")
|
57
57
|
end
|
58
58
|
end
|
59
59
|
|
@@ -66,7 +66,7 @@ module ProxyFetcher
|
|
66
66
|
# @return [MaximumRetriesReached]
|
67
67
|
#
|
68
68
|
def initialize(*)
|
69
|
-
super(
|
69
|
+
super("reached the maximum number of retries")
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
@@ -95,7 +95,7 @@ module ProxyFetcher
|
|
95
95
|
super(<<-MSG.strip.squeeze
|
96
96
|
you need to specify adapter for HTML parsing: ProxyFetcher.config.adapter = :nokogiri.
|
97
97
|
You can use one of the predefined adapters (:nokogiri or :oga) or your own implementation.
|
98
|
-
|
98
|
+
MSG
|
99
99
|
)
|
100
100
|
end
|
101
101
|
end
|
@@ -111,7 +111,7 @@ module ProxyFetcher
|
|
111
111
|
# @return [AdapterSetupError]
|
112
112
|
#
|
113
113
|
def initialize(adapter_name, error)
|
114
|
-
adapter = demodulize(adapter_name.gsub(
|
114
|
+
adapter = demodulize(adapter_name.gsub("Adapter", ""))
|
115
115
|
|
116
116
|
super("can't setup '#{adapter}' adapter during the following error:\n\t#{error}'")
|
117
117
|
end
|
@@ -127,7 +127,7 @@ module ProxyFetcher
|
|
127
127
|
#
|
128
128
|
def demodulize(path)
|
129
129
|
path = path.to_s
|
130
|
-
index = path.rindex(
|
130
|
+
index = path.rindex("::")
|
131
131
|
|
132
132
|
index ? path[(index + 2)..-1] : path
|
133
133
|
end
|
@@ -55,7 +55,7 @@ module ProxyFetcher
|
|
55
55
|
|
56
56
|
# Pop just first proxy (and back it to the end of the proxy list).
|
57
57
|
#
|
58
|
-
# @return [Proxy]
|
58
|
+
# @return [ProxyFetcher::Proxy, NilClass]
|
59
59
|
# proxy object from the list
|
60
60
|
#
|
61
61
|
def get
|
@@ -72,7 +72,7 @@ module ProxyFetcher
|
|
72
72
|
# Pop first valid proxy (and back it to the end of the proxy list)
|
73
73
|
# Invalid proxies will be removed from the list
|
74
74
|
#
|
75
|
-
# @return [Proxy]
|
75
|
+
# @return [ProxyFetcher::Proxy, NilClass]
|
76
76
|
# proxy object from the list
|
77
77
|
#
|
78
78
|
def get!
|
@@ -6,12 +6,15 @@ module ProxyFetcher
|
|
6
6
|
class Base
|
7
7
|
# Loads proxy provider page content, extract proxy list from it
|
8
8
|
# and convert every entry to proxy object.
|
9
|
-
def fetch_proxies
|
9
|
+
def fetch_proxies(filters = {})
|
10
10
|
raw_proxies = load_proxy_list(filters)
|
11
11
|
proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact
|
12
12
|
proxies.reject { |proxy| proxy.addr.nil? }
|
13
13
|
end
|
14
14
|
|
15
|
+
# For retro-compatibility
|
16
|
+
alias fetch_proxies! fetch_proxies
|
17
|
+
|
15
18
|
def provider_url
|
16
19
|
raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
|
17
20
|
end
|
@@ -24,10 +27,17 @@ module ProxyFetcher
|
|
24
27
|
{}
|
25
28
|
end
|
26
29
|
|
30
|
+
# @return [Hash]
|
31
|
+
# Provider headers required to fetch the proxy list
|
32
|
+
#
|
27
33
|
def provider_headers
|
28
34
|
{}
|
29
35
|
end
|
30
36
|
|
37
|
+
def xpath
|
38
|
+
raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
|
39
|
+
end
|
40
|
+
|
31
41
|
# Just synthetic sugar to make it easier to call #fetch_proxies! method.
|
32
42
|
def self.fetch_proxies!(*args)
|
33
43
|
new.fetch_proxies!(*args)
|
@@ -37,18 +47,29 @@ module ProxyFetcher
|
|
37
47
|
|
38
48
|
# Loads raw provider HTML with proxies.
|
39
49
|
#
|
50
|
+
# @param url [String]
|
51
|
+
# Provider URL
|
52
|
+
#
|
53
|
+
# @param filters [#to_h]
|
54
|
+
# Provider filters (Hash-like object)
|
55
|
+
#
|
40
56
|
# @return [String]
|
41
|
-
# HTML body
|
57
|
+
# HTML body from the response
|
42
58
|
#
|
43
59
|
def load_html(url, filters = {})
|
44
|
-
|
60
|
+
unless filters.respond_to?(:to_h)
|
61
|
+
raise ArgumentError, "filters must be a Hash or respond to #to_h"
|
62
|
+
end
|
45
63
|
|
46
|
-
|
47
|
-
|
48
|
-
|
64
|
+
if filters&.any?
|
65
|
+
# TODO: query for post request?
|
66
|
+
uri = URI.parse(url)
|
67
|
+
uri.query = URI.encode_www_form(provider_params.merge(filters.to_h))
|
68
|
+
url = uri.to_s
|
69
|
+
end
|
49
70
|
|
50
71
|
ProxyFetcher.config.http_client.fetch(
|
51
|
-
|
72
|
+
url,
|
52
73
|
method: provider_method,
|
53
74
|
headers: provider_headers,
|
54
75
|
params: provider_params
|
@@ -71,29 +92,28 @@ module ProxyFetcher
|
|
71
92
|
ProxyFetcher::Document.parse(html)
|
72
93
|
end
|
73
94
|
|
95
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
96
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
97
|
+
# to return all the proxy entries (HTML nodes).
|
98
|
+
#
|
99
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
100
|
+
# Collection of extracted HTML nodes with full proxy info
|
101
|
+
#
|
102
|
+
def load_proxy_list(filters = {})
|
103
|
+
doc = load_document(provider_url, filters)
|
104
|
+
doc.xpath(xpath)
|
105
|
+
end
|
106
|
+
|
74
107
|
def build_proxy(*args)
|
75
108
|
to_proxy(*args)
|
76
|
-
rescue StandardError =>
|
109
|
+
rescue StandardError => e
|
77
110
|
ProxyFetcher.logger.warn(
|
78
|
-
"Failed to build Proxy object for #{self.class.name} due to error: #{
|
111
|
+
"Failed to build Proxy object for #{self.class.name} due to error: #{e.message}"
|
79
112
|
)
|
80
113
|
|
81
114
|
nil
|
82
115
|
end
|
83
116
|
|
84
|
-
# Fetches HTML content by sending HTTP request to the provider URL and
|
85
|
-
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
86
|
-
# to return all the proxy entries (HTML nodes).
|
87
|
-
#
|
88
|
-
# Abstract method. Must be implemented in a descendant class
|
89
|
-
#
|
90
|
-
# @return [Array<Document::Node>]
|
91
|
-
# list of proxy elements from the providers HTML content
|
92
|
-
#
|
93
|
-
def load_proxy_list(*)
|
94
|
-
raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
|
95
|
-
end
|
96
|
-
|
97
117
|
# Convert HTML element with proxy info to ProxyFetcher::Proxy instance.
|
98
118
|
#
|
99
119
|
# Abstract method. Must be implemented in a descendant class
|