proxy_fetcher 0.11.0 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -1
- data/Gemfile +4 -2
- data/Rakefile +3 -1
- data/gemfiles/nokogiri.gemfile +1 -1
- data/gemfiles/oga.gemfile +2 -2
- data/lib/proxy_fetcher.rb +42 -31
- data/lib/proxy_fetcher/client/request.rb +3 -3
- data/lib/proxy_fetcher/configuration.rb +13 -9
- data/lib/proxy_fetcher/document/node.rb +1 -1
- data/lib/proxy_fetcher/manager.rb +40 -7
- data/lib/proxy_fetcher/providers/base.rb +2 -1
- data/lib/proxy_fetcher/providers/free_proxy_list.rb +0 -21
- data/lib/proxy_fetcher/providers/free_proxy_list_socks.rb +58 -0
- data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +1 -0
- data/lib/proxy_fetcher/providers/free_proxy_list_us.rb +54 -0
- data/lib/proxy_fetcher/providers/mtpro.rb +43 -0
- data/lib/proxy_fetcher/providers/proxypedia.rb +48 -0
- data/lib/proxy_fetcher/providers/proxyscrape_http.rb +65 -0
- data/lib/proxy_fetcher/providers/proxyscrape_socks4.rb +65 -0
- data/lib/proxy_fetcher/providers/proxyscrape_socks5.rb +65 -0
- data/lib/proxy_fetcher/providers/xroxy.rb +2 -2
- data/lib/proxy_fetcher/proxy.rb +12 -0
- data/lib/proxy_fetcher/utils/http_client.rb +25 -21
- data/lib/proxy_fetcher/utils/proxy_validator.rb +20 -8
- data/lib/proxy_fetcher/version.rb +2 -2
- data/proxy_fetcher.gemspec +6 -4
- data/spec/fixtures/proxies.txt +14 -0
- data/spec/proxy_fetcher/client/client_spec.rb +10 -5
- data/spec/proxy_fetcher/manager_spec.rb +18 -0
- data/spec/proxy_fetcher/providers/proxy_classes_spec.rb +28 -0
- metadata +15 -12
- data/lib/proxy_fetcher/providers/gather_proxy.rb +0 -50
- data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +0 -13
- data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/proxy_list_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/xroxy_spec.rb +0 -11
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ProxyFetcher
|
4
|
+
module Providers
|
5
|
+
# FreeProxyListUS provider class.
|
6
|
+
class FreeProxyListUS < Base
|
7
|
+
# Provider URL to fetch proxy list
|
8
|
+
def provider_url
|
9
|
+
"https://www.us-proxy.org/"
|
10
|
+
end
|
11
|
+
|
12
|
+
# [NOTE] Doesn't support filtering
|
13
|
+
def xpath
|
14
|
+
'//table[@id="proxylisttable"]/tbody/tr'
|
15
|
+
end
|
16
|
+
|
17
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
18
|
+
# object.
|
19
|
+
#
|
20
|
+
# @param html_node [Object]
|
21
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
22
|
+
#
|
23
|
+
# @return [ProxyFetcher::Proxy]
|
24
|
+
# Proxy object
|
25
|
+
#
|
26
|
+
def to_proxy(html_node)
|
27
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
28
|
+
proxy.addr = html_node.content_at("td[1]")
|
29
|
+
proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
|
30
|
+
proxy.country = html_node.content_at("td[4]")
|
31
|
+
proxy.anonymity = html_node.content_at("td[5]")
|
32
|
+
proxy.type = parse_type(html_node)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# Parses HTML node to extract proxy type.
|
39
|
+
#
|
40
|
+
# @param html_node [Object]
|
41
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
42
|
+
#
|
43
|
+
# @return [String]
|
44
|
+
# Proxy type
|
45
|
+
#
|
46
|
+
def parse_type(html_node)
|
47
|
+
https = html_node.content_at("td[7]")
|
48
|
+
https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
ProxyFetcher::Configuration.register_provider(:free_proxy_list_us, FreeProxyListUS)
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "json"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# MTPro provider class.
|
8
|
+
class MTPro < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://mtpro.xyz/api/?type=socks"
|
12
|
+
end
|
13
|
+
|
14
|
+
def load_proxy_list(filters = {})
|
15
|
+
html = load_html(provider_url, filters)
|
16
|
+
JSON.parse(html)
|
17
|
+
rescue JSON::ParserError
|
18
|
+
[]
|
19
|
+
end
|
20
|
+
|
21
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
22
|
+
# object.
|
23
|
+
#
|
24
|
+
# @param node [Object]
|
25
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
26
|
+
#
|
27
|
+
# @return [ProxyFetcher::Proxy]
|
28
|
+
# Proxy object
|
29
|
+
#
|
30
|
+
def to_proxy(node)
|
31
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
32
|
+
proxy.addr = node["ip"]
|
33
|
+
proxy.port = Integer(node["port"])
|
34
|
+
proxy.country = node["country"]
|
35
|
+
proxy.anonymity = "Unknown"
|
36
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS5
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
ProxyFetcher::Configuration.register_provider(:mtpro, MTPro)
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ProxyFetcher
|
4
|
+
module Providers
|
5
|
+
# FreeProxyList provider class.
|
6
|
+
class Proxypedia < Base
|
7
|
+
# Provider URL to fetch proxy list
|
8
|
+
def provider_url
|
9
|
+
"https://proxypedia.org"
|
10
|
+
end
|
11
|
+
|
12
|
+
# [NOTE] Doesn't support filtering
|
13
|
+
def xpath
|
14
|
+
"//main/ul/li[position()>1]"
|
15
|
+
end
|
16
|
+
|
17
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
18
|
+
# object.]
|
19
|
+
#
|
20
|
+
# @param html_node [Object]
|
21
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
22
|
+
#
|
23
|
+
# @return [ProxyFetcher::Proxy]
|
24
|
+
# Proxy object
|
25
|
+
#
|
26
|
+
def to_proxy(html_node)
|
27
|
+
addr, port = html_node.content_at("a").to_s.split(":")
|
28
|
+
|
29
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
30
|
+
proxy.addr = addr
|
31
|
+
proxy.port = Integer(port)
|
32
|
+
proxy.country = parse_country(html_node)
|
33
|
+
proxy.anonymity = "Unknown"
|
34
|
+
proxy.type = ProxyFetcher::Proxy::HTTP
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def parse_country(html_node)
|
41
|
+
text = html_node.content.to_s
|
42
|
+
text[/\((.+?)\)/, 1] || "Unknown"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
ProxyFetcher::Configuration.register_provider(:proxypedia, Proxypedia)
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# FreeProxyList provider class.
|
8
|
+
class ProxyscrapeHTTP < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://api.proxyscrape.com/v2/?request=getproxies&protocol=http"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Loads provider HTML and parses it with internal document object.
|
15
|
+
#
|
16
|
+
# @param url [String]
|
17
|
+
# URL to fetch
|
18
|
+
#
|
19
|
+
# @param filters [Hash]
|
20
|
+
# filters for proxy provider
|
21
|
+
#
|
22
|
+
# @return [Array]
|
23
|
+
# Collection of extracted proxies with ports
|
24
|
+
#
|
25
|
+
def load_document(url, filters = {})
|
26
|
+
html = load_html(url, filters)
|
27
|
+
|
28
|
+
CSV.parse(html, col_sep: "\t").map(&:first)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the txt document to return all the proxy entries (ip addresses
|
33
|
+
# and ports).
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
# Collection of extracted proxies with ports
|
37
|
+
#
|
38
|
+
def load_proxy_list(filters = {})
|
39
|
+
load_document(provider_url, filters)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts String to <code>ProxyFetcher::Proxy</code> object.
|
43
|
+
#
|
44
|
+
# @param node [String]
|
45
|
+
# String
|
46
|
+
#
|
47
|
+
# @return [ProxyFetcher::Proxy]
|
48
|
+
# Proxy object
|
49
|
+
#
|
50
|
+
def to_proxy(node)
|
51
|
+
addr, port = node.split(":")
|
52
|
+
|
53
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
54
|
+
proxy.addr = addr
|
55
|
+
proxy.port = Integer(port)
|
56
|
+
proxy.country = "Unknown"
|
57
|
+
proxy.anonymity = "Unknown"
|
58
|
+
proxy.type = ProxyFetcher::Proxy::HTTP
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
ProxyFetcher::Configuration.register_provider(:proxyscrape_http, ProxyscrapeHTTP)
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# FreeProxyList provider class.
|
8
|
+
class ProxyscrapeSOCKS4 < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks4"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Loads provider HTML and parses it with internal document object.
|
15
|
+
#
|
16
|
+
# @param url [String]
|
17
|
+
# URL to fetch
|
18
|
+
#
|
19
|
+
# @param filters [Hash]
|
20
|
+
# filters for proxy provider
|
21
|
+
#
|
22
|
+
# @return [Array]
|
23
|
+
# Collection of extracted proxies with ports
|
24
|
+
#
|
25
|
+
def load_document(url, filters = {})
|
26
|
+
html = load_html(url, filters)
|
27
|
+
|
28
|
+
CSV.parse(html, col_sep: "\t").map(&:first)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the txt document to return all the proxy entries (ip addresses
|
33
|
+
# and ports).
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
# Collection of extracted proxies with ports
|
37
|
+
#
|
38
|
+
def load_proxy_list(filters = {})
|
39
|
+
load_document(provider_url, filters)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts String to <code>ProxyFetcher::Proxy</code> object.
|
43
|
+
#
|
44
|
+
# @param node [String]
|
45
|
+
# String
|
46
|
+
#
|
47
|
+
# @return [ProxyFetcher::Proxy]
|
48
|
+
# Proxy object
|
49
|
+
#
|
50
|
+
def to_proxy(html_node)
|
51
|
+
addr, port = html_node.split(":")
|
52
|
+
|
53
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
54
|
+
proxy.addr = addr
|
55
|
+
proxy.port = Integer(port)
|
56
|
+
proxy.country = "Unknown"
|
57
|
+
proxy.anonymity = "Unknown"
|
58
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS4
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
ProxyFetcher::Configuration.register_provider(:proxyscrape_socks4, ProxyscrapeSOCKS4)
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# FreeProxyList provider class.
|
8
|
+
class ProxyscrapeSOCKS5 < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks5"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Loads provider HTML and parses it with internal document object.
|
15
|
+
#
|
16
|
+
# @param url [String]
|
17
|
+
# URL to fetch
|
18
|
+
#
|
19
|
+
# @param filters [Hash]
|
20
|
+
# filters for proxy provider
|
21
|
+
#
|
22
|
+
# @return [Array]
|
23
|
+
# Collection of extracted proxies with ports
|
24
|
+
#
|
25
|
+
def load_document(url, filters = {})
|
26
|
+
html = load_html(url, filters)
|
27
|
+
|
28
|
+
CSV.parse(html, col_sep: "\t").map(&:first)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the txt document to return all the proxy entries (ip addresses
|
33
|
+
# and ports).
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
# Collection of extracted proxies with ports
|
37
|
+
#
|
38
|
+
def load_proxy_list(filters = {})
|
39
|
+
load_document(provider_url, filters)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts String to <code>ProxyFetcher::Proxy</code> object.
|
43
|
+
#
|
44
|
+
# @param node [String]
|
45
|
+
# String
|
46
|
+
#
|
47
|
+
# @return [ProxyFetcher::Proxy]
|
48
|
+
# Proxy object
|
49
|
+
#
|
50
|
+
def to_proxy(html_node)
|
51
|
+
addr, port = html_node.split(":")
|
52
|
+
|
53
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
54
|
+
proxy.addr = addr
|
55
|
+
proxy.port = Integer(port)
|
56
|
+
proxy.country = "Unknown"
|
57
|
+
proxy.anonymity = "Unknown"
|
58
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS5
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
ProxyFetcher::Configuration.register_provider(:proxyscrape_socks5, ProxyscrapeSOCKS5)
|
64
|
+
end
|
65
|
+
end
|
@@ -6,11 +6,11 @@ module ProxyFetcher
|
|
6
6
|
class XRoxy < Base
|
7
7
|
# Provider URL to fetch proxy list
|
8
8
|
def provider_url
|
9
|
-
"https://www.xroxy.com/
|
9
|
+
"https://www.xroxy.com/proxylist.htm"
|
10
10
|
end
|
11
11
|
|
12
12
|
def xpath
|
13
|
-
"//
|
13
|
+
"//tr[@class='row1' or @class='row0']"
|
14
14
|
end
|
15
15
|
|
16
16
|
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
data/lib/proxy_fetcher/proxy.rb
CHANGED
@@ -104,5 +104,17 @@ module ProxyFetcher
|
|
104
104
|
URI::Generic.build(host: addr, port: port).to_s
|
105
105
|
end
|
106
106
|
end
|
107
|
+
|
108
|
+
def ==(other)
|
109
|
+
other.is_a?(Proxy) && addr == other.addr && port == other.port
|
110
|
+
end
|
111
|
+
|
112
|
+
def eql?(other)
|
113
|
+
hash.eql?(other.hash)
|
114
|
+
end
|
115
|
+
|
116
|
+
def hash
|
117
|
+
[addr.hash, port.hash].hash
|
118
|
+
end
|
107
119
|
end
|
108
120
|
end
|
@@ -41,8 +41,8 @@ module ProxyFetcher
|
|
41
41
|
# @return [String]
|
42
42
|
# resource content
|
43
43
|
#
|
44
|
-
def self.fetch(*args)
|
45
|
-
new(*args).fetch
|
44
|
+
def self.fetch(*args, **kwargs, &block)
|
45
|
+
new(*args, **kwargs, &block).fetch
|
46
46
|
end
|
47
47
|
|
48
48
|
# Initialize HTTP client instance
|
@@ -51,15 +51,17 @@ module ProxyFetcher
|
|
51
51
|
#
|
52
52
|
def initialize(url, method: :get, params: {}, headers: {})
|
53
53
|
@url = url.to_s
|
54
|
-
@method = method
|
54
|
+
@method = method.to_sym
|
55
55
|
@params = params
|
56
56
|
@headers = headers
|
57
57
|
|
58
|
-
|
59
|
-
|
58
|
+
unless HTTP::Request::METHODS.include?(@method)
|
59
|
+
raise ArgumentError, "'#{@method}' is a wrong HTTP method name"
|
60
|
+
end
|
60
61
|
|
61
|
-
@
|
62
|
-
@
|
62
|
+
@timeout = ProxyFetcher.config.provider_proxies_load_timeout
|
63
|
+
@http = build_http_engine
|
64
|
+
@ssl_ctx = build_ssl_context
|
63
65
|
end
|
64
66
|
|
65
67
|
# Fetches resource content by sending HTTP request to it.
|
@@ -67,30 +69,32 @@ module ProxyFetcher
|
|
67
69
|
# @return [String]
|
68
70
|
# response body
|
69
71
|
#
|
70
|
-
def fetch
|
71
|
-
response =
|
72
|
+
def fetch(**options)
|
73
|
+
response = perform_http_request
|
74
|
+
return response if options.fetch(:raw, false)
|
75
|
+
|
72
76
|
response.body.to_s
|
73
77
|
rescue StandardError => e
|
74
|
-
ProxyFetcher.logger.warn("Failed to process request to #{url} (#{e.message})")
|
78
|
+
ProxyFetcher.config.logger.warn("Failed to process request to #{url} (#{e.message})")
|
75
79
|
""
|
76
80
|
end
|
77
81
|
|
78
|
-
def fetch_with_headers
|
79
|
-
process_http_request
|
80
|
-
rescue StandardError => e
|
81
|
-
ProxyFetcher.logger.warn("Failed to process request to #{url} (#{e.message})")
|
82
|
-
HTTP::Response.new(version: "1.1", status: 500, body: "")
|
83
|
-
end
|
84
|
-
|
85
82
|
protected
|
86
83
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
84
|
+
def build_ssl_context
|
85
|
+
OpenSSL::SSL::SSLContext.new.tap do |context|
|
86
|
+
context.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
90
87
|
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def build_http_engine
|
91
|
+
HTTP.headers(default_headers.merge(headers)).timeout(connect: timeout, read: timeout)
|
92
|
+
end
|
91
93
|
|
94
|
+
def perform_http_request(http_method: method, http_params: params)
|
92
95
|
http.public_send(
|
93
|
-
http_method
|
96
|
+
http_method,
|
97
|
+
url,
|
94
98
|
form: http_params,
|
95
99
|
ssl_context: ssl_ctx
|
96
100
|
)
|