proxy_fetcher 0.11.0 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -1
- data/Gemfile +4 -2
- data/Rakefile +3 -1
- data/gemfiles/nokogiri.gemfile +1 -1
- data/gemfiles/oga.gemfile +2 -2
- data/lib/proxy_fetcher.rb +42 -31
- data/lib/proxy_fetcher/client/request.rb +3 -3
- data/lib/proxy_fetcher/configuration.rb +13 -9
- data/lib/proxy_fetcher/document/node.rb +1 -1
- data/lib/proxy_fetcher/manager.rb +40 -7
- data/lib/proxy_fetcher/providers/base.rb +2 -1
- data/lib/proxy_fetcher/providers/free_proxy_list.rb +0 -21
- data/lib/proxy_fetcher/providers/free_proxy_list_socks.rb +58 -0
- data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +1 -0
- data/lib/proxy_fetcher/providers/free_proxy_list_us.rb +54 -0
- data/lib/proxy_fetcher/providers/mtpro.rb +43 -0
- data/lib/proxy_fetcher/providers/proxypedia.rb +48 -0
- data/lib/proxy_fetcher/providers/proxyscrape_http.rb +65 -0
- data/lib/proxy_fetcher/providers/proxyscrape_socks4.rb +65 -0
- data/lib/proxy_fetcher/providers/proxyscrape_socks5.rb +65 -0
- data/lib/proxy_fetcher/providers/xroxy.rb +2 -2
- data/lib/proxy_fetcher/proxy.rb +12 -0
- data/lib/proxy_fetcher/utils/http_client.rb +25 -21
- data/lib/proxy_fetcher/utils/proxy_validator.rb +20 -8
- data/lib/proxy_fetcher/version.rb +2 -2
- data/proxy_fetcher.gemspec +6 -4
- data/spec/fixtures/proxies.txt +14 -0
- data/spec/proxy_fetcher/client/client_spec.rb +10 -5
- data/spec/proxy_fetcher/manager_spec.rb +18 -0
- data/spec/proxy_fetcher/providers/proxy_classes_spec.rb +28 -0
- metadata +15 -12
- data/lib/proxy_fetcher/providers/gather_proxy.rb +0 -50
- data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +0 -13
- data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/proxy_list_spec.rb +0 -11
- data/spec/proxy_fetcher/providers/xroxy_spec.rb +0 -11
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ProxyFetcher
|
4
|
+
module Providers
|
5
|
+
# FreeProxyListUS provider class.
|
6
|
+
class FreeProxyListUS < Base
|
7
|
+
# Provider URL to fetch proxy list
|
8
|
+
def provider_url
|
9
|
+
"https://www.us-proxy.org/"
|
10
|
+
end
|
11
|
+
|
12
|
+
# [NOTE] Doesn't support filtering
|
13
|
+
def xpath
|
14
|
+
'//table[@id="proxylisttable"]/tbody/tr'
|
15
|
+
end
|
16
|
+
|
17
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
18
|
+
# object.
|
19
|
+
#
|
20
|
+
# @param html_node [Object]
|
21
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
22
|
+
#
|
23
|
+
# @return [ProxyFetcher::Proxy]
|
24
|
+
# Proxy object
|
25
|
+
#
|
26
|
+
def to_proxy(html_node)
|
27
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
28
|
+
proxy.addr = html_node.content_at("td[1]")
|
29
|
+
proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
|
30
|
+
proxy.country = html_node.content_at("td[4]")
|
31
|
+
proxy.anonymity = html_node.content_at("td[5]")
|
32
|
+
proxy.type = parse_type(html_node)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# Parses HTML node to extract proxy type.
|
39
|
+
#
|
40
|
+
# @param html_node [Object]
|
41
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
42
|
+
#
|
43
|
+
# @return [String]
|
44
|
+
# Proxy type
|
45
|
+
#
|
46
|
+
def parse_type(html_node)
|
47
|
+
https = html_node.content_at("td[7]")
|
48
|
+
https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
ProxyFetcher::Configuration.register_provider(:free_proxy_list_us, FreeProxyListUS)
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "json"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# MTPro provider class.
|
8
|
+
class MTPro < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://mtpro.xyz/api/?type=socks"
|
12
|
+
end
|
13
|
+
|
14
|
+
def load_proxy_list(filters = {})
|
15
|
+
html = load_html(provider_url, filters)
|
16
|
+
JSON.parse(html)
|
17
|
+
rescue JSON::ParserError
|
18
|
+
[]
|
19
|
+
end
|
20
|
+
|
21
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
22
|
+
# object.
|
23
|
+
#
|
24
|
+
# @param node [Object]
|
25
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
26
|
+
#
|
27
|
+
# @return [ProxyFetcher::Proxy]
|
28
|
+
# Proxy object
|
29
|
+
#
|
30
|
+
def to_proxy(node)
|
31
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
32
|
+
proxy.addr = node["ip"]
|
33
|
+
proxy.port = Integer(node["port"])
|
34
|
+
proxy.country = node["country"]
|
35
|
+
proxy.anonymity = "Unknown"
|
36
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS5
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
ProxyFetcher::Configuration.register_provider(:mtpro, MTPro)
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ProxyFetcher
|
4
|
+
module Providers
|
5
|
+
# FreeProxyList provider class.
|
6
|
+
class Proxypedia < Base
|
7
|
+
# Provider URL to fetch proxy list
|
8
|
+
def provider_url
|
9
|
+
"https://proxypedia.org"
|
10
|
+
end
|
11
|
+
|
12
|
+
# [NOTE] Doesn't support filtering
|
13
|
+
def xpath
|
14
|
+
"//main/ul/li[position()>1]"
|
15
|
+
end
|
16
|
+
|
17
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
18
|
+
# object.]
|
19
|
+
#
|
20
|
+
# @param html_node [Object]
|
21
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
22
|
+
#
|
23
|
+
# @return [ProxyFetcher::Proxy]
|
24
|
+
# Proxy object
|
25
|
+
#
|
26
|
+
def to_proxy(html_node)
|
27
|
+
addr, port = html_node.content_at("a").to_s.split(":")
|
28
|
+
|
29
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
30
|
+
proxy.addr = addr
|
31
|
+
proxy.port = Integer(port)
|
32
|
+
proxy.country = parse_country(html_node)
|
33
|
+
proxy.anonymity = "Unknown"
|
34
|
+
proxy.type = ProxyFetcher::Proxy::HTTP
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def parse_country(html_node)
|
41
|
+
text = html_node.content.to_s
|
42
|
+
text[/\((.+?)\)/, 1] || "Unknown"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
ProxyFetcher::Configuration.register_provider(:proxypedia, Proxypedia)
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# FreeProxyList provider class.
|
8
|
+
class ProxyscrapeHTTP < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://api.proxyscrape.com/v2/?request=getproxies&protocol=http"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Loads provider HTML and parses it with internal document object.
|
15
|
+
#
|
16
|
+
# @param url [String]
|
17
|
+
# URL to fetch
|
18
|
+
#
|
19
|
+
# @param filters [Hash]
|
20
|
+
# filters for proxy provider
|
21
|
+
#
|
22
|
+
# @return [Array]
|
23
|
+
# Collection of extracted proxies with ports
|
24
|
+
#
|
25
|
+
def load_document(url, filters = {})
|
26
|
+
html = load_html(url, filters)
|
27
|
+
|
28
|
+
CSV.parse(html, col_sep: "\t").map(&:first)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the txt document to return all the proxy entries (ip addresses
|
33
|
+
# and ports).
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
# Collection of extracted proxies with ports
|
37
|
+
#
|
38
|
+
def load_proxy_list(filters = {})
|
39
|
+
load_document(provider_url, filters)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts String to <code>ProxyFetcher::Proxy</code> object.
|
43
|
+
#
|
44
|
+
# @param node [String]
|
45
|
+
# String
|
46
|
+
#
|
47
|
+
# @return [ProxyFetcher::Proxy]
|
48
|
+
# Proxy object
|
49
|
+
#
|
50
|
+
def to_proxy(node)
|
51
|
+
addr, port = node.split(":")
|
52
|
+
|
53
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
54
|
+
proxy.addr = addr
|
55
|
+
proxy.port = Integer(port)
|
56
|
+
proxy.country = "Unknown"
|
57
|
+
proxy.anonymity = "Unknown"
|
58
|
+
proxy.type = ProxyFetcher::Proxy::HTTP
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
ProxyFetcher::Configuration.register_provider(:proxyscrape_http, ProxyscrapeHTTP)
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# FreeProxyList provider class.
|
8
|
+
class ProxyscrapeSOCKS4 < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks4"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Loads provider HTML and parses it with internal document object.
|
15
|
+
#
|
16
|
+
# @param url [String]
|
17
|
+
# URL to fetch
|
18
|
+
#
|
19
|
+
# @param filters [Hash]
|
20
|
+
# filters for proxy provider
|
21
|
+
#
|
22
|
+
# @return [Array]
|
23
|
+
# Collection of extracted proxies with ports
|
24
|
+
#
|
25
|
+
def load_document(url, filters = {})
|
26
|
+
html = load_html(url, filters)
|
27
|
+
|
28
|
+
CSV.parse(html, col_sep: "\t").map(&:first)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the txt document to return all the proxy entries (ip addresses
|
33
|
+
# and ports).
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
# Collection of extracted proxies with ports
|
37
|
+
#
|
38
|
+
def load_proxy_list(filters = {})
|
39
|
+
load_document(provider_url, filters)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts String to <code>ProxyFetcher::Proxy</code> object.
|
43
|
+
#
|
44
|
+
# @param node [String]
|
45
|
+
# String
|
46
|
+
#
|
47
|
+
# @return [ProxyFetcher::Proxy]
|
48
|
+
# Proxy object
|
49
|
+
#
|
50
|
+
def to_proxy(html_node)
|
51
|
+
addr, port = html_node.split(":")
|
52
|
+
|
53
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
54
|
+
proxy.addr = addr
|
55
|
+
proxy.port = Integer(port)
|
56
|
+
proxy.country = "Unknown"
|
57
|
+
proxy.anonymity = "Unknown"
|
58
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS4
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
ProxyFetcher::Configuration.register_provider(:proxyscrape_socks4, ProxyscrapeSOCKS4)
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "csv"
|
4
|
+
|
5
|
+
module ProxyFetcher
|
6
|
+
module Providers
|
7
|
+
# FreeProxyList provider class.
|
8
|
+
class ProxyscrapeSOCKS5 < Base
|
9
|
+
# Provider URL to fetch proxy list
|
10
|
+
def provider_url
|
11
|
+
"https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks5"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Loads provider HTML and parses it with internal document object.
|
15
|
+
#
|
16
|
+
# @param url [String]
|
17
|
+
# URL to fetch
|
18
|
+
#
|
19
|
+
# @param filters [Hash]
|
20
|
+
# filters for proxy provider
|
21
|
+
#
|
22
|
+
# @return [Array]
|
23
|
+
# Collection of extracted proxies with ports
|
24
|
+
#
|
25
|
+
def load_document(url, filters = {})
|
26
|
+
html = load_html(url, filters)
|
27
|
+
|
28
|
+
CSV.parse(html, col_sep: "\t").map(&:first)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the txt document to return all the proxy entries (ip addresses
|
33
|
+
# and ports).
|
34
|
+
#
|
35
|
+
# @return [Array]
|
36
|
+
# Collection of extracted proxies with ports
|
37
|
+
#
|
38
|
+
def load_proxy_list(filters = {})
|
39
|
+
load_document(provider_url, filters)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts String to <code>ProxyFetcher::Proxy</code> object.
|
43
|
+
#
|
44
|
+
# @param node [String]
|
45
|
+
# String
|
46
|
+
#
|
47
|
+
# @return [ProxyFetcher::Proxy]
|
48
|
+
# Proxy object
|
49
|
+
#
|
50
|
+
def to_proxy(html_node)
|
51
|
+
addr, port = html_node.split(":")
|
52
|
+
|
53
|
+
ProxyFetcher::Proxy.new.tap do |proxy|
|
54
|
+
proxy.addr = addr
|
55
|
+
proxy.port = Integer(port)
|
56
|
+
proxy.country = "Unknown"
|
57
|
+
proxy.anonymity = "Unknown"
|
58
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS5
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
ProxyFetcher::Configuration.register_provider(:proxyscrape_socks5, ProxyscrapeSOCKS5)
|
64
|
+
end
|
65
|
+
end
|
@@ -6,11 +6,11 @@ module ProxyFetcher
|
|
6
6
|
class XRoxy < Base
|
7
7
|
# Provider URL to fetch proxy list
|
8
8
|
def provider_url
|
9
|
-
"https://www.xroxy.com/
|
9
|
+
"https://www.xroxy.com/proxylist.htm"
|
10
10
|
end
|
11
11
|
|
12
12
|
def xpath
|
13
|
-
"//
|
13
|
+
"//tr[@class='row1' or @class='row0']"
|
14
14
|
end
|
15
15
|
|
16
16
|
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
data/lib/proxy_fetcher/proxy.rb
CHANGED
@@ -104,5 +104,17 @@ module ProxyFetcher
|
|
104
104
|
URI::Generic.build(host: addr, port: port).to_s
|
105
105
|
end
|
106
106
|
end
|
107
|
+
|
108
|
+
def ==(other)
|
109
|
+
other.is_a?(Proxy) && addr == other.addr && port == other.port
|
110
|
+
end
|
111
|
+
|
112
|
+
def eql?(other)
|
113
|
+
hash.eql?(other.hash)
|
114
|
+
end
|
115
|
+
|
116
|
+
def hash
|
117
|
+
[addr.hash, port.hash].hash
|
118
|
+
end
|
107
119
|
end
|
108
120
|
end
|
@@ -41,8 +41,8 @@ module ProxyFetcher
|
|
41
41
|
# @return [String]
|
42
42
|
# resource content
|
43
43
|
#
|
44
|
-
def self.fetch(*args)
|
45
|
-
new(*args).fetch
|
44
|
+
def self.fetch(*args, **kwargs, &block)
|
45
|
+
new(*args, **kwargs, &block).fetch
|
46
46
|
end
|
47
47
|
|
48
48
|
# Initialize HTTP client instance
|
@@ -51,15 +51,17 @@ module ProxyFetcher
|
|
51
51
|
#
|
52
52
|
def initialize(url, method: :get, params: {}, headers: {})
|
53
53
|
@url = url.to_s
|
54
|
-
@method = method
|
54
|
+
@method = method.to_sym
|
55
55
|
@params = params
|
56
56
|
@headers = headers
|
57
57
|
|
58
|
-
|
59
|
-
|
58
|
+
unless HTTP::Request::METHODS.include?(@method)
|
59
|
+
raise ArgumentError, "'#{@method}' is a wrong HTTP method name"
|
60
|
+
end
|
60
61
|
|
61
|
-
@
|
62
|
-
@
|
62
|
+
@timeout = ProxyFetcher.config.provider_proxies_load_timeout
|
63
|
+
@http = build_http_engine
|
64
|
+
@ssl_ctx = build_ssl_context
|
63
65
|
end
|
64
66
|
|
65
67
|
# Fetches resource content by sending HTTP request to it.
|
@@ -67,30 +69,32 @@ module ProxyFetcher
|
|
67
69
|
# @return [String]
|
68
70
|
# response body
|
69
71
|
#
|
70
|
-
def fetch
|
71
|
-
response =
|
72
|
+
def fetch(**options)
|
73
|
+
response = perform_http_request
|
74
|
+
return response if options.fetch(:raw, false)
|
75
|
+
|
72
76
|
response.body.to_s
|
73
77
|
rescue StandardError => e
|
74
|
-
ProxyFetcher.logger.warn("Failed to process request to #{url} (#{e.message})")
|
78
|
+
ProxyFetcher.config.logger.warn("Failed to process request to #{url} (#{e.message})")
|
75
79
|
""
|
76
80
|
end
|
77
81
|
|
78
|
-
def fetch_with_headers
|
79
|
-
process_http_request
|
80
|
-
rescue StandardError => e
|
81
|
-
ProxyFetcher.logger.warn("Failed to process request to #{url} (#{e.message})")
|
82
|
-
HTTP::Response.new(version: "1.1", status: 500, body: "")
|
83
|
-
end
|
84
|
-
|
85
82
|
protected
|
86
83
|
|
87
|
-
def
|
88
|
-
|
89
|
-
|
84
|
+
def build_ssl_context
|
85
|
+
OpenSSL::SSL::SSLContext.new.tap do |context|
|
86
|
+
context.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
90
87
|
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def build_http_engine
|
91
|
+
HTTP.headers(default_headers.merge(headers)).timeout(connect: timeout, read: timeout)
|
92
|
+
end
|
91
93
|
|
94
|
+
def perform_http_request(http_method: method, http_params: params)
|
92
95
|
http.public_send(
|
93
|
-
http_method
|
96
|
+
http_method,
|
97
|
+
url,
|
94
98
|
form: http_params,
|
95
99
|
ssl_context: ssl_ctx
|
96
100
|
)
|