proxy_fetcher 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/gemfiles/nokogiri.gemfile +1 -1
- data/gemfiles/oga.gemfile +1 -1
- data/lib/proxy_fetcher.rb +1 -0
- data/lib/proxy_fetcher/providers/base.rb +48 -8
- data/lib/proxy_fetcher/providers/free_proxy_list.rb +5 -3
- data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +5 -3
- data/lib/proxy_fetcher/providers/gather_proxy.rb +4 -2
- data/lib/proxy_fetcher/providers/http_tunnel.rb +5 -3
- data/lib/proxy_fetcher/providers/proxy_docker.rb +53 -13
- data/lib/proxy_fetcher/providers/proxy_list.rb +4 -2
- data/lib/proxy_fetcher/providers/xroxy.rb +4 -2
- data/lib/proxy_fetcher/utils/http_client.rb +30 -9
- data/lib/proxy_fetcher/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '01894f07ae28eafbb09934aa7a4e52188fb0bf02db7ab458df5c91a93b3e32af'
|
4
|
+
data.tar.gz: b487f3aeb6b833ab6b6395d0d4c9311294f7894e68f6a51c2e9943ccf2904d09
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aef23de20b41467dc2e1cadf65e6e728555f62e3cba0006df55ec62c8b877f8ca924dff9a81441897dc8e4f3efbf346ecefad62cf0b1e5a3bbcbd78272e1ea34
|
7
|
+
data.tar.gz: de41ae50f5bff9c8b8ae309057695661e76c47d2df703d369df6a2b3b40bffcfc881edc21b9833ca0f303392a480b7562b53b39d3f8c79e8e47cd0595a690366
|
data/gemfiles/nokogiri.gemfile
CHANGED
data/gemfiles/oga.gemfile
CHANGED
data/lib/proxy_fetcher.rb
CHANGED
@@ -7,7 +7,25 @@ module ProxyFetcher
|
|
7
7
|
# Loads proxy provider page content, extract proxy list from it
|
8
8
|
# and convert every entry to proxy object.
|
9
9
|
def fetch_proxies!(filters = {})
|
10
|
-
load_proxy_list(filters)
|
10
|
+
raw_proxies = load_proxy_list(filters)
|
11
|
+
proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact
|
12
|
+
proxies.reject { |proxy| proxy.addr.nil? }
|
13
|
+
end
|
14
|
+
|
15
|
+
def provider_url
|
16
|
+
raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
|
17
|
+
end
|
18
|
+
|
19
|
+
def provider_method
|
20
|
+
:get
|
21
|
+
end
|
22
|
+
|
23
|
+
def provider_params
|
24
|
+
{}
|
25
|
+
end
|
26
|
+
|
27
|
+
def provider_headers
|
28
|
+
{}
|
11
29
|
end
|
12
30
|
|
13
31
|
# Just synthetic sugar to make it easier to call #fetch_proxies! method.
|
@@ -17,7 +35,27 @@ module ProxyFetcher
|
|
17
35
|
|
18
36
|
protected
|
19
37
|
|
20
|
-
# Loads
|
38
|
+
# Loads raw provider HTML with proxies.
|
39
|
+
#
|
40
|
+
# @return [String]
|
41
|
+
# HTML body
|
42
|
+
#
|
43
|
+
def load_html(url, filters = {})
|
44
|
+
raise ArgumentError, 'filters must be a Hash' if filters && !filters.is_a?(Hash)
|
45
|
+
|
46
|
+
uri = URI.parse(url)
|
47
|
+
# TODO: query for post request?
|
48
|
+
uri.query = URI.encode_www_form(provider_params.merge(filters)) if filters && filters.any?
|
49
|
+
|
50
|
+
ProxyFetcher.config.http_client.fetch(
|
51
|
+
uri.to_s,
|
52
|
+
method: provider_method,
|
53
|
+
headers: provider_headers,
|
54
|
+
params: provider_params
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Loads provider HTML and parses it with internal document object.
|
21
59
|
#
|
22
60
|
# @param url [String]
|
23
61
|
# URL to fetch
|
@@ -29,15 +67,17 @@ module ProxyFetcher
|
|
29
67
|
# ProxyFetcher document object
|
30
68
|
#
|
31
69
|
def load_document(url, filters = {})
|
32
|
-
|
33
|
-
|
34
|
-
uri = URI.parse(url)
|
35
|
-
uri.query = URI.encode_www_form(filters) if filters && filters.any?
|
36
|
-
|
37
|
-
html = ProxyFetcher.config.http_client.fetch(uri.to_s)
|
70
|
+
html = load_html(url, filters)
|
38
71
|
ProxyFetcher::Document.parse(html)
|
39
72
|
end
|
40
73
|
|
74
|
+
def build_proxy(*args)
|
75
|
+
to_proxy(*args)
|
76
|
+
rescue StandardError => error
|
77
|
+
ProxyFetcher.logger.warn("Failed to build Proxy object due to error: #{error.message}")
|
78
|
+
nil
|
79
|
+
end
|
80
|
+
|
41
81
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
42
82
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
43
83
|
# to return all the proxy entries (HTML nodes).
|
@@ -5,11 +5,13 @@ module ProxyFetcher
|
|
5
5
|
# FreeProxyList provider class.
|
6
6
|
class FreeProxyList < Base
|
7
7
|
# Provider URL to fetch proxy list
|
8
|
-
|
8
|
+
def provider_url
|
9
|
+
'https://free-proxy-list.net/'
|
10
|
+
end
|
9
11
|
|
10
12
|
# [NOTE] Doesn't support filtering
|
11
|
-
def load_proxy_list(
|
12
|
-
doc = load_document(
|
13
|
+
def load_proxy_list(_filters = {})
|
14
|
+
doc = load_document(provider_url, {})
|
13
15
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
14
16
|
end
|
15
17
|
|
@@ -5,7 +5,9 @@ module ProxyFetcher
|
|
5
5
|
# FreeProxyListSSL provider class.
|
6
6
|
class FreeProxyListSSL < Base
|
7
7
|
# Provider URL to fetch proxy list
|
8
|
-
|
8
|
+
def provider_url
|
9
|
+
'https://www.sslproxies.org/'
|
10
|
+
end
|
9
11
|
|
10
12
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
13
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
@@ -15,8 +17,8 @@ module ProxyFetcher
|
|
15
17
|
# Collection of extracted HTML nodes with full proxy info
|
16
18
|
#
|
17
19
|
# [NOTE] Doesn't support filtering
|
18
|
-
def load_proxy_list(
|
19
|
-
doc = load_document(
|
20
|
+
def load_proxy_list(_filters = {})
|
21
|
+
doc = load_document(provider_url, {})
|
20
22
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
21
23
|
end
|
22
24
|
|
@@ -7,7 +7,9 @@ module ProxyFetcher
|
|
7
7
|
# GatherProxy provider class.
|
8
8
|
class GatherProxy < Base
|
9
9
|
# Provider URL to fetch proxy list
|
10
|
-
|
10
|
+
def provider_url
|
11
|
+
'http://www.gatherproxy.com/'
|
12
|
+
end
|
11
13
|
|
12
14
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
13
15
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
@@ -17,7 +19,7 @@ module ProxyFetcher
|
|
17
19
|
# Collection of extracted HTML nodes with full proxy info
|
18
20
|
#
|
19
21
|
def load_proxy_list(*)
|
20
|
-
doc = load_document(
|
22
|
+
doc = load_document(provider_url)
|
21
23
|
doc.xpath('//div[@class="proxy-list"]/table/script')
|
22
24
|
end
|
23
25
|
|
@@ -5,7 +5,9 @@ module ProxyFetcher
|
|
5
5
|
# HTTPTunnel provider class.
|
6
6
|
class HTTPTunnel < Base
|
7
7
|
# Provider URL to fetch proxy list
|
8
|
-
|
8
|
+
def provider_url
|
9
|
+
'http://www.httptunnel.ge/ProxyListForFree.aspx'
|
10
|
+
end
|
9
11
|
|
10
12
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
13
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
@@ -14,8 +16,8 @@ module ProxyFetcher
|
|
14
16
|
# @return [Array<ProxyFetcher::Document::Node>]
|
15
17
|
# Collection of extracted HTML nodes with full proxy info
|
16
18
|
#
|
17
|
-
def load_proxy_list(
|
18
|
-
doc = load_document(
|
19
|
+
def load_proxy_list(_filters = {})
|
20
|
+
doc = load_document(provider_url)
|
19
21
|
doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
|
20
22
|
end
|
21
23
|
|
@@ -1,11 +1,39 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'json'
|
4
|
+
|
3
5
|
module ProxyFetcher
|
4
6
|
module Providers
|
5
7
|
# ProxyDocker provider class.
|
6
8
|
class ProxyDocker < Base
|
7
9
|
# Provider URL to fetch proxy list
|
8
|
-
|
10
|
+
def provider_url
|
11
|
+
'https://www.proxydocker.com/en/api/proxylist/'
|
12
|
+
end
|
13
|
+
|
14
|
+
def provider_method
|
15
|
+
:post
|
16
|
+
end
|
17
|
+
|
18
|
+
def provider_params
|
19
|
+
{
|
20
|
+
token: 'GmZyl0OJmmgrWakdzO7AFf6AWfkdledR6xmKvGmwmJg',
|
21
|
+
country: 'all',
|
22
|
+
city: 'all',
|
23
|
+
state: 'all',
|
24
|
+
port: 'all',
|
25
|
+
type: 'all',
|
26
|
+
anonymity: 'all',
|
27
|
+
need: 'all',
|
28
|
+
page: '1'
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
def provider_headers
|
33
|
+
{
|
34
|
+
cookie: 'PHPSESSID=7f59558ee58b1e4352c4ab4c2f1a3c11'
|
35
|
+
}
|
36
|
+
end
|
9
37
|
|
10
38
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
39
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
@@ -16,30 +44,42 @@ module ProxyFetcher
|
|
16
44
|
#
|
17
45
|
# [NOTE] Doesn't support direct filters
|
18
46
|
def load_proxy_list(*)
|
19
|
-
|
20
|
-
|
47
|
+
json = JSON.parse(load_html(provider_url, {}))
|
48
|
+
json.fetch('proxies', [])
|
49
|
+
rescue JSON::ParserError
|
50
|
+
[]
|
21
51
|
end
|
22
52
|
|
23
|
-
# Converts
|
53
|
+
# Converts JSON node to <code>ProxyFetcher::Proxy</code>
|
24
54
|
# object.
|
25
55
|
#
|
26
|
-
# @param
|
27
|
-
#
|
56
|
+
# @param node [Hash]
|
57
|
+
# JSON entry from the API response
|
28
58
|
#
|
29
59
|
# @return [ProxyFetcher::Proxy]
|
30
60
|
# Proxy object
|
31
61
|
#
|
32
|
-
def to_proxy(
|
62
|
+
def to_proxy(node)
|
33
63
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
34
|
-
|
35
|
-
proxy.
|
36
|
-
proxy.port = uri.port
|
64
|
+
proxy.addr = node['ip']
|
65
|
+
proxy.port = node['port']
|
37
66
|
|
38
|
-
proxy.type =
|
39
|
-
proxy.anonymity =
|
40
|
-
proxy.country =
|
67
|
+
proxy.type = types_mapping.fetch(node['type'], ProxyFetcher::Proxy::HTTP)
|
68
|
+
proxy.anonymity = "Lvl#{node['anonymity']}"
|
69
|
+
proxy.country = node['country']
|
41
70
|
end
|
42
71
|
end
|
72
|
+
|
73
|
+
def types_mapping
|
74
|
+
{
|
75
|
+
'16' => ProxyFetcher::Proxy::HTTP,
|
76
|
+
'26' => ProxyFetcher::Proxy::HTTPS,
|
77
|
+
'3' => ProxyFetcher::Proxy::SOCKS4,
|
78
|
+
'4' => ProxyFetcher::Proxy::SOCKS5,
|
79
|
+
'56' => ProxyFetcher::Proxy::HTTP, # CON25
|
80
|
+
'6' => ProxyFetcher::Proxy::HTTP # CON80
|
81
|
+
}
|
82
|
+
end
|
43
83
|
end
|
44
84
|
|
45
85
|
ProxyFetcher::Configuration.register_provider(:proxy_docker, ProxyDocker)
|
@@ -7,7 +7,9 @@ module ProxyFetcher
|
|
7
7
|
# ProxyList provider class.
|
8
8
|
class ProxyList < Base
|
9
9
|
# Provider URL to fetch proxy list
|
10
|
-
|
10
|
+
def provider_url
|
11
|
+
'https://proxy-list.org/english/index.php'
|
12
|
+
end
|
11
13
|
|
12
14
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
13
15
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
@@ -17,7 +19,7 @@ module ProxyFetcher
|
|
17
19
|
# Collection of extracted HTML nodes with full proxy info
|
18
20
|
#
|
19
21
|
def load_proxy_list(filters = {})
|
20
|
-
doc = load_document(
|
22
|
+
doc = load_document(provider_url, filters)
|
21
23
|
doc.css('.table-wrap .table ul')
|
22
24
|
end
|
23
25
|
|
@@ -5,7 +5,9 @@ module ProxyFetcher
|
|
5
5
|
# XRoxy provider class.
|
6
6
|
class XRoxy < Base
|
7
7
|
# Provider URL to fetch proxy list
|
8
|
-
|
8
|
+
def provider_url
|
9
|
+
'https://www.xroxy.com/free-proxy-lists/'
|
10
|
+
end
|
9
11
|
|
10
12
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
13
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
@@ -15,7 +17,7 @@ module ProxyFetcher
|
|
15
17
|
# Collection of extracted HTML nodes with full proxy info
|
16
18
|
#
|
17
19
|
def load_proxy_list(filters = { type: 'All_http' })
|
18
|
-
doc = load_document(
|
20
|
+
doc = load_document(provider_url, filters)
|
19
21
|
doc.xpath('//div/table/tbody/tr')
|
20
22
|
end
|
21
23
|
|
@@ -9,6 +9,18 @@ module ProxyFetcher
|
|
9
9
|
# @return [String] URL
|
10
10
|
attr_reader :url
|
11
11
|
|
12
|
+
# @!attribute [r] HTTP method
|
13
|
+
# @return [String] HTTP method verb
|
14
|
+
attr_reader :method
|
15
|
+
|
16
|
+
# @!attribute [r] HTTP params
|
17
|
+
# @return [Hash] params
|
18
|
+
attr_reader :params
|
19
|
+
|
20
|
+
# @!attribute [r] HTTP headers
|
21
|
+
# @return [Hash] headers
|
22
|
+
attr_reader :headers
|
23
|
+
|
12
24
|
# @!attribute [r] http
|
13
25
|
# @return [Net::HTTP] HTTP client
|
14
26
|
attr_reader :http
|
@@ -29,17 +41,21 @@ module ProxyFetcher
|
|
29
41
|
# @return [String]
|
30
42
|
# resource content
|
31
43
|
#
|
32
|
-
def self.fetch(
|
33
|
-
new(
|
44
|
+
def self.fetch(*args)
|
45
|
+
new(*args).fetch
|
34
46
|
end
|
35
47
|
|
36
48
|
# Initialize HTTP client instance
|
37
49
|
#
|
38
50
|
# @return [HTTPClient]
|
39
51
|
#
|
40
|
-
def initialize(url)
|
52
|
+
def initialize(url, method: :get, params: {}, headers: {})
|
41
53
|
@url = url.to_s
|
42
|
-
@
|
54
|
+
@method = method
|
55
|
+
@params = params
|
56
|
+
@headers = headers
|
57
|
+
|
58
|
+
@http = HTTP.headers(default_headers.merge(headers)).timeout(connect: timeout, read: timeout)
|
43
59
|
@timeout = ProxyFetcher.config.provider_proxies_load_timeout
|
44
60
|
|
45
61
|
@ssl_ctx = OpenSSL::SSL::SSLContext.new
|
@@ -52,11 +68,16 @@ module ProxyFetcher
|
|
52
68
|
# response body
|
53
69
|
#
|
54
70
|
def fetch
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
71
|
+
# TODO: must be more generic
|
72
|
+
response = if method == :post
|
73
|
+
http.post(url, form: params, ssl_context: ssl_ctx)
|
74
|
+
else
|
75
|
+
http.get(url, ssl_context: ssl_ctx)
|
76
|
+
end
|
77
|
+
|
78
|
+
response.body.to_s
|
79
|
+
rescue StandardError => error
|
80
|
+
ProxyFetcher.logger.warn("Failed to load proxy list for #{url} (#{error.message})")
|
60
81
|
''
|
61
82
|
end
|
62
83
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proxy_fetcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikita Bulai
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|