proxy_fetcher 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d5fee8c7b429b32d608388ddf15885d2354721d6de693396ad228248925cf2d6
4
- data.tar.gz: '0250291040c511a723cdae4a22a78d4f765a0ca7b10325f331a8d119f8d25157'
3
+ metadata.gz: '01894f07ae28eafbb09934aa7a4e52188fb0bf02db7ab458df5c91a93b3e32af'
4
+ data.tar.gz: b487f3aeb6b833ab6b6395d0d4c9311294f7894e68f6a51c2e9943ccf2904d09
5
5
  SHA512:
6
- metadata.gz: b7d12b75f0e80075d31832f8add9d1269ae48ff32714a0c0b481a85eb275094ccec62020911d55f74a2920211fc445f11ee3c31f43fdc7cc1f5ae2d175278eec
7
- data.tar.gz: 138c464c0531fc12773b995d3101f5e952e2a4868e7e13d18274b27f7b0ea310a515d76e2066f36b25c2586c2046b9497f15a74e943cfbc8fdc8f0204796f93e
6
+ metadata.gz: aef23de20b41467dc2e1cadf65e6e728555f62e3cba0006df55ec62c8b877f8ca924dff9a81441897dc8e4f3efbf346ecefad62cf0b1e5a3bbcbd78272e1ea34
7
+ data.tar.gz: de41ae50f5bff9c8b8ae309057695661e76c47d2df703d369df6a2b3b40bffcfc881edc21b9833ca0f303392a480b7562b53b39d3f8c79e8e47cd0595a690366
@@ -7,5 +7,5 @@ gem 'nokogiri', '~> 1.8'
7
7
  group :test do
8
8
  gem 'coveralls', require: false
9
9
  gem 'evil-proxy', '~> 0.2'
10
- gem 'rspec-rails', '~> 3.6'
10
+ gem 'rspec', '~> 3.6'
11
11
  end
data/gemfiles/oga.gemfile CHANGED
@@ -7,5 +7,5 @@ gem 'oga', '~> 2.0'
7
7
  group :test do
8
8
  gem 'coveralls', require: false
9
9
  gem 'evil-proxy', '~> 0.2'
10
- gem 'rspec-rails', '~> 3.6'
10
+ gem 'rspec', '~> 3.6'
11
11
  end
data/lib/proxy_fetcher.rb CHANGED
@@ -80,6 +80,7 @@ module ProxyFetcher
80
80
  #
81
81
  def logger
82
82
  return @logger if defined?(@logger)
83
+
83
84
  @logger = config.logger || NullLogger.new
84
85
  end
85
86
 
@@ -7,7 +7,25 @@ module ProxyFetcher
7
7
  # Loads proxy provider page content, extract proxy list from it
8
8
  # and convert every entry to proxy object.
9
9
  def fetch_proxies!(filters = {})
10
- load_proxy_list(filters).map { |html_node| to_proxy(html_node) }
10
+ raw_proxies = load_proxy_list(filters)
11
+ proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact
12
+ proxies.reject { |proxy| proxy.addr.nil? }
13
+ end
14
+
15
+ def provider_url
16
+ raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
17
+ end
18
+
19
+ def provider_method
20
+ :get
21
+ end
22
+
23
+ def provider_params
24
+ {}
25
+ end
26
+
27
+ def provider_headers
28
+ {}
11
29
  end
12
30
 
13
31
  # Just synthetic sugar to make it easier to call #fetch_proxies! method.
@@ -17,7 +35,27 @@ module ProxyFetcher
17
35
 
18
36
  protected
19
37
 
20
- # Loads HTML document with Nokogiri by the URL combined with custom filters
38
+ # Loads raw provider HTML with proxies.
39
+ #
40
+ # @return [String]
41
+ # HTML body
42
+ #
43
+ def load_html(url, filters = {})
44
+ raise ArgumentError, 'filters must be a Hash' if filters && !filters.is_a?(Hash)
45
+
46
+ uri = URI.parse(url)
47
+ # TODO: query for post request?
48
+ uri.query = URI.encode_www_form(provider_params.merge(filters)) if filters && filters.any?
49
+
50
+ ProxyFetcher.config.http_client.fetch(
51
+ uri.to_s,
52
+ method: provider_method,
53
+ headers: provider_headers,
54
+ params: provider_params
55
+ )
56
+ end
57
+
58
+ # Loads provider HTML and parses it with internal document object.
21
59
  #
22
60
  # @param url [String]
23
61
  # URL to fetch
@@ -29,15 +67,17 @@ module ProxyFetcher
29
67
  # ProxyFetcher document object
30
68
  #
31
69
  def load_document(url, filters = {})
32
- raise ArgumentError, 'filters must be a Hash' if filters && !filters.is_a?(Hash)
33
-
34
- uri = URI.parse(url)
35
- uri.query = URI.encode_www_form(filters) if filters && filters.any?
36
-
37
- html = ProxyFetcher.config.http_client.fetch(uri.to_s)
70
+ html = load_html(url, filters)
38
71
  ProxyFetcher::Document.parse(html)
39
72
  end
40
73
 
74
+ def build_proxy(*args)
75
+ to_proxy(*args)
76
+ rescue StandardError => error
77
+ ProxyFetcher.logger.warn("Failed to build Proxy object due to error: #{error.message}")
78
+ nil
79
+ end
80
+
41
81
  # Fetches HTML content by sending HTTP request to the provider URL and
42
82
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
43
83
  # to return all the proxy entries (HTML nodes).
@@ -5,11 +5,13 @@ module ProxyFetcher
5
5
  # FreeProxyList provider class.
6
6
  class FreeProxyList < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://free-proxy-list.net/'.freeze
8
+ def provider_url
9
+ 'https://free-proxy-list.net/'
10
+ end
9
11
 
10
12
  # [NOTE] Doesn't support filtering
11
- def load_proxy_list(*)
12
- doc = load_document(PROVIDER_URL, {})
13
+ def load_proxy_list(_filters = {})
14
+ doc = load_document(provider_url, {})
13
15
  doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
14
16
  end
15
17
 
@@ -5,7 +5,9 @@ module ProxyFetcher
5
5
  # FreeProxyListSSL provider class.
6
6
  class FreeProxyListSSL < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://www.sslproxies.org/'.freeze
8
+ def provider_url
9
+ 'https://www.sslproxies.org/'
10
+ end
9
11
 
10
12
  # Fetches HTML content by sending HTTP request to the provider URL and
11
13
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -15,8 +17,8 @@ module ProxyFetcher
15
17
  # Collection of extracted HTML nodes with full proxy info
16
18
  #
17
19
  # [NOTE] Doesn't support filtering
18
- def load_proxy_list(*)
19
- doc = load_document(PROVIDER_URL, {})
20
+ def load_proxy_list(_filters = {})
21
+ doc = load_document(provider_url, {})
20
22
  doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
21
23
  end
22
24
 
@@ -7,7 +7,9 @@ module ProxyFetcher
7
7
  # GatherProxy provider class.
8
8
  class GatherProxy < Base
9
9
  # Provider URL to fetch proxy list
10
- PROVIDER_URL = 'http://www.gatherproxy.com/'.freeze
10
+ def provider_url
11
+ 'http://www.gatherproxy.com/'
12
+ end
11
13
 
12
14
  # Fetches HTML content by sending HTTP request to the provider URL and
13
15
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -17,7 +19,7 @@ module ProxyFetcher
17
19
  # Collection of extracted HTML nodes with full proxy info
18
20
  #
19
21
  def load_proxy_list(*)
20
- doc = load_document(PROVIDER_URL)
22
+ doc = load_document(provider_url)
21
23
  doc.xpath('//div[@class="proxy-list"]/table/script')
22
24
  end
23
25
 
@@ -5,7 +5,9 @@ module ProxyFetcher
5
5
  # HTTPTunnel provider class.
6
6
  class HTTPTunnel < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'http://www.httptunnel.ge/ProxyListForFree.aspx'.freeze
8
+ def provider_url
9
+ 'http://www.httptunnel.ge/ProxyListForFree.aspx'
10
+ end
9
11
 
10
12
  # Fetches HTML content by sending HTTP request to the provider URL and
11
13
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -14,8 +16,8 @@ module ProxyFetcher
14
16
  # @return [Array<ProxyFetcher::Document::Node>]
15
17
  # Collection of extracted HTML nodes with full proxy info
16
18
  #
17
- def load_proxy_list(*)
18
- doc = load_document(PROVIDER_URL)
19
+ def load_proxy_list(_filters = {})
20
+ doc = load_document(provider_url)
19
21
  doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
20
22
  end
21
23
 
@@ -1,11 +1,39 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
4
+
3
5
  module ProxyFetcher
4
6
  module Providers
5
7
  # ProxyDocker provider class.
6
8
  class ProxyDocker < Base
7
9
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://www.proxydocker.com/en/proxylist/'.freeze
10
+ def provider_url
11
+ 'https://www.proxydocker.com/en/api/proxylist/'
12
+ end
13
+
14
+ def provider_method
15
+ :post
16
+ end
17
+
18
+ def provider_params
19
+ {
20
+ token: 'GmZyl0OJmmgrWakdzO7AFf6AWfkdledR6xmKvGmwmJg',
21
+ country: 'all',
22
+ city: 'all',
23
+ state: 'all',
24
+ port: 'all',
25
+ type: 'all',
26
+ anonymity: 'all',
27
+ need: 'all',
28
+ page: '1'
29
+ }
30
+ end
31
+
32
+ def provider_headers
33
+ {
34
+ cookie: 'PHPSESSID=7f59558ee58b1e4352c4ab4c2f1a3c11'
35
+ }
36
+ end
9
37
 
10
38
  # Fetches HTML content by sending HTTP request to the provider URL and
11
39
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -16,30 +44,42 @@ module ProxyFetcher
16
44
  #
17
45
  # [NOTE] Doesn't support direct filters
18
46
  def load_proxy_list(*)
19
- doc = load_document(PROVIDER_URL, {})
20
- doc.xpath('//table[contains(@class, "table")]/tbody/tr[(count(td)>2)]')
47
+ json = JSON.parse(load_html(provider_url, {}))
48
+ json.fetch('proxies', [])
49
+ rescue JSON::ParserError
50
+ []
21
51
  end
22
52
 
23
- # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
53
+ # Converts JSON node to <code>ProxyFetcher::Proxy</code>
24
54
  # object.
25
55
  #
26
- # @param html_node [Object]
27
- # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
56
+ # @param node [Hash]
57
+ # JSON entry from the API response
28
58
  #
29
59
  # @return [ProxyFetcher::Proxy]
30
60
  # Proxy object
31
61
  #
32
- def to_proxy(html_node)
62
+ def to_proxy(node)
33
63
  ProxyFetcher::Proxy.new.tap do |proxy|
34
- uri = URI("//#{html_node.content_at('td[1]')}")
35
- proxy.addr = uri.host
36
- proxy.port = uri.port
64
+ proxy.addr = node['ip']
65
+ proxy.port = node['port']
37
66
 
38
- proxy.type = html_node.content_at('td[2]')
39
- proxy.anonymity = html_node.content_at('td[3]')
40
- proxy.country = html_node.content_at('td[5]')
67
+ proxy.type = types_mapping.fetch(node['type'], ProxyFetcher::Proxy::HTTP)
68
+ proxy.anonymity = "Lvl#{node['anonymity']}"
69
+ proxy.country = node['country']
41
70
  end
42
71
  end
72
+
73
+ def types_mapping
74
+ {
75
+ '16' => ProxyFetcher::Proxy::HTTP,
76
+ '26' => ProxyFetcher::Proxy::HTTPS,
77
+ '3' => ProxyFetcher::Proxy::SOCKS4,
78
+ '4' => ProxyFetcher::Proxy::SOCKS5,
79
+ '56' => ProxyFetcher::Proxy::HTTP, # CON25
80
+ '6' => ProxyFetcher::Proxy::HTTP # CON80
81
+ }
82
+ end
43
83
  end
44
84
 
45
85
  ProxyFetcher::Configuration.register_provider(:proxy_docker, ProxyDocker)
@@ -7,7 +7,9 @@ module ProxyFetcher
7
7
  # ProxyList provider class.
8
8
  class ProxyList < Base
9
9
  # Provider URL to fetch proxy list
10
- PROVIDER_URL = 'https://proxy-list.org/english/index.php'.freeze
10
+ def provider_url
11
+ 'https://proxy-list.org/english/index.php'
12
+ end
11
13
 
12
14
  # Fetches HTML content by sending HTTP request to the provider URL and
13
15
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -17,7 +19,7 @@ module ProxyFetcher
17
19
  # Collection of extracted HTML nodes with full proxy info
18
20
  #
19
21
  def load_proxy_list(filters = {})
20
- doc = load_document(PROVIDER_URL, filters)
22
+ doc = load_document(provider_url, filters)
21
23
  doc.css('.table-wrap .table ul')
22
24
  end
23
25
 
@@ -5,7 +5,9 @@ module ProxyFetcher
5
5
  # XRoxy provider class.
6
6
  class XRoxy < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://www.xroxy.com/free-proxy-lists/'.freeze
8
+ def provider_url
9
+ 'https://www.xroxy.com/free-proxy-lists/'
10
+ end
9
11
 
10
12
  # Fetches HTML content by sending HTTP request to the provider URL and
11
13
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -15,7 +17,7 @@ module ProxyFetcher
15
17
  # Collection of extracted HTML nodes with full proxy info
16
18
  #
17
19
  def load_proxy_list(filters = { type: 'All_http' })
18
- doc = load_document(PROVIDER_URL, filters)
20
+ doc = load_document(provider_url, filters)
19
21
  doc.xpath('//div/table/tbody/tr')
20
22
  end
21
23
 
@@ -9,6 +9,18 @@ module ProxyFetcher
9
9
  # @return [String] URL
10
10
  attr_reader :url
11
11
 
12
+ # @!attribute [r] HTTP method
13
+ # @return [String] HTTP method verb
14
+ attr_reader :method
15
+
16
+ # @!attribute [r] HTTP params
17
+ # @return [Hash] params
18
+ attr_reader :params
19
+
20
+ # @!attribute [r] HTTP headers
21
+ # @return [Hash] headers
22
+ attr_reader :headers
23
+
12
24
  # @!attribute [r] http
13
25
  # @return [Net::HTTP] HTTP client
14
26
  attr_reader :http
@@ -29,17 +41,21 @@ module ProxyFetcher
29
41
  # @return [String]
30
42
  # resource content
31
43
  #
32
- def self.fetch(url)
33
- new(url).fetch
44
+ def self.fetch(*args)
45
+ new(*args).fetch
34
46
  end
35
47
 
36
48
  # Initialize HTTP client instance
37
49
  #
38
50
  # @return [HTTPClient]
39
51
  #
40
- def initialize(url)
52
+ def initialize(url, method: :get, params: {}, headers: {})
41
53
  @url = url.to_s
42
- @http = HTTP.headers(default_headers)
54
+ @method = method
55
+ @params = params
56
+ @headers = headers
57
+
58
+ @http = HTTP.headers(default_headers.merge(headers)).timeout(connect: timeout, read: timeout)
43
59
  @timeout = ProxyFetcher.config.provider_proxies_load_timeout
44
60
 
45
61
  @ssl_ctx = OpenSSL::SSL::SSLContext.new
@@ -52,11 +68,16 @@ module ProxyFetcher
52
68
  # response body
53
69
  #
54
70
  def fetch
55
- @http.timeout(connect: timeout, read: timeout)
56
- .get(url, ssl_context: ssl_ctx)
57
- .body.to_s
58
- rescue StandardError
59
- ProxyFetcher.logger.warn("Failed to load proxy list for #{url}")
71
+ # TODO: must be more generic
72
+ response = if method == :post
73
+ http.post(url, form: params, ssl_context: ssl_ctx)
74
+ else
75
+ http.get(url, ssl_context: ssl_ctx)
76
+ end
77
+
78
+ response.body.to_s
79
+ rescue StandardError => error
80
+ ProxyFetcher.logger.warn("Failed to load proxy list for #{url} (#{error.message})")
60
81
  ''
61
82
  end
62
83
 
@@ -13,7 +13,7 @@ module ProxyFetcher
13
13
  # Major version number
14
14
  MAJOR = 0
15
15
  # Minor version number
16
- MINOR = 9
16
+ MINOR = 10
17
17
  # Smallest version number
18
18
  TINY = 0
19
19
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proxy_fetcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikita Bulai
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-22 00:00:00.000000000 Z
11
+ date: 2019-03-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http