proxy_fetcher 0.9.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d5fee8c7b429b32d608388ddf15885d2354721d6de693396ad228248925cf2d6
4
- data.tar.gz: '0250291040c511a723cdae4a22a78d4f765a0ca7b10325f331a8d119f8d25157'
3
+ metadata.gz: '01894f07ae28eafbb09934aa7a4e52188fb0bf02db7ab458df5c91a93b3e32af'
4
+ data.tar.gz: b487f3aeb6b833ab6b6395d0d4c9311294f7894e68f6a51c2e9943ccf2904d09
5
5
  SHA512:
6
- metadata.gz: b7d12b75f0e80075d31832f8add9d1269ae48ff32714a0c0b481a85eb275094ccec62020911d55f74a2920211fc445f11ee3c31f43fdc7cc1f5ae2d175278eec
7
- data.tar.gz: 138c464c0531fc12773b995d3101f5e952e2a4868e7e13d18274b27f7b0ea310a515d76e2066f36b25c2586c2046b9497f15a74e943cfbc8fdc8f0204796f93e
6
+ metadata.gz: aef23de20b41467dc2e1cadf65e6e728555f62e3cba0006df55ec62c8b877f8ca924dff9a81441897dc8e4f3efbf346ecefad62cf0b1e5a3bbcbd78272e1ea34
7
+ data.tar.gz: de41ae50f5bff9c8b8ae309057695661e76c47d2df703d369df6a2b3b40bffcfc881edc21b9833ca0f303392a480b7562b53b39d3f8c79e8e47cd0595a690366
@@ -7,5 +7,5 @@ gem 'nokogiri', '~> 1.8'
7
7
  group :test do
8
8
  gem 'coveralls', require: false
9
9
  gem 'evil-proxy', '~> 0.2'
10
- gem 'rspec-rails', '~> 3.6'
10
+ gem 'rspec', '~> 3.6'
11
11
  end
data/gemfiles/oga.gemfile CHANGED
@@ -7,5 +7,5 @@ gem 'oga', '~> 2.0'
7
7
  group :test do
8
8
  gem 'coveralls', require: false
9
9
  gem 'evil-proxy', '~> 0.2'
10
- gem 'rspec-rails', '~> 3.6'
10
+ gem 'rspec', '~> 3.6'
11
11
  end
data/lib/proxy_fetcher.rb CHANGED
@@ -80,6 +80,7 @@ module ProxyFetcher
80
80
  #
81
81
  def logger
82
82
  return @logger if defined?(@logger)
83
+
83
84
  @logger = config.logger || NullLogger.new
84
85
  end
85
86
 
@@ -7,7 +7,25 @@ module ProxyFetcher
7
7
  # Loads proxy provider page content, extract proxy list from it
8
8
  # and convert every entry to proxy object.
9
9
  def fetch_proxies!(filters = {})
10
- load_proxy_list(filters).map { |html_node| to_proxy(html_node) }
10
+ raw_proxies = load_proxy_list(filters)
11
+ proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact
12
+ proxies.reject { |proxy| proxy.addr.nil? }
13
+ end
14
+
15
+ def provider_url
16
+ raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
17
+ end
18
+
19
+ def provider_method
20
+ :get
21
+ end
22
+
23
+ def provider_params
24
+ {}
25
+ end
26
+
27
+ def provider_headers
28
+ {}
11
29
  end
12
30
 
13
31
  # Just synthetic sugar to make it easier to call #fetch_proxies! method.
@@ -17,7 +35,27 @@ module ProxyFetcher
17
35
 
18
36
  protected
19
37
 
20
- # Loads HTML document with Nokogiri by the URL combined with custom filters
38
+ # Loads raw provider HTML with proxies.
39
+ #
40
+ # @return [String]
41
+ # HTML body
42
+ #
43
+ def load_html(url, filters = {})
44
+ raise ArgumentError, 'filters must be a Hash' if filters && !filters.is_a?(Hash)
45
+
46
+ uri = URI.parse(url)
47
+ # TODO: query for post request?
48
+ uri.query = URI.encode_www_form(provider_params.merge(filters)) if filters && filters.any?
49
+
50
+ ProxyFetcher.config.http_client.fetch(
51
+ uri.to_s,
52
+ method: provider_method,
53
+ headers: provider_headers,
54
+ params: provider_params
55
+ )
56
+ end
57
+
58
+ # Loads provider HTML and parses it with internal document object.
21
59
  #
22
60
  # @param url [String]
23
61
  # URL to fetch
@@ -29,15 +67,17 @@ module ProxyFetcher
29
67
  # ProxyFetcher document object
30
68
  #
31
69
  def load_document(url, filters = {})
32
- raise ArgumentError, 'filters must be a Hash' if filters && !filters.is_a?(Hash)
33
-
34
- uri = URI.parse(url)
35
- uri.query = URI.encode_www_form(filters) if filters && filters.any?
36
-
37
- html = ProxyFetcher.config.http_client.fetch(uri.to_s)
70
+ html = load_html(url, filters)
38
71
  ProxyFetcher::Document.parse(html)
39
72
  end
40
73
 
74
+ def build_proxy(*args)
75
+ to_proxy(*args)
76
+ rescue StandardError => error
77
+ ProxyFetcher.logger.warn("Failed to build Proxy object due to error: #{error.message}")
78
+ nil
79
+ end
80
+
41
81
  # Fetches HTML content by sending HTTP request to the provider URL and
42
82
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
43
83
  # to return all the proxy entries (HTML nodes).
@@ -5,11 +5,13 @@ module ProxyFetcher
5
5
  # FreeProxyList provider class.
6
6
  class FreeProxyList < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://free-proxy-list.net/'.freeze
8
+ def provider_url
9
+ 'https://free-proxy-list.net/'
10
+ end
9
11
 
10
12
  # [NOTE] Doesn't support filtering
11
- def load_proxy_list(*)
12
- doc = load_document(PROVIDER_URL, {})
13
+ def load_proxy_list(_filters = {})
14
+ doc = load_document(provider_url, {})
13
15
  doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
14
16
  end
15
17
 
@@ -5,7 +5,9 @@ module ProxyFetcher
5
5
  # FreeProxyListSSL provider class.
6
6
  class FreeProxyListSSL < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://www.sslproxies.org/'.freeze
8
+ def provider_url
9
+ 'https://www.sslproxies.org/'
10
+ end
9
11
 
10
12
  # Fetches HTML content by sending HTTP request to the provider URL and
11
13
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -15,8 +17,8 @@ module ProxyFetcher
15
17
  # Collection of extracted HTML nodes with full proxy info
16
18
  #
17
19
  # [NOTE] Doesn't support filtering
18
- def load_proxy_list(*)
19
- doc = load_document(PROVIDER_URL, {})
20
+ def load_proxy_list(_filters = {})
21
+ doc = load_document(provider_url, {})
20
22
  doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
21
23
  end
22
24
 
@@ -7,7 +7,9 @@ module ProxyFetcher
7
7
  # GatherProxy provider class.
8
8
  class GatherProxy < Base
9
9
  # Provider URL to fetch proxy list
10
- PROVIDER_URL = 'http://www.gatherproxy.com/'.freeze
10
+ def provider_url
11
+ 'http://www.gatherproxy.com/'
12
+ end
11
13
 
12
14
  # Fetches HTML content by sending HTTP request to the provider URL and
13
15
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -17,7 +19,7 @@ module ProxyFetcher
17
19
  # Collection of extracted HTML nodes with full proxy info
18
20
  #
19
21
  def load_proxy_list(*)
20
- doc = load_document(PROVIDER_URL)
22
+ doc = load_document(provider_url)
21
23
  doc.xpath('//div[@class="proxy-list"]/table/script')
22
24
  end
23
25
 
@@ -5,7 +5,9 @@ module ProxyFetcher
5
5
  # HTTPTunnel provider class.
6
6
  class HTTPTunnel < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'http://www.httptunnel.ge/ProxyListForFree.aspx'.freeze
8
+ def provider_url
9
+ 'http://www.httptunnel.ge/ProxyListForFree.aspx'
10
+ end
9
11
 
10
12
  # Fetches HTML content by sending HTTP request to the provider URL and
11
13
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -14,8 +16,8 @@ module ProxyFetcher
14
16
  # @return [Array<ProxyFetcher::Document::Node>]
15
17
  # Collection of extracted HTML nodes with full proxy info
16
18
  #
17
- def load_proxy_list(*)
18
- doc = load_document(PROVIDER_URL)
19
+ def load_proxy_list(_filters = {})
20
+ doc = load_document(provider_url)
19
21
  doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
20
22
  end
21
23
 
@@ -1,11 +1,39 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'json'
4
+
3
5
  module ProxyFetcher
4
6
  module Providers
5
7
  # ProxyDocker provider class.
6
8
  class ProxyDocker < Base
7
9
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://www.proxydocker.com/en/proxylist/'.freeze
10
+ def provider_url
11
+ 'https://www.proxydocker.com/en/api/proxylist/'
12
+ end
13
+
14
+ def provider_method
15
+ :post
16
+ end
17
+
18
+ def provider_params
19
+ {
20
+ token: 'GmZyl0OJmmgrWakdzO7AFf6AWfkdledR6xmKvGmwmJg',
21
+ country: 'all',
22
+ city: 'all',
23
+ state: 'all',
24
+ port: 'all',
25
+ type: 'all',
26
+ anonymity: 'all',
27
+ need: 'all',
28
+ page: '1'
29
+ }
30
+ end
31
+
32
+ def provider_headers
33
+ {
34
+ cookie: 'PHPSESSID=7f59558ee58b1e4352c4ab4c2f1a3c11'
35
+ }
36
+ end
9
37
 
10
38
  # Fetches HTML content by sending HTTP request to the provider URL and
11
39
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -16,30 +44,42 @@ module ProxyFetcher
16
44
  #
17
45
  # [NOTE] Doesn't support direct filters
18
46
  def load_proxy_list(*)
19
- doc = load_document(PROVIDER_URL, {})
20
- doc.xpath('//table[contains(@class, "table")]/tbody/tr[(count(td)>2)]')
47
+ json = JSON.parse(load_html(provider_url, {}))
48
+ json.fetch('proxies', [])
49
+ rescue JSON::ParserError
50
+ []
21
51
  end
22
52
 
23
- # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
53
+ # Converts JSON node to <code>ProxyFetcher::Proxy</code>
24
54
  # object.
25
55
  #
26
- # @param html_node [Object]
27
- # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
56
+ # @param node [Hash]
57
+ # JSON entry from the API response
28
58
  #
29
59
  # @return [ProxyFetcher::Proxy]
30
60
  # Proxy object
31
61
  #
32
- def to_proxy(html_node)
62
+ def to_proxy(node)
33
63
  ProxyFetcher::Proxy.new.tap do |proxy|
34
- uri = URI("//#{html_node.content_at('td[1]')}")
35
- proxy.addr = uri.host
36
- proxy.port = uri.port
64
+ proxy.addr = node['ip']
65
+ proxy.port = node['port']
37
66
 
38
- proxy.type = html_node.content_at('td[2]')
39
- proxy.anonymity = html_node.content_at('td[3]')
40
- proxy.country = html_node.content_at('td[5]')
67
+ proxy.type = types_mapping.fetch(node['type'], ProxyFetcher::Proxy::HTTP)
68
+ proxy.anonymity = "Lvl#{node['anonymity']}"
69
+ proxy.country = node['country']
41
70
  end
42
71
  end
72
+
73
+ def types_mapping
74
+ {
75
+ '16' => ProxyFetcher::Proxy::HTTP,
76
+ '26' => ProxyFetcher::Proxy::HTTPS,
77
+ '3' => ProxyFetcher::Proxy::SOCKS4,
78
+ '4' => ProxyFetcher::Proxy::SOCKS5,
79
+ '56' => ProxyFetcher::Proxy::HTTP, # CON25
80
+ '6' => ProxyFetcher::Proxy::HTTP # CON80
81
+ }
82
+ end
43
83
  end
44
84
 
45
85
  ProxyFetcher::Configuration.register_provider(:proxy_docker, ProxyDocker)
@@ -7,7 +7,9 @@ module ProxyFetcher
7
7
  # ProxyList provider class.
8
8
  class ProxyList < Base
9
9
  # Provider URL to fetch proxy list
10
- PROVIDER_URL = 'https://proxy-list.org/english/index.php'.freeze
10
+ def provider_url
11
+ 'https://proxy-list.org/english/index.php'
12
+ end
11
13
 
12
14
  # Fetches HTML content by sending HTTP request to the provider URL and
13
15
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -17,7 +19,7 @@ module ProxyFetcher
17
19
  # Collection of extracted HTML nodes with full proxy info
18
20
  #
19
21
  def load_proxy_list(filters = {})
20
- doc = load_document(PROVIDER_URL, filters)
22
+ doc = load_document(provider_url, filters)
21
23
  doc.css('.table-wrap .table ul')
22
24
  end
23
25
 
@@ -5,7 +5,9 @@ module ProxyFetcher
5
5
  # XRoxy provider class.
6
6
  class XRoxy < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://www.xroxy.com/free-proxy-lists/'.freeze
8
+ def provider_url
9
+ 'https://www.xroxy.com/free-proxy-lists/'
10
+ end
9
11
 
10
12
  # Fetches HTML content by sending HTTP request to the provider URL and
11
13
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -15,7 +17,7 @@ module ProxyFetcher
15
17
  # Collection of extracted HTML nodes with full proxy info
16
18
  #
17
19
  def load_proxy_list(filters = { type: 'All_http' })
18
- doc = load_document(PROVIDER_URL, filters)
20
+ doc = load_document(provider_url, filters)
19
21
  doc.xpath('//div/table/tbody/tr')
20
22
  end
21
23
 
@@ -9,6 +9,18 @@ module ProxyFetcher
9
9
  # @return [String] URL
10
10
  attr_reader :url
11
11
 
12
+ # @!attribute [r] HTTP method
13
+ # @return [String] HTTP method verb
14
+ attr_reader :method
15
+
16
+ # @!attribute [r] HTTP params
17
+ # @return [Hash] params
18
+ attr_reader :params
19
+
20
+ # @!attribute [r] HTTP headers
21
+ # @return [Hash] headers
22
+ attr_reader :headers
23
+
12
24
  # @!attribute [r] http
13
25
  # @return [Net::HTTP] HTTP client
14
26
  attr_reader :http
@@ -29,17 +41,21 @@ module ProxyFetcher
29
41
  # @return [String]
30
42
  # resource content
31
43
  #
32
- def self.fetch(url)
33
- new(url).fetch
44
+ def self.fetch(*args)
45
+ new(*args).fetch
34
46
  end
35
47
 
36
48
  # Initialize HTTP client instance
37
49
  #
38
50
  # @return [HTTPClient]
39
51
  #
40
- def initialize(url)
52
+ def initialize(url, method: :get, params: {}, headers: {})
41
53
  @url = url.to_s
42
- @http = HTTP.headers(default_headers)
54
+ @method = method
55
+ @params = params
56
+ @headers = headers
57
+
58
+ @http = HTTP.headers(default_headers.merge(headers)).timeout(connect: timeout, read: timeout)
43
59
  @timeout = ProxyFetcher.config.provider_proxies_load_timeout
44
60
 
45
61
  @ssl_ctx = OpenSSL::SSL::SSLContext.new
@@ -52,11 +68,16 @@ module ProxyFetcher
52
68
  # response body
53
69
  #
54
70
  def fetch
55
- @http.timeout(connect: timeout, read: timeout)
56
- .get(url, ssl_context: ssl_ctx)
57
- .body.to_s
58
- rescue StandardError
59
- ProxyFetcher.logger.warn("Failed to load proxy list for #{url}")
71
+ # TODO: must be more generic
72
+ response = if method == :post
73
+ http.post(url, form: params, ssl_context: ssl_ctx)
74
+ else
75
+ http.get(url, ssl_context: ssl_ctx)
76
+ end
77
+
78
+ response.body.to_s
79
+ rescue StandardError => error
80
+ ProxyFetcher.logger.warn("Failed to load proxy list for #{url} (#{error.message})")
60
81
  ''
61
82
  end
62
83
 
@@ -13,7 +13,7 @@ module ProxyFetcher
13
13
  # Major version number
14
14
  MAJOR = 0
15
15
  # Minor version number
16
- MINOR = 9
16
+ MINOR = 10
17
17
  # Smallest version number
18
18
  TINY = 0
19
19
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proxy_fetcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikita Bulai
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-01-22 00:00:00.000000000 Z
11
+ date: 2019-03-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http