proxy_fetcher 0.10.2 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -1
  3. data/Gemfile +8 -5
  4. data/Rakefile +7 -3
  5. data/gemfiles/nokogiri.gemfile +8 -6
  6. data/gemfiles/oga.gemfile +8 -6
  7. data/lib/proxy_fetcher.rb +46 -35
  8. data/lib/proxy_fetcher/client/client.rb +10 -3
  9. data/lib/proxy_fetcher/client/request.rb +4 -4
  10. data/lib/proxy_fetcher/configuration.rb +24 -19
  11. data/lib/proxy_fetcher/document.rb +0 -9
  12. data/lib/proxy_fetcher/document/adapters.rb +1 -1
  13. data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +3 -12
  14. data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +1 -1
  15. data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +1 -1
  16. data/lib/proxy_fetcher/document/node.rb +2 -2
  17. data/lib/proxy_fetcher/exceptions.rb +6 -6
  18. data/lib/proxy_fetcher/manager.rb +42 -9
  19. data/lib/proxy_fetcher/providers/base.rb +43 -22
  20. data/lib/proxy_fetcher/providers/free_proxy_list.rb +9 -10
  21. data/lib/proxy_fetcher/providers/free_proxy_list_socks.rb +58 -0
  22. data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +7 -15
  23. data/lib/proxy_fetcher/providers/free_proxy_list_us.rb +54 -0
  24. data/lib/proxy_fetcher/providers/http_tunnel.rb +11 -19
  25. data/lib/proxy_fetcher/providers/mtpro.rb +43 -0
  26. data/lib/proxy_fetcher/providers/proxy_list.rb +8 -16
  27. data/lib/proxy_fetcher/providers/proxypedia.rb +48 -0
  28. data/lib/proxy_fetcher/providers/proxyscrape_http.rb +65 -0
  29. data/lib/proxy_fetcher/providers/proxyscrape_socks4.rb +65 -0
  30. data/lib/proxy_fetcher/providers/proxyscrape_socks5.rb +65 -0
  31. data/lib/proxy_fetcher/providers/xroxy.rb +9 -17
  32. data/lib/proxy_fetcher/proxy.rb +16 -4
  33. data/lib/proxy_fetcher/utils/http_client.rb +7 -12
  34. data/lib/proxy_fetcher/utils/proxy_list_validator.rb +3 -1
  35. data/lib/proxy_fetcher/utils/proxy_validator.rb +21 -9
  36. data/lib/proxy_fetcher/version.rb +3 -3
  37. data/proxy_fetcher.gemspec +21 -16
  38. data/spec/fixtures/proxies.txt +14 -0
  39. data/spec/proxy_fetcher/client/client_spec.rb +72 -57
  40. data/spec/proxy_fetcher/configuration_spec.rb +11 -11
  41. data/spec/proxy_fetcher/document/adapters_spec.rb +8 -8
  42. data/spec/proxy_fetcher/document/node_spec.rb +4 -4
  43. data/spec/proxy_fetcher/manager_spec.rb +18 -0
  44. data/spec/proxy_fetcher/providers/base_spec.rb +9 -9
  45. data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +4 -4
  46. data/spec/proxy_fetcher/providers/proxy_classes_spec.rb +28 -0
  47. data/spec/proxy_fetcher/proxy_spec.rb +14 -14
  48. data/spec/proxy_fetcher/version_spec.rb +2 -0
  49. data/spec/spec_helper.rb +10 -10
  50. data/spec/support/manager_examples.rb +21 -21
  51. metadata +27 -17
  52. data/lib/proxy_fetcher/providers/gather_proxy.rb +0 -58
  53. data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +0 -13
  54. data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +0 -11
  55. data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +0 -11
  56. data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +0 -11
  57. data/spec/proxy_fetcher/providers/proxy_list_spec.rb +0 -11
  58. data/spec/proxy_fetcher/providers/xroxy_spec.rb +0 -11
@@ -6,7 +6,7 @@ module ProxyFetcher
6
6
  class NokogiriAdapter < AbstractAdapter
7
7
  # Requires Nokogiri gem to the application.
8
8
  def self.install_requirements!
9
- require 'nokogiri'
9
+ require "nokogiri"
10
10
  end
11
11
 
12
12
  # Parses raw HTML content with specific gem.
@@ -6,7 +6,7 @@ module ProxyFetcher
6
6
  class OgaAdapter < AbstractAdapter
7
7
  # Requires Oga gem to the application.
8
8
  def self.install_requirements!
9
- require 'oga'
9
+ require "oga"
10
10
  end
11
11
 
12
12
  # Parses raw HTML content with specific gem.
@@ -81,9 +81,9 @@ module ProxyFetcher
81
81
  # clean text
82
82
  #
83
83
  def clear(text)
84
- return '' if text.nil? || text.empty?
84
+ return "" if text.nil? || text.empty?
85
85
 
86
- text.strip.gsub(/[ \t]/i, '')
86
+ text.strip.gsub(/[\t]/i, "")
87
87
  end
88
88
  end
89
89
  end
@@ -13,7 +13,7 @@ module ProxyFetcher
13
13
  # @return [WrongCustomClass]
14
14
  #
15
15
  def initialize(klass, methods)
16
- required_methods = Array(methods).join(', ')
16
+ required_methods = Array(methods).join(", ")
17
17
  super("#{klass} must respond to [#{required_methods}] class methods!")
18
18
  end
19
19
  end
@@ -53,7 +53,7 @@ module ProxyFetcher
53
53
  # @return [MaximumRedirectsReached]
54
54
  #
55
55
  def initialize(*)
56
- super('maximum redirects reached')
56
+ super("maximum redirects reached")
57
57
  end
58
58
  end
59
59
 
@@ -66,7 +66,7 @@ module ProxyFetcher
66
66
  # @return [MaximumRetriesReached]
67
67
  #
68
68
  def initialize(*)
69
- super('reached the maximum number of retries')
69
+ super("reached the maximum number of retries")
70
70
  end
71
71
  end
72
72
 
@@ -95,7 +95,7 @@ module ProxyFetcher
95
95
  super(<<-MSG.strip.squeeze
96
96
  you need to specify adapter for HTML parsing: ProxyFetcher.config.adapter = :nokogiri.
97
97
  You can use one of the predefined adapters (:nokogiri or :oga) or your own implementation.
98
- MSG
98
+ MSG
99
99
  )
100
100
  end
101
101
  end
@@ -111,7 +111,7 @@ module ProxyFetcher
111
111
  # @return [AdapterSetupError]
112
112
  #
113
113
  def initialize(adapter_name, error)
114
- adapter = demodulize(adapter_name.gsub('Adapter', ''))
114
+ adapter = demodulize(adapter_name.gsub("Adapter", ""))
115
115
 
116
116
  super("can't setup '#{adapter}' adapter during the following error:\n\t#{error}'")
117
117
  end
@@ -127,7 +127,7 @@ module ProxyFetcher
127
127
  #
128
128
  def demodulize(path)
129
129
  path = path.to_s
130
- index = path.rindex('::')
130
+ index = path.rindex("::")
131
131
 
132
132
  index ? path[(index + 2)..-1] : path
133
133
  end
@@ -3,6 +3,16 @@
3
3
  module ProxyFetcher
4
4
  # ProxyFetcher Manager class for interacting with proxy lists from various providers.
5
5
  class Manager
6
+ REFRESHER_LOCK = Mutex.new
7
+
8
+ class << self
9
+ def from_files(files, **options)
10
+ new(**options.merge(files: Array(files)))
11
+ end
12
+
13
+ alias from_file from_files
14
+ end
15
+
6
16
  # @!attribute [r] proxies
7
17
  # @return [Array<ProxyFetcher::Proxy>] An array of proxies
8
18
  attr_reader :proxies
@@ -14,14 +24,17 @@ module ProxyFetcher
14
24
  #
15
25
  # @return [Manager]
16
26
  #
17
- def initialize(refresh: true, validate: false, filters: {})
18
- if refresh
19
- refresh_list!(filters)
27
+ def initialize(**options)
28
+ if options.fetch(:refresh, true)
29
+ refresh_list!(options.fetch(:filters, {}))
20
30
  else
21
31
  @proxies = []
22
32
  end
23
33
 
24
- cleanup! if validate
34
+ files = Array(options.fetch(:file, options.fetch(:files, [])))
35
+ load_proxies_from_files!(files) if files&.any?
36
+
37
+ cleanup! if options.fetch(:validate, false)
25
38
  end
26
39
 
27
40
  # Update current proxy list using configured providers.
@@ -30,17 +43,17 @@ module ProxyFetcher
30
43
  #
31
44
  def refresh_list!(filters = nil)
32
45
  @proxies = []
33
-
34
46
  threads = []
35
- lock = Mutex.new
36
47
 
37
48
  ProxyFetcher.config.providers.each do |provider_name|
38
49
  threads << Thread.new do
50
+ Thread.current.report_on_exception = false
51
+
39
52
  provider = ProxyFetcher::Configuration.providers_registry.class_for(provider_name)
40
53
  provider_filters = filters && filters.fetch(provider_name.to_sym, filters)
41
54
  provider_proxies = provider.fetch_proxies!(provider_filters)
42
55
 
43
- lock.synchronize do
56
+ REFRESHER_LOCK.synchronize do
44
57
  @proxies.concat(provider_proxies)
45
58
  end
46
59
  end
@@ -55,7 +68,7 @@ module ProxyFetcher
55
68
 
56
69
  # Pop just first proxy (and back it to the end of the proxy list).
57
70
  #
58
- # @return [Proxy]
71
+ # @return [ProxyFetcher::Proxy, NilClass]
59
72
  # proxy object from the list
60
73
  #
61
74
  def get
@@ -72,7 +85,7 @@ module ProxyFetcher
72
85
  # Pop first valid proxy (and back it to the end of the proxy list)
73
86
  # Invalid proxies will be removed from the list
74
87
  #
75
- # @return [Proxy]
88
+ # @return [ProxyFetcher::Proxy, NilClass]
76
89
  # proxy object from the list
77
90
  #
78
91
  def get!
@@ -89,6 +102,26 @@ module ProxyFetcher
89
102
 
90
103
  alias pop! get!
91
104
 
105
+ # Loads proxies from files.
106
+ #
107
+ # @param proxy_files [String, Array<String,Pathname>]
108
+ # file path of list of files to load
109
+ #
110
+ def load_proxies_from_files!(proxy_files)
111
+ proxy_files = Array(proxy_files)
112
+ return if proxy_files.empty?
113
+
114
+ proxy_files.each do |proxy_file|
115
+ File.foreach(proxy_file, chomp: true) do |proxy_string|
116
+ addr, port = proxy_string.split(":", 2)
117
+ port = Integer(port) if port
118
+ @proxies << Proxy.new(addr: addr, port: port)
119
+ end
120
+ end
121
+
122
+ @proxies.uniq!
123
+ end
124
+
92
125
  # Clean current proxy list from dead proxies (that doesn't respond by timeout)
93
126
  #
94
127
  # @return [Array<ProxyFetcher::Proxy>]
@@ -6,12 +6,15 @@ module ProxyFetcher
6
6
  class Base
7
7
  # Loads proxy provider page content, extract proxy list from it
8
8
  # and convert every entry to proxy object.
9
- def fetch_proxies!(filters = {})
9
+ def fetch_proxies(filters = {})
10
10
  raw_proxies = load_proxy_list(filters)
11
11
  proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact
12
12
  proxies.reject { |proxy| proxy.addr.nil? }
13
13
  end
14
14
 
15
+ # For retro-compatibility
16
+ alias fetch_proxies! fetch_proxies
17
+
15
18
  def provider_url
16
19
  raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
17
20
  end
@@ -24,10 +27,17 @@ module ProxyFetcher
24
27
  {}
25
28
  end
26
29
 
30
+ # @return [Hash]
31
+ # Provider headers required to fetch the proxy list
32
+ #
27
33
  def provider_headers
28
34
  {}
29
35
  end
30
36
 
37
+ def xpath
38
+ raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
39
+ end
40
+
31
41
  # Just synthetic sugar to make it easier to call #fetch_proxies! method.
32
42
  def self.fetch_proxies!(*args)
33
43
  new.fetch_proxies!(*args)
@@ -37,18 +47,29 @@ module ProxyFetcher
37
47
 
38
48
  # Loads raw provider HTML with proxies.
39
49
  #
50
+ # @param url [String]
51
+ # Provider URL
52
+ #
53
+ # @param filters [#to_h]
54
+ # Provider filters (Hash-like object)
55
+ #
40
56
  # @return [String]
41
- # HTML body
57
+ # HTML body from the response
42
58
  #
43
59
  def load_html(url, filters = {})
44
- raise ArgumentError, 'filters must be a Hash' if filters && !filters.is_a?(Hash)
60
+ unless filters.respond_to?(:to_h)
61
+ raise ArgumentError, "filters must be a Hash or respond to #to_h"
62
+ end
45
63
 
46
- uri = URI.parse(url)
47
- # TODO: query for post request?
48
- uri.query = URI.encode_www_form(provider_params.merge(filters)) if filters && filters.any?
64
+ if filters&.any?
65
+ # TODO: query for post request?
66
+ uri = URI.parse(url)
67
+ uri.query = URI.encode_www_form(provider_params.merge(filters.to_h))
68
+ url = uri.to_s
69
+ end
49
70
 
50
71
  ProxyFetcher.config.http_client.fetch(
51
- uri.to_s,
72
+ url,
52
73
  method: provider_method,
53
74
  headers: provider_headers,
54
75
  params: provider_params
@@ -71,29 +92,29 @@ module ProxyFetcher
71
92
  ProxyFetcher::Document.parse(html)
72
93
  end
73
94
 
95
+ # Fetches HTML content by sending HTTP request to the provider URL and
96
+ # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
97
+ # to return all the proxy entries (HTML nodes).
98
+ #
99
+ # @return [Array<ProxyFetcher::Document::Node>]
100
+ # Collection of extracted HTML nodes with full proxy info
101
+ #
102
+ def load_proxy_list(filters = {})
103
+ doc = load_document(provider_url, filters)
104
+ doc.xpath(xpath)
105
+ end
106
+
74
107
  def build_proxy(*args)
75
108
  to_proxy(*args)
76
- rescue StandardError => error
109
+ rescue StandardError => e
77
110
  ProxyFetcher.logger.warn(
78
- "Failed to build Proxy object for #{self.class.name} due to error: #{error.message}"
111
+ "Failed to build Proxy for #{self.class.name.split("::").last} " \
112
+ "due to error: #{e.message}"
79
113
  )
80
114
 
81
115
  nil
82
116
  end
83
117
 
84
- # Fetches HTML content by sending HTTP request to the provider URL and
85
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
86
- # to return all the proxy entries (HTML nodes).
87
- #
88
- # Abstract method. Must be implemented in a descendant class
89
- #
90
- # @return [Array<Document::Node>]
91
- # list of proxy elements from the providers HTML content
92
- #
93
- def load_proxy_list(*)
94
- raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
95
- end
96
-
97
118
  # Convert HTML element with proxy info to ProxyFetcher::Proxy instance.
98
119
  #
99
120
  # Abstract method. Must be implemented in a descendant class
@@ -6,13 +6,12 @@ module ProxyFetcher
6
6
  class FreeProxyList < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'https://free-proxy-list.net/'
9
+ "https://free-proxy-list.net/"
10
10
  end
11
11
 
12
12
  # [NOTE] Doesn't support filtering
13
- def load_proxy_list(_filters = {})
14
- doc = load_document(provider_url, {})
15
- doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
16
15
  end
17
16
 
18
17
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -26,10 +25,10 @@ module ProxyFetcher
26
25
  #
27
26
  def to_proxy(html_node)
28
27
  ProxyFetcher::Proxy.new.tap do |proxy|
29
- proxy.addr = html_node.content_at('td[1]')
30
- proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
31
- proxy.country = html_node.content_at('td[4]')
32
- proxy.anonymity = html_node.content_at('td[5]')
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.anonymity = html_node.content_at("td[5]")
33
32
  proxy.type = parse_type(html_node)
34
33
  end
35
34
  end
@@ -45,8 +44,8 @@ module ProxyFetcher
45
44
  # Proxy type
46
45
  #
47
46
  def parse_type(html_node)
48
- https = html_node.content_at('td[6]')
49
- https && https.casecmp('yes').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
47
+ https = html_node.content_at("td[6]")
48
+ https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
50
49
  end
51
50
  end
52
51
 
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProxyFetcher
4
+ module Providers
5
+ # FreeProxyListSocks provider class.
6
+ class FreeProxyListSocks < Base
7
+ # Provider URL to fetch proxy list
8
+ def provider_url
9
+ "https://www.socks-proxy.net/"
10
+ end
11
+
12
+ # [NOTE] Doesn't support filtering
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
15
+ end
16
+
17
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
18
+ # object.
19
+ #
20
+ # @param html_node [Object]
21
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
22
+ #
23
+ # @return [ProxyFetcher::Proxy]
24
+ # Proxy object
25
+ #
26
+ def to_proxy(html_node)
27
+ ProxyFetcher::Proxy.new.tap do |proxy|
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.type = parse_type(html_node)
32
+ proxy.anonymity = html_node.content_at("td[6]")
33
+ end
34
+ end
35
+
36
+ private
37
+
38
+ # Parses HTML node to extract proxy type.
39
+ #
40
+ # @param html_node [Object]
41
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
42
+ #
43
+ # @return [String]
44
+ # Proxy type
45
+ #
46
+ def parse_type(html_node)
47
+ https = html_node.content_at("td[5]")
48
+
49
+ return ProxyFetcher::Proxy::SOCKS4 if https&.casecmp("socks4")&.zero?
50
+ return ProxyFetcher::Proxy::SOCKS5 if https&.casecmp("socks5")&.zero?
51
+
52
+ "Unknown"
53
+ end
54
+ end
55
+
56
+ ProxyFetcher::Configuration.register_provider(:free_proxy_list_socks, FreeProxyListSocks)
57
+ end
58
+ end
@@ -6,20 +6,12 @@ module ProxyFetcher
6
6
  class FreeProxyListSSL < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'https://www.sslproxies.org/'
9
+ "https://www.sslproxies.org/"
10
10
  end
11
11
 
12
- # Fetches HTML content by sending HTTP request to the provider URL and
13
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
14
- # to return all the proxy entries (HTML nodes).
15
- #
16
- # @return [Array<ProxyFetcher::Document::Node>]
17
- # Collection of extracted HTML nodes with full proxy info
18
- #
19
12
  # [NOTE] Doesn't support filtering
20
- def load_proxy_list(_filters = {})
21
- doc = load_document(provider_url, {})
22
- doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
23
15
  end
24
16
 
25
17
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -33,10 +25,10 @@ module ProxyFetcher
33
25
  #
34
26
  def to_proxy(html_node)
35
27
  ProxyFetcher::Proxy.new.tap do |proxy|
36
- proxy.addr = html_node.content_at('td[1]')
37
- proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
38
- proxy.country = html_node.content_at('td[4]')
39
- proxy.anonymity = html_node.content_at('td[5]')
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.anonymity = html_node.content_at("td[5]")
40
32
  proxy.type = ProxyFetcher::Proxy::HTTPS
41
33
  end
42
34
  end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProxyFetcher
4
+ module Providers
5
+ # FreeProxyListUS provider class.
6
+ class FreeProxyListUS < Base
7
+ # Provider URL to fetch proxy list
8
+ def provider_url
9
+ "https://www.us-proxy.org/"
10
+ end
11
+
12
+ # [NOTE] Doesn't support filtering
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
15
+ end
16
+
17
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
18
+ # object.
19
+ #
20
+ # @param html_node [Object]
21
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
22
+ #
23
+ # @return [ProxyFetcher::Proxy]
24
+ # Proxy object
25
+ #
26
+ def to_proxy(html_node)
27
+ ProxyFetcher::Proxy.new.tap do |proxy|
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.anonymity = html_node.content_at("td[5]")
32
+ proxy.type = parse_type(html_node)
33
+ end
34
+ end
35
+
36
+ private
37
+
38
+ # Parses HTML node to extract proxy type.
39
+ #
40
+ # @param html_node [Object]
41
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
42
+ #
43
+ # @return [String]
44
+ # Proxy type
45
+ #
46
+ def parse_type(html_node)
47
+ https = html_node.content_at("td[7]")
48
+ https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
49
+ end
50
+ end
51
+
52
+ ProxyFetcher::Configuration.register_provider(:free_proxy_list_us, FreeProxyListUS)
53
+ end
54
+ end