proxy_fetcher 0.10.2 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -1
  3. data/Gemfile +8 -5
  4. data/Rakefile +7 -3
  5. data/gemfiles/nokogiri.gemfile +8 -6
  6. data/gemfiles/oga.gemfile +8 -6
  7. data/lib/proxy_fetcher.rb +46 -35
  8. data/lib/proxy_fetcher/client/client.rb +10 -3
  9. data/lib/proxy_fetcher/client/request.rb +4 -4
  10. data/lib/proxy_fetcher/configuration.rb +24 -19
  11. data/lib/proxy_fetcher/document.rb +0 -9
  12. data/lib/proxy_fetcher/document/adapters.rb +1 -1
  13. data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +3 -12
  14. data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +1 -1
  15. data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +1 -1
  16. data/lib/proxy_fetcher/document/node.rb +2 -2
  17. data/lib/proxy_fetcher/exceptions.rb +6 -6
  18. data/lib/proxy_fetcher/manager.rb +42 -9
  19. data/lib/proxy_fetcher/providers/base.rb +43 -22
  20. data/lib/proxy_fetcher/providers/free_proxy_list.rb +9 -10
  21. data/lib/proxy_fetcher/providers/free_proxy_list_socks.rb +58 -0
  22. data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +7 -15
  23. data/lib/proxy_fetcher/providers/free_proxy_list_us.rb +54 -0
  24. data/lib/proxy_fetcher/providers/http_tunnel.rb +11 -19
  25. data/lib/proxy_fetcher/providers/mtpro.rb +43 -0
  26. data/lib/proxy_fetcher/providers/proxy_list.rb +8 -16
  27. data/lib/proxy_fetcher/providers/proxypedia.rb +48 -0
  28. data/lib/proxy_fetcher/providers/proxyscrape_http.rb +65 -0
  29. data/lib/proxy_fetcher/providers/proxyscrape_socks4.rb +65 -0
  30. data/lib/proxy_fetcher/providers/proxyscrape_socks5.rb +65 -0
  31. data/lib/proxy_fetcher/providers/xroxy.rb +9 -17
  32. data/lib/proxy_fetcher/proxy.rb +16 -4
  33. data/lib/proxy_fetcher/utils/http_client.rb +7 -12
  34. data/lib/proxy_fetcher/utils/proxy_list_validator.rb +3 -1
  35. data/lib/proxy_fetcher/utils/proxy_validator.rb +21 -9
  36. data/lib/proxy_fetcher/version.rb +3 -3
  37. data/proxy_fetcher.gemspec +21 -16
  38. data/spec/fixtures/proxies.txt +14 -0
  39. data/spec/proxy_fetcher/client/client_spec.rb +72 -57
  40. data/spec/proxy_fetcher/configuration_spec.rb +11 -11
  41. data/spec/proxy_fetcher/document/adapters_spec.rb +8 -8
  42. data/spec/proxy_fetcher/document/node_spec.rb +4 -4
  43. data/spec/proxy_fetcher/manager_spec.rb +18 -0
  44. data/spec/proxy_fetcher/providers/base_spec.rb +9 -9
  45. data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +4 -4
  46. data/spec/proxy_fetcher/providers/proxy_classes_spec.rb +28 -0
  47. data/spec/proxy_fetcher/proxy_spec.rb +14 -14
  48. data/spec/proxy_fetcher/version_spec.rb +2 -0
  49. data/spec/spec_helper.rb +10 -10
  50. data/spec/support/manager_examples.rb +21 -21
  51. metadata +27 -17
  52. data/lib/proxy_fetcher/providers/gather_proxy.rb +0 -58
  53. data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +0 -13
  54. data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +0 -11
  55. data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +0 -11
  56. data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +0 -11
  57. data/spec/proxy_fetcher/providers/proxy_list_spec.rb +0 -11
  58. data/spec/proxy_fetcher/providers/xroxy_spec.rb +0 -11
@@ -6,7 +6,7 @@ module ProxyFetcher
6
6
  class NokogiriAdapter < AbstractAdapter
7
7
  # Requires Nokogiri gem to the application.
8
8
  def self.install_requirements!
9
- require 'nokogiri'
9
+ require "nokogiri"
10
10
  end
11
11
 
12
12
  # Parses raw HTML content with specific gem.
@@ -6,7 +6,7 @@ module ProxyFetcher
6
6
  class OgaAdapter < AbstractAdapter
7
7
  # Requires Oga gem to the application.
8
8
  def self.install_requirements!
9
- require 'oga'
9
+ require "oga"
10
10
  end
11
11
 
12
12
  # Parses raw HTML content with specific gem.
@@ -81,9 +81,9 @@ module ProxyFetcher
81
81
  # clean text
82
82
  #
83
83
  def clear(text)
84
- return '' if text.nil? || text.empty?
84
+ return "" if text.nil? || text.empty?
85
85
 
86
- text.strip.gsub(/[ \t]/i, '')
86
+ text.strip.gsub(/[\t]/i, "")
87
87
  end
88
88
  end
89
89
  end
@@ -13,7 +13,7 @@ module ProxyFetcher
13
13
  # @return [WrongCustomClass]
14
14
  #
15
15
  def initialize(klass, methods)
16
- required_methods = Array(methods).join(', ')
16
+ required_methods = Array(methods).join(", ")
17
17
  super("#{klass} must respond to [#{required_methods}] class methods!")
18
18
  end
19
19
  end
@@ -53,7 +53,7 @@ module ProxyFetcher
53
53
  # @return [MaximumRedirectsReached]
54
54
  #
55
55
  def initialize(*)
56
- super('maximum redirects reached')
56
+ super("maximum redirects reached")
57
57
  end
58
58
  end
59
59
 
@@ -66,7 +66,7 @@ module ProxyFetcher
66
66
  # @return [MaximumRetriesReached]
67
67
  #
68
68
  def initialize(*)
69
- super('reached the maximum number of retries')
69
+ super("reached the maximum number of retries")
70
70
  end
71
71
  end
72
72
 
@@ -95,7 +95,7 @@ module ProxyFetcher
95
95
  super(<<-MSG.strip.squeeze
96
96
  you need to specify adapter for HTML parsing: ProxyFetcher.config.adapter = :nokogiri.
97
97
  You can use one of the predefined adapters (:nokogiri or :oga) or your own implementation.
98
- MSG
98
+ MSG
99
99
  )
100
100
  end
101
101
  end
@@ -111,7 +111,7 @@ module ProxyFetcher
111
111
  # @return [AdapterSetupError]
112
112
  #
113
113
  def initialize(adapter_name, error)
114
- adapter = demodulize(adapter_name.gsub('Adapter', ''))
114
+ adapter = demodulize(adapter_name.gsub("Adapter", ""))
115
115
 
116
116
  super("can't setup '#{adapter}' adapter during the following error:\n\t#{error}'")
117
117
  end
@@ -127,7 +127,7 @@ module ProxyFetcher
127
127
  #
128
128
  def demodulize(path)
129
129
  path = path.to_s
130
- index = path.rindex('::')
130
+ index = path.rindex("::")
131
131
 
132
132
  index ? path[(index + 2)..-1] : path
133
133
  end
@@ -3,6 +3,16 @@
3
3
  module ProxyFetcher
4
4
  # ProxyFetcher Manager class for interacting with proxy lists from various providers.
5
5
  class Manager
6
+ REFRESHER_LOCK = Mutex.new
7
+
8
+ class << self
9
+ def from_files(files, **options)
10
+ new(**options.merge(files: Array(files)))
11
+ end
12
+
13
+ alias from_file from_files
14
+ end
15
+
6
16
  # @!attribute [r] proxies
7
17
  # @return [Array<ProxyFetcher::Proxy>] An array of proxies
8
18
  attr_reader :proxies
@@ -14,14 +24,17 @@ module ProxyFetcher
14
24
  #
15
25
  # @return [Manager]
16
26
  #
17
- def initialize(refresh: true, validate: false, filters: {})
18
- if refresh
19
- refresh_list!(filters)
27
+ def initialize(**options)
28
+ if options.fetch(:refresh, true)
29
+ refresh_list!(options.fetch(:filters, {}))
20
30
  else
21
31
  @proxies = []
22
32
  end
23
33
 
24
- cleanup! if validate
34
+ files = Array(options.fetch(:file, options.fetch(:files, [])))
35
+ load_proxies_from_files!(files) if files&.any?
36
+
37
+ cleanup! if options.fetch(:validate, false)
25
38
  end
26
39
 
27
40
  # Update current proxy list using configured providers.
@@ -30,17 +43,17 @@ module ProxyFetcher
30
43
  #
31
44
  def refresh_list!(filters = nil)
32
45
  @proxies = []
33
-
34
46
  threads = []
35
- lock = Mutex.new
36
47
 
37
48
  ProxyFetcher.config.providers.each do |provider_name|
38
49
  threads << Thread.new do
50
+ Thread.current.report_on_exception = false
51
+
39
52
  provider = ProxyFetcher::Configuration.providers_registry.class_for(provider_name)
40
53
  provider_filters = filters && filters.fetch(provider_name.to_sym, filters)
41
54
  provider_proxies = provider.fetch_proxies!(provider_filters)
42
55
 
43
- lock.synchronize do
56
+ REFRESHER_LOCK.synchronize do
44
57
  @proxies.concat(provider_proxies)
45
58
  end
46
59
  end
@@ -55,7 +68,7 @@ module ProxyFetcher
55
68
 
56
69
  # Pop just first proxy (and back it to the end of the proxy list).
57
70
  #
58
- # @return [Proxy]
71
+ # @return [ProxyFetcher::Proxy, NilClass]
59
72
  # proxy object from the list
60
73
  #
61
74
  def get
@@ -72,7 +85,7 @@ module ProxyFetcher
72
85
  # Pop first valid proxy (and back it to the end of the proxy list)
73
86
  # Invalid proxies will be removed from the list
74
87
  #
75
- # @return [Proxy]
88
+ # @return [ProxyFetcher::Proxy, NilClass]
76
89
  # proxy object from the list
77
90
  #
78
91
  def get!
@@ -89,6 +102,26 @@ module ProxyFetcher
89
102
 
90
103
  alias pop! get!
91
104
 
105
+ # Loads proxies from files.
106
+ #
107
+ # @param proxy_files [String, Array<String,Pathname>]
108
+ # file path of list of files to load
109
+ #
110
+ def load_proxies_from_files!(proxy_files)
111
+ proxy_files = Array(proxy_files)
112
+ return if proxy_files.empty?
113
+
114
+ proxy_files.each do |proxy_file|
115
+ File.foreach(proxy_file, chomp: true) do |proxy_string|
116
+ addr, port = proxy_string.split(":", 2)
117
+ port = Integer(port) if port
118
+ @proxies << Proxy.new(addr: addr, port: port)
119
+ end
120
+ end
121
+
122
+ @proxies.uniq!
123
+ end
124
+
92
125
  # Clean current proxy list from dead proxies (that doesn't respond by timeout)
93
126
  #
94
127
  # @return [Array<ProxyFetcher::Proxy>]
@@ -6,12 +6,15 @@ module ProxyFetcher
6
6
  class Base
7
7
  # Loads proxy provider page content, extract proxy list from it
8
8
  # and convert every entry to proxy object.
9
- def fetch_proxies!(filters = {})
9
+ def fetch_proxies(filters = {})
10
10
  raw_proxies = load_proxy_list(filters)
11
11
  proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact
12
12
  proxies.reject { |proxy| proxy.addr.nil? }
13
13
  end
14
14
 
15
+ # For retro-compatibility
16
+ alias fetch_proxies! fetch_proxies
17
+
15
18
  def provider_url
16
19
  raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
17
20
  end
@@ -24,10 +27,17 @@ module ProxyFetcher
24
27
  {}
25
28
  end
26
29
 
30
+ # @return [Hash]
31
+ # Provider headers required to fetch the proxy list
32
+ #
27
33
  def provider_headers
28
34
  {}
29
35
  end
30
36
 
37
+ def xpath
38
+ raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
39
+ end
40
+
31
41
  # Just synthetic sugar to make it easier to call #fetch_proxies! method.
32
42
  def self.fetch_proxies!(*args)
33
43
  new.fetch_proxies!(*args)
@@ -37,18 +47,29 @@ module ProxyFetcher
37
47
 
38
48
  # Loads raw provider HTML with proxies.
39
49
  #
50
+ # @param url [String]
51
+ # Provider URL
52
+ #
53
+ # @param filters [#to_h]
54
+ # Provider filters (Hash-like object)
55
+ #
40
56
  # @return [String]
41
- # HTML body
57
+ # HTML body from the response
42
58
  #
43
59
  def load_html(url, filters = {})
44
- raise ArgumentError, 'filters must be a Hash' if filters && !filters.is_a?(Hash)
60
+ unless filters.respond_to?(:to_h)
61
+ raise ArgumentError, "filters must be a Hash or respond to #to_h"
62
+ end
45
63
 
46
- uri = URI.parse(url)
47
- # TODO: query for post request?
48
- uri.query = URI.encode_www_form(provider_params.merge(filters)) if filters && filters.any?
64
+ if filters&.any?
65
+ # TODO: query for post request?
66
+ uri = URI.parse(url)
67
+ uri.query = URI.encode_www_form(provider_params.merge(filters.to_h))
68
+ url = uri.to_s
69
+ end
49
70
 
50
71
  ProxyFetcher.config.http_client.fetch(
51
- uri.to_s,
72
+ url,
52
73
  method: provider_method,
53
74
  headers: provider_headers,
54
75
  params: provider_params
@@ -71,29 +92,29 @@ module ProxyFetcher
71
92
  ProxyFetcher::Document.parse(html)
72
93
  end
73
94
 
95
+ # Fetches HTML content by sending HTTP request to the provider URL and
96
+ # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
97
+ # to return all the proxy entries (HTML nodes).
98
+ #
99
+ # @return [Array<ProxyFetcher::Document::Node>]
100
+ # Collection of extracted HTML nodes with full proxy info
101
+ #
102
+ def load_proxy_list(filters = {})
103
+ doc = load_document(provider_url, filters)
104
+ doc.xpath(xpath)
105
+ end
106
+
74
107
  def build_proxy(*args)
75
108
  to_proxy(*args)
76
- rescue StandardError => error
109
+ rescue StandardError => e
77
110
  ProxyFetcher.logger.warn(
78
- "Failed to build Proxy object for #{self.class.name} due to error: #{error.message}"
111
+ "Failed to build Proxy for #{self.class.name.split("::").last} " \
112
+ "due to error: #{e.message}"
79
113
  )
80
114
 
81
115
  nil
82
116
  end
83
117
 
84
- # Fetches HTML content by sending HTTP request to the provider URL and
85
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
86
- # to return all the proxy entries (HTML nodes).
87
- #
88
- # Abstract method. Must be implemented in a descendant class
89
- #
90
- # @return [Array<Document::Node>]
91
- # list of proxy elements from the providers HTML content
92
- #
93
- def load_proxy_list(*)
94
- raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
95
- end
96
-
97
118
  # Convert HTML element with proxy info to ProxyFetcher::Proxy instance.
98
119
  #
99
120
  # Abstract method. Must be implemented in a descendant class
@@ -6,13 +6,12 @@ module ProxyFetcher
6
6
  class FreeProxyList < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'https://free-proxy-list.net/'
9
+ "https://free-proxy-list.net/"
10
10
  end
11
11
 
12
12
  # [NOTE] Doesn't support filtering
13
- def load_proxy_list(_filters = {})
14
- doc = load_document(provider_url, {})
15
- doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
16
15
  end
17
16
 
18
17
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -26,10 +25,10 @@ module ProxyFetcher
26
25
  #
27
26
  def to_proxy(html_node)
28
27
  ProxyFetcher::Proxy.new.tap do |proxy|
29
- proxy.addr = html_node.content_at('td[1]')
30
- proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
31
- proxy.country = html_node.content_at('td[4]')
32
- proxy.anonymity = html_node.content_at('td[5]')
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.anonymity = html_node.content_at("td[5]")
33
32
  proxy.type = parse_type(html_node)
34
33
  end
35
34
  end
@@ -45,8 +44,8 @@ module ProxyFetcher
45
44
  # Proxy type
46
45
  #
47
46
  def parse_type(html_node)
48
- https = html_node.content_at('td[6]')
49
- https && https.casecmp('yes').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
47
+ https = html_node.content_at("td[6]")
48
+ https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
50
49
  end
51
50
  end
52
51
 
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProxyFetcher
4
+ module Providers
5
+ # FreeProxyListSocks provider class.
6
+ class FreeProxyListSocks < Base
7
+ # Provider URL to fetch proxy list
8
+ def provider_url
9
+ "https://www.socks-proxy.net/"
10
+ end
11
+
12
+ # [NOTE] Doesn't support filtering
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
15
+ end
16
+
17
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
18
+ # object.
19
+ #
20
+ # @param html_node [Object]
21
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
22
+ #
23
+ # @return [ProxyFetcher::Proxy]
24
+ # Proxy object
25
+ #
26
+ def to_proxy(html_node)
27
+ ProxyFetcher::Proxy.new.tap do |proxy|
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.type = parse_type(html_node)
32
+ proxy.anonymity = html_node.content_at("td[6]")
33
+ end
34
+ end
35
+
36
+ private
37
+
38
+ # Parses HTML node to extract proxy type.
39
+ #
40
+ # @param html_node [Object]
41
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
42
+ #
43
+ # @return [String]
44
+ # Proxy type
45
+ #
46
+ def parse_type(html_node)
47
+ https = html_node.content_at("td[5]")
48
+
49
+ return ProxyFetcher::Proxy::SOCKS4 if https&.casecmp("socks4")&.zero?
50
+ return ProxyFetcher::Proxy::SOCKS5 if https&.casecmp("socks5")&.zero?
51
+
52
+ "Unknown"
53
+ end
54
+ end
55
+
56
+ ProxyFetcher::Configuration.register_provider(:free_proxy_list_socks, FreeProxyListSocks)
57
+ end
58
+ end
@@ -6,20 +6,12 @@ module ProxyFetcher
6
6
  class FreeProxyListSSL < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'https://www.sslproxies.org/'
9
+ "https://www.sslproxies.org/"
10
10
  end
11
11
 
12
- # Fetches HTML content by sending HTTP request to the provider URL and
13
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
14
- # to return all the proxy entries (HTML nodes).
15
- #
16
- # @return [Array<ProxyFetcher::Document::Node>]
17
- # Collection of extracted HTML nodes with full proxy info
18
- #
19
12
  # [NOTE] Doesn't support filtering
20
- def load_proxy_list(_filters = {})
21
- doc = load_document(provider_url, {})
22
- doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
23
15
  end
24
16
 
25
17
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -33,10 +25,10 @@ module ProxyFetcher
33
25
  #
34
26
  def to_proxy(html_node)
35
27
  ProxyFetcher::Proxy.new.tap do |proxy|
36
- proxy.addr = html_node.content_at('td[1]')
37
- proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
38
- proxy.country = html_node.content_at('td[4]')
39
- proxy.anonymity = html_node.content_at('td[5]')
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.anonymity = html_node.content_at("td[5]")
40
32
  proxy.type = ProxyFetcher::Proxy::HTTPS
41
33
  end
42
34
  end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProxyFetcher
4
+ module Providers
5
+ # FreeProxyListUS provider class.
6
+ class FreeProxyListUS < Base
7
+ # Provider URL to fetch proxy list
8
+ def provider_url
9
+ "https://www.us-proxy.org/"
10
+ end
11
+
12
+ # [NOTE] Doesn't support filtering
13
+ def xpath
14
+ '//table[@id="proxylisttable"]/tbody/tr'
15
+ end
16
+
17
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
18
+ # object.
19
+ #
20
+ # @param html_node [Object]
21
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
22
+ #
23
+ # @return [ProxyFetcher::Proxy]
24
+ # Proxy object
25
+ #
26
+ def to_proxy(html_node)
27
+ ProxyFetcher::Proxy.new.tap do |proxy|
28
+ proxy.addr = html_node.content_at("td[1]")
29
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
30
+ proxy.country = html_node.content_at("td[4]")
31
+ proxy.anonymity = html_node.content_at("td[5]")
32
+ proxy.type = parse_type(html_node)
33
+ end
34
+ end
35
+
36
+ private
37
+
38
+ # Parses HTML node to extract proxy type.
39
+ #
40
+ # @param html_node [Object]
41
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
42
+ #
43
+ # @return [String]
44
+ # Proxy type
45
+ #
46
+ def parse_type(html_node)
47
+ https = html_node.content_at("td[7]")
48
+ https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
49
+ end
50
+ end
51
+
52
+ ProxyFetcher::Configuration.register_provider(:free_proxy_list_us, FreeProxyListUS)
53
+ end
54
+ end