proxy_fetcher 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -1
  3. data/LICENSE +1 -1
  4. data/README.md +12 -1
  5. data/lib/proxy_fetcher.rb +5 -0
  6. data/lib/proxy_fetcher/client/client.rb +28 -7
  7. data/lib/proxy_fetcher/client/proxies_registry.rb +27 -0
  8. data/lib/proxy_fetcher/client/request.rb +73 -2
  9. data/lib/proxy_fetcher/configuration.rb +66 -2
  10. data/lib/proxy_fetcher/configuration/providers_registry.rb +31 -2
  11. data/lib/proxy_fetcher/document.rb +29 -6
  12. data/lib/proxy_fetcher/document/adapters.rb +20 -0
  13. data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +29 -0
  14. data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +26 -0
  15. data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +26 -0
  16. data/lib/proxy_fetcher/document/node.rb +47 -0
  17. data/lib/proxy_fetcher/exceptions.rb +52 -2
  18. data/lib/proxy_fetcher/manager.rb +32 -4
  19. data/lib/proxy_fetcher/providers/base.rb +27 -8
  20. data/lib/proxy_fetcher/providers/free_proxy_list.rb +2 -0
  21. data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +2 -0
  22. data/lib/proxy_fetcher/providers/gather_proxy.rb +2 -0
  23. data/lib/proxy_fetcher/providers/http_tunnel.rb +2 -0
  24. data/lib/proxy_fetcher/providers/proxy_docker.rb +2 -0
  25. data/lib/proxy_fetcher/providers/proxy_list.rb +2 -0
  26. data/lib/proxy_fetcher/providers/xroxy.rb +2 -0
  27. data/lib/proxy_fetcher/proxy.rb +36 -5
  28. data/lib/proxy_fetcher/utils/http_client.rb +35 -7
  29. data/lib/proxy_fetcher/utils/proxy_validator.rb +25 -4
  30. data/lib/proxy_fetcher/version.rb +3 -1
  31. data/proxy_fetcher.gemspec +1 -1
  32. data/spec/proxy_fetcher/{client_spec.rb → client/client_spec.rb} +10 -0
  33. data/spec/proxy_fetcher/configuration_spec.rb +2 -0
  34. data/spec/proxy_fetcher/document/adapters_spec.rb +2 -0
  35. data/spec/proxy_fetcher/document/node_spec.rb +2 -0
  36. data/spec/proxy_fetcher/providers/base_spec.rb +2 -0
  37. data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +2 -0
  38. data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +2 -0
  39. data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +2 -0
  40. data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +2 -0
  41. data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +2 -0
  42. data/spec/proxy_fetcher/providers/proxy_docker_spec.rb +2 -0
  43. data/spec/proxy_fetcher/providers/proxy_list_spec.rb +2 -0
  44. data/spec/proxy_fetcher/providers/xroxy_spec.rb +2 -0
  45. data/spec/proxy_fetcher/proxy_spec.rb +2 -0
  46. data/spec/proxy_fetcher/version_spec.rb +3 -0
  47. data/spec/spec_helper.rb +2 -0
  48. data/spec/support/manager_examples.rb +2 -0
  49. metadata +4 -3
@@ -1,24 +1,47 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # HTML document abstraction class. Used to work with different HTML parser adapters
3
5
  # such as Nokogiri, Oga or a custom one. Stores <i>backend</i< that will handle all
4
6
  # the DOM manipulation logic.
5
7
  class Document
6
- class << self
7
- def parse(data)
8
- new(ProxyFetcher.config.adapter.parse(data))
9
- end
10
- end
11
-
8
+ # @!attribute [r] backend
9
+ # @return [Object] Backend object that handles DOM processing
12
10
  attr_reader :backend
13
11
 
12
+ # Parses raw HTML data to abstract ProxyFetcher document.
13
+ #
14
+ # @param data [String] HTML
15
+ #
16
+ # @return [ProxyFetcher::Document]
17
+ # ProxyFetcher document model
18
+ #
19
+ def self.parse(data)
20
+ new(ProxyFetcher.config.adapter.parse(data))
21
+ end
22
+
23
+ # Initialize abstract ProxyFetcher HTML Document
24
+ #
25
+ # @return [Document]
26
+ #
14
27
  def initialize(backend)
15
28
  @backend = backend
16
29
  end
17
30
 
31
+ # Searches elements by XPath selector.
32
+ #
33
+ # @return [Array<ProxyFetcher::Document::Node>]
34
+ # collection of nodes
35
+ #
18
36
  def xpath(*args)
19
37
  backend.xpath(*args).map { |node| backend.proxy_node.new(node) }
20
38
  end
21
39
 
40
+ # Searches elements by CSS selector.
41
+ #
42
+ # @return [Array<ProxyFetcher::Document::Node>]
43
+ # collection of nodes
44
+ #
22
45
  def css(*args)
23
46
  backend.css(*args).map { |node| backend.proxy_node.new(node) }
24
47
  end
@@ -1,10 +1,30 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # ProxyFetcher HTML parser adapters.
6
+ #
7
+ # ProxyFetcher default supported adapters are:
8
+ #
9
+ # * Nokogiri
10
+ # * Oga
11
+ #
12
+ # Any custom adapter can be used and must be inherited from
13
+ # <code>ProxyFetcher::Document::AbstractAdapter</code>.
3
14
  class Adapters
15
+ # Adapters class name suffix
4
16
  ADAPTER = 'Adapter'.freeze
5
17
  private_constant :ADAPTER
6
18
 
7
19
  class << self
20
+ # Returns HTML parser adapter by it's name or class.
21
+ # If name is provided, then it looks for predefined classes
22
+ # in <code>ProxyFetcher::Document</code> namespace. Otherwise
23
+ # it just returns the passed class.
24
+ #
25
+ # @param name_or_class [String, Class]
26
+ # Adapter name or class
27
+ #
8
28
  def lookup(name_or_class)
9
29
  raise Exceptions::BlankAdapter if name_or_class.nil? || name_or_class.to_s.empty?
10
30
 
@@ -1,26 +1,55 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # Abstract HTML parser adapter class.
6
+ # Handles document manipulations.
3
7
  class AbstractAdapter
8
+ # @!attribute [r] events
9
+ # @return [Hash] A hash with events registered within a bus
4
10
  attr_reader :document
5
11
 
12
+ # Initialize adapter
13
+ #
14
+ # @return [AbstractAdapter]
15
+ #
6
16
  def initialize(document)
7
17
  @document = document
8
18
  end
9
19
 
10
20
  # You can override this method in your own adapter class
21
+ #
22
+ # @param selector [String]
23
+ # XPath selector
24
+ #
11
25
  def xpath(selector)
12
26
  document.xpath(selector)
13
27
  end
14
28
 
15
29
  # You can override this method in your own adapter class
30
+ #
31
+ # @param selector [String]
32
+ # CSS selector
33
+ #
16
34
  def css(selector)
17
35
  document.css(selector)
18
36
  end
19
37
 
38
+ # Returns <code>Node</code> class that will handle HTML
39
+ # nodes for particular adapter.
40
+ #
41
+ # @return [ProxyFetcher::Document::Node]
42
+ # node
43
+ #
20
44
  def proxy_node
21
45
  self.class.const_get('Node')
22
46
  end
23
47
 
48
+ # Installs adapter requirements.
49
+ #
50
+ # @raise [Exceptions::AdapterSetupError]
51
+ # adapter can't be install due to some error
52
+ #
24
53
  def self.setup!(*args)
25
54
  install_requirements!(*args)
26
55
  rescue LoadError => error
@@ -1,23 +1,49 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # HTML parser adapter that uses Nokogiri as a backend.
3
6
  class NokogiriAdapter < AbstractAdapter
7
+ # Requires Nokogiri gem to the application.
4
8
  def self.install_requirements!
5
9
  require 'nokogiri'
6
10
  end
7
11
 
12
+ # Parses raw HTML content with specific gem.
13
+ #
14
+ # @param data [String]
15
+ # HTML content
16
+ #
17
+ # @return [ProxyFetcher::Document::NokogiriAdapter]
18
+ # Object with parsed document
19
+ #
8
20
  def self.parse(data)
9
21
  new(::Nokogiri::HTML(data))
10
22
  end
11
23
 
24
+ # Nokogiri DOM node
12
25
  class Node < ProxyFetcher::Document::Node
26
+ # Returns HTML node attribute value.
27
+ #
28
+ # @return [String] attribute value
29
+ #
13
30
  def attr(*args)
14
31
  clear(node.attr(*args))
15
32
  end
16
33
 
34
+ # Returns HTML node inner text value clean from
35
+ # whitespaces, tabs, etc.
36
+ #
37
+ # @return [String] node inner text
38
+ #
17
39
  def content
18
40
  clear(node.content)
19
41
  end
20
42
 
43
+ # Returns node inner HTML.
44
+ #
45
+ # @return [String] inner HTML
46
+ #
21
47
  def html
22
48
  node.inner_html
23
49
  end
@@ -1,23 +1,49 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # HTML parser adapter that uses Oga as a backend.
3
6
  class OgaAdapter < AbstractAdapter
7
+ # Requires Oga gem to the application.
4
8
  def self.install_requirements!
5
9
  require 'oga'
6
10
  end
7
11
 
12
+ # Parses raw HTML content with specific gem.
13
+ #
14
+ # @param data [String]
15
+ # HTML content
16
+ #
17
+ # @return [ProxyFetcher::Document::OgaAdapter]
18
+ # Object with parsed document
19
+ #
8
20
  def self.parse(data)
9
21
  new(::Oga.parse_html(data))
10
22
  end
11
23
 
24
+ # Oga DOM node
12
25
  class Node < ProxyFetcher::Document::Node
26
+ # Returns HTML node attribute value.
27
+ #
28
+ # @return [String] attribute value
29
+ #
13
30
  def attr(*args)
14
31
  clear(node.attribute(*args).value)
15
32
  end
16
33
 
34
+ # Returns HTML node inner text value clean from
35
+ # whitespaces, tabs, etc.
36
+ #
37
+ # @return [String] node inner text
38
+ #
17
39
  def content
18
40
  clear(node.text)
19
41
  end
20
42
 
43
+ # Returns node inner HTML.
44
+ #
45
+ # @return [String] inner HTML
46
+ #
21
47
  def html
22
48
  node.to_xml
23
49
  end
@@ -1,38 +1,85 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # Abstract class for storing HTML elements that was parsed by
6
+ # one of the <code>ProxyFetcher::Document</code> adapters class.
3
7
  class Node
8
+ # @!attribute [r] node
9
+ # @return [Object] original DOM node, parsed by adapter backend
4
10
  attr_reader :node
5
11
 
12
+ # Initialize new HTML node
13
+ #
14
+ # @return [Node]
15
+ #
6
16
  def initialize(node)
7
17
  @node = node
8
18
  end
9
19
 
20
+ # Searches for node in children using some selector (CSS or XPath).
21
+ #
22
+ # @param selector [String] selector (CSS or XPath)
23
+ #
24
+ # @return [Node] child node
25
+ #
10
26
  def find(selector, method = :at_xpath)
11
27
  self.class.new(node.public_send(method, selector))
12
28
  end
13
29
 
30
+ # Searches exact HTML element by XPath. Returns only one element.
31
+ #
32
+ # @return [ProxyFetcher::Document::Node]
33
+ # node
34
+ #
14
35
  def at_xpath(*args)
15
36
  self.class.new(node.at_xpath(*args))
16
37
  end
17
38
 
39
+ # Searches exact HTML element by CSS. Returns only one element.
40
+ #
41
+ # @return [ProxyFetcher::Document::Node]
42
+ # node
43
+ #
18
44
  def at_css(*args)
19
45
  self.class.new(node.at_css(*args))
20
46
  end
21
47
 
48
+ # Returns clean content (text) for the specific element.
49
+ #
50
+ # @return [String]
51
+ # HTML node content
52
+ #
22
53
  def content_at(*args)
23
54
  clear(find(*args).content)
24
55
  end
25
56
 
57
+ # Returns HTML node content.
58
+ #
59
+ # Abstract method, must be implemented for specific adapter class.
60
+ #
26
61
  def content
27
62
  raise "`#{__method__}` must be implemented for specific adapter class!"
28
63
  end
29
64
 
65
+ # Returns HTML node inner HTML.
66
+ #
67
+ # Abstract method, must be implemented for specific adapter class.
68
+ #
30
69
  def html
31
70
  raise "`#{__method__}` must be implemented for specific adapter class!"
32
71
  end
33
72
 
34
73
  protected
35
74
 
75
+ # Removes whitespaces, tabulation and other "garbage" for the text.
76
+ #
77
+ # @param text [String]
78
+ # text to clear
79
+ #
80
+ # @return [String]
81
+ # clean text
82
+ #
36
83
  def clear(text)
37
84
  return if text.nil? || text.empty?
38
85
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # Base exception class for all the ProxyFetcher exceptions.
3
5
  Error = Class.new(StandardError)
@@ -6,6 +8,10 @@ module ProxyFetcher
6
8
  module Exceptions
7
9
  # Exception for wrong custom classes (such as ProxyValidator or HTTP Client).
8
10
  class WrongCustomClass < Error
11
+ # Initialize new exception
12
+ #
13
+ # @return [WrongCustomClass]
14
+ #
9
15
  def initialize(klass, methods)
10
16
  required_methods = Array(methods).join(', ')
11
17
  super("#{klass} must respond to [#{required_methods}] class methods!")
@@ -15,6 +21,12 @@ module ProxyFetcher
15
21
  # Exception for wrong provider name, that raises when configured provider
16
22
  # that is not registered via <code>register_provider</code> interface.
17
23
  class UnknownProvider < Error
24
+ # Initialize new exception
25
+ #
26
+ # @param provider_name [String] provider name
27
+ #
28
+ # @return [UnknownProvider]
29
+ #
18
30
  def initialize(provider_name)
19
31
  super("unregistered proxy provider `#{provider_name}`")
20
32
  end
@@ -22,6 +34,12 @@ module ProxyFetcher
22
34
 
23
35
  # Exception for cases when user tries to register already existing provider.
24
36
  class RegisteredProvider < Error
37
+ # Initialize new exception
38
+ #
39
+ # @param name [String, Symbol] provider name
40
+ #
41
+ # @return [RegisteredProvider]
42
+ #
25
43
  def initialize(name)
26
44
  super("`#{name}` provider already registered!")
27
45
  end
@@ -30,6 +48,10 @@ module ProxyFetcher
30
48
  # Exception for cases when HTTP client reached maximum count of redirects
31
49
  # trying to process HTTP request.
32
50
  class MaximumRedirectsReached < Error
51
+ # Initialize new exception
52
+ #
53
+ # @return [MaximumRedirectsReached]
54
+ #
33
55
  def initialize(*)
34
56
  super('maximum redirects reached')
35
57
  end
@@ -39,6 +61,10 @@ module ProxyFetcher
39
61
  # trying to process HTTP request. Can occur when request failed by timeout
40
62
  # multiple times.
41
63
  class MaximumRetriesReached < Error
64
+ # Initialize new exception
65
+ #
66
+ # @return [MaximumRetriesReached]
67
+ #
42
68
  def initialize(*)
43
69
  super('reached the maximum number of retries')
44
70
  end
@@ -47,6 +73,12 @@ module ProxyFetcher
47
73
  # Exception for cases when user tries to set wrong HTML parser adapter
48
74
  # in the configuration.
49
75
  class UnknownAdapter < Error
76
+ # Initialize new exception
77
+ #
78
+ # @param name [String] configured adapter name
79
+ #
80
+ # @return [UnknownAdapter]
81
+ #
50
82
  def initialize(name)
51
83
  super("unknown adapter '#{name}'")
52
84
  end
@@ -55,6 +87,10 @@ module ProxyFetcher
55
87
  # Exception for cases when user tries to set <code>nil</code> HTML parser adapter
56
88
  # in the configuration (or just forget to change it).
57
89
  class BlankAdapter < Error
90
+ # Initialize new exception
91
+ #
92
+ # @return [BlankAdapter]
93
+ #
58
94
  def initialize(*)
59
95
  super(<<-MSG.strip.squeeze
60
96
  you need to specify adapter for HTML parsing: ProxyFetcher.config.adapter = :nokogiri.
@@ -67,14 +103,28 @@ module ProxyFetcher
67
103
  # Exception for cases when HTML parser adapter can't be installed.
68
104
  # It will print the reason (backtrace) of the exception that caused an error.
69
105
  class AdapterSetupError < Error
70
- def initialize(adapter_name, reason)
106
+ # Initialize new exception
107
+ #
108
+ # @param adapter_name [String] configured adapter name
109
+ # @param error [String] full setup error (backtrace)
110
+ #
111
+ # @return [AdapterSetupError]
112
+ #
113
+ def initialize(adapter_name, error)
71
114
  adapter = demodulize(adapter_name.gsub('Adapter', ''))
72
115
 
73
- super("can't setup '#{adapter}' adapter during the following error:\n\t#{reason}'")
116
+ super("can't setup '#{adapter}' adapter during the following error:\n\t#{error}'")
74
117
  end
75
118
 
76
119
  private
77
120
 
121
+ # Returns just class name removing it's namespace.
122
+ #
123
+ # @param path [String]
124
+ # full class name
125
+ #
126
+ # @return [String] demodulized class name
127
+ #
78
128
  def demodulize(path)
79
129
  path = path.to_s
80
130
  index = path.rindex('::')
@@ -1,10 +1,19 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # ProxyFetcher Manager class for interacting with proxy lists from various providers.
3
5
  class Manager
6
+ # @!attribute [r] proxies
7
+ # @return [Array<ProxyFetcher::Proxy>] An array of proxies
4
8
  attr_reader :proxies
5
9
 
10
+ # Initialize ProxyFetcher Manager instance for managing proxies
11
+ #
6
12
  # refresh: true - load proxy list from the remote server on initialization
7
13
  # refresh: false - just initialize the class, proxy list will be empty ([])
14
+ #
15
+ # @return [Manager]
16
+ #
8
17
  def initialize(refresh: true, validate: false, filters: {})
9
18
  if refresh
10
19
  refresh_list!(filters)
@@ -15,7 +24,10 @@ module ProxyFetcher
15
24
  cleanup! if validate
16
25
  end
17
26
 
18
- # Update current proxy list from the provider
27
+ # Update current proxy list using configured providers.
28
+ #
29
+ # @param filters [Hash] providers filters
30
+ #
19
31
  def refresh_list!(filters = nil)
20
32
  @proxies = []
21
33
 
@@ -29,7 +41,11 @@ module ProxyFetcher
29
41
 
30
42
  alias fetch! refresh_list!
31
43
 
32
- # Pop just first proxy (and back it to the end of the proxy list)
44
+ # Pop just first proxy (and back it to the end of the proxy list).
45
+ #
46
+ # @return [Proxy]
47
+ # proxy object from the list
48
+ #
33
49
  def get
34
50
  return if @proxies.empty?
35
51
 
@@ -43,6 +59,10 @@ module ProxyFetcher
43
59
 
44
60
  # Pop first valid proxy (and back it to the end of the proxy list)
45
61
  # Invalid proxies will be removed from the list
62
+ #
63
+ # @return [Proxy]
64
+ # proxy object from the list
65
+ #
46
66
  def get!
47
67
  index = proxies.find_index(&:connectable?)
48
68
  return if index.nil?
@@ -76,7 +96,11 @@ module ProxyFetcher
76
96
 
77
97
  alias validate! cleanup!
78
98
 
79
- # Return random proxy
99
+ # Returns random proxy
100
+ #
101
+ # @return [Proxy]
102
+ # random proxy from the loaded list
103
+ #
80
104
  def random_proxy
81
105
  proxies.sample
82
106
  end
@@ -84,11 +108,15 @@ module ProxyFetcher
84
108
  alias random random_proxy
85
109
 
86
110
  # Returns array of proxy URLs (just schema + host + port)
111
+ #
112
+ # @return [Array<String>]
113
+ # collection of proxies
114
+ #
87
115
  def raw_proxies
88
116
  proxies.map(&:url)
89
117
  end
90
118
 
91
- # No need to put all the attr_readers
119
+ # @private No need to put all the attr_readers to the output
92
120
  def inspect
93
121
  to_s
94
122
  end