proxy_fetcher 0.6.2 → 0.6.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -1
  3. data/LICENSE +1 -1
  4. data/README.md +12 -1
  5. data/lib/proxy_fetcher.rb +5 -0
  6. data/lib/proxy_fetcher/client/client.rb +28 -7
  7. data/lib/proxy_fetcher/client/proxies_registry.rb +27 -0
  8. data/lib/proxy_fetcher/client/request.rb +73 -2
  9. data/lib/proxy_fetcher/configuration.rb +66 -2
  10. data/lib/proxy_fetcher/configuration/providers_registry.rb +31 -2
  11. data/lib/proxy_fetcher/document.rb +29 -6
  12. data/lib/proxy_fetcher/document/adapters.rb +20 -0
  13. data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +29 -0
  14. data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +26 -0
  15. data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +26 -0
  16. data/lib/proxy_fetcher/document/node.rb +47 -0
  17. data/lib/proxy_fetcher/exceptions.rb +52 -2
  18. data/lib/proxy_fetcher/manager.rb +32 -4
  19. data/lib/proxy_fetcher/providers/base.rb +27 -8
  20. data/lib/proxy_fetcher/providers/free_proxy_list.rb +2 -0
  21. data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +2 -0
  22. data/lib/proxy_fetcher/providers/gather_proxy.rb +2 -0
  23. data/lib/proxy_fetcher/providers/http_tunnel.rb +2 -0
  24. data/lib/proxy_fetcher/providers/proxy_docker.rb +2 -0
  25. data/lib/proxy_fetcher/providers/proxy_list.rb +2 -0
  26. data/lib/proxy_fetcher/providers/xroxy.rb +2 -0
  27. data/lib/proxy_fetcher/proxy.rb +36 -5
  28. data/lib/proxy_fetcher/utils/http_client.rb +35 -7
  29. data/lib/proxy_fetcher/utils/proxy_validator.rb +25 -4
  30. data/lib/proxy_fetcher/version.rb +3 -1
  31. data/proxy_fetcher.gemspec +1 -1
  32. data/spec/proxy_fetcher/{client_spec.rb → client/client_spec.rb} +10 -0
  33. data/spec/proxy_fetcher/configuration_spec.rb +2 -0
  34. data/spec/proxy_fetcher/document/adapters_spec.rb +2 -0
  35. data/spec/proxy_fetcher/document/node_spec.rb +2 -0
  36. data/spec/proxy_fetcher/providers/base_spec.rb +2 -0
  37. data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +2 -0
  38. data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +2 -0
  39. data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +2 -0
  40. data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +2 -0
  41. data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +2 -0
  42. data/spec/proxy_fetcher/providers/proxy_docker_spec.rb +2 -0
  43. data/spec/proxy_fetcher/providers/proxy_list_spec.rb +2 -0
  44. data/spec/proxy_fetcher/providers/xroxy_spec.rb +2 -0
  45. data/spec/proxy_fetcher/proxy_spec.rb +2 -0
  46. data/spec/proxy_fetcher/version_spec.rb +3 -0
  47. data/spec/spec_helper.rb +2 -0
  48. data/spec/support/manager_examples.rb +2 -0
  49. metadata +4 -3
@@ -1,24 +1,47 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # HTML document abstraction class. Used to work with different HTML parser adapters
3
5
  # such as Nokogiri, Oga or a custom one. Stores <i>backend</i< that will handle all
4
6
  # the DOM manipulation logic.
5
7
  class Document
6
- class << self
7
- def parse(data)
8
- new(ProxyFetcher.config.adapter.parse(data))
9
- end
10
- end
11
-
8
+ # @!attribute [r] backend
9
+ # @return [Object] Backend object that handles DOM processing
12
10
  attr_reader :backend
13
11
 
12
+ # Parses raw HTML data to abstract ProxyFetcher document.
13
+ #
14
+ # @param data [String] HTML
15
+ #
16
+ # @return [ProxyFetcher::Document]
17
+ # ProxyFetcher document model
18
+ #
19
+ def self.parse(data)
20
+ new(ProxyFetcher.config.adapter.parse(data))
21
+ end
22
+
23
+ # Initialize abstract ProxyFetcher HTML Document
24
+ #
25
+ # @return [Document]
26
+ #
14
27
  def initialize(backend)
15
28
  @backend = backend
16
29
  end
17
30
 
31
+ # Searches elements by XPath selector.
32
+ #
33
+ # @return [Array<ProxyFetcher::Document::Node>]
34
+ # collection of nodes
35
+ #
18
36
  def xpath(*args)
19
37
  backend.xpath(*args).map { |node| backend.proxy_node.new(node) }
20
38
  end
21
39
 
40
+ # Searches elements by CSS selector.
41
+ #
42
+ # @return [Array<ProxyFetcher::Document::Node>]
43
+ # collection of nodes
44
+ #
22
45
  def css(*args)
23
46
  backend.css(*args).map { |node| backend.proxy_node.new(node) }
24
47
  end
@@ -1,10 +1,30 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # ProxyFetcher HTML parser adapters.
6
+ #
7
+ # ProxyFetcher default supported adapters are:
8
+ #
9
+ # * Nokogiri
10
+ # * Oga
11
+ #
12
+ # Any custom adapter can be used and must be inherited from
13
+ # <code>ProxyFetcher::Document::AbstractAdapter</code>.
3
14
  class Adapters
15
+ # Adapters class name suffix
4
16
  ADAPTER = 'Adapter'.freeze
5
17
  private_constant :ADAPTER
6
18
 
7
19
  class << self
20
+ # Returns HTML parser adapter by it's name or class.
21
+ # If name is provided, then it looks for predefined classes
22
+ # in <code>ProxyFetcher::Document</code> namespace. Otherwise
23
+ # it just returns the passed class.
24
+ #
25
+ # @param name_or_class [String, Class]
26
+ # Adapter name or class
27
+ #
8
28
  def lookup(name_or_class)
9
29
  raise Exceptions::BlankAdapter if name_or_class.nil? || name_or_class.to_s.empty?
10
30
 
@@ -1,26 +1,55 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # Abstract HTML parser adapter class.
6
+ # Handles document manipulations.
3
7
  class AbstractAdapter
8
+ # @!attribute [r] events
9
+ # @return [Hash] A hash with events registered within a bus
4
10
  attr_reader :document
5
11
 
12
+ # Initialize adapter
13
+ #
14
+ # @return [AbstractAdapter]
15
+ #
6
16
  def initialize(document)
7
17
  @document = document
8
18
  end
9
19
 
10
20
  # You can override this method in your own adapter class
21
+ #
22
+ # @param selector [String]
23
+ # XPath selector
24
+ #
11
25
  def xpath(selector)
12
26
  document.xpath(selector)
13
27
  end
14
28
 
15
29
  # You can override this method in your own adapter class
30
+ #
31
+ # @param selector [String]
32
+ # CSS selector
33
+ #
16
34
  def css(selector)
17
35
  document.css(selector)
18
36
  end
19
37
 
38
+ # Returns <code>Node</code> class that will handle HTML
39
+ # nodes for particular adapter.
40
+ #
41
+ # @return [ProxyFetcher::Document::Node]
42
+ # node
43
+ #
20
44
  def proxy_node
21
45
  self.class.const_get('Node')
22
46
  end
23
47
 
48
+ # Installs adapter requirements.
49
+ #
50
+ # @raise [Exceptions::AdapterSetupError]
51
+ # adapter can't be install due to some error
52
+ #
24
53
  def self.setup!(*args)
25
54
  install_requirements!(*args)
26
55
  rescue LoadError => error
@@ -1,23 +1,49 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # HTML parser adapter that uses Nokogiri as a backend.
3
6
  class NokogiriAdapter < AbstractAdapter
7
+ # Requires Nokogiri gem to the application.
4
8
  def self.install_requirements!
5
9
  require 'nokogiri'
6
10
  end
7
11
 
12
+ # Parses raw HTML content with specific gem.
13
+ #
14
+ # @param data [String]
15
+ # HTML content
16
+ #
17
+ # @return [ProxyFetcher::Document::NokogiriAdapter]
18
+ # Object with parsed document
19
+ #
8
20
  def self.parse(data)
9
21
  new(::Nokogiri::HTML(data))
10
22
  end
11
23
 
24
+ # Nokogiri DOM node
12
25
  class Node < ProxyFetcher::Document::Node
26
+ # Returns HTML node attribute value.
27
+ #
28
+ # @return [String] attribute value
29
+ #
13
30
  def attr(*args)
14
31
  clear(node.attr(*args))
15
32
  end
16
33
 
34
+ # Returns HTML node inner text value clean from
35
+ # whitespaces, tabs, etc.
36
+ #
37
+ # @return [String] node inner text
38
+ #
17
39
  def content
18
40
  clear(node.content)
19
41
  end
20
42
 
43
+ # Returns node inner HTML.
44
+ #
45
+ # @return [String] inner HTML
46
+ #
21
47
  def html
22
48
  node.inner_html
23
49
  end
@@ -1,23 +1,49 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # HTML parser adapter that uses Oga as a backend.
3
6
  class OgaAdapter < AbstractAdapter
7
+ # Requires Oga gem to the application.
4
8
  def self.install_requirements!
5
9
  require 'oga'
6
10
  end
7
11
 
12
+ # Parses raw HTML content with specific gem.
13
+ #
14
+ # @param data [String]
15
+ # HTML content
16
+ #
17
+ # @return [ProxyFetcher::Document::OgaAdapter]
18
+ # Object with parsed document
19
+ #
8
20
  def self.parse(data)
9
21
  new(::Oga.parse_html(data))
10
22
  end
11
23
 
24
+ # Oga DOM node
12
25
  class Node < ProxyFetcher::Document::Node
26
+ # Returns HTML node attribute value.
27
+ #
28
+ # @return [String] attribute value
29
+ #
13
30
  def attr(*args)
14
31
  clear(node.attribute(*args).value)
15
32
  end
16
33
 
34
+ # Returns HTML node inner text value clean from
35
+ # whitespaces, tabs, etc.
36
+ #
37
+ # @return [String] node inner text
38
+ #
17
39
  def content
18
40
  clear(node.text)
19
41
  end
20
42
 
43
+ # Returns node inner HTML.
44
+ #
45
+ # @return [String] inner HTML
46
+ #
21
47
  def html
22
48
  node.to_xml
23
49
  end
@@ -1,38 +1,85 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  class Document
5
+ # Abstract class for storing HTML elements that was parsed by
6
+ # one of the <code>ProxyFetcher::Document</code> adapters class.
3
7
  class Node
8
+ # @!attribute [r] node
9
+ # @return [Object] original DOM node, parsed by adapter backend
4
10
  attr_reader :node
5
11
 
12
+ # Initialize new HTML node
13
+ #
14
+ # @return [Node]
15
+ #
6
16
  def initialize(node)
7
17
  @node = node
8
18
  end
9
19
 
20
+ # Searches for node in children using some selector (CSS or XPath).
21
+ #
22
+ # @param selector [String] selector (CSS or XPath)
23
+ #
24
+ # @return [Node] child node
25
+ #
10
26
  def find(selector, method = :at_xpath)
11
27
  self.class.new(node.public_send(method, selector))
12
28
  end
13
29
 
30
+ # Searches exact HTML element by XPath. Returns only one element.
31
+ #
32
+ # @return [ProxyFetcher::Document::Node]
33
+ # node
34
+ #
14
35
  def at_xpath(*args)
15
36
  self.class.new(node.at_xpath(*args))
16
37
  end
17
38
 
39
+ # Searches exact HTML element by CSS. Returns only one element.
40
+ #
41
+ # @return [ProxyFetcher::Document::Node]
42
+ # node
43
+ #
18
44
  def at_css(*args)
19
45
  self.class.new(node.at_css(*args))
20
46
  end
21
47
 
48
+ # Returns clean content (text) for the specific element.
49
+ #
50
+ # @return [String]
51
+ # HTML node content
52
+ #
22
53
  def content_at(*args)
23
54
  clear(find(*args).content)
24
55
  end
25
56
 
57
+ # Returns HTML node content.
58
+ #
59
+ # Abstract method, must be implemented for specific adapter class.
60
+ #
26
61
  def content
27
62
  raise "`#{__method__}` must be implemented for specific adapter class!"
28
63
  end
29
64
 
65
+ # Returns HTML node inner HTML.
66
+ #
67
+ # Abstract method, must be implemented for specific adapter class.
68
+ #
30
69
  def html
31
70
  raise "`#{__method__}` must be implemented for specific adapter class!"
32
71
  end
33
72
 
34
73
  protected
35
74
 
75
+ # Removes whitespaces, tabulation and other "garbage" for the text.
76
+ #
77
+ # @param text [String]
78
+ # text to clear
79
+ #
80
+ # @return [String]
81
+ # clean text
82
+ #
36
83
  def clear(text)
37
84
  return if text.nil? || text.empty?
38
85
 
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # Base exception class for all the ProxyFetcher exceptions.
3
5
  Error = Class.new(StandardError)
@@ -6,6 +8,10 @@ module ProxyFetcher
6
8
  module Exceptions
7
9
  # Exception for wrong custom classes (such as ProxyValidator or HTTP Client).
8
10
  class WrongCustomClass < Error
11
+ # Initialize new exception
12
+ #
13
+ # @return [WrongCustomClass]
14
+ #
9
15
  def initialize(klass, methods)
10
16
  required_methods = Array(methods).join(', ')
11
17
  super("#{klass} must respond to [#{required_methods}] class methods!")
@@ -15,6 +21,12 @@ module ProxyFetcher
15
21
  # Exception for wrong provider name, that raises when configured provider
16
22
  # that is not registered via <code>register_provider</code> interface.
17
23
  class UnknownProvider < Error
24
+ # Initialize new exception
25
+ #
26
+ # @param provider_name [String] provider name
27
+ #
28
+ # @return [UnknownProvider]
29
+ #
18
30
  def initialize(provider_name)
19
31
  super("unregistered proxy provider `#{provider_name}`")
20
32
  end
@@ -22,6 +34,12 @@ module ProxyFetcher
22
34
 
23
35
  # Exception for cases when user tries to register already existing provider.
24
36
  class RegisteredProvider < Error
37
+ # Initialize new exception
38
+ #
39
+ # @param name [String, Symbol] provider name
40
+ #
41
+ # @return [RegisteredProvider]
42
+ #
25
43
  def initialize(name)
26
44
  super("`#{name}` provider already registered!")
27
45
  end
@@ -30,6 +48,10 @@ module ProxyFetcher
30
48
  # Exception for cases when HTTP client reached maximum count of redirects
31
49
  # trying to process HTTP request.
32
50
  class MaximumRedirectsReached < Error
51
+ # Initialize new exception
52
+ #
53
+ # @return [MaximumRedirectsReached]
54
+ #
33
55
  def initialize(*)
34
56
  super('maximum redirects reached')
35
57
  end
@@ -39,6 +61,10 @@ module ProxyFetcher
39
61
  # trying to process HTTP request. Can occur when request failed by timeout
40
62
  # multiple times.
41
63
  class MaximumRetriesReached < Error
64
+ # Initialize new exception
65
+ #
66
+ # @return [MaximumRetriesReached]
67
+ #
42
68
  def initialize(*)
43
69
  super('reached the maximum number of retries')
44
70
  end
@@ -47,6 +73,12 @@ module ProxyFetcher
47
73
  # Exception for cases when user tries to set wrong HTML parser adapter
48
74
  # in the configuration.
49
75
  class UnknownAdapter < Error
76
+ # Initialize new exception
77
+ #
78
+ # @param name [String] configured adapter name
79
+ #
80
+ # @return [UnknownAdapter]
81
+ #
50
82
  def initialize(name)
51
83
  super("unknown adapter '#{name}'")
52
84
  end
@@ -55,6 +87,10 @@ module ProxyFetcher
55
87
  # Exception for cases when user tries to set <code>nil</code> HTML parser adapter
56
88
  # in the configuration (or just forget to change it).
57
89
  class BlankAdapter < Error
90
+ # Initialize new exception
91
+ #
92
+ # @return [BlankAdapter]
93
+ #
58
94
  def initialize(*)
59
95
  super(<<-MSG.strip.squeeze
60
96
  you need to specify adapter for HTML parsing: ProxyFetcher.config.adapter = :nokogiri.
@@ -67,14 +103,28 @@ module ProxyFetcher
67
103
  # Exception for cases when HTML parser adapter can't be installed.
68
104
  # It will print the reason (backtrace) of the exception that caused an error.
69
105
  class AdapterSetupError < Error
70
- def initialize(adapter_name, reason)
106
+ # Initialize new exception
107
+ #
108
+ # @param adapter_name [String] configured adapter name
109
+ # @param error [String] full setup error (backtrace)
110
+ #
111
+ # @return [AdapterSetupError]
112
+ #
113
+ def initialize(adapter_name, error)
71
114
  adapter = demodulize(adapter_name.gsub('Adapter', ''))
72
115
 
73
- super("can't setup '#{adapter}' adapter during the following error:\n\t#{reason}'")
116
+ super("can't setup '#{adapter}' adapter during the following error:\n\t#{error}'")
74
117
  end
75
118
 
76
119
  private
77
120
 
121
+ # Returns just class name removing it's namespace.
122
+ #
123
+ # @param path [String]
124
+ # full class name
125
+ #
126
+ # @return [String] demodulized class name
127
+ #
78
128
  def demodulize(path)
79
129
  path = path.to_s
80
130
  index = path.rindex('::')
@@ -1,10 +1,19 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # ProxyFetcher Manager class for interacting with proxy lists from various providers.
3
5
  class Manager
6
+ # @!attribute [r] proxies
7
+ # @return [Array<ProxyFetcher::Proxy>] An array of proxies
4
8
  attr_reader :proxies
5
9
 
10
+ # Initialize ProxyFetcher Manager instance for managing proxies
11
+ #
6
12
  # refresh: true - load proxy list from the remote server on initialization
7
13
  # refresh: false - just initialize the class, proxy list will be empty ([])
14
+ #
15
+ # @return [Manager]
16
+ #
8
17
  def initialize(refresh: true, validate: false, filters: {})
9
18
  if refresh
10
19
  refresh_list!(filters)
@@ -15,7 +24,10 @@ module ProxyFetcher
15
24
  cleanup! if validate
16
25
  end
17
26
 
18
- # Update current proxy list from the provider
27
+ # Update current proxy list using configured providers.
28
+ #
29
+ # @param filters [Hash] providers filters
30
+ #
19
31
  def refresh_list!(filters = nil)
20
32
  @proxies = []
21
33
 
@@ -29,7 +41,11 @@ module ProxyFetcher
29
41
 
30
42
  alias fetch! refresh_list!
31
43
 
32
- # Pop just first proxy (and back it to the end of the proxy list)
44
+ # Pop just first proxy (and back it to the end of the proxy list).
45
+ #
46
+ # @return [Proxy]
47
+ # proxy object from the list
48
+ #
33
49
  def get
34
50
  return if @proxies.empty?
35
51
 
@@ -43,6 +59,10 @@ module ProxyFetcher
43
59
 
44
60
  # Pop first valid proxy (and back it to the end of the proxy list)
45
61
  # Invalid proxies will be removed from the list
62
+ #
63
+ # @return [Proxy]
64
+ # proxy object from the list
65
+ #
46
66
  def get!
47
67
  index = proxies.find_index(&:connectable?)
48
68
  return if index.nil?
@@ -76,7 +96,11 @@ module ProxyFetcher
76
96
 
77
97
  alias validate! cleanup!
78
98
 
79
- # Return random proxy
99
+ # Returns random proxy
100
+ #
101
+ # @return [Proxy]
102
+ # random proxy from the loaded list
103
+ #
80
104
  def random_proxy
81
105
  proxies.sample
82
106
  end
@@ -84,11 +108,15 @@ module ProxyFetcher
84
108
  alias random random_proxy
85
109
 
86
110
  # Returns array of proxy URLs (just schema + host + port)
111
+ #
112
+ # @return [Array<String>]
113
+ # collection of proxies
114
+ #
87
115
  def raw_proxies
88
116
  proxies.map(&:url)
89
117
  end
90
118
 
91
- # No need to put all the attr_readers
119
+ # @private No need to put all the attr_readers to the output
92
120
  def inspect
93
121
  to_s
94
122
  end