proxy_fetcher 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -1
  3. data/LICENSE +1 -1
  4. data/README.md +12 -1
  5. data/lib/proxy_fetcher.rb +5 -0
  6. data/lib/proxy_fetcher/client/client.rb +28 -7
  7. data/lib/proxy_fetcher/client/proxies_registry.rb +27 -0
  8. data/lib/proxy_fetcher/client/request.rb +73 -2
  9. data/lib/proxy_fetcher/configuration.rb +66 -2
  10. data/lib/proxy_fetcher/configuration/providers_registry.rb +31 -2
  11. data/lib/proxy_fetcher/document.rb +29 -6
  12. data/lib/proxy_fetcher/document/adapters.rb +20 -0
  13. data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +29 -0
  14. data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +26 -0
  15. data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +26 -0
  16. data/lib/proxy_fetcher/document/node.rb +47 -0
  17. data/lib/proxy_fetcher/exceptions.rb +52 -2
  18. data/lib/proxy_fetcher/manager.rb +32 -4
  19. data/lib/proxy_fetcher/providers/base.rb +27 -8
  20. data/lib/proxy_fetcher/providers/free_proxy_list.rb +2 -0
  21. data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +2 -0
  22. data/lib/proxy_fetcher/providers/gather_proxy.rb +2 -0
  23. data/lib/proxy_fetcher/providers/http_tunnel.rb +2 -0
  24. data/lib/proxy_fetcher/providers/proxy_docker.rb +2 -0
  25. data/lib/proxy_fetcher/providers/proxy_list.rb +2 -0
  26. data/lib/proxy_fetcher/providers/xroxy.rb +2 -0
  27. data/lib/proxy_fetcher/proxy.rb +36 -5
  28. data/lib/proxy_fetcher/utils/http_client.rb +35 -7
  29. data/lib/proxy_fetcher/utils/proxy_validator.rb +25 -4
  30. data/lib/proxy_fetcher/version.rb +3 -1
  31. data/proxy_fetcher.gemspec +1 -1
  32. data/spec/proxy_fetcher/{client_spec.rb → client/client_spec.rb} +10 -0
  33. data/spec/proxy_fetcher/configuration_spec.rb +2 -0
  34. data/spec/proxy_fetcher/document/adapters_spec.rb +2 -0
  35. data/spec/proxy_fetcher/document/node_spec.rb +2 -0
  36. data/spec/proxy_fetcher/providers/base_spec.rb +2 -0
  37. data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +2 -0
  38. data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +2 -0
  39. data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +2 -0
  40. data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +2 -0
  41. data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +2 -0
  42. data/spec/proxy_fetcher/providers/proxy_docker_spec.rb +2 -0
  43. data/spec/proxy_fetcher/providers/proxy_list_spec.rb +2 -0
  44. data/spec/proxy_fetcher/providers/xroxy_spec.rb +2 -0
  45. data/spec/proxy_fetcher/proxy_spec.rb +2 -0
  46. data/spec/proxy_fetcher/version_spec.rb +3 -0
  47. data/spec/spec_helper.rb +2 -0
  48. data/spec/support/manager_examples.rb +2 -0
  49. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6c2d4076c9f73303b305c364d3dcf672b6e8eba7
4
- data.tar.gz: b692847fab646ec4bd230909ac1cba9eb77c1c7f
3
+ metadata.gz: f7ade86090b2c8dca7aa7d20a47fdf63334507e2
4
+ data.tar.gz: 0256d5fa29f0ec451f2248ab280dc71878c3e05f
5
5
  SHA512:
6
- metadata.gz: cd70341cd18eea64dad58d63b8888f36e403251bb08e9d0289c584de1bc2201b45ff97f48fbac26697b25ee52328611b1e37de8d393b3b50ed7b3885192d0885
7
- data.tar.gz: aed0b7532628796cb2b9f397be8bd8d87108c4d5a4e4efd2d30a468e2b8c9c203fbe7ac1fd4b56c7e571a67748b681297c56e2c880fddddb20c463bc117789e6
6
+ metadata.gz: c598b1a286e45d96e40ee9772365ece447d9dd18054fbdf80e727b086181b8b0cac9c5933898d74364ed95301da35855a368ea6c81cd41a4e26e355a92b1f1b7
7
+ data.tar.gz: 3cc0c7c348d63abb9fde40aadf3730d176fb7de59dbd51036082d605a9bf50614eb7f19a103f9ee448078d73dabf05e666911ee97cd9c7f30e0d1fe64a57ef21
data/CHANGELOG.md CHANGED
@@ -2,10 +2,19 @@
2
2
 
3
3
  Reverse Chronological Order:
4
4
 
5
+ ## `0.6.3` (2018-01-26)
6
+
7
+ * Add ability to use own proxy for `ProxyFetcher::Client`
8
+ * Improve specs
9
+
10
+ ## `0.6.2` (2017-12-27)
11
+
12
+ * Fix ProxyDocker provider.
13
+
5
14
  ## `0.6.1` (2017-12-11)
6
15
 
7
16
  * Fix gem executable to check dependencies for adapters
8
- * Code clean
17
+ * Code cleanup
9
18
  * Some new specs
10
19
 
11
20
  ## `0.6.0` (2017-12-08)
data/LICENSE CHANGED
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2017 Nikita Bulai
3
+ Copyright (c) 2017—2018 Nikita Bulai
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -245,6 +245,17 @@ require 'proxy-fetcher'
245
245
  ProxyFetcher::Client.get 'https://example.com/resource', options: { max_retries: 10_000 }
246
246
  ```
247
247
 
248
+ You can also use your own proxy object when using ProxyFetcher client:
249
+
250
+ ```ruby
251
+ require 'proxy-fetcher'
252
+
253
+ manager = ProxyFetcher::Manager.new # will immediately load proxy list from the server
254
+
255
+ #random will return random proxy object from the list
256
+ ProxyFetcher::Client.get 'https://example.com/resource', options: { proxy: manager.random }
257
+ ```
258
+
248
259
  Btw, if you need support of JavaScript or some other features, you need to implement your own client using, for example,
249
260
  `selenium-webdriver`.
250
261
 
@@ -469,4 +480,4 @@ Thanks.
469
480
 
470
481
  `proxy_fetcher` gem is released under the [MIT License](http://www.opensource.org/licenses/MIT).
471
482
 
472
- Copyright (c) 2017 Nikita Bulai (bulajnikita@gmail.com).
483
+ Copyright (c) 2017—2018 Nikita Bulai (bulajnikita@gmail.com).
data/lib/proxy_fetcher.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'uri'
2
4
  require 'net/https'
3
5
 
@@ -70,6 +72,9 @@ module ProxyFetcher
70
72
 
71
73
  private
72
74
 
75
+ # Configures default adapter if it isn't defined by the user.
76
+ # @api private
77
+ #
73
78
  def configure_adapter!
74
79
  config.adapter = Configuration::DEFAULT_ADAPTER if config.adapter.nil?
75
80
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # ProxyFetcher HTTP client that encapsulates all the logic for sending
3
5
  # HTTP(S) requests using proxies, automatically fetched and validated by the gem.
@@ -122,29 +124,48 @@ module ProxyFetcher
122
124
 
123
125
  private
124
126
 
127
+ # Executes HTTP request with user payload.
128
+ #
125
129
  def request_with_payload(method, url, payload, headers, options)
126
- safe_request_to(url, options.fetch(:max_retries, 1000)) do |proxy|
127
- opts = options.merge(url: url, payload: payload, proxy: proxy, headers: default_headers.merge(headers))
130
+ with_proxy_for(url, options.fetch(:max_retries, 1000)) do |proxy|
131
+ opts = options.merge(payload: payload, proxy: options.fetch(:proxy, proxy), headers: default_headers.merge(headers))
128
132
 
129
- Request.execute(method: method, **opts)
133
+ Request.execute(url: url, method: method, **opts)
130
134
  end
131
135
  end
132
136
 
137
+ # Executes HTTP request without user payload.
138
+ #
133
139
  def request_without_payload(method, url, headers, options)
134
- safe_request_to(url, options.fetch(:max_retries, 1000)) do |proxy|
135
- opts = options.merge(url: url, proxy: proxy, headers: default_headers.merge(headers))
140
+ with_proxy_for(url, options.fetch(:max_retries, 1000)) do |proxy|
141
+ opts = options.merge(proxy: options.fetch(:proxy, proxy), headers: default_headers.merge(headers))
136
142
 
137
- Request.execute(method: method, **opts)
143
+ Request.execute(url: url, method: method, **opts)
138
144
  end
139
145
  end
140
146
 
147
+ # Default ProxyFetcher::Client http headers. Uses some options
148
+ # from the configuration object, such as User-Agent string.
149
+ #
150
+ # @return [Hash]
151
+ # headers
152
+ #
141
153
  def default_headers
142
154
  {
143
155
  'User-Agent' => ProxyFetcher.config.user_agent
144
156
  }
145
157
  end
146
158
 
147
- def safe_request_to(url, max_retries = 1000)
159
+ # Searches for valid proxy (suitable for URL type) using <code>ProxyFetcher::Manager</code>
160
+ # instance and executes the block with found proxy with retries (N times, default is 1000) if
161
+ # something goes wrong.
162
+ #
163
+ # @param url [String] request URL
164
+ # @param max_retries [Integer] maximum number of retries
165
+ #
166
+ # @raise [ProxyFetcher::Error] internal error happened during block execution
167
+ #
168
+ def with_proxy_for(url, max_retries = 1000)
148
169
  tries = 0
149
170
 
150
171
  begin
@@ -1,12 +1,33 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  module Client
5
+ # ProxyFetcher proxies registry for managing proxy lists used by the Client.
6
+ # It is used to fetch proxy lists and instantiate Manager object that will
7
+ # handle proxies.
3
8
  class ProxiesRegistry
4
9
  class << self
10
+ # Removes proxy from the list of the current proxy manager
11
+ # instance. If no more proxy available, refreshes the list.
12
+ #
13
+ # @param proxy [ProxyFetcher::Proxy]
14
+ # proxy object to remove
15
+ #
5
16
  def invalidate_proxy!(proxy)
6
17
  manager.proxies.delete(proxy)
7
18
  manager.refresh_list! if manager.proxies.empty?
8
19
  end
9
20
 
21
+ # Searches for valid proxy or required type (HTTP or secure)
22
+ # for requested URL. If no proxy found, than it refreshes proxy list
23
+ # and tries again.
24
+ #
25
+ # @param url [String]
26
+ # URL to process with proxy
27
+ #
28
+ # @return [ProxyFetcher::Proxy]
29
+ # gems proxy object
30
+ #
10
31
  def find_proxy_for(url)
11
32
  proxy = if URI.parse(url).is_a?(URI::HTTPS)
12
33
  manager.proxies.detect(&:ssl?)
@@ -20,6 +41,12 @@ module ProxyFetcher
20
41
  find_proxy_for(url)
21
42
  end
22
43
 
44
+ # Instantiates or returns <code>ProxyFetcher::Manager</code> instance
45
+ # for current <code>Thread</code>.
46
+ #
47
+ # @return [ProxyFetcher::Manager]
48
+ # ProxyFetcher manager class
49
+ #
23
50
  def manager
24
51
  manager = Thread.current[:proxy_fetcher_manager]
25
52
  return manager unless manager.nil?
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  module Client
3
5
  # ProxyFetcher::Client HTTP request abstraction.
@@ -14,13 +16,55 @@ module ProxyFetcher
14
16
  verify_mode: OpenSSL::SSL::VERIFY_NONE
15
17
  }.freeze
16
18
 
17
- attr_reader :http, :method, :uri, :headers, :timeout,
18
- :payload, :proxy, :max_redirects, :ssl_options
19
+ # @!attribute [r] http
20
+ # @return [Class] HTTP client
21
+ attr_reader :http
22
+
23
+ # @!attribute [r] method
24
+ # @return [String, Symbol] HTTP request method
25
+ attr_reader :method
26
+
27
+ # @!attribute [r] uri
28
+ # @return [URI] Request URI
29
+ attr_reader :uri
30
+
31
+ # @!attribute [r] headers
32
+ # @return [Hash] HTTP headers
33
+ attr_reader :headers
34
+
35
+ # @!attribute [r] timeout
36
+ # @return [Integer] Request timeout
37
+ attr_reader :timeout
38
+
39
+ # @!attribute [r] payload
40
+ # @return [String, Hash] Request payload
41
+ attr_reader :payload
42
+
43
+ # @!attribute [r] proxy
44
+ # @return [Proxy] Proxy to process the request
45
+ attr_reader :proxy
46
+
47
+ # @!attribute [r] max_redirects
48
+ # @return [Integer] Maximum count of requests (if fails)
49
+ attr_reader :max_redirects
50
+
51
+ # @!attribute [r] ssl_options
52
+ # @return [Hash] SSL options
53
+ attr_reader :ssl_options
19
54
 
55
+ # Initializes a new HTTP request and processes it
56
+ #
57
+ # @return [String]
58
+ # response body (requested resource content)
59
+ #
20
60
  def self.execute(args)
21
61
  new(args).execute
22
62
  end
23
63
 
64
+ # Initialize new HTTP request
65
+ #
66
+ # @return [Request]
67
+ #
24
68
  def initialize(args)
25
69
  raise ArgumentError, 'args must be a Hash!' unless args.is_a?(Hash)
26
70
 
@@ -37,6 +81,11 @@ module ProxyFetcher
37
81
  build_http_client
38
82
  end
39
83
 
84
+ # Executes HTTP request with defined options.
85
+ #
86
+ # @return [String]
87
+ # response body (requested resource content)
88
+ #
40
89
  def execute
41
90
  request = request_class_for(method).new(uri, headers)
42
91
 
@@ -47,6 +96,9 @@ module ProxyFetcher
47
96
 
48
97
  private
49
98
 
99
+ # Converts payload to the required format, so <code>Hash</code>
100
+ # must be a WWW-Form encoded for example.
101
+ #
50
102
  def preprocess_payload(payload)
51
103
  return if payload.nil?
52
104
 
@@ -58,6 +110,11 @@ module ProxyFetcher
58
110
  end
59
111
  end
60
112
 
113
+ # Builds HTTP client based on stdlib Net::HTTP.
114
+ #
115
+ # @return [Net::HTTP]
116
+ # HTTP client
117
+ #
61
118
  def build_http_client
62
119
  @http = Net::HTTP.new(uri.host, uri.port, proxy.addr, proxy.port)
63
120
 
@@ -67,6 +124,15 @@ module ProxyFetcher
67
124
  @http.read_timeout = timeout
68
125
  end
69
126
 
127
+ # Processes HTTP response: checks it status and follows redirect if required.
128
+ # If response returned an error, then throws it.
129
+ #
130
+ # @param http_response [Net::HTTPResponse]
131
+ # HTTP response object
132
+ #
133
+ # @return [String]
134
+ # requested resource content
135
+ #
70
136
  def process_response!(http_response)
71
137
  case http_response
72
138
  when Net::HTTPSuccess then http_response.read_body
@@ -76,6 +142,8 @@ module ProxyFetcher
76
142
  end
77
143
  end
78
144
 
145
+ # Follows redirection for response.
146
+ #
79
147
  def follow_redirection(http_response)
80
148
  raise ProxyFetcher::Exceptions::MaximumRedirectsReached if max_redirects <= 0
81
149
 
@@ -85,6 +153,9 @@ module ProxyFetcher
85
153
  Request.execute(method: :get, url: url, proxy: proxy, headers: headers, timeout: timeout, max_redirects: max_redirects - 1)
86
154
  end
87
155
 
156
+ # Returns particular Net::HTTP method object
157
+ # for processing required request.
158
+ #
88
159
  def request_class_for(method)
89
160
  Net::HTTP.const_get(method, false)
90
161
  end
@@ -1,10 +1,37 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
2
4
  # ProxyFetcher configuration. Stores all the options for dealing
3
5
  # with HTTP requests, adapters, custom classes.
4
6
  #
5
7
  class Configuration
6
- attr_accessor :timeout, :pool_size, :user_agent
7
- attr_reader :adapter, :http_client, :proxy_validator, :providers
8
+ # @!attribute timeout
9
+ # @return [Integer] HTTP request connection / open timeout
10
+ attr_accessor :timeout
11
+
12
+ # @!attribute pool_size
13
+ # @return [Integer] proxy validator pool size (max number of threads)
14
+ attr_accessor :pool_size
15
+
16
+ # @!attribute user_agent
17
+ # @return [String] User-Agent string
18
+ attr_accessor :user_agent
19
+
20
+ # @!attribute [r] adapter
21
+ # @return [Object] HTML parser adapter
22
+ attr_reader :adapter
23
+
24
+ # @!attribute [r] http_client
25
+ # @return [Class] HTTP client class
26
+ attr_reader :http_client
27
+
28
+ # @!attribute [r] proxy_validator
29
+ # @return [Class] proxy validator class
30
+ attr_reader :proxy_validator
31
+
32
+ # @!attribute [r] providers
33
+ # @return [Array<String>, Array<Symbol>] proxy providers list to be used
34
+ attr_reader :providers
8
35
 
9
36
  # User-Agent string that will be used by the ProxyFetcher HTTP client (to
10
37
  # send requests via proxy) and to fetch proxy lists from the sources.
@@ -21,6 +48,11 @@ module ProxyFetcher
21
48
  DEFAULT_ADAPTER = :nokogiri
22
49
 
23
50
  class << self
51
+ # Registry for handling proxy providers.
52
+ #
53
+ # @return [ProxyFetcher::ProvidersRegistry]
54
+ # providers registry
55
+ #
24
56
  def providers_registry
25
57
  @registry ||= ProvidersRegistry.new
26
58
  end
@@ -38,11 +70,21 @@ module ProxyFetcher
38
70
  providers_registry.register(name, klass)
39
71
  end
40
72
 
73
+ # Returns registered providers names.
74
+ #
75
+ # @return [Array<String>, Array<Symbol>]
76
+ # registered providers names
77
+ #
41
78
  def registered_providers
42
79
  providers_registry.providers.keys
43
80
  end
44
81
  end
45
82
 
83
+ # Initialize ProxyFetcher configuration with default options.
84
+ #
85
+ # @return [ProxyFetcher::Configuration]
86
+ # ProxyFetcher gem configuration object
87
+ #
46
88
  def initialize
47
89
  reset!
48
90
  end
@@ -58,11 +100,21 @@ module ProxyFetcher
58
100
  self.providers = self.class.registered_providers
59
101
  end
60
102
 
103
+ # Setups HTML parser adapter for all the proxy providers.
104
+ #
105
+ # @param name_or_class [String, Symbol, Class]
106
+ # name of the adapter or it's class
107
+ #
61
108
  def adapter=(name_or_class)
62
109
  @adapter = ProxyFetcher::Document::Adapters.lookup(name_or_class)
63
110
  @adapter.setup!
64
111
  end
65
112
 
113
+ # Setups collection of providers that will be used to fetch proxies.
114
+ #
115
+ # @param value [String, Symbol, Array<String>, Array<Symbol>]
116
+ # provider names
117
+ #
66
118
  def providers=(value)
67
119
  @providers = Array(value)
68
120
  end
@@ -70,10 +122,22 @@ module ProxyFetcher
70
122
  alias provider providers
71
123
  alias provider= providers=
72
124
 
125
+ # Setups HTTP client class that will be used to fetch proxy lists.
126
+ # Validates class for the required methods to be defined.
127
+ #
128
+ # @param klass [Class]
129
+ # HTTP client class
130
+ #
73
131
  def http_client=(klass)
74
132
  @http_client = setup_custom_class(klass, required_methods: :fetch)
75
133
  end
76
134
 
135
+ # Setups class that will be used to validate proxy lists.
136
+ # Validates class for the required methods to be defined.
137
+ #
138
+ # @param klass [Class]
139
+ # Proxy validator class
140
+ #
77
141
  def proxy_validator=(klass)
78
142
  @proxy_validator = setup_custom_class(klass, required_methods: :connectable?)
79
143
  end
@@ -1,5 +1,14 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module ProxyFetcher
4
+ # ProxyFetcher providers registry that stores all registered proxy providers.
2
5
  class ProvidersRegistry
6
+ # Returns providers hash where <i>key</i> is the name of the provider
7
+ # and <i>value</i> is an associated class.
8
+ #
9
+ # @return [Hash]
10
+ # registered providers
11
+ #
3
12
  def providers
4
13
  @providers ||= {}
5
14
  end
@@ -7,14 +16,34 @@ module ProxyFetcher
7
16
  # Add custom provider to common registry.
8
17
  # Requires proxy provider name ('proxy_docker' for example) and a class
9
18
  # that implements the parsing logic.
19
+ #
20
+ # @param name [String, Symbol]
21
+ # provider name
22
+ #
23
+ # @param klass [Class]
24
+ # provider class
25
+ #
26
+ # @raise [ProxyFetcher::Exceptions::RegisteredProvider]
27
+ # provider already registered
28
+ #
10
29
  def register(name, klass)
11
30
  raise ProxyFetcher::Exceptions::RegisteredProvider, name if providers.key?(name.to_sym)
12
31
 
13
32
  providers[name.to_sym] = klass
14
33
  end
15
34
 
16
- # Returns a class for specific provider if it is
17
- # registered in the registry. Otherwise throws an exception.
35
+ # Returns a class for specific provider if it is registered
36
+ # in the registry. Otherwise throws an exception.
37
+ #
38
+ # @param provider_name [String, Symbol]
39
+ # provider name
40
+ #
41
+ # @return [Class]
42
+ # provider class
43
+ #
44
+ # @raise [ProxyFetcher::Exceptions::UnknownProvider]
45
+ # provider is unknown
46
+ #
18
47
  def class_for(provider_name)
19
48
  provider_name = provider_name.to_sym
20
49