proxy_fetcher 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -1
- data/.travis.yml +9 -0
- data/Gemfile +4 -1
- data/README.md +58 -10
- data/gemfiles/nokogiri.gemfile +11 -0
- data/gemfiles/oga.gemfile +11 -0
- data/lib/proxy_fetcher/configuration.rb +9 -2
- data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +31 -0
- data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +35 -0
- data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +35 -0
- data/lib/proxy_fetcher/document/adapters.rb +24 -0
- data/lib/proxy_fetcher/document/node.rb +35 -0
- data/lib/proxy_fetcher/document.rb +23 -0
- data/lib/proxy_fetcher/exceptions.rb +33 -0
- data/lib/proxy_fetcher/providers/base.rb +5 -15
- data/lib/proxy_fetcher/providers/free_proxy_list.rb +8 -8
- data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +5 -5
- data/lib/proxy_fetcher/providers/gather_proxy.rb +4 -4
- data/lib/proxy_fetcher/providers/http_tunnel.rb +11 -11
- data/lib/proxy_fetcher/providers/proxy_docker.rb +5 -5
- data/lib/proxy_fetcher/providers/proxy_list.rb +7 -7
- data/lib/proxy_fetcher/providers/xroxy.rb +9 -9
- data/lib/proxy_fetcher/version.rb +2 -2
- data/lib/proxy_fetcher.rb +15 -3
- data/proxy_fetcher.gemspec +2 -4
- data/spec/proxy_fetcher/client_spec.rb +1 -1
- data/spec/proxy_fetcher/configuration_spec.rb +19 -2
- data/spec/proxy_fetcher/document/adapters_spec.rb +24 -0
- data/spec/spec_helper.rb +7 -0
- metadata +13 -26
- data/lib/proxy_fetcher/utils/html.rb +0 -15
- data/spec/support/evil_proxy_patch.rb +0 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6a8a3fe3140e235a3b46ecdb64322c77c8ce69d4
|
4
|
+
data.tar.gz: 9ace00e654e55832242e050ee42d01642b26338c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 961dd103ae502f947a7417b248ba7cbfe8e5907880bbe2b2b123b568de6f703f39ba6a2da281dbb139a62703d9f2a26f4bb02be6128df6b09457542aa7235ba3
|
7
|
+
data.tar.gz: d80b53cdfb9f67c76edf4176ab6bd3daa8151d814f67e82405b53495ecab4be0d963232e010efe333a5ef1dfd177251367b9d620d9cc78306fccfc2ef137b837
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
@@ -2,6 +2,12 @@ language: ruby
|
|
2
2
|
before_install: gem install bundler
|
3
3
|
bundler_args: --without yard guard benchmarks
|
4
4
|
script: "rake spec"
|
5
|
+
env:
|
6
|
+
global:
|
7
|
+
- "JRUBY_OPTS='$JRUBY_OPTS --debug'"
|
8
|
+
gemfile:
|
9
|
+
- gemfiles/oga.gemfile
|
10
|
+
- gemfiles/nokogiri.gemfile
|
5
11
|
rvm:
|
6
12
|
- 2.0
|
7
13
|
- 2.1
|
@@ -12,3 +18,6 @@ rvm:
|
|
12
18
|
matrix:
|
13
19
|
allow_failures:
|
14
20
|
- rvm: ruby-head
|
21
|
+
exclude:
|
22
|
+
- rvm: 2.0
|
23
|
+
gemfile: gemfiles/nokogiri.gemfile # Nokogiri doesn't support Ruby 2.0
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -5,17 +5,19 @@
|
|
5
5
|
[](https://codeclimate.com/github/nbulaj/proxy_fetcher)
|
6
6
|
[](#license)
|
7
7
|
|
8
|
-
This gem can help your Ruby application to make HTTP(S) requests
|
8
|
+
This gem can help your Ruby application to make HTTP(S) requests using proxy by fetching and validating actual
|
9
9
|
proxy lists from multiple providers.
|
10
10
|
|
11
|
-
It gives you a `Manager` class that can load proxy lists, validate them and return random or specific proxies.
|
12
|
-
|
11
|
+
It gives you a special `Manager` class that can load proxy lists, validate them and return random or specific proxies.
|
12
|
+
It also has a `Client` class that encapsulates all the logic for the sending HTTP requests using proxies.
|
13
|
+
Take a look at the documentation below to find all the gem features.
|
13
14
|
|
14
15
|
Also this gem can be used with any other programming language (Go / Python / etc) as standalone solution for downloading and
|
15
16
|
validating proxy lists from the different providers. [Checkout examples](#standalone) of usage below.
|
16
17
|
|
17
18
|
## Table of Contents
|
18
19
|
|
20
|
+
- [Dependencies](#dependencies)
|
19
21
|
- [Installation](#installation)
|
20
22
|
- [Example of usage](#example-of-usage)
|
21
23
|
- [In Ruby application](#in-ruby-application)
|
@@ -28,12 +30,24 @@ validating proxy lists from the different providers. [Checkout examples](#standa
|
|
28
30
|
- [Contributing](#contributing)
|
29
31
|
- [License](#license)
|
30
32
|
|
33
|
+
## Dependencies
|
34
|
+
|
35
|
+
ProxyFetcher gem itself requires only Ruby `>= 2.0.0`.
|
36
|
+
|
37
|
+
However, it requires an adapter to parse HTML. If you do not specify any specific adapter, then it will use
|
38
|
+
default one - [Nokogiri](https://github.com/sparklemotion/nokogiri). It's OK for any Ruby on Rails project
|
39
|
+
(because they uses it by default).
|
40
|
+
|
41
|
+
But if you want to use some specific adapter (for example your Ruby application uses [Oga](https://gitlab.com/yorickpeterse/oga),
|
42
|
+
then you need to manually add your dependencies to your project and configure ProxyFetcher to use another adapter. Moreover,
|
43
|
+
you can implement your own adapter if it your use-case. Take a look at the [Configuration](#configuration) section for more details.
|
44
|
+
|
31
45
|
## Installation
|
32
46
|
|
33
47
|
If using bundler, first add 'proxy_fetcher' to your Gemfile:
|
34
48
|
|
35
49
|
```ruby
|
36
|
-
gem 'proxy_fetcher', '~> 0.
|
50
|
+
gem 'proxy_fetcher', '~> 0.6'
|
37
51
|
```
|
38
52
|
|
39
53
|
or if you want to use the latest version (from `master` branch), then:
|
@@ -234,7 +248,25 @@ Btw, if you need support of JavaScript or some other features, you need to imple
|
|
234
248
|
|
235
249
|
## Configuration
|
236
250
|
|
237
|
-
|
251
|
+
ProxyFetcher is very flexible gem. You can configure the most important parts of the library and use your own solutions.
|
252
|
+
|
253
|
+
Default configuration looks as follows:
|
254
|
+
|
255
|
+
```ruby
|
256
|
+
ProxyFetcher.configure do |config|
|
257
|
+
config.user_agent = ProxyFetcher::Configuration::DEFAULT_USER_AGENT
|
258
|
+
config.pool_size = 10
|
259
|
+
config.timeout = 3
|
260
|
+
config.http_client = ProxyFetcher::HTTPClient
|
261
|
+
config.proxy_validator = ProxyFetcher::ProxyValidator
|
262
|
+
config.providers = ProxyFetcher::Configuration.registered_providers
|
263
|
+
config.adapter = ProxyFetcher::Configuration::DEFAULT_ADAPTER # :nokogiri by default
|
264
|
+
end
|
265
|
+
```
|
266
|
+
|
267
|
+
You can change any of the options above. Let's look at this deeper.
|
268
|
+
|
269
|
+
To change open/read timeout for `cleanup!` and `connectable?` methods you need to change `timeout` options:
|
238
270
|
|
239
271
|
```ruby
|
240
272
|
ProxyFetcher.configure do |config|
|
@@ -245,7 +277,7 @@ manager = ProxyFetcher::Manager.new
|
|
245
277
|
manager.cleanup!
|
246
278
|
```
|
247
279
|
|
248
|
-
Also you can set your custom User-Agent:
|
280
|
+
Also you can set your custom User-Agent string:
|
249
281
|
|
250
282
|
```ruby
|
251
283
|
ProxyFetcher.configure do |config|
|
@@ -253,10 +285,11 @@ ProxyFetcher.configure do |config|
|
|
253
285
|
end
|
254
286
|
```
|
255
287
|
|
256
|
-
ProxyFetcher uses
|
257
|
-
was developed as a Single Page Application (SPA) with some JavaScript,
|
258
|
-
|
259
|
-
|
288
|
+
ProxyFetcher uses standard Ruby solution for dealing with HTTP(S) requests - `net/http` library from the Ruby core.
|
289
|
+
If you wanna add, for example, your custom provider that was developed as a Single Page Application (SPA) with some JavaScript,
|
290
|
+
then you will need something like [selenium-webdriver](https://github.com/SeleniumHQ/selenium/tree/master/rb) to properly
|
291
|
+
load the content of the website. For those and other cases you can write your own class for fetching HTML content by
|
292
|
+
the URL and setup it in the ProxyFetcher config:
|
260
293
|
|
261
294
|
```ruby
|
262
295
|
class MyHTTPClient
|
@@ -300,6 +333,21 @@ manager.validate!
|
|
300
333
|
#=> [ ... ]
|
301
334
|
```
|
302
335
|
|
336
|
+
Be default, ProxyFetcher gem uses [Nokogiri](https://github.com/sparklemotion/nokogiri) for parsing HTML. If you want
|
337
|
+
to use [Oga](https://gitlab.com/yorickpeterse/oga) instead, then you need to add `gem 'oga'` to your Gemfile and configure
|
338
|
+
ProxyFetcher as follows:
|
339
|
+
|
340
|
+
```ruby
|
341
|
+
ProxyFetcher.config.adapter = :oga
|
342
|
+
```
|
343
|
+
|
344
|
+
Also you can write your own HTML parser implementation and use it, take a look at the [abstract class and implementations](lib/proxy_fetcher/document).
|
345
|
+
Configure it as:
|
346
|
+
|
347
|
+
```ruby
|
348
|
+
ProxyFetcher.config.adapter = MyHTMLParserClass
|
349
|
+
```
|
350
|
+
|
303
351
|
### Proxy validation speed
|
304
352
|
|
305
353
|
There are some tricks to increase proxy list validation performance.
|
@@ -1,11 +1,13 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
class Configuration
|
3
|
-
attr_accessor :
|
4
|
-
|
3
|
+
attr_accessor :timeout, :pool_size, :user_agent
|
4
|
+
attr_reader :adapter, :http_client, :proxy_validator, :providers
|
5
5
|
|
6
6
|
# rubocop:disable Metrics/LineLength
|
7
7
|
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112 Safari/537.36'.freeze
|
8
8
|
|
9
|
+
DEFAULT_ADAPTER = :nokogiri
|
10
|
+
|
9
11
|
class << self
|
10
12
|
def providers_registry
|
11
13
|
@registry ||= ProvidersRegistry.new
|
@@ -35,6 +37,11 @@ module ProxyFetcher
|
|
35
37
|
self.providers = self.class.registered_providers
|
36
38
|
end
|
37
39
|
|
40
|
+
def adapter=(name_or_class)
|
41
|
+
@adapter = ProxyFetcher::Document::Adapters.lookup(name_or_class)
|
42
|
+
@adapter.setup!
|
43
|
+
end
|
44
|
+
|
38
45
|
def providers=(value)
|
39
46
|
@providers = Array(value)
|
40
47
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module ProxyFetcher
|
2
|
+
class Document
|
3
|
+
class AbstractAdapter
|
4
|
+
attr_reader :document
|
5
|
+
|
6
|
+
def initialize(document)
|
7
|
+
@document = document
|
8
|
+
end
|
9
|
+
|
10
|
+
# You can override this method in your own adapter class
|
11
|
+
def xpath(selector)
|
12
|
+
document.xpath(selector)
|
13
|
+
end
|
14
|
+
|
15
|
+
# You can override this method in your own adapter class
|
16
|
+
def css(selector)
|
17
|
+
document.css(selector)
|
18
|
+
end
|
19
|
+
|
20
|
+
def proxy_node
|
21
|
+
self.class.const_get('Node')
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.setup!(*args)
|
25
|
+
install_requirements!(*args)
|
26
|
+
rescue LoadError => error
|
27
|
+
raise Exceptions::AdapterSetupError.new(name, error.message)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ProxyFetcher
|
2
|
+
class Document
|
3
|
+
class NokogiriAdapter < AbstractAdapter
|
4
|
+
def self.install_requirements!
|
5
|
+
require 'nokogiri'
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.parse(data)
|
9
|
+
new(::Nokogiri::HTML(data))
|
10
|
+
end
|
11
|
+
|
12
|
+
class Node < ProxyFetcher::Document::Node
|
13
|
+
def at_xpath(*args)
|
14
|
+
self.class.new(node.at_xpath(*args))
|
15
|
+
end
|
16
|
+
|
17
|
+
def at_css(*args)
|
18
|
+
self.class.new(node.at_css(*args))
|
19
|
+
end
|
20
|
+
|
21
|
+
def attr(*args)
|
22
|
+
clear(node.attr(*args))
|
23
|
+
end
|
24
|
+
|
25
|
+
def content
|
26
|
+
clear(node.content)
|
27
|
+
end
|
28
|
+
|
29
|
+
def html
|
30
|
+
node.inner_html
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ProxyFetcher
|
2
|
+
class Document
|
3
|
+
class OgaAdapter < AbstractAdapter
|
4
|
+
def self.install_requirements!
|
5
|
+
require 'oga'
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.parse(data)
|
9
|
+
new(::Oga.parse_html(data))
|
10
|
+
end
|
11
|
+
|
12
|
+
class Node < ProxyFetcher::Document::Node
|
13
|
+
def at_xpath(*args)
|
14
|
+
self.class.new(node.at_xpath(*args))
|
15
|
+
end
|
16
|
+
|
17
|
+
def at_css(*args)
|
18
|
+
self.class.new(node.at_css(*args))
|
19
|
+
end
|
20
|
+
|
21
|
+
def attr(*args)
|
22
|
+
clear(node.attribute(*args).value)
|
23
|
+
end
|
24
|
+
|
25
|
+
def content
|
26
|
+
clear(node.text)
|
27
|
+
end
|
28
|
+
|
29
|
+
def html
|
30
|
+
node.to_xml
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module ProxyFetcher
|
2
|
+
class Document
|
3
|
+
class Adapters
|
4
|
+
ADAPTER = 'Adapter'.freeze
|
5
|
+
private_constant :ADAPTER
|
6
|
+
|
7
|
+
class << self
|
8
|
+
def lookup(name_or_class)
|
9
|
+
raise Exceptions::BlankAdapter if name_or_class.nil? || name_or_class.to_s.empty?
|
10
|
+
|
11
|
+
case name_or_class
|
12
|
+
when Symbol, String
|
13
|
+
adapter_name = name_or_class.to_s.capitalize << ADAPTER
|
14
|
+
ProxyFetcher::Document.const_get(adapter_name)
|
15
|
+
else
|
16
|
+
name_or_class
|
17
|
+
end
|
18
|
+
rescue NameError
|
19
|
+
raise Exceptions::UnknownAdapter, name_or_class
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module ProxyFetcher
|
2
|
+
class Document
|
3
|
+
class Node
|
4
|
+
attr_reader :node
|
5
|
+
|
6
|
+
def initialize(node)
|
7
|
+
@node = node
|
8
|
+
end
|
9
|
+
|
10
|
+
def find(selector, method = :at_xpath)
|
11
|
+
self.class.new(node.public_send(method, selector))
|
12
|
+
end
|
13
|
+
|
14
|
+
def content_at(*args)
|
15
|
+
clear(find(*args).content)
|
16
|
+
end
|
17
|
+
|
18
|
+
def content
|
19
|
+
raise "#{__method__} must be implemented in descendant class!"
|
20
|
+
end
|
21
|
+
|
22
|
+
def html
|
23
|
+
raise "#{__method__} must be implemented in descendant class!"
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
|
28
|
+
def clear(text)
|
29
|
+
return if text.nil? || text.empty?
|
30
|
+
|
31
|
+
text.strip.gsub(/[ \t]/i, '')
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module ProxyFetcher
|
2
|
+
class Document
|
3
|
+
class << self
|
4
|
+
def parse(data)
|
5
|
+
new(ProxyFetcher.config.adapter.parse(data))
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
attr_reader :backend
|
10
|
+
|
11
|
+
def initialize(backend)
|
12
|
+
@backend = backend
|
13
|
+
end
|
14
|
+
|
15
|
+
def xpath(*args)
|
16
|
+
backend.xpath(*args).map { |node| backend.proxy_node.new(node) }
|
17
|
+
end
|
18
|
+
|
19
|
+
def css(*args)
|
20
|
+
backend.css(*args).map { |node| backend.proxy_node.new(node) }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -32,5 +32,38 @@ module ProxyFetcher
|
|
32
32
|
super('reached the maximum number of retries')
|
33
33
|
end
|
34
34
|
end
|
35
|
+
|
36
|
+
class UnknownAdapter < Error
|
37
|
+
def initialize(name)
|
38
|
+
super("unknown adapter '#{name}'")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class BlankAdapter < Error
|
43
|
+
def initialize(*)
|
44
|
+
super(<<-MSG.strip.squeeze
|
45
|
+
you need to specify adapter for HTML parsing: ProxyFetcher.config.adapter = :nokogiri.
|
46
|
+
You can use one of the predefined adapters (:nokogiri or :oga) or your own implementation.
|
47
|
+
MSG
|
48
|
+
)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class AdapterSetupError < Error
|
53
|
+
def initialize(adapter_name, reason)
|
54
|
+
adapter = demodulize(adapter_name.gsub('Adapter', ''))
|
55
|
+
|
56
|
+
super("can't setup '#{adapter}' adapter during the following error:\n\t#{reason}'")
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def demodulize(path)
|
62
|
+
path = path.to_s
|
63
|
+
index = path.rindex('::')
|
64
|
+
|
65
|
+
index ? path[(index + 2)..-1] : path
|
66
|
+
end
|
67
|
+
end
|
35
68
|
end
|
36
69
|
end
|
@@ -1,12 +1,6 @@
|
|
1
|
-
require 'forwardable'
|
2
|
-
|
3
1
|
module ProxyFetcher
|
4
2
|
module Providers
|
5
3
|
class Base
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
def_delegators ProxyFetcher::HTML, :clear, :convert_to_int
|
9
|
-
|
10
4
|
# Loads proxy provider page content, extract proxy list from it
|
11
5
|
# and convert every entry to proxy object.
|
12
6
|
def fetch_proxies!(filters = {})
|
@@ -14,8 +8,8 @@ module ProxyFetcher
|
|
14
8
|
end
|
15
9
|
|
16
10
|
class << self
|
17
|
-
def fetch_proxies!(
|
18
|
-
new.fetch_proxies!(
|
11
|
+
def fetch_proxies!(*args)
|
12
|
+
new.fetch_proxies!(*args)
|
19
13
|
end
|
20
14
|
end
|
21
15
|
|
@@ -23,12 +17,13 @@ module ProxyFetcher
|
|
23
17
|
|
24
18
|
# Loads HTML document with Nokogiri by the URL combined with custom filters
|
25
19
|
def load_document(url, filters = {})
|
26
|
-
raise ArgumentError, 'filters must be a Hash'
|
20
|
+
raise ArgumentError, 'filters must be a Hash' unless filters.is_a?(Hash)
|
27
21
|
|
28
22
|
uri = URI.parse(url)
|
29
23
|
uri.query = URI.encode_www_form(filters) if filters && filters.any?
|
30
24
|
|
31
|
-
|
25
|
+
html = ProxyFetcher.config.http_client.fetch(uri.to_s)
|
26
|
+
ProxyFetcher::Document.parse(html)
|
32
27
|
end
|
33
28
|
|
34
29
|
# Get HTML elements with proxy info
|
@@ -40,11 +35,6 @@ module ProxyFetcher
|
|
40
35
|
def to_proxy(*)
|
41
36
|
raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
|
42
37
|
end
|
43
|
-
|
44
|
-
# Return normalized HTML element content by selector
|
45
|
-
def parse_element(parent, selector, method = :at_xpath)
|
46
|
-
clear(parent.public_send(method, selector).content)
|
47
|
-
end
|
48
38
|
end
|
49
39
|
end
|
50
40
|
end
|
@@ -9,20 +9,20 @@ module ProxyFetcher
|
|
9
9
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
10
10
|
end
|
11
11
|
|
12
|
-
def to_proxy(
|
12
|
+
def to_proxy(html_node)
|
13
13
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
|
-
proxy.addr =
|
15
|
-
proxy.port =
|
16
|
-
proxy.country =
|
17
|
-
proxy.anonymity =
|
18
|
-
proxy.type = parse_type(
|
14
|
+
proxy.addr = html_node.content_at('td[1]')
|
15
|
+
proxy.port = Integer(html_node.content_at('td[2]'))
|
16
|
+
proxy.country = html_node.content_at('td[4]')
|
17
|
+
proxy.anonymity = html_node.content_at('td[5]')
|
18
|
+
proxy.type = parse_type(html_node)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
22
|
private
|
23
23
|
|
24
|
-
def parse_type(
|
25
|
-
https =
|
24
|
+
def parse_type(html_node)
|
25
|
+
https = html_node.content_at('td[6]')
|
26
26
|
https && https.casecmp('yes').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
27
27
|
end
|
28
28
|
end
|
@@ -9,12 +9,12 @@ module ProxyFetcher
|
|
9
9
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
10
10
|
end
|
11
11
|
|
12
|
-
def to_proxy(
|
12
|
+
def to_proxy(html_node)
|
13
13
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
|
-
proxy.addr =
|
15
|
-
proxy.port =
|
16
|
-
proxy.country =
|
17
|
-
proxy.anonymity =
|
14
|
+
proxy.addr = html_node.content_at('td[1]')
|
15
|
+
proxy.port = Integer(html_node.content_at('td[2]'))
|
16
|
+
proxy.country = html_node.content_at('td[4]')
|
17
|
+
proxy.anonymity = html_node.content_at('td[5]')
|
18
18
|
proxy.type = ProxyFetcher::Proxy::HTTPS
|
19
19
|
end
|
20
20
|
end
|
@@ -10,8 +10,8 @@ module ProxyFetcher
|
|
10
10
|
doc.xpath('//div[@class="proxy-list"]/table/script')
|
11
11
|
end
|
12
12
|
|
13
|
-
def to_proxy(
|
14
|
-
json = parse_json(
|
13
|
+
def to_proxy(html_node)
|
14
|
+
json = parse_json(html_node)
|
15
15
|
|
16
16
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
17
17
|
proxy.addr = json['PROXY_IP']
|
@@ -25,8 +25,8 @@ module ProxyFetcher
|
|
25
25
|
|
26
26
|
private
|
27
27
|
|
28
|
-
def parse_json(
|
29
|
-
javascript =
|
28
|
+
def parse_json(html_node)
|
29
|
+
javascript = html_node.content[/{.+}/im]
|
30
30
|
JSON.parse(javascript)
|
31
31
|
end
|
32
32
|
end
|
@@ -8,34 +8,34 @@ module ProxyFetcher
|
|
8
8
|
doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
|
9
9
|
end
|
10
10
|
|
11
|
-
def to_proxy(
|
11
|
+
def to_proxy(html_node)
|
12
12
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
13
|
-
uri = parse_proxy_uri(
|
13
|
+
uri = parse_proxy_uri(html_node)
|
14
14
|
proxy.addr = uri.host
|
15
15
|
proxy.port = uri.port
|
16
16
|
|
17
|
-
proxy.country = parse_country(
|
18
|
-
proxy.anonymity = parse_anonymity(
|
17
|
+
proxy.country = parse_country(html_node)
|
18
|
+
proxy.anonymity = parse_anonymity(html_node)
|
19
19
|
proxy.type = ProxyFetcher::Proxy::HTTP
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
23
|
private
|
24
24
|
|
25
|
-
def parse_proxy_uri(
|
26
|
-
full_addr =
|
25
|
+
def parse_proxy_uri(html_node)
|
26
|
+
full_addr = html_node.content_at('td[1]')
|
27
27
|
URI.parse("http://#{full_addr}")
|
28
28
|
end
|
29
29
|
|
30
|
-
def parse_country(
|
31
|
-
|
30
|
+
def parse_country(html_node)
|
31
|
+
html_node.find('.//img').attr('title')
|
32
32
|
end
|
33
33
|
|
34
|
-
def parse_anonymity(
|
35
|
-
transparency =
|
34
|
+
def parse_anonymity(html_node)
|
35
|
+
transparency = html_node.content_at('td[5]').to_sym
|
36
36
|
|
37
37
|
{
|
38
|
-
A: '
|
38
|
+
A: 'Anonymous',
|
39
39
|
E: 'Elite',
|
40
40
|
T: 'Transparent',
|
41
41
|
U: 'Unknown'
|
@@ -9,15 +9,15 @@ module ProxyFetcher
|
|
9
9
|
doc.xpath('//table[contains(@class, "table")]/tr[(not(@id="proxy-table-header")) and (count(td)>2)]')
|
10
10
|
end
|
11
11
|
|
12
|
-
def to_proxy(
|
12
|
+
def to_proxy(html_node)
|
13
13
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
|
-
uri = URI("//#{
|
14
|
+
uri = URI("//#{html_node.content_at('td[1]')}")
|
15
15
|
proxy.addr = uri.host
|
16
16
|
proxy.port = uri.port
|
17
17
|
|
18
|
-
proxy.type =
|
19
|
-
proxy.anonymity =
|
20
|
-
proxy.country =
|
18
|
+
proxy.type = html_node.content_at('td[2]')
|
19
|
+
proxy.anonymity = html_node.content_at('td[3]')
|
20
|
+
proxy.country = html_node.content_at('td[5]')
|
21
21
|
end
|
22
22
|
end
|
23
23
|
end
|
@@ -10,22 +10,22 @@ module ProxyFetcher
|
|
10
10
|
doc.css('.table-wrap .table ul')
|
11
11
|
end
|
12
12
|
|
13
|
-
def to_proxy(
|
13
|
+
def to_proxy(html_node)
|
14
14
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
15
|
-
uri = parse_proxy_uri(
|
15
|
+
uri = parse_proxy_uri(html_node)
|
16
16
|
proxy.addr = uri.host
|
17
17
|
proxy.port = uri.port
|
18
18
|
|
19
|
-
proxy.type =
|
20
|
-
proxy.anonymity =
|
21
|
-
proxy.country =
|
19
|
+
proxy.type = html_node.content_at('li[2]')
|
20
|
+
proxy.anonymity = html_node.content_at('li[4]')
|
21
|
+
proxy.country = html_node.find("li[5]//span[@class='country']").attr('title')
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
25
|
private
|
26
26
|
|
27
|
-
def parse_proxy_uri(
|
28
|
-
full_addr = ::Base64.decode64(
|
27
|
+
def parse_proxy_uri(html_node)
|
28
|
+
full_addr = ::Base64.decode64(html_node.at_css('li script').html.match(/'(.+)'/)[1])
|
29
29
|
URI.parse("http://#{full_addr}")
|
30
30
|
end
|
31
31
|
end
|
@@ -8,21 +8,21 @@ module ProxyFetcher
|
|
8
8
|
doc.xpath('//div[@id="content"]/table[1]/tr[contains(@class, "row")]')
|
9
9
|
end
|
10
10
|
|
11
|
-
def to_proxy(
|
11
|
+
def to_proxy(html_node)
|
12
12
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
13
|
-
proxy.addr =
|
14
|
-
proxy.port =
|
15
|
-
proxy.anonymity =
|
16
|
-
proxy.country =
|
17
|
-
proxy.response_time =
|
18
|
-
proxy.type = parse_type(
|
13
|
+
proxy.addr = html_node.content_at('td[2]')
|
14
|
+
proxy.port = Integer(html_node.content_at('td[3]'))
|
15
|
+
proxy.anonymity = html_node.content_at('td[4]')
|
16
|
+
proxy.country = html_node.content_at('td[6]')
|
17
|
+
proxy.response_time = Integer(html_node.content_at('td[7]'))
|
18
|
+
proxy.type = parse_type(html_node)
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
22
22
|
private
|
23
23
|
|
24
|
-
def parse_type(
|
25
|
-
https =
|
24
|
+
def parse_type(html_node)
|
25
|
+
https = html_node.content_at('td[5]')
|
26
26
|
https.casecmp('true').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
27
27
|
end
|
28
28
|
end
|
data/lib/proxy_fetcher.rb
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
require 'uri'
|
2
2
|
require 'net/https'
|
3
|
-
require 'nokogiri'
|
4
|
-
require 'thread'
|
5
3
|
|
6
4
|
require File.dirname(__FILE__) + '/proxy_fetcher/exceptions'
|
7
5
|
require File.dirname(__FILE__) + '/proxy_fetcher/configuration'
|
@@ -10,12 +8,18 @@ require File.dirname(__FILE__) + '/proxy_fetcher/proxy'
|
|
10
8
|
require File.dirname(__FILE__) + '/proxy_fetcher/manager'
|
11
9
|
|
12
10
|
require File.dirname(__FILE__) + '/proxy_fetcher/utils/http_client'
|
13
|
-
require File.dirname(__FILE__) + '/proxy_fetcher/utils/html'
|
14
11
|
require File.dirname(__FILE__) + '/proxy_fetcher/utils/proxy_validator'
|
15
12
|
require File.dirname(__FILE__) + '/proxy_fetcher/client/client'
|
16
13
|
require File.dirname(__FILE__) + '/proxy_fetcher/client/request'
|
17
14
|
require File.dirname(__FILE__) + '/proxy_fetcher/client/proxies_registry'
|
18
15
|
|
16
|
+
require File.dirname(__FILE__) + '/proxy_fetcher/document'
|
17
|
+
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters'
|
18
|
+
require File.dirname(__FILE__) + '/proxy_fetcher/document/node'
|
19
|
+
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/abstract_adapter'
|
20
|
+
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/nokogiri_adapter'
|
21
|
+
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/oga_adapter'
|
22
|
+
|
19
23
|
module ProxyFetcher
|
20
24
|
module Providers
|
21
25
|
require File.dirname(__FILE__) + '/proxy_fetcher/providers/base'
|
@@ -36,5 +40,13 @@ module ProxyFetcher
|
|
36
40
|
def configure
|
37
41
|
yield config
|
38
42
|
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def configure_adapter!
|
47
|
+
config.adapter = Configuration::DEFAULT_ADAPTER if config.adapter.nil?
|
48
|
+
end
|
39
49
|
end
|
50
|
+
|
51
|
+
configure_adapter!
|
40
52
|
end
|
data/proxy_fetcher.gemspec
CHANGED
@@ -5,10 +5,10 @@ require 'proxy_fetcher/version'
|
|
5
5
|
Gem::Specification.new do |gem|
|
6
6
|
gem.name = 'proxy_fetcher'
|
7
7
|
gem.version = ProxyFetcher.gem_version
|
8
|
-
gem.date = '2017-
|
8
|
+
gem.date = '2017-12-08'
|
9
9
|
gem.summary = 'Ruby gem for dealing with proxy lists from different providers'
|
10
10
|
gem.description = 'This gem can help your Ruby application to make HTTP(S) requests ' \
|
11
|
-
'
|
11
|
+
'using proxies by fetching and validating proxy lists from the different providers.'
|
12
12
|
gem.authors = ['Nikita Bulai']
|
13
13
|
gem.email = 'bulajnikita@gmail.com'
|
14
14
|
gem.require_paths = ['lib']
|
@@ -19,7 +19,5 @@ Gem::Specification.new do |gem|
|
|
19
19
|
gem.license = 'MIT'
|
20
20
|
gem.required_ruby_version = '>= 2.0.0'
|
21
21
|
|
22
|
-
gem.add_runtime_dependency 'nokogiri', '~> 1.6', '>= 1.6'
|
23
|
-
|
24
22
|
gem.add_development_dependency 'rspec', '~> 3.5'
|
25
23
|
end
|
@@ -118,7 +118,7 @@ describe ProxyFetcher::Client do
|
|
118
118
|
it 'refreshes proxy lists if no proxy found' do
|
119
119
|
ProxyFetcher::Client::ProxiesRegistry.manager.instance_variable_set(:'@proxies', [])
|
120
120
|
|
121
|
-
expect { ProxyFetcher::Client.get('http://httpbin.org') }.not_to raise_error
|
121
|
+
expect { ProxyFetcher::Client.get('http://httpbin.org') }.not_to raise_error
|
122
122
|
end
|
123
123
|
end
|
124
124
|
|
@@ -43,16 +43,33 @@ describe ProxyFetcher::Configuration do
|
|
43
43
|
end
|
44
44
|
|
45
45
|
context 'custom provider' do
|
46
|
-
it '
|
46
|
+
it 'fails on registration if provider class already registered' do
|
47
47
|
expect { ProxyFetcher::Configuration.register_provider(:xroxy, Class.new) }
|
48
48
|
.to raise_error(ProxyFetcher::Exceptions::RegisteredProvider)
|
49
49
|
end
|
50
50
|
|
51
|
-
it "
|
51
|
+
it "fails on proxy list fetching if provider doesn't registered" do
|
52
52
|
ProxyFetcher.config.provider = :not_existing_provider
|
53
53
|
|
54
54
|
expect { ProxyFetcher::Manager.new }
|
55
55
|
.to raise_error(ProxyFetcher::Exceptions::UnknownProvider)
|
56
56
|
end
|
57
57
|
end
|
58
|
+
|
59
|
+
context 'custom HTML parsing adapter' do
|
60
|
+
it "fails if adapter can't be installed" do
|
61
|
+
old_config = ProxyFetcher.config.dup
|
62
|
+
|
63
|
+
class CustomAdapter < ProxyFetcher::Document::AbstractAdapter
|
64
|
+
def self.install_requirements!
|
65
|
+
require 'not_existing_gem'
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
expect { ProxyFetcher.config.adapter = CustomAdapter }
|
70
|
+
.to raise_error(ProxyFetcher::Exceptions::AdapterSetupError)
|
71
|
+
|
72
|
+
ProxyFetcher.instance_variable_set('@config', old_config)
|
73
|
+
end
|
74
|
+
end
|
58
75
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe ProxyFetcher::Document::Adapters do
|
4
|
+
describe '#lookup' do
|
5
|
+
it 'returns predefined adapters if symbol or string passed' do
|
6
|
+
expect(described_class.lookup('nokogiri')).to eq(ProxyFetcher::Document::NokogiriAdapter)
|
7
|
+
|
8
|
+
expect(described_class.lookup(:oga)).to eq(ProxyFetcher::Document::OgaAdapter)
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'returns self if class passed' do
|
12
|
+
expect(described_class.lookup(Struct)).to eq(Struct)
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'raises an exception if passed value is blank' do
|
16
|
+
expect { described_class.lookup(nil) }.to raise_error(ProxyFetcher::Exceptions::BlankAdapter)
|
17
|
+
expect { described_class.lookup('') }.to raise_error(ProxyFetcher::Exceptions::BlankAdapter)
|
18
|
+
end
|
19
|
+
|
20
|
+
it "raises an exception if adapter doesn't exist" do
|
21
|
+
expect { described_class.lookup('wrong') }.to raise_error(ProxyFetcher::Exceptions::UnknownAdapter)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -15,6 +15,13 @@ require 'proxy_fetcher'
|
|
15
15
|
|
16
16
|
Dir['./spec/support/**/*.rb'].sort.each { |f| require f }
|
17
17
|
|
18
|
+
adapter = ENV['BUNDLE_GEMFILE'][/.+\/(.+)\.gemfile/i, 1]
|
19
|
+
puts "Configured adapter: '#{adapter}'"
|
20
|
+
|
21
|
+
ProxyFetcher.configure do |config|
|
22
|
+
config.adapter = adapter
|
23
|
+
end
|
24
|
+
|
18
25
|
RSpec.configure do |config|
|
19
26
|
config.order = 'random'
|
20
27
|
end
|
metadata
CHANGED
@@ -1,35 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proxy_fetcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikita Bulai
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-12-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: nokogiri
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.6'
|
20
|
-
- - ">="
|
21
|
-
- !ruby/object:Gem::Version
|
22
|
-
version: '1.6'
|
23
|
-
type: :runtime
|
24
|
-
prerelease: false
|
25
|
-
version_requirements: !ruby/object:Gem::Requirement
|
26
|
-
requirements:
|
27
|
-
- - "~>"
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
version: '1.6'
|
30
|
-
- - ">="
|
31
|
-
- !ruby/object:Gem::Version
|
32
|
-
version: '1.6'
|
33
13
|
- !ruby/object:Gem::Dependency
|
34
14
|
name: rspec
|
35
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,8 +24,8 @@ dependencies:
|
|
44
24
|
- - "~>"
|
45
25
|
- !ruby/object:Gem::Version
|
46
26
|
version: '3.5'
|
47
|
-
description: This gem can help your Ruby application to make HTTP(S) requests
|
48
|
-
|
27
|
+
description: This gem can help your Ruby application to make HTTP(S) requests using
|
28
|
+
proxies by fetching and validating proxy lists from the different providers.
|
49
29
|
email: bulajnikita@gmail.com
|
50
30
|
executables:
|
51
31
|
- proxy_fetcher
|
@@ -62,12 +42,20 @@ files:
|
|
62
42
|
- README.md
|
63
43
|
- Rakefile
|
64
44
|
- bin/proxy_fetcher
|
45
|
+
- gemfiles/nokogiri.gemfile
|
46
|
+
- gemfiles/oga.gemfile
|
65
47
|
- lib/proxy_fetcher.rb
|
66
48
|
- lib/proxy_fetcher/client/client.rb
|
67
49
|
- lib/proxy_fetcher/client/proxies_registry.rb
|
68
50
|
- lib/proxy_fetcher/client/request.rb
|
69
51
|
- lib/proxy_fetcher/configuration.rb
|
70
52
|
- lib/proxy_fetcher/configuration/providers_registry.rb
|
53
|
+
- lib/proxy_fetcher/document.rb
|
54
|
+
- lib/proxy_fetcher/document/adapters.rb
|
55
|
+
- lib/proxy_fetcher/document/adapters/abstract_adapter.rb
|
56
|
+
- lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb
|
57
|
+
- lib/proxy_fetcher/document/adapters/oga_adapter.rb
|
58
|
+
- lib/proxy_fetcher/document/node.rb
|
71
59
|
- lib/proxy_fetcher/exceptions.rb
|
72
60
|
- lib/proxy_fetcher/manager.rb
|
73
61
|
- lib/proxy_fetcher/providers/base.rb
|
@@ -79,13 +67,13 @@ files:
|
|
79
67
|
- lib/proxy_fetcher/providers/proxy_list.rb
|
80
68
|
- lib/proxy_fetcher/providers/xroxy.rb
|
81
69
|
- lib/proxy_fetcher/proxy.rb
|
82
|
-
- lib/proxy_fetcher/utils/html.rb
|
83
70
|
- lib/proxy_fetcher/utils/http_client.rb
|
84
71
|
- lib/proxy_fetcher/utils/proxy_validator.rb
|
85
72
|
- lib/proxy_fetcher/version.rb
|
86
73
|
- proxy_fetcher.gemspec
|
87
74
|
- spec/proxy_fetcher/client_spec.rb
|
88
75
|
- spec/proxy_fetcher/configuration_spec.rb
|
76
|
+
- spec/proxy_fetcher/document/adapters_spec.rb
|
89
77
|
- spec/proxy_fetcher/providers/base_spec.rb
|
90
78
|
- spec/proxy_fetcher/providers/free_proxy_list_spec.rb
|
91
79
|
- spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb
|
@@ -97,7 +85,6 @@ files:
|
|
97
85
|
- spec/proxy_fetcher/providers/xroxy_spec.rb
|
98
86
|
- spec/proxy_fetcher/proxy_spec.rb
|
99
87
|
- spec/spec_helper.rb
|
100
|
-
- spec/support/evil_proxy_patch.rb
|
101
88
|
- spec/support/manager_examples.rb
|
102
89
|
homepage: http://github.com/nbulaj/proxy_fetcher
|
103
90
|
licenses:
|
@@ -1,26 +0,0 @@
|
|
1
|
-
require 'evil-proxy'
|
2
|
-
|
3
|
-
EvilProxy::HTTPProxyServer.class_eval do
|
4
|
-
def do_PUT(req, res)
|
5
|
-
perform_proxy_request(req, res) do |http, path, header|
|
6
|
-
http.put(path, req.body || '', header)
|
7
|
-
end
|
8
|
-
end
|
9
|
-
|
10
|
-
def do_DELETE(req, res)
|
11
|
-
perform_proxy_request(req, res) do |http, path, header|
|
12
|
-
http.delete(path, header)
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
def do_PATCH(req, res)
|
17
|
-
perform_proxy_request(req, res) do |http, path, header|
|
18
|
-
http.patch(path, req.body || '', header)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# This method is not needed for PUT but I added for completeness
|
23
|
-
def do_OPTIONS(_req, res)
|
24
|
-
res['allow'] = 'GET,HEAD,POST,OPTIONS,CONNECT,PUT,PATCH,DELETE'
|
25
|
-
end
|
26
|
-
end
|