proxy_fetcher 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -10
- data/.travis.yml +7 -4
- data/README.md +11 -9
- data/bin/proxy_fetcher +0 -0
- data/lib/proxy_fetcher/client/client.rb +95 -0
- data/lib/proxy_fetcher/client/request.rb +6 -1
- data/lib/proxy_fetcher/configuration.rb +24 -3
- data/lib/proxy_fetcher/document.rb +3 -0
- data/lib/proxy_fetcher/exceptions.rb +17 -0
- data/lib/proxy_fetcher/manager.rb +1 -0
- data/lib/proxy_fetcher/providers/base.rb +8 -1
- data/lib/proxy_fetcher/providers/free_proxy_list.rb +19 -0
- data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +18 -0
- data/lib/proxy_fetcher/providers/gather_proxy.rb +18 -0
- data/lib/proxy_fetcher/providers/http_tunnel.rb +42 -0
- data/lib/proxy_fetcher/providers/proxy_docker.rb +19 -1
- data/lib/proxy_fetcher/providers/proxy_list.rb +26 -0
- data/lib/proxy_fetcher/providers/xroxy.rb +26 -0
- data/lib/proxy_fetcher/proxy.rb +28 -0
- data/lib/proxy_fetcher/utils/http_client.rb +3 -0
- data/lib/proxy_fetcher/utils/proxy_validator.rb +4 -0
- data/lib/proxy_fetcher/version.rb +5 -1
- data/lib/proxy_fetcher.rb +25 -0
- data/proxy_fetcher.gemspec +1 -1
- data/spec/proxy_fetcher/proxy_spec.rb +10 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c2d4076c9f73303b305c364d3dcf672b6e8eba7
|
4
|
+
data.tar.gz: b692847fab646ec4bd230909ac1cba9eb77c1c7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd70341cd18eea64dad58d63b8888f36e403251bb08e9d0289c584de1bc2201b45ff97f48fbac26697b25ee52328611b1e37de8d393b3b50ed7b3885192d0885
|
7
|
+
data.tar.gz: aed0b7532628796cb2b9f397be8bd8d87108c4d5a4e4efd2d30a468e2b8c9c203fbe7ac1fd4b56c7e571a67748b681297c56e2c880fddddb20c463bc117789e6
|
data/.gitignore
CHANGED
@@ -17,16 +17,6 @@ Gemfile.lock
|
|
17
17
|
certs
|
18
18
|
gemfiles/*.gemfile.lock
|
19
19
|
|
20
|
-
# TODO Comment out this rule if you are OK with secrets being uploaded to the repo
|
21
|
-
config/initializers/secret_token.rb
|
22
|
-
|
23
|
-
# Only include if you have production secrets in this file, which is no longer a Rails default
|
24
|
-
# config/secrets.yml
|
25
|
-
|
26
|
-
# dotenv
|
27
|
-
# TODO Comment out this rule if environment variables can be committed
|
28
|
-
.env
|
29
|
-
|
30
20
|
## Environment normalization:
|
31
21
|
/.bundle
|
32
22
|
/vendor/bundle
|
@@ -47,3 +37,5 @@ bower.json
|
|
47
37
|
|
48
38
|
# Ignore Byebug command history file.
|
49
39
|
.byebug_history
|
40
|
+
.yardoc/
|
41
|
+
doc/
|
data/.travis.yml
CHANGED
@@ -2,9 +2,10 @@ language: ruby
|
|
2
2
|
before_install: gem install bundler
|
3
3
|
bundler_args: --without yard guard benchmarks
|
4
4
|
script: "rake spec"
|
5
|
-
env:
|
6
|
-
|
7
|
-
|
5
|
+
env: JRUBY_OPTS="$JRUBY_OPTS --debug"
|
6
|
+
gemfile:
|
7
|
+
- gemfiles/oga.gemfile
|
8
|
+
- gemfiles/nokogiri.gemfile
|
8
9
|
gemfile:
|
9
10
|
- gemfiles/oga.gemfile
|
10
11
|
- gemfiles/nokogiri.gemfile
|
@@ -13,8 +14,10 @@ rvm:
|
|
13
14
|
- 2.1
|
14
15
|
- 2.2.4
|
15
16
|
- 2.3.3
|
16
|
-
- 2.4.
|
17
|
+
- 2.4.3
|
18
|
+
- 2.5.0
|
17
19
|
- ruby-head
|
20
|
+
- jruby-9.1.15.0
|
18
21
|
matrix:
|
19
22
|
allow_failures:
|
20
23
|
- rvm: ruby-head
|
data/README.md
CHANGED
@@ -1,16 +1,17 @@
|
|
1
|
-
# Ruby lib for managing proxies
|
1
|
+
# Ruby / JRuby lib for managing proxies
|
2
2
|
[](http://badge.fury.io/rb/proxy_fetcher)
|
3
3
|
[](https://travis-ci.org/nbulaj/proxy_fetcher)
|
4
4
|
[](https://coveralls.io/github/nbulaj/proxy_fetcher)
|
5
5
|
[](https://codeclimate.com/github/nbulaj/proxy_fetcher)
|
6
|
+
[](http://inch-ci.org/github/nbulaj/proxy_fetcher)
|
6
7
|
[](#license)
|
7
8
|
|
8
|
-
This gem can help your Ruby application to make HTTP(S) requests using
|
9
|
-
proxy lists from multiple providers.
|
9
|
+
This gem can help your Ruby / JRuby application to make HTTP(S) requests using
|
10
|
+
proxy by fetching and validating actual proxy lists from multiple providers.
|
10
11
|
|
11
12
|
It gives you a special `Manager` class that can load proxy lists, validate them and return random or specific proxies.
|
12
|
-
It also has a `Client` class that encapsulates all the logic for
|
13
|
-
Take a look at the documentation below to find all the gem features.
|
13
|
+
It also has a `Client` class that encapsulates all the logic for sending HTTP requests using proxies, automatically
|
14
|
+
fetched and validated by the gem. Take a look at the documentation below to find all the gem features.
|
14
15
|
|
15
16
|
Also this gem can be used with any other programming language (Go / Python / etc) as standalone solution for downloading and
|
16
17
|
validating proxy lists from the different providers. [Checkout examples](#standalone) of usage below.
|
@@ -32,13 +33,14 @@ validating proxy lists from the different providers. [Checkout examples](#standa
|
|
32
33
|
|
33
34
|
## Dependencies
|
34
35
|
|
35
|
-
ProxyFetcher gem itself requires only Ruby `>= 2.0.0
|
36
|
+
ProxyFetcher gem itself requires only Ruby `>= 2.0.0` (or [JRuby](http://jruby.org/) `> 9.0`, but maybe earlier too,
|
37
|
+
[see Travis build matrix](.travis.yml)).
|
36
38
|
|
37
39
|
However, it requires an adapter to parse HTML. If you do not specify any specific adapter, then it will use
|
38
40
|
default one - [Nokogiri](https://github.com/sparklemotion/nokogiri). It's OK for any Ruby on Rails project
|
39
|
-
(because they
|
41
|
+
(because they use it by default).
|
40
42
|
|
41
|
-
But if you want to use some specific adapter (for example your
|
43
|
+
But if you want to use some specific adapter (for example your application uses [Oga](https://gitlab.com/yorickpeterse/oga),
|
42
44
|
then you need to manually add your dependencies to your project and configure ProxyFetcher to use another adapter. Moreover,
|
43
45
|
you can implement your own adapter if it your use-case. Take a look at the [Configuration](#configuration) section for more details.
|
44
46
|
|
@@ -65,7 +67,7 @@ bundle install
|
|
65
67
|
Otherwise simply install the gem:
|
66
68
|
|
67
69
|
```sh
|
68
|
-
gem install proxy_fetcher -v '0.
|
70
|
+
gem install proxy_fetcher -v '0.6'
|
69
71
|
```
|
70
72
|
|
71
73
|
## Example of usage
|
data/bin/proxy_fetcher
CHANGED
File without changes
|
@@ -1,26 +1,121 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# ProxyFetcher HTTP client that encapsulates all the logic for sending
|
3
|
+
# HTTP(S) requests using proxies, automatically fetched and validated by the gem.
|
2
4
|
module Client
|
3
5
|
class << self
|
6
|
+
# Sends HTTP GET request.
|
7
|
+
#
|
8
|
+
# @param url [String]
|
9
|
+
# Requested URL
|
10
|
+
#
|
11
|
+
# @param headers [Hash]
|
12
|
+
# HTTP headers that will be used in the request
|
13
|
+
#
|
14
|
+
# @param options [Hash]
|
15
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
16
|
+
#
|
17
|
+
# @return [String]
|
18
|
+
# HTML body from the URL.
|
19
|
+
#
|
4
20
|
def get(url, headers: {}, options: {})
|
5
21
|
request_without_payload(:get, url, headers, options)
|
6
22
|
end
|
7
23
|
|
24
|
+
# Sends HTTP HEAD request.
|
25
|
+
#
|
26
|
+
# @param url [String]
|
27
|
+
# Requested URL
|
28
|
+
#
|
29
|
+
# @param headers [Hash]
|
30
|
+
# HTTP headers that will be used in the request
|
31
|
+
#
|
32
|
+
# @param options [Hash]
|
33
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
34
|
+
#
|
35
|
+
# @return [String]
|
36
|
+
# HTML body from the URL.
|
37
|
+
#
|
8
38
|
def head(url, headers: {}, options: {})
|
9
39
|
request_without_payload(:head, url, headers, options)
|
10
40
|
end
|
11
41
|
|
42
|
+
# Sends HTTP POST request.
|
43
|
+
#
|
44
|
+
# @param url [String]
|
45
|
+
# Requested URL
|
46
|
+
#
|
47
|
+
# @param payload [String, Hash]
|
48
|
+
# HTTP payload
|
49
|
+
#
|
50
|
+
# @param headers [Hash]
|
51
|
+
# HTTP headers that will be used in the request
|
52
|
+
#
|
53
|
+
# @param options [Hash]
|
54
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
55
|
+
#
|
56
|
+
# @return [String]
|
57
|
+
# HTML body from the URL.
|
58
|
+
#
|
12
59
|
def post(url, payload, headers: {}, options: {})
|
13
60
|
request_with_payload(:post, url, payload, headers, options)
|
14
61
|
end
|
15
62
|
|
63
|
+
# Sends HTTP DELETE request.
|
64
|
+
#
|
65
|
+
# @param url [String]
|
66
|
+
# Requested URL
|
67
|
+
#
|
68
|
+
# @param headers [Hash]
|
69
|
+
# HTTP headers that will be used in the request
|
70
|
+
#
|
71
|
+
# @param options [Hash]
|
72
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
73
|
+
#
|
74
|
+
# @return [String]
|
75
|
+
# HTML body from the URL.
|
76
|
+
#
|
16
77
|
def delete(url, headers: {}, options: {})
|
17
78
|
request_without_payload(:delete, url, headers, options)
|
18
79
|
end
|
19
80
|
|
81
|
+
# Sends HTTP PUT request.
|
82
|
+
#
|
83
|
+
# @param url [String]
|
84
|
+
# Requested URL
|
85
|
+
#
|
86
|
+
# @param payload [String, Hash]
|
87
|
+
# HTTP payload
|
88
|
+
#
|
89
|
+
# @param headers [Hash]
|
90
|
+
# HTTP headers that will be used in the request
|
91
|
+
#
|
92
|
+
# @param options [Hash]
|
93
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
94
|
+
#
|
95
|
+
# @return [String]
|
96
|
+
# HTML body from the URL.
|
97
|
+
#
|
20
98
|
def put(url, payload, headers: {}, options: {})
|
21
99
|
request_with_payload(:put, url, payload, headers, options)
|
22
100
|
end
|
23
101
|
|
102
|
+
# Sends HTTP PATCH request.
|
103
|
+
#
|
104
|
+
# @param url [String]
|
105
|
+
# Requested URL
|
106
|
+
#
|
107
|
+
# @param payload [String, Hash]
|
108
|
+
# HTTP payload
|
109
|
+
#
|
110
|
+
# @param headers [Hash]
|
111
|
+
# HTTP headers that will be used in the request
|
112
|
+
#
|
113
|
+
# @param options [Hash]
|
114
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
115
|
+
#
|
116
|
+
# @return [String]
|
117
|
+
# HTML body from the URL.
|
118
|
+
#
|
24
119
|
def patch(url, payload, headers: {}, options: {})
|
25
120
|
request_with_payload(:patch, url, payload, headers, options)
|
26
121
|
end
|
@@ -1,10 +1,15 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Client
|
3
|
+
# ProxyFetcher::Client HTTP request abstraction.
|
3
4
|
class Request
|
5
|
+
# URL encoding HTTP headers.
|
4
6
|
URL_ENCODED = {
|
5
7
|
'Content-Type' => 'application/x-www-form-urlencoded'
|
6
8
|
}.freeze
|
7
9
|
|
10
|
+
# Default SSL options that will be used for connecting to resources
|
11
|
+
# the uses secure connection. By default ProxyFetcher wouldn't verify
|
12
|
+
# SSL certs.
|
8
13
|
DEFAULT_SSL_OPTIONS = {
|
9
14
|
verify_mode: OpenSSL::SSL::VERIFY_NONE
|
10
15
|
}.freeze
|
@@ -46,7 +51,7 @@ module ProxyFetcher
|
|
46
51
|
return if payload.nil?
|
47
52
|
|
48
53
|
if payload.is_a?(Hash)
|
49
|
-
headers.merge(URL_ENCODED)
|
54
|
+
headers.merge!(URL_ENCODED)
|
50
55
|
URI.encode_www_form(payload)
|
51
56
|
else
|
52
57
|
payload
|
@@ -1,11 +1,23 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# ProxyFetcher configuration. Stores all the options for dealing
|
3
|
+
# with HTTP requests, adapters, custom classes.
|
4
|
+
#
|
2
5
|
class Configuration
|
3
6
|
attr_accessor :timeout, :pool_size, :user_agent
|
4
7
|
attr_reader :adapter, :http_client, :proxy_validator, :providers
|
5
8
|
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
+
# User-Agent string that will be used by the ProxyFetcher HTTP client (to
|
10
|
+
# send requests via proxy) and to fetch proxy lists from the sources.
|
11
|
+
#
|
12
|
+
# Default is Google Chrome 60, but can be changed in <code>ProxyFetcher.config</code>.
|
13
|
+
#
|
14
|
+
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 ' \
|
15
|
+
'(KHTML, like Gecko) Chrome/60.0.3112 Safari/537.36'.freeze
|
16
|
+
|
17
|
+
# HTML parser adapter name.
|
18
|
+
#
|
19
|
+
# Default is Nokogiri, but can be changed in <code>ProxyFetcher.config</code>.
|
20
|
+
#
|
9
21
|
DEFAULT_ADAPTER = :nokogiri
|
10
22
|
|
11
23
|
class << self
|
@@ -13,6 +25,15 @@ module ProxyFetcher
|
|
13
25
|
@registry ||= ProvidersRegistry.new
|
14
26
|
end
|
15
27
|
|
28
|
+
# Register new proxy provider. Requires provider name and class
|
29
|
+
# that will process proxy list.
|
30
|
+
#
|
31
|
+
# @param name [String, Symbol]
|
32
|
+
# name of the provider
|
33
|
+
#
|
34
|
+
# @param klass [Class]
|
35
|
+
# Class that will fetch and process proxy list
|
36
|
+
#
|
16
37
|
def register_provider(name, klass)
|
17
38
|
providers_registry.register(name, klass)
|
18
39
|
end
|
@@ -1,4 +1,7 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# HTML document abstraction class. Used to work with different HTML parser adapters
|
3
|
+
# such as Nokogiri, Oga or a custom one. Stores <i>backend</i< that will handle all
|
4
|
+
# the DOM manipulation logic.
|
2
5
|
class Document
|
3
6
|
class << self
|
4
7
|
def parse(data)
|
@@ -1,7 +1,10 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# Base exception class for all the ProxyFetcher exceptions.
|
2
3
|
Error = Class.new(StandardError)
|
3
4
|
|
5
|
+
# ProxyFetcher exceptions namespace
|
4
6
|
module Exceptions
|
7
|
+
# Exception for wrong custom classes (such as ProxyValidator or HTTP Client).
|
5
8
|
class WrongCustomClass < Error
|
6
9
|
def initialize(klass, methods)
|
7
10
|
required_methods = Array(methods).join(', ')
|
@@ -9,36 +12,48 @@ module ProxyFetcher
|
|
9
12
|
end
|
10
13
|
end
|
11
14
|
|
15
|
+
# Exception for wrong provider name, that raises when configured provider
|
16
|
+
# that is not registered via <code>register_provider</code> interface.
|
12
17
|
class UnknownProvider < Error
|
13
18
|
def initialize(provider_name)
|
14
19
|
super("unregistered proxy provider `#{provider_name}`")
|
15
20
|
end
|
16
21
|
end
|
17
22
|
|
23
|
+
# Exception for cases when user tries to register already existing provider.
|
18
24
|
class RegisteredProvider < Error
|
19
25
|
def initialize(name)
|
20
26
|
super("`#{name}` provider already registered!")
|
21
27
|
end
|
22
28
|
end
|
23
29
|
|
30
|
+
# Exception for cases when HTTP client reached maximum count of redirects
|
31
|
+
# trying to process HTTP request.
|
24
32
|
class MaximumRedirectsReached < Error
|
25
33
|
def initialize(*)
|
26
34
|
super('maximum redirects reached')
|
27
35
|
end
|
28
36
|
end
|
29
37
|
|
38
|
+
# Exception for cases when HTTP client reached maximum count of retries
|
39
|
+
# trying to process HTTP request. Can occur when request failed by timeout
|
40
|
+
# multiple times.
|
30
41
|
class MaximumRetriesReached < Error
|
31
42
|
def initialize(*)
|
32
43
|
super('reached the maximum number of retries')
|
33
44
|
end
|
34
45
|
end
|
35
46
|
|
47
|
+
# Exception for cases when user tries to set wrong HTML parser adapter
|
48
|
+
# in the configuration.
|
36
49
|
class UnknownAdapter < Error
|
37
50
|
def initialize(name)
|
38
51
|
super("unknown adapter '#{name}'")
|
39
52
|
end
|
40
53
|
end
|
41
54
|
|
55
|
+
# Exception for cases when user tries to set <code>nil</code> HTML parser adapter
|
56
|
+
# in the configuration (or just forget to change it).
|
42
57
|
class BlankAdapter < Error
|
43
58
|
def initialize(*)
|
44
59
|
super(<<-MSG.strip.squeeze
|
@@ -49,6 +64,8 @@ module ProxyFetcher
|
|
49
64
|
end
|
50
65
|
end
|
51
66
|
|
67
|
+
# Exception for cases when HTML parser adapter can't be installed.
|
68
|
+
# It will print the reason (backtrace) of the exception that caused an error.
|
52
69
|
class AdapterSetupError < Error
|
53
70
|
def initialize(adapter_name, reason)
|
54
71
|
adapter = demodulize(adapter_name.gsub('Adapter', ''))
|
@@ -1,5 +1,6 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# Base class for all the ProxyFetcher providers.
|
3
4
|
class Base
|
4
5
|
# Loads proxy provider page content, extract proxy list from it
|
5
6
|
# and convert every entry to proxy object.
|
@@ -8,6 +9,7 @@ module ProxyFetcher
|
|
8
9
|
end
|
9
10
|
|
10
11
|
class << self
|
12
|
+
# Just synthetic sugar to make it easier to call #fetch_proxies! method.
|
11
13
|
def fetch_proxies!(*args)
|
12
14
|
new.fetch_proxies!(*args)
|
13
15
|
end
|
@@ -26,7 +28,12 @@ module ProxyFetcher
|
|
26
28
|
ProxyFetcher::Document.parse(html)
|
27
29
|
end
|
28
30
|
|
29
|
-
#
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
33
|
+
# to return all the proxy entries (HTML nodes).
|
34
|
+
#
|
35
|
+
# Abstract method.
|
36
|
+
#
|
30
37
|
def load_proxy_list(*)
|
31
38
|
raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
|
32
39
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# FreeProxyList provider class.
|
3
4
|
class FreeProxyList < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'https://free-proxy-list.net/'.freeze
|
5
7
|
|
6
8
|
# [NOTE] Doesn't support filtering
|
@@ -9,6 +11,15 @@ module ProxyFetcher
|
|
9
11
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
10
12
|
end
|
11
13
|
|
14
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
15
|
+
# object.
|
16
|
+
#
|
17
|
+
# @param html_node [Object]
|
18
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
19
|
+
#
|
20
|
+
# @return [ProxyFetcher::Proxy]
|
21
|
+
# Proxy object
|
22
|
+
#
|
12
23
|
def to_proxy(html_node)
|
13
24
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
25
|
proxy.addr = html_node.content_at('td[1]')
|
@@ -21,6 +32,14 @@ module ProxyFetcher
|
|
21
32
|
|
22
33
|
private
|
23
34
|
|
35
|
+
# Parses HTML node to extract proxy type.
|
36
|
+
#
|
37
|
+
# @param html_node [Object]
|
38
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
39
|
+
#
|
40
|
+
# @return [String]
|
41
|
+
# Proxy type
|
42
|
+
#
|
24
43
|
def parse_type(html_node)
|
25
44
|
https = html_node.content_at('td[6]')
|
26
45
|
https && https.casecmp('yes').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
@@ -1,14 +1,32 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# FreeProxyListSSL provider class.
|
3
4
|
class FreeProxyListSSL < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'https://www.sslproxies.org/'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
# [NOTE] Doesn't support filtering
|
7
16
|
def load_proxy_list(*)
|
8
17
|
doc = load_document(PROVIDER_URL, {})
|
9
18
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
10
19
|
end
|
11
20
|
|
21
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
22
|
+
# object.
|
23
|
+
#
|
24
|
+
# @param html_node [Object]
|
25
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
26
|
+
#
|
27
|
+
# @return [ProxyFetcher::Proxy]
|
28
|
+
# Proxy object
|
29
|
+
#
|
12
30
|
def to_proxy(html_node)
|
13
31
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
32
|
proxy.addr = html_node.content_at('td[1]')
|
@@ -2,14 +2,32 @@ require 'json'
|
|
2
2
|
|
3
3
|
module ProxyFetcher
|
4
4
|
module Providers
|
5
|
+
# GatherProxy provider class.
|
5
6
|
class GatherProxy < Base
|
7
|
+
# Provider URL to fetch proxy list
|
6
8
|
PROVIDER_URL = 'http://www.gatherproxy.com/'.freeze
|
7
9
|
|
10
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
12
|
+
# to return all the proxy entries (HTML nodes).
|
13
|
+
#
|
14
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
15
|
+
# Collection of extracted HTML nodes with full proxy info
|
16
|
+
#
|
8
17
|
def load_proxy_list(*)
|
9
18
|
doc = load_document(PROVIDER_URL)
|
10
19
|
doc.xpath('//div[@class="proxy-list"]/table/script')
|
11
20
|
end
|
12
21
|
|
22
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
23
|
+
# object.
|
24
|
+
#
|
25
|
+
# @param html_node [Object]
|
26
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
27
|
+
#
|
28
|
+
# @return [ProxyFetcher::Proxy]
|
29
|
+
# Proxy object
|
30
|
+
#
|
13
31
|
def to_proxy(html_node)
|
14
32
|
json = parse_json(html_node)
|
15
33
|
|
@@ -1,13 +1,31 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# HTTPTunnel provider class.
|
3
4
|
class HTTPTunnel < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'http://www.httptunnel.ge/ProxyListForFree.aspx'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
def load_proxy_list(*)
|
7
16
|
doc = load_document(PROVIDER_URL)
|
8
17
|
doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
|
9
18
|
end
|
10
19
|
|
20
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
21
|
+
# object.
|
22
|
+
#
|
23
|
+
# @param html_node [Object]
|
24
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
25
|
+
#
|
26
|
+
# @return [ProxyFetcher::Proxy]
|
27
|
+
# Proxy object
|
28
|
+
#
|
11
29
|
def to_proxy(html_node)
|
12
30
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
13
31
|
uri = parse_proxy_uri(html_node)
|
@@ -22,15 +40,39 @@ module ProxyFetcher
|
|
22
40
|
|
23
41
|
private
|
24
42
|
|
43
|
+
# Parses HTML node to extract URI object with proxy host and port.
|
44
|
+
#
|
45
|
+
# @param html_node [Object]
|
46
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
47
|
+
#
|
48
|
+
# @return [URI]
|
49
|
+
# URI object
|
50
|
+
#
|
25
51
|
def parse_proxy_uri(html_node)
|
26
52
|
full_addr = html_node.content_at('td[1]')
|
27
53
|
URI.parse("http://#{full_addr}")
|
28
54
|
end
|
29
55
|
|
56
|
+
# Parses HTML node to extract proxy country.
|
57
|
+
#
|
58
|
+
# @param html_node [Object]
|
59
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
60
|
+
#
|
61
|
+
# @return [String]
|
62
|
+
# Country code
|
63
|
+
#
|
30
64
|
def parse_country(html_node)
|
31
65
|
html_node.find('.//img').attr('title')
|
32
66
|
end
|
33
67
|
|
68
|
+
# Parses HTML node to extract proxy anonymity level.
|
69
|
+
#
|
70
|
+
# @param html_node [Object]
|
71
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
72
|
+
#
|
73
|
+
# @return [String]
|
74
|
+
# Anonymity level
|
75
|
+
#
|
34
76
|
def parse_anonymity(html_node)
|
35
77
|
transparency = html_node.content_at('td[5]').to_sym
|
36
78
|
|
@@ -1,14 +1,32 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# ProxyDocker provider class.
|
3
4
|
class ProxyDocker < Base
|
4
|
-
|
5
|
+
# Provider URL to fetch proxy list
|
6
|
+
PROVIDER_URL = 'https://www.proxydocker.com/en/proxylist/'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
# [NOTE] Doesn't support direct filters
|
7
16
|
def load_proxy_list(*)
|
8
17
|
doc = load_document(PROVIDER_URL, {})
|
9
18
|
doc.xpath('//table[contains(@class, "table")]/tr[(not(@id="proxy-table-header")) and (count(td)>2)]')
|
10
19
|
end
|
11
20
|
|
21
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
22
|
+
# object.
|
23
|
+
#
|
24
|
+
# @param html_node [Object]
|
25
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
26
|
+
#
|
27
|
+
# @return [ProxyFetcher::Proxy]
|
28
|
+
# Proxy object
|
29
|
+
#
|
12
30
|
def to_proxy(html_node)
|
13
31
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
32
|
uri = URI("//#{html_node.content_at('td[1]')}")
|
@@ -2,14 +2,32 @@ require 'base64'
|
|
2
2
|
|
3
3
|
module ProxyFetcher
|
4
4
|
module Providers
|
5
|
+
# ProxyList provider class.
|
5
6
|
class ProxyList < Base
|
7
|
+
# Provider URL to fetch proxy list
|
6
8
|
PROVIDER_URL = 'https://proxy-list.org/english/index.php'.freeze
|
7
9
|
|
10
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
12
|
+
# to return all the proxy entries (HTML nodes).
|
13
|
+
#
|
14
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
15
|
+
# Collection of extracted HTML nodes with full proxy info
|
16
|
+
#
|
8
17
|
def load_proxy_list(filters = {})
|
9
18
|
doc = load_document(PROVIDER_URL, filters)
|
10
19
|
doc.css('.table-wrap .table ul')
|
11
20
|
end
|
12
21
|
|
22
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
23
|
+
# object.
|
24
|
+
#
|
25
|
+
# @param html_node [Object]
|
26
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
27
|
+
#
|
28
|
+
# @return [ProxyFetcher::Proxy]
|
29
|
+
# Proxy object
|
30
|
+
#
|
13
31
|
def to_proxy(html_node)
|
14
32
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
15
33
|
uri = parse_proxy_uri(html_node)
|
@@ -24,6 +42,14 @@ module ProxyFetcher
|
|
24
42
|
|
25
43
|
private
|
26
44
|
|
45
|
+
# Parses HTML node to extract URI object with proxy host and port.
|
46
|
+
#
|
47
|
+
# @param html_node [Object]
|
48
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
49
|
+
#
|
50
|
+
# @return [URI]
|
51
|
+
# URI object
|
52
|
+
#
|
27
53
|
def parse_proxy_uri(html_node)
|
28
54
|
full_addr = ::Base64.decode64(html_node.at_css('li script').html.match(/'(.+)'/)[1])
|
29
55
|
URI.parse("http://#{full_addr}")
|
@@ -1,13 +1,31 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# XRoxy provider class.
|
3
4
|
class XRoxy < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'http://www.xroxy.com/proxylist.php'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
def load_proxy_list(filters = { type: 'All_http' })
|
7
16
|
doc = load_document(PROVIDER_URL, filters)
|
8
17
|
doc.xpath('//div[@id="content"]/table[1]/tr[contains(@class, "row")]')
|
9
18
|
end
|
10
19
|
|
20
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
21
|
+
# object.
|
22
|
+
#
|
23
|
+
# @param html_node [Object]
|
24
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
25
|
+
#
|
26
|
+
# @return [ProxyFetcher::Proxy]
|
27
|
+
# Proxy object
|
28
|
+
#
|
11
29
|
def to_proxy(html_node)
|
12
30
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
13
31
|
proxy.addr = html_node.content_at('td[2]')
|
@@ -21,6 +39,14 @@ module ProxyFetcher
|
|
21
39
|
|
22
40
|
private
|
23
41
|
|
42
|
+
# Parses HTML node to extract proxy type.
|
43
|
+
#
|
44
|
+
# @param html_node [Object]
|
45
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
46
|
+
#
|
47
|
+
# @return [String]
|
48
|
+
# Proxy type
|
49
|
+
#
|
24
50
|
def parse_type(html_node)
|
25
51
|
https = html_node.content_at('td[5]')
|
26
52
|
https.casecmp('true').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
data/lib/proxy_fetcher/proxy.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# Proxy object
|
2
3
|
class Proxy
|
3
4
|
attr_accessor :addr, :port, :type, :country, :response_time, :anonymity
|
4
5
|
|
6
|
+
# Proxy type
|
5
7
|
TYPES = [
|
6
8
|
HTTP = 'HTTP'.freeze,
|
7
9
|
HTTPS = 'HTTPS'.freeze,
|
@@ -9,12 +11,22 @@ module ProxyFetcher
|
|
9
11
|
SOCKS5 = 'SOCKS5'.freeze
|
10
12
|
].freeze
|
11
13
|
|
14
|
+
# Proxy type predicates (#socks4?, #https?)
|
15
|
+
#
|
16
|
+
# @return [Boolean]
|
17
|
+
# true if proxy of requested type, otherwise false.
|
18
|
+
#
|
12
19
|
TYPES.each do |proxy_type|
|
13
20
|
define_method "#{proxy_type.downcase}?" do
|
14
21
|
!type.nil? && type.upcase.include?(proxy_type)
|
15
22
|
end
|
16
23
|
end
|
17
24
|
|
25
|
+
# Returns true if proxy is secure (works through https, socks4 or socks5).
|
26
|
+
#
|
27
|
+
# @return [Boolean]
|
28
|
+
# true if proxy is secure, otherwise false.
|
29
|
+
#
|
18
30
|
def ssl?
|
19
31
|
https? || socks4? || socks5?
|
20
32
|
end
|
@@ -25,16 +37,32 @@ module ProxyFetcher
|
|
25
37
|
end
|
26
38
|
end
|
27
39
|
|
40
|
+
# Checks if proxy object is connectable? (can be used as a proxy for
|
41
|
+
# network requests).
|
42
|
+
#
|
43
|
+
# @return [Boolean]
|
44
|
+
# true if proxy connectable, otherwise false.
|
45
|
+
#
|
28
46
|
def connectable?
|
29
47
|
ProxyFetcher.config.proxy_validator.connectable?(addr, port)
|
30
48
|
end
|
31
49
|
|
32
50
|
alias valid? connectable?
|
33
51
|
|
52
|
+
# Returns <code>URI::Generic</code> object with host and port values of the proxy.
|
53
|
+
#
|
54
|
+
# @return [URI::Generic]
|
55
|
+
# URI object.
|
56
|
+
#
|
34
57
|
def uri
|
35
58
|
URI::Generic.build(host: addr, port: port)
|
36
59
|
end
|
37
60
|
|
61
|
+
# Returns <code>String</object> object with <i>addr:port<i> values of the proxy.
|
62
|
+
#
|
63
|
+
# @return [String]
|
64
|
+
# true if proxy connectable, otherwise false.
|
65
|
+
#
|
38
66
|
def url
|
39
67
|
"#{addr}:#{port}"
|
40
68
|
end
|
@@ -1,5 +1,9 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# Default ProxyFetcher proxy validator that checks either proxy
|
3
|
+
# connectable or not. It tries to send HEAD request to default
|
4
|
+
# URL to check if proxy can be used (aka connectable?).
|
2
5
|
class ProxyValidator
|
6
|
+
# Default URL that will be used to check if proxy can be used.
|
3
7
|
URL_TO_CHECK = 'https://google.com'.freeze
|
4
8
|
|
5
9
|
def initialize(proxy_addr, proxy_port)
|
@@ -1,15 +1,19 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
##
|
3
|
+
# ProxyFetcher gem version.
|
2
4
|
def self.gem_version
|
3
5
|
Gem::Version.new VERSION::STRING
|
4
6
|
end
|
5
7
|
|
8
|
+
##
|
9
|
+
# ProxyFetcher gem semantic versioning.
|
6
10
|
module VERSION
|
7
11
|
# Major version number
|
8
12
|
MAJOR = 0
|
9
13
|
# Minor version number
|
10
14
|
MINOR = 6
|
11
15
|
# Smallest version number
|
12
|
-
TINY =
|
16
|
+
TINY = 2
|
13
17
|
|
14
18
|
# Full version number
|
15
19
|
STRING = [MAJOR, MINOR, TINY].compact.join('.')
|
data/lib/proxy_fetcher.rb
CHANGED
@@ -22,7 +22,10 @@ require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/abstract_adap
|
|
22
22
|
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/nokogiri_adapter'
|
23
23
|
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/oga_adapter'
|
24
24
|
|
25
|
+
##
|
26
|
+
# Ruby / JRuby lib for managing proxies
|
25
27
|
module ProxyFetcher
|
28
|
+
# ProxyFetcher providers namespace
|
26
29
|
module Providers
|
27
30
|
require File.dirname(__FILE__) + '/proxy_fetcher/providers/base'
|
28
31
|
require File.dirname(__FILE__) + '/proxy_fetcher/providers/free_proxy_list'
|
@@ -34,11 +37,33 @@ module ProxyFetcher
|
|
34
37
|
require File.dirname(__FILE__) + '/proxy_fetcher/providers/xroxy'
|
35
38
|
end
|
36
39
|
|
40
|
+
# Main ProxyFetcher module.
|
37
41
|
class << self
|
42
|
+
##
|
43
|
+
# Returns ProxyFetcher configuration.
|
44
|
+
#
|
45
|
+
# @return [ProxyFetcher::Configuration]
|
46
|
+
# Configuration object.
|
47
|
+
#
|
48
|
+
# @example
|
49
|
+
# ProxyFetcher.config
|
50
|
+
#
|
51
|
+
# #=> #<ProxyFetcher::Configuration:0x0000000241eec8 @user_agent="Mozilla/5.0, ...", @pool_size=10,
|
52
|
+
# @timeout=3, @http_client=ProxyFetcher::HTTPClient, @proxy_validator=ProxyFetcher::ProxyValidator,
|
53
|
+
# @providers=[:free_proxy_list, ...], @adapter=ProxyFetcher::Document::NokogiriAdapter>
|
54
|
+
#
|
38
55
|
def config
|
39
56
|
@config ||= ProxyFetcher::Configuration.new
|
40
57
|
end
|
41
58
|
|
59
|
+
##
|
60
|
+
# Configures ProxyFetcher and yields config object for additional manipulations.
|
61
|
+
|
62
|
+
# @yieldreturn [optional, types, ...] description
|
63
|
+
#
|
64
|
+
# @return [ProxyFetcher::Configuration]
|
65
|
+
# Configuration object.
|
66
|
+
#
|
42
67
|
def configure
|
43
68
|
yield config
|
44
69
|
end
|
data/proxy_fetcher.gemspec
CHANGED
@@ -5,7 +5,7 @@ require 'proxy_fetcher/version'
|
|
5
5
|
Gem::Specification.new do |gem|
|
6
6
|
gem.name = 'proxy_fetcher'
|
7
7
|
gem.version = ProxyFetcher.gem_version
|
8
|
-
gem.date = '2017-12-
|
8
|
+
gem.date = '2017-12-27'
|
9
9
|
gem.summary = 'Ruby gem for dealing with proxy lists from different providers'
|
10
10
|
gem.description = 'This gem can help your Ruby application to make HTTP(S) requests ' \
|
11
11
|
'using proxies by fetching and validating proxy lists from the different providers.'
|
@@ -24,13 +24,20 @@ describe ProxyFetcher::Proxy do
|
|
24
24
|
proxy.type = ProxyFetcher::Proxy::HTTP
|
25
25
|
expect(proxy.http?).to be_truthy
|
26
26
|
expect(proxy.https?).to be_falsey
|
27
|
+
expect(proxy.ssl?).to be_falsey
|
27
28
|
|
28
29
|
proxy.type = ProxyFetcher::Proxy::HTTPS
|
29
30
|
expect(proxy.https?).to be_truthy
|
30
31
|
expect(proxy.http?).to be_truthy
|
32
|
+
expect(proxy.ssl?).to be_truthy
|
33
|
+
|
34
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS4
|
35
|
+
expect(proxy.socks4?).to be_truthy
|
36
|
+
expect(proxy.ssl?).to be_truthy
|
31
37
|
|
32
38
|
proxy.type = ProxyFetcher::Proxy::SOCKS5
|
33
39
|
expect(proxy.socks5?).to be_truthy
|
40
|
+
expect(proxy.ssl?).to be_truthy
|
34
41
|
end
|
35
42
|
|
36
43
|
it 'not connectable if IP addr is wrong' do
|
@@ -51,6 +58,9 @@ describe ProxyFetcher::Proxy do
|
|
51
58
|
|
52
59
|
it 'returns URI::Generic' do
|
53
60
|
expect(proxy.uri).to be_a(URI::Generic)
|
61
|
+
|
62
|
+
expect(proxy.uri.host).not_to be_empty
|
63
|
+
expect(proxy.uri.port).not_to be_nil
|
54
64
|
end
|
55
65
|
|
56
66
|
it 'returns URL' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proxy_fetcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikita Bulai
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-12-
|
11
|
+
date: 2017-12-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -107,7 +107,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
107
|
version: '0'
|
108
108
|
requirements: []
|
109
109
|
rubyforge_project:
|
110
|
-
rubygems_version: 2.
|
110
|
+
rubygems_version: 2.6.11
|
111
111
|
signing_key:
|
112
112
|
specification_version: 4
|
113
113
|
summary: Ruby gem for dealing with proxy lists from different providers
|