proxy_fetcher 0.6.1 → 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -10
- data/.travis.yml +7 -4
- data/README.md +11 -9
- data/bin/proxy_fetcher +0 -0
- data/lib/proxy_fetcher/client/client.rb +95 -0
- data/lib/proxy_fetcher/client/request.rb +6 -1
- data/lib/proxy_fetcher/configuration.rb +24 -3
- data/lib/proxy_fetcher/document.rb +3 -0
- data/lib/proxy_fetcher/exceptions.rb +17 -0
- data/lib/proxy_fetcher/manager.rb +1 -0
- data/lib/proxy_fetcher/providers/base.rb +8 -1
- data/lib/proxy_fetcher/providers/free_proxy_list.rb +19 -0
- data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +18 -0
- data/lib/proxy_fetcher/providers/gather_proxy.rb +18 -0
- data/lib/proxy_fetcher/providers/http_tunnel.rb +42 -0
- data/lib/proxy_fetcher/providers/proxy_docker.rb +19 -1
- data/lib/proxy_fetcher/providers/proxy_list.rb +26 -0
- data/lib/proxy_fetcher/providers/xroxy.rb +26 -0
- data/lib/proxy_fetcher/proxy.rb +28 -0
- data/lib/proxy_fetcher/utils/http_client.rb +3 -0
- data/lib/proxy_fetcher/utils/proxy_validator.rb +4 -0
- data/lib/proxy_fetcher/version.rb +5 -1
- data/lib/proxy_fetcher.rb +25 -0
- data/proxy_fetcher.gemspec +1 -1
- data/spec/proxy_fetcher/proxy_spec.rb +10 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c2d4076c9f73303b305c364d3dcf672b6e8eba7
|
4
|
+
data.tar.gz: b692847fab646ec4bd230909ac1cba9eb77c1c7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cd70341cd18eea64dad58d63b8888f36e403251bb08e9d0289c584de1bc2201b45ff97f48fbac26697b25ee52328611b1e37de8d393b3b50ed7b3885192d0885
|
7
|
+
data.tar.gz: aed0b7532628796cb2b9f397be8bd8d87108c4d5a4e4efd2d30a468e2b8c9c203fbe7ac1fd4b56c7e571a67748b681297c56e2c880fddddb20c463bc117789e6
|
data/.gitignore
CHANGED
@@ -17,16 +17,6 @@ Gemfile.lock
|
|
17
17
|
certs
|
18
18
|
gemfiles/*.gemfile.lock
|
19
19
|
|
20
|
-
# TODO Comment out this rule if you are OK with secrets being uploaded to the repo
|
21
|
-
config/initializers/secret_token.rb
|
22
|
-
|
23
|
-
# Only include if you have production secrets in this file, which is no longer a Rails default
|
24
|
-
# config/secrets.yml
|
25
|
-
|
26
|
-
# dotenv
|
27
|
-
# TODO Comment out this rule if environment variables can be committed
|
28
|
-
.env
|
29
|
-
|
30
20
|
## Environment normalization:
|
31
21
|
/.bundle
|
32
22
|
/vendor/bundle
|
@@ -47,3 +37,5 @@ bower.json
|
|
47
37
|
|
48
38
|
# Ignore Byebug command history file.
|
49
39
|
.byebug_history
|
40
|
+
.yardoc/
|
41
|
+
doc/
|
data/.travis.yml
CHANGED
@@ -2,9 +2,10 @@ language: ruby
|
|
2
2
|
before_install: gem install bundler
|
3
3
|
bundler_args: --without yard guard benchmarks
|
4
4
|
script: "rake spec"
|
5
|
-
env:
|
6
|
-
|
7
|
-
|
5
|
+
env: JRUBY_OPTS="$JRUBY_OPTS --debug"
|
6
|
+
gemfile:
|
7
|
+
- gemfiles/oga.gemfile
|
8
|
+
- gemfiles/nokogiri.gemfile
|
8
9
|
gemfile:
|
9
10
|
- gemfiles/oga.gemfile
|
10
11
|
- gemfiles/nokogiri.gemfile
|
@@ -13,8 +14,10 @@ rvm:
|
|
13
14
|
- 2.1
|
14
15
|
- 2.2.4
|
15
16
|
- 2.3.3
|
16
|
-
- 2.4.
|
17
|
+
- 2.4.3
|
18
|
+
- 2.5.0
|
17
19
|
- ruby-head
|
20
|
+
- jruby-9.1.15.0
|
18
21
|
matrix:
|
19
22
|
allow_failures:
|
20
23
|
- rvm: ruby-head
|
data/README.md
CHANGED
@@ -1,16 +1,17 @@
|
|
1
|
-
# Ruby lib for managing proxies
|
1
|
+
# Ruby / JRuby lib for managing proxies
|
2
2
|
[![Gem Version](https://badge.fury.io/rb/proxy_fetcher.svg)](http://badge.fury.io/rb/proxy_fetcher)
|
3
3
|
[![Build Status](https://travis-ci.org/nbulaj/proxy_fetcher.svg?branch=master)](https://travis-ci.org/nbulaj/proxy_fetcher)
|
4
4
|
[![Coverage Status](https://coveralls.io/repos/github/nbulaj/proxy_fetcher/badge.svg)](https://coveralls.io/github/nbulaj/proxy_fetcher)
|
5
5
|
[![Code Climate](https://codeclimate.com/github/nbulaj/proxy_fetcher/badges/gpa.svg)](https://codeclimate.com/github/nbulaj/proxy_fetcher)
|
6
|
+
[![Inline docs](http://inch-ci.org/github/nbulaj/proxy_fetcher.png?branch=master)](http://inch-ci.org/github/nbulaj/proxy_fetcher)
|
6
7
|
[![License](http://img.shields.io/badge/license-MIT-brightgreen.svg)](#license)
|
7
8
|
|
8
|
-
This gem can help your Ruby application to make HTTP(S) requests using
|
9
|
-
proxy lists from multiple providers.
|
9
|
+
This gem can help your Ruby / JRuby application to make HTTP(S) requests using
|
10
|
+
proxy by fetching and validating actual proxy lists from multiple providers.
|
10
11
|
|
11
12
|
It gives you a special `Manager` class that can load proxy lists, validate them and return random or specific proxies.
|
12
|
-
It also has a `Client` class that encapsulates all the logic for
|
13
|
-
Take a look at the documentation below to find all the gem features.
|
13
|
+
It also has a `Client` class that encapsulates all the logic for sending HTTP requests using proxies, automatically
|
14
|
+
fetched and validated by the gem. Take a look at the documentation below to find all the gem features.
|
14
15
|
|
15
16
|
Also this gem can be used with any other programming language (Go / Python / etc) as standalone solution for downloading and
|
16
17
|
validating proxy lists from the different providers. [Checkout examples](#standalone) of usage below.
|
@@ -32,13 +33,14 @@ validating proxy lists from the different providers. [Checkout examples](#standa
|
|
32
33
|
|
33
34
|
## Dependencies
|
34
35
|
|
35
|
-
ProxyFetcher gem itself requires only Ruby `>= 2.0.0
|
36
|
+
ProxyFetcher gem itself requires only Ruby `>= 2.0.0` (or [JRuby](http://jruby.org/) `> 9.0`, but maybe earlier too,
|
37
|
+
[see Travis build matrix](.travis.yml)).
|
36
38
|
|
37
39
|
However, it requires an adapter to parse HTML. If you do not specify any specific adapter, then it will use
|
38
40
|
default one - [Nokogiri](https://github.com/sparklemotion/nokogiri). It's OK for any Ruby on Rails project
|
39
|
-
(because they
|
41
|
+
(because they use it by default).
|
40
42
|
|
41
|
-
But if you want to use some specific adapter (for example your
|
43
|
+
But if you want to use some specific adapter (for example your application uses [Oga](https://gitlab.com/yorickpeterse/oga),
|
42
44
|
then you need to manually add your dependencies to your project and configure ProxyFetcher to use another adapter. Moreover,
|
43
45
|
you can implement your own adapter if it your use-case. Take a look at the [Configuration](#configuration) section for more details.
|
44
46
|
|
@@ -65,7 +67,7 @@ bundle install
|
|
65
67
|
Otherwise simply install the gem:
|
66
68
|
|
67
69
|
```sh
|
68
|
-
gem install proxy_fetcher -v '0.
|
70
|
+
gem install proxy_fetcher -v '0.6'
|
69
71
|
```
|
70
72
|
|
71
73
|
## Example of usage
|
data/bin/proxy_fetcher
CHANGED
File without changes
|
@@ -1,26 +1,121 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# ProxyFetcher HTTP client that encapsulates all the logic for sending
|
3
|
+
# HTTP(S) requests using proxies, automatically fetched and validated by the gem.
|
2
4
|
module Client
|
3
5
|
class << self
|
6
|
+
# Sends HTTP GET request.
|
7
|
+
#
|
8
|
+
# @param url [String]
|
9
|
+
# Requested URL
|
10
|
+
#
|
11
|
+
# @param headers [Hash]
|
12
|
+
# HTTP headers that will be used in the request
|
13
|
+
#
|
14
|
+
# @param options [Hash]
|
15
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
16
|
+
#
|
17
|
+
# @return [String]
|
18
|
+
# HTML body from the URL.
|
19
|
+
#
|
4
20
|
def get(url, headers: {}, options: {})
|
5
21
|
request_without_payload(:get, url, headers, options)
|
6
22
|
end
|
7
23
|
|
24
|
+
# Sends HTTP HEAD request.
|
25
|
+
#
|
26
|
+
# @param url [String]
|
27
|
+
# Requested URL
|
28
|
+
#
|
29
|
+
# @param headers [Hash]
|
30
|
+
# HTTP headers that will be used in the request
|
31
|
+
#
|
32
|
+
# @param options [Hash]
|
33
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
34
|
+
#
|
35
|
+
# @return [String]
|
36
|
+
# HTML body from the URL.
|
37
|
+
#
|
8
38
|
def head(url, headers: {}, options: {})
|
9
39
|
request_without_payload(:head, url, headers, options)
|
10
40
|
end
|
11
41
|
|
42
|
+
# Sends HTTP POST request.
|
43
|
+
#
|
44
|
+
# @param url [String]
|
45
|
+
# Requested URL
|
46
|
+
#
|
47
|
+
# @param payload [String, Hash]
|
48
|
+
# HTTP payload
|
49
|
+
#
|
50
|
+
# @param headers [Hash]
|
51
|
+
# HTTP headers that will be used in the request
|
52
|
+
#
|
53
|
+
# @param options [Hash]
|
54
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
55
|
+
#
|
56
|
+
# @return [String]
|
57
|
+
# HTML body from the URL.
|
58
|
+
#
|
12
59
|
def post(url, payload, headers: {}, options: {})
|
13
60
|
request_with_payload(:post, url, payload, headers, options)
|
14
61
|
end
|
15
62
|
|
63
|
+
# Sends HTTP DELETE request.
|
64
|
+
#
|
65
|
+
# @param url [String]
|
66
|
+
# Requested URL
|
67
|
+
#
|
68
|
+
# @param headers [Hash]
|
69
|
+
# HTTP headers that will be used in the request
|
70
|
+
#
|
71
|
+
# @param options [Hash]
|
72
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
73
|
+
#
|
74
|
+
# @return [String]
|
75
|
+
# HTML body from the URL.
|
76
|
+
#
|
16
77
|
def delete(url, headers: {}, options: {})
|
17
78
|
request_without_payload(:delete, url, headers, options)
|
18
79
|
end
|
19
80
|
|
81
|
+
# Sends HTTP PUT request.
|
82
|
+
#
|
83
|
+
# @param url [String]
|
84
|
+
# Requested URL
|
85
|
+
#
|
86
|
+
# @param payload [String, Hash]
|
87
|
+
# HTTP payload
|
88
|
+
#
|
89
|
+
# @param headers [Hash]
|
90
|
+
# HTTP headers that will be used in the request
|
91
|
+
#
|
92
|
+
# @param options [Hash]
|
93
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
94
|
+
#
|
95
|
+
# @return [String]
|
96
|
+
# HTML body from the URL.
|
97
|
+
#
|
20
98
|
def put(url, payload, headers: {}, options: {})
|
21
99
|
request_with_payload(:put, url, payload, headers, options)
|
22
100
|
end
|
23
101
|
|
102
|
+
# Sends HTTP PATCH request.
|
103
|
+
#
|
104
|
+
# @param url [String]
|
105
|
+
# Requested URL
|
106
|
+
#
|
107
|
+
# @param payload [String, Hash]
|
108
|
+
# HTTP payload
|
109
|
+
#
|
110
|
+
# @param headers [Hash]
|
111
|
+
# HTTP headers that will be used in the request
|
112
|
+
#
|
113
|
+
# @param options [Hash]
|
114
|
+
# Additional options used by <code>ProxyFetcher::Client</code>
|
115
|
+
#
|
116
|
+
# @return [String]
|
117
|
+
# HTML body from the URL.
|
118
|
+
#
|
24
119
|
def patch(url, payload, headers: {}, options: {})
|
25
120
|
request_with_payload(:patch, url, payload, headers, options)
|
26
121
|
end
|
@@ -1,10 +1,15 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Client
|
3
|
+
# ProxyFetcher::Client HTTP request abstraction.
|
3
4
|
class Request
|
5
|
+
# URL encoding HTTP headers.
|
4
6
|
URL_ENCODED = {
|
5
7
|
'Content-Type' => 'application/x-www-form-urlencoded'
|
6
8
|
}.freeze
|
7
9
|
|
10
|
+
# Default SSL options that will be used for connecting to resources
|
11
|
+
# the uses secure connection. By default ProxyFetcher wouldn't verify
|
12
|
+
# SSL certs.
|
8
13
|
DEFAULT_SSL_OPTIONS = {
|
9
14
|
verify_mode: OpenSSL::SSL::VERIFY_NONE
|
10
15
|
}.freeze
|
@@ -46,7 +51,7 @@ module ProxyFetcher
|
|
46
51
|
return if payload.nil?
|
47
52
|
|
48
53
|
if payload.is_a?(Hash)
|
49
|
-
headers.merge(URL_ENCODED)
|
54
|
+
headers.merge!(URL_ENCODED)
|
50
55
|
URI.encode_www_form(payload)
|
51
56
|
else
|
52
57
|
payload
|
@@ -1,11 +1,23 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# ProxyFetcher configuration. Stores all the options for dealing
|
3
|
+
# with HTTP requests, adapters, custom classes.
|
4
|
+
#
|
2
5
|
class Configuration
|
3
6
|
attr_accessor :timeout, :pool_size, :user_agent
|
4
7
|
attr_reader :adapter, :http_client, :proxy_validator, :providers
|
5
8
|
|
6
|
-
#
|
7
|
-
|
8
|
-
|
9
|
+
# User-Agent string that will be used by the ProxyFetcher HTTP client (to
|
10
|
+
# send requests via proxy) and to fetch proxy lists from the sources.
|
11
|
+
#
|
12
|
+
# Default is Google Chrome 60, but can be changed in <code>ProxyFetcher.config</code>.
|
13
|
+
#
|
14
|
+
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 ' \
|
15
|
+
'(KHTML, like Gecko) Chrome/60.0.3112 Safari/537.36'.freeze
|
16
|
+
|
17
|
+
# HTML parser adapter name.
|
18
|
+
#
|
19
|
+
# Default is Nokogiri, but can be changed in <code>ProxyFetcher.config</code>.
|
20
|
+
#
|
9
21
|
DEFAULT_ADAPTER = :nokogiri
|
10
22
|
|
11
23
|
class << self
|
@@ -13,6 +25,15 @@ module ProxyFetcher
|
|
13
25
|
@registry ||= ProvidersRegistry.new
|
14
26
|
end
|
15
27
|
|
28
|
+
# Register new proxy provider. Requires provider name and class
|
29
|
+
# that will process proxy list.
|
30
|
+
#
|
31
|
+
# @param name [String, Symbol]
|
32
|
+
# name of the provider
|
33
|
+
#
|
34
|
+
# @param klass [Class]
|
35
|
+
# Class that will fetch and process proxy list
|
36
|
+
#
|
16
37
|
def register_provider(name, klass)
|
17
38
|
providers_registry.register(name, klass)
|
18
39
|
end
|
@@ -1,4 +1,7 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# HTML document abstraction class. Used to work with different HTML parser adapters
|
3
|
+
# such as Nokogiri, Oga or a custom one. Stores <i>backend</i< that will handle all
|
4
|
+
# the DOM manipulation logic.
|
2
5
|
class Document
|
3
6
|
class << self
|
4
7
|
def parse(data)
|
@@ -1,7 +1,10 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# Base exception class for all the ProxyFetcher exceptions.
|
2
3
|
Error = Class.new(StandardError)
|
3
4
|
|
5
|
+
# ProxyFetcher exceptions namespace
|
4
6
|
module Exceptions
|
7
|
+
# Exception for wrong custom classes (such as ProxyValidator or HTTP Client).
|
5
8
|
class WrongCustomClass < Error
|
6
9
|
def initialize(klass, methods)
|
7
10
|
required_methods = Array(methods).join(', ')
|
@@ -9,36 +12,48 @@ module ProxyFetcher
|
|
9
12
|
end
|
10
13
|
end
|
11
14
|
|
15
|
+
# Exception for wrong provider name, that raises when configured provider
|
16
|
+
# that is not registered via <code>register_provider</code> interface.
|
12
17
|
class UnknownProvider < Error
|
13
18
|
def initialize(provider_name)
|
14
19
|
super("unregistered proxy provider `#{provider_name}`")
|
15
20
|
end
|
16
21
|
end
|
17
22
|
|
23
|
+
# Exception for cases when user tries to register already existing provider.
|
18
24
|
class RegisteredProvider < Error
|
19
25
|
def initialize(name)
|
20
26
|
super("`#{name}` provider already registered!")
|
21
27
|
end
|
22
28
|
end
|
23
29
|
|
30
|
+
# Exception for cases when HTTP client reached maximum count of redirects
|
31
|
+
# trying to process HTTP request.
|
24
32
|
class MaximumRedirectsReached < Error
|
25
33
|
def initialize(*)
|
26
34
|
super('maximum redirects reached')
|
27
35
|
end
|
28
36
|
end
|
29
37
|
|
38
|
+
# Exception for cases when HTTP client reached maximum count of retries
|
39
|
+
# trying to process HTTP request. Can occur when request failed by timeout
|
40
|
+
# multiple times.
|
30
41
|
class MaximumRetriesReached < Error
|
31
42
|
def initialize(*)
|
32
43
|
super('reached the maximum number of retries')
|
33
44
|
end
|
34
45
|
end
|
35
46
|
|
47
|
+
# Exception for cases when user tries to set wrong HTML parser adapter
|
48
|
+
# in the configuration.
|
36
49
|
class UnknownAdapter < Error
|
37
50
|
def initialize(name)
|
38
51
|
super("unknown adapter '#{name}'")
|
39
52
|
end
|
40
53
|
end
|
41
54
|
|
55
|
+
# Exception for cases when user tries to set <code>nil</code> HTML parser adapter
|
56
|
+
# in the configuration (or just forget to change it).
|
42
57
|
class BlankAdapter < Error
|
43
58
|
def initialize(*)
|
44
59
|
super(<<-MSG.strip.squeeze
|
@@ -49,6 +64,8 @@ module ProxyFetcher
|
|
49
64
|
end
|
50
65
|
end
|
51
66
|
|
67
|
+
# Exception for cases when HTML parser adapter can't be installed.
|
68
|
+
# It will print the reason (backtrace) of the exception that caused an error.
|
52
69
|
class AdapterSetupError < Error
|
53
70
|
def initialize(adapter_name, reason)
|
54
71
|
adapter = demodulize(adapter_name.gsub('Adapter', ''))
|
@@ -1,5 +1,6 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# Base class for all the ProxyFetcher providers.
|
3
4
|
class Base
|
4
5
|
# Loads proxy provider page content, extract proxy list from it
|
5
6
|
# and convert every entry to proxy object.
|
@@ -8,6 +9,7 @@ module ProxyFetcher
|
|
8
9
|
end
|
9
10
|
|
10
11
|
class << self
|
12
|
+
# Just synthetic sugar to make it easier to call #fetch_proxies! method.
|
11
13
|
def fetch_proxies!(*args)
|
12
14
|
new.fetch_proxies!(*args)
|
13
15
|
end
|
@@ -26,7 +28,12 @@ module ProxyFetcher
|
|
26
28
|
ProxyFetcher::Document.parse(html)
|
27
29
|
end
|
28
30
|
|
29
|
-
#
|
31
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
32
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
33
|
+
# to return all the proxy entries (HTML nodes).
|
34
|
+
#
|
35
|
+
# Abstract method.
|
36
|
+
#
|
30
37
|
def load_proxy_list(*)
|
31
38
|
raise NotImplementedError, "#{__method__} must be implemented in a descendant class!"
|
32
39
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# FreeProxyList provider class.
|
3
4
|
class FreeProxyList < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'https://free-proxy-list.net/'.freeze
|
5
7
|
|
6
8
|
# [NOTE] Doesn't support filtering
|
@@ -9,6 +11,15 @@ module ProxyFetcher
|
|
9
11
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
10
12
|
end
|
11
13
|
|
14
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
15
|
+
# object.
|
16
|
+
#
|
17
|
+
# @param html_node [Object]
|
18
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
19
|
+
#
|
20
|
+
# @return [ProxyFetcher::Proxy]
|
21
|
+
# Proxy object
|
22
|
+
#
|
12
23
|
def to_proxy(html_node)
|
13
24
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
25
|
proxy.addr = html_node.content_at('td[1]')
|
@@ -21,6 +32,14 @@ module ProxyFetcher
|
|
21
32
|
|
22
33
|
private
|
23
34
|
|
35
|
+
# Parses HTML node to extract proxy type.
|
36
|
+
#
|
37
|
+
# @param html_node [Object]
|
38
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
39
|
+
#
|
40
|
+
# @return [String]
|
41
|
+
# Proxy type
|
42
|
+
#
|
24
43
|
def parse_type(html_node)
|
25
44
|
https = html_node.content_at('td[6]')
|
26
45
|
https && https.casecmp('yes').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
@@ -1,14 +1,32 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# FreeProxyListSSL provider class.
|
3
4
|
class FreeProxyListSSL < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'https://www.sslproxies.org/'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
# [NOTE] Doesn't support filtering
|
7
16
|
def load_proxy_list(*)
|
8
17
|
doc = load_document(PROVIDER_URL, {})
|
9
18
|
doc.xpath('//table[@id="proxylisttable"]/tbody/tr')
|
10
19
|
end
|
11
20
|
|
21
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
22
|
+
# object.
|
23
|
+
#
|
24
|
+
# @param html_node [Object]
|
25
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
26
|
+
#
|
27
|
+
# @return [ProxyFetcher::Proxy]
|
28
|
+
# Proxy object
|
29
|
+
#
|
12
30
|
def to_proxy(html_node)
|
13
31
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
32
|
proxy.addr = html_node.content_at('td[1]')
|
@@ -2,14 +2,32 @@ require 'json'
|
|
2
2
|
|
3
3
|
module ProxyFetcher
|
4
4
|
module Providers
|
5
|
+
# GatherProxy provider class.
|
5
6
|
class GatherProxy < Base
|
7
|
+
# Provider URL to fetch proxy list
|
6
8
|
PROVIDER_URL = 'http://www.gatherproxy.com/'.freeze
|
7
9
|
|
10
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
12
|
+
# to return all the proxy entries (HTML nodes).
|
13
|
+
#
|
14
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
15
|
+
# Collection of extracted HTML nodes with full proxy info
|
16
|
+
#
|
8
17
|
def load_proxy_list(*)
|
9
18
|
doc = load_document(PROVIDER_URL)
|
10
19
|
doc.xpath('//div[@class="proxy-list"]/table/script')
|
11
20
|
end
|
12
21
|
|
22
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
23
|
+
# object.
|
24
|
+
#
|
25
|
+
# @param html_node [Object]
|
26
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
27
|
+
#
|
28
|
+
# @return [ProxyFetcher::Proxy]
|
29
|
+
# Proxy object
|
30
|
+
#
|
13
31
|
def to_proxy(html_node)
|
14
32
|
json = parse_json(html_node)
|
15
33
|
|
@@ -1,13 +1,31 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# HTTPTunnel provider class.
|
3
4
|
class HTTPTunnel < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'http://www.httptunnel.ge/ProxyListForFree.aspx'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
def load_proxy_list(*)
|
7
16
|
doc = load_document(PROVIDER_URL)
|
8
17
|
doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
|
9
18
|
end
|
10
19
|
|
20
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
21
|
+
# object.
|
22
|
+
#
|
23
|
+
# @param html_node [Object]
|
24
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
25
|
+
#
|
26
|
+
# @return [ProxyFetcher::Proxy]
|
27
|
+
# Proxy object
|
28
|
+
#
|
11
29
|
def to_proxy(html_node)
|
12
30
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
13
31
|
uri = parse_proxy_uri(html_node)
|
@@ -22,15 +40,39 @@ module ProxyFetcher
|
|
22
40
|
|
23
41
|
private
|
24
42
|
|
43
|
+
# Parses HTML node to extract URI object with proxy host and port.
|
44
|
+
#
|
45
|
+
# @param html_node [Object]
|
46
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
47
|
+
#
|
48
|
+
# @return [URI]
|
49
|
+
# URI object
|
50
|
+
#
|
25
51
|
def parse_proxy_uri(html_node)
|
26
52
|
full_addr = html_node.content_at('td[1]')
|
27
53
|
URI.parse("http://#{full_addr}")
|
28
54
|
end
|
29
55
|
|
56
|
+
# Parses HTML node to extract proxy country.
|
57
|
+
#
|
58
|
+
# @param html_node [Object]
|
59
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
60
|
+
#
|
61
|
+
# @return [String]
|
62
|
+
# Country code
|
63
|
+
#
|
30
64
|
def parse_country(html_node)
|
31
65
|
html_node.find('.//img').attr('title')
|
32
66
|
end
|
33
67
|
|
68
|
+
# Parses HTML node to extract proxy anonymity level.
|
69
|
+
#
|
70
|
+
# @param html_node [Object]
|
71
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
72
|
+
#
|
73
|
+
# @return [String]
|
74
|
+
# Anonymity level
|
75
|
+
#
|
34
76
|
def parse_anonymity(html_node)
|
35
77
|
transparency = html_node.content_at('td[5]').to_sym
|
36
78
|
|
@@ -1,14 +1,32 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# ProxyDocker provider class.
|
3
4
|
class ProxyDocker < Base
|
4
|
-
|
5
|
+
# Provider URL to fetch proxy list
|
6
|
+
PROVIDER_URL = 'https://www.proxydocker.com/en/proxylist/'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
# [NOTE] Doesn't support direct filters
|
7
16
|
def load_proxy_list(*)
|
8
17
|
doc = load_document(PROVIDER_URL, {})
|
9
18
|
doc.xpath('//table[contains(@class, "table")]/tr[(not(@id="proxy-table-header")) and (count(td)>2)]')
|
10
19
|
end
|
11
20
|
|
21
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
22
|
+
# object.
|
23
|
+
#
|
24
|
+
# @param html_node [Object]
|
25
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
26
|
+
#
|
27
|
+
# @return [ProxyFetcher::Proxy]
|
28
|
+
# Proxy object
|
29
|
+
#
|
12
30
|
def to_proxy(html_node)
|
13
31
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
14
32
|
uri = URI("//#{html_node.content_at('td[1]')}")
|
@@ -2,14 +2,32 @@ require 'base64'
|
|
2
2
|
|
3
3
|
module ProxyFetcher
|
4
4
|
module Providers
|
5
|
+
# ProxyList provider class.
|
5
6
|
class ProxyList < Base
|
7
|
+
# Provider URL to fetch proxy list
|
6
8
|
PROVIDER_URL = 'https://proxy-list.org/english/index.php'.freeze
|
7
9
|
|
10
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
12
|
+
# to return all the proxy entries (HTML nodes).
|
13
|
+
#
|
14
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
15
|
+
# Collection of extracted HTML nodes with full proxy info
|
16
|
+
#
|
8
17
|
def load_proxy_list(filters = {})
|
9
18
|
doc = load_document(PROVIDER_URL, filters)
|
10
19
|
doc.css('.table-wrap .table ul')
|
11
20
|
end
|
12
21
|
|
22
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
23
|
+
# object.
|
24
|
+
#
|
25
|
+
# @param html_node [Object]
|
26
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
27
|
+
#
|
28
|
+
# @return [ProxyFetcher::Proxy]
|
29
|
+
# Proxy object
|
30
|
+
#
|
13
31
|
def to_proxy(html_node)
|
14
32
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
15
33
|
uri = parse_proxy_uri(html_node)
|
@@ -24,6 +42,14 @@ module ProxyFetcher
|
|
24
42
|
|
25
43
|
private
|
26
44
|
|
45
|
+
# Parses HTML node to extract URI object with proxy host and port.
|
46
|
+
#
|
47
|
+
# @param html_node [Object]
|
48
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
49
|
+
#
|
50
|
+
# @return [URI]
|
51
|
+
# URI object
|
52
|
+
#
|
27
53
|
def parse_proxy_uri(html_node)
|
28
54
|
full_addr = ::Base64.decode64(html_node.at_css('li script').html.match(/'(.+)'/)[1])
|
29
55
|
URI.parse("http://#{full_addr}")
|
@@ -1,13 +1,31 @@
|
|
1
1
|
module ProxyFetcher
|
2
2
|
module Providers
|
3
|
+
# XRoxy provider class.
|
3
4
|
class XRoxy < Base
|
5
|
+
# Provider URL to fetch proxy list
|
4
6
|
PROVIDER_URL = 'http://www.xroxy.com/proxylist.php'.freeze
|
5
7
|
|
8
|
+
# Fetches HTML content by sending HTTP request to the provider URL and
|
9
|
+
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
10
|
+
# to return all the proxy entries (HTML nodes).
|
11
|
+
#
|
12
|
+
# @return [Array<ProxyFetcher::Document::Node>]
|
13
|
+
# Collection of extracted HTML nodes with full proxy info
|
14
|
+
#
|
6
15
|
def load_proxy_list(filters = { type: 'All_http' })
|
7
16
|
doc = load_document(PROVIDER_URL, filters)
|
8
17
|
doc.xpath('//div[@id="content"]/table[1]/tr[contains(@class, "row")]')
|
9
18
|
end
|
10
19
|
|
20
|
+
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
21
|
+
# object.
|
22
|
+
#
|
23
|
+
# @param html_node [Object]
|
24
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
25
|
+
#
|
26
|
+
# @return [ProxyFetcher::Proxy]
|
27
|
+
# Proxy object
|
28
|
+
#
|
11
29
|
def to_proxy(html_node)
|
12
30
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
13
31
|
proxy.addr = html_node.content_at('td[2]')
|
@@ -21,6 +39,14 @@ module ProxyFetcher
|
|
21
39
|
|
22
40
|
private
|
23
41
|
|
42
|
+
# Parses HTML node to extract proxy type.
|
43
|
+
#
|
44
|
+
# @param html_node [Object]
|
45
|
+
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
46
|
+
#
|
47
|
+
# @return [String]
|
48
|
+
# Proxy type
|
49
|
+
#
|
24
50
|
def parse_type(html_node)
|
25
51
|
https = html_node.content_at('td[5]')
|
26
52
|
https.casecmp('true').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
data/lib/proxy_fetcher/proxy.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# Proxy object
|
2
3
|
class Proxy
|
3
4
|
attr_accessor :addr, :port, :type, :country, :response_time, :anonymity
|
4
5
|
|
6
|
+
# Proxy type
|
5
7
|
TYPES = [
|
6
8
|
HTTP = 'HTTP'.freeze,
|
7
9
|
HTTPS = 'HTTPS'.freeze,
|
@@ -9,12 +11,22 @@ module ProxyFetcher
|
|
9
11
|
SOCKS5 = 'SOCKS5'.freeze
|
10
12
|
].freeze
|
11
13
|
|
14
|
+
# Proxy type predicates (#socks4?, #https?)
|
15
|
+
#
|
16
|
+
# @return [Boolean]
|
17
|
+
# true if proxy of requested type, otherwise false.
|
18
|
+
#
|
12
19
|
TYPES.each do |proxy_type|
|
13
20
|
define_method "#{proxy_type.downcase}?" do
|
14
21
|
!type.nil? && type.upcase.include?(proxy_type)
|
15
22
|
end
|
16
23
|
end
|
17
24
|
|
25
|
+
# Returns true if proxy is secure (works through https, socks4 or socks5).
|
26
|
+
#
|
27
|
+
# @return [Boolean]
|
28
|
+
# true if proxy is secure, otherwise false.
|
29
|
+
#
|
18
30
|
def ssl?
|
19
31
|
https? || socks4? || socks5?
|
20
32
|
end
|
@@ -25,16 +37,32 @@ module ProxyFetcher
|
|
25
37
|
end
|
26
38
|
end
|
27
39
|
|
40
|
+
# Checks if proxy object is connectable? (can be used as a proxy for
|
41
|
+
# network requests).
|
42
|
+
#
|
43
|
+
# @return [Boolean]
|
44
|
+
# true if proxy connectable, otherwise false.
|
45
|
+
#
|
28
46
|
def connectable?
|
29
47
|
ProxyFetcher.config.proxy_validator.connectable?(addr, port)
|
30
48
|
end
|
31
49
|
|
32
50
|
alias valid? connectable?
|
33
51
|
|
52
|
+
# Returns <code>URI::Generic</code> object with host and port values of the proxy.
|
53
|
+
#
|
54
|
+
# @return [URI::Generic]
|
55
|
+
# URI object.
|
56
|
+
#
|
34
57
|
def uri
|
35
58
|
URI::Generic.build(host: addr, port: port)
|
36
59
|
end
|
37
60
|
|
61
|
+
# Returns <code>String</object> object with <i>addr:port<i> values of the proxy.
|
62
|
+
#
|
63
|
+
# @return [String]
|
64
|
+
# true if proxy connectable, otherwise false.
|
65
|
+
#
|
38
66
|
def url
|
39
67
|
"#{addr}:#{port}"
|
40
68
|
end
|
@@ -1,5 +1,9 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
# Default ProxyFetcher proxy validator that checks either proxy
|
3
|
+
# connectable or not. It tries to send HEAD request to default
|
4
|
+
# URL to check if proxy can be used (aka connectable?).
|
2
5
|
class ProxyValidator
|
6
|
+
# Default URL that will be used to check if proxy can be used.
|
3
7
|
URL_TO_CHECK = 'https://google.com'.freeze
|
4
8
|
|
5
9
|
def initialize(proxy_addr, proxy_port)
|
@@ -1,15 +1,19 @@
|
|
1
1
|
module ProxyFetcher
|
2
|
+
##
|
3
|
+
# ProxyFetcher gem version.
|
2
4
|
def self.gem_version
|
3
5
|
Gem::Version.new VERSION::STRING
|
4
6
|
end
|
5
7
|
|
8
|
+
##
|
9
|
+
# ProxyFetcher gem semantic versioning.
|
6
10
|
module VERSION
|
7
11
|
# Major version number
|
8
12
|
MAJOR = 0
|
9
13
|
# Minor version number
|
10
14
|
MINOR = 6
|
11
15
|
# Smallest version number
|
12
|
-
TINY =
|
16
|
+
TINY = 2
|
13
17
|
|
14
18
|
# Full version number
|
15
19
|
STRING = [MAJOR, MINOR, TINY].compact.join('.')
|
data/lib/proxy_fetcher.rb
CHANGED
@@ -22,7 +22,10 @@ require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/abstract_adap
|
|
22
22
|
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/nokogiri_adapter'
|
23
23
|
require File.dirname(__FILE__) + '/proxy_fetcher/document/adapters/oga_adapter'
|
24
24
|
|
25
|
+
##
|
26
|
+
# Ruby / JRuby lib for managing proxies
|
25
27
|
module ProxyFetcher
|
28
|
+
# ProxyFetcher providers namespace
|
26
29
|
module Providers
|
27
30
|
require File.dirname(__FILE__) + '/proxy_fetcher/providers/base'
|
28
31
|
require File.dirname(__FILE__) + '/proxy_fetcher/providers/free_proxy_list'
|
@@ -34,11 +37,33 @@ module ProxyFetcher
|
|
34
37
|
require File.dirname(__FILE__) + '/proxy_fetcher/providers/xroxy'
|
35
38
|
end
|
36
39
|
|
40
|
+
# Main ProxyFetcher module.
|
37
41
|
class << self
|
42
|
+
##
|
43
|
+
# Returns ProxyFetcher configuration.
|
44
|
+
#
|
45
|
+
# @return [ProxyFetcher::Configuration]
|
46
|
+
# Configuration object.
|
47
|
+
#
|
48
|
+
# @example
|
49
|
+
# ProxyFetcher.config
|
50
|
+
#
|
51
|
+
# #=> #<ProxyFetcher::Configuration:0x0000000241eec8 @user_agent="Mozilla/5.0, ...", @pool_size=10,
|
52
|
+
# @timeout=3, @http_client=ProxyFetcher::HTTPClient, @proxy_validator=ProxyFetcher::ProxyValidator,
|
53
|
+
# @providers=[:free_proxy_list, ...], @adapter=ProxyFetcher::Document::NokogiriAdapter>
|
54
|
+
#
|
38
55
|
def config
|
39
56
|
@config ||= ProxyFetcher::Configuration.new
|
40
57
|
end
|
41
58
|
|
59
|
+
##
|
60
|
+
# Configures ProxyFetcher and yields config object for additional manipulations.
|
61
|
+
|
62
|
+
# @yieldreturn [optional, types, ...] description
|
63
|
+
#
|
64
|
+
# @return [ProxyFetcher::Configuration]
|
65
|
+
# Configuration object.
|
66
|
+
#
|
42
67
|
def configure
|
43
68
|
yield config
|
44
69
|
end
|
data/proxy_fetcher.gemspec
CHANGED
@@ -5,7 +5,7 @@ require 'proxy_fetcher/version'
|
|
5
5
|
Gem::Specification.new do |gem|
|
6
6
|
gem.name = 'proxy_fetcher'
|
7
7
|
gem.version = ProxyFetcher.gem_version
|
8
|
-
gem.date = '2017-12-
|
8
|
+
gem.date = '2017-12-27'
|
9
9
|
gem.summary = 'Ruby gem for dealing with proxy lists from different providers'
|
10
10
|
gem.description = 'This gem can help your Ruby application to make HTTP(S) requests ' \
|
11
11
|
'using proxies by fetching and validating proxy lists from the different providers.'
|
@@ -24,13 +24,20 @@ describe ProxyFetcher::Proxy do
|
|
24
24
|
proxy.type = ProxyFetcher::Proxy::HTTP
|
25
25
|
expect(proxy.http?).to be_truthy
|
26
26
|
expect(proxy.https?).to be_falsey
|
27
|
+
expect(proxy.ssl?).to be_falsey
|
27
28
|
|
28
29
|
proxy.type = ProxyFetcher::Proxy::HTTPS
|
29
30
|
expect(proxy.https?).to be_truthy
|
30
31
|
expect(proxy.http?).to be_truthy
|
32
|
+
expect(proxy.ssl?).to be_truthy
|
33
|
+
|
34
|
+
proxy.type = ProxyFetcher::Proxy::SOCKS4
|
35
|
+
expect(proxy.socks4?).to be_truthy
|
36
|
+
expect(proxy.ssl?).to be_truthy
|
31
37
|
|
32
38
|
proxy.type = ProxyFetcher::Proxy::SOCKS5
|
33
39
|
expect(proxy.socks5?).to be_truthy
|
40
|
+
expect(proxy.ssl?).to be_truthy
|
34
41
|
end
|
35
42
|
|
36
43
|
it 'not connectable if IP addr is wrong' do
|
@@ -51,6 +58,9 @@ describe ProxyFetcher::Proxy do
|
|
51
58
|
|
52
59
|
it 'returns URI::Generic' do
|
53
60
|
expect(proxy.uri).to be_a(URI::Generic)
|
61
|
+
|
62
|
+
expect(proxy.uri.host).not_to be_empty
|
63
|
+
expect(proxy.uri.port).not_to be_nil
|
54
64
|
end
|
55
65
|
|
56
66
|
it 'returns URL' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proxy_fetcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikita Bulai
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-12-
|
11
|
+
date: 2017-12-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -107,7 +107,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
107
|
version: '0'
|
108
108
|
requirements: []
|
109
109
|
rubyforge_project:
|
110
|
-
rubygems_version: 2.
|
110
|
+
rubygems_version: 2.6.11
|
111
111
|
signing_key:
|
112
112
|
specification_version: 4
|
113
113
|
summary: Ruby gem for dealing with proxy lists from different providers
|