proxy_fetcher 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/lib/proxy_fetcher.rb +1 -0
- data/lib/proxy_fetcher/manager.rb +20 -16
- data/lib/proxy_fetcher/providers/proxy_docker.rb +1 -1
- data/lib/proxy_fetcher/providers/xroxy.rb +8 -23
- data/lib/proxy_fetcher/utils/proxy_list_validator.rb +47 -0
- data/lib/proxy_fetcher/version.rb +2 -2
- data/spec/support/manager_examples.rb +3 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ad122ebb9d6241313981ef764c41a1b940dfdc12
|
4
|
+
data.tar.gz: 2cf2d359bfd74122fbfdf9d97d94c1e70b008cd5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ebe17a6ebfb52a14267856053ad750676d4b28e85e138aa172d778b85368ec85ccd04af55cdceb53e1aaad3db5cdab09d42ee87767d56b1fc5578c1c6ba114b
|
7
|
+
data.tar.gz: 6133dfc0b9810831ae7e79c2795ad3214a4b1c78df71e7a2a910429dd21d6b67a7a2f22c29b284fc30d68fff4ea9907fc39edefbb630027eb9a1e2cd4eb2ae24
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,24 @@
|
|
2
2
|
|
3
3
|
Reverse Chronological Order:
|
4
4
|
|
5
|
+
## `master`
|
6
|
+
|
7
|
+
* Add your changelog here
|
8
|
+
|
9
|
+
## `0.8.0` (2018-11-12)
|
10
|
+
|
11
|
+
* Improve speed of proxy list loading.
|
12
|
+
* Improve speed of proxies cleanup.
|
13
|
+
* Fix ProxyDocker provider
|
14
|
+
|
15
|
+
## `0.7.2` (2018-08-13)
|
16
|
+
|
17
|
+
* Fix XRoxy provider
|
18
|
+
|
19
|
+
## `0.7.1` (2018-07-13)
|
20
|
+
|
21
|
+
* Fix XRoxy provider
|
22
|
+
|
5
23
|
## `0.7.0` (2018-06-04)
|
6
24
|
|
7
25
|
* Migrate to `HTTP.rb` instead of `Net::HTTP`
|
data/lib/proxy_fetcher.rb
CHANGED
@@ -15,6 +15,7 @@ require File.dirname(__FILE__) + '/proxy_fetcher/null_logger'
|
|
15
15
|
|
16
16
|
require File.dirname(__FILE__) + '/proxy_fetcher/utils/http_client'
|
17
17
|
require File.dirname(__FILE__) + '/proxy_fetcher/utils/proxy_validator'
|
18
|
+
require File.dirname(__FILE__) + '/proxy_fetcher/utils/proxy_list_validator'
|
18
19
|
require File.dirname(__FILE__) + '/proxy_fetcher/client/client'
|
19
20
|
require File.dirname(__FILE__) + '/proxy_fetcher/client/request'
|
20
21
|
require File.dirname(__FILE__) + '/proxy_fetcher/client/proxies_registry'
|
@@ -31,12 +31,24 @@ module ProxyFetcher
|
|
31
31
|
def refresh_list!(filters = nil)
|
32
32
|
@proxies = []
|
33
33
|
|
34
|
+
threads = []
|
35
|
+
lock = Mutex.new
|
36
|
+
|
34
37
|
ProxyFetcher.config.providers.each do |provider_name|
|
35
|
-
|
36
|
-
|
38
|
+
threads << Thread.new do
|
39
|
+
provider = ProxyFetcher::Configuration.providers_registry.class_for(provider_name)
|
40
|
+
provider_filters = filters && filters.fetch(provider_name.to_sym, filters)
|
41
|
+
provider_proxies = provider.fetch_proxies!(provider_filters)
|
37
42
|
|
38
|
-
|
43
|
+
lock.synchronize do
|
44
|
+
@proxies.concat(provider_proxies)
|
45
|
+
end
|
46
|
+
end
|
39
47
|
end
|
48
|
+
|
49
|
+
threads.each(&:join)
|
50
|
+
|
51
|
+
@proxies
|
40
52
|
end
|
41
53
|
|
42
54
|
alias fetch! refresh_list!
|
@@ -78,20 +90,12 @@ module ProxyFetcher
|
|
78
90
|
alias pop! get!
|
79
91
|
|
80
92
|
# Clean current proxy list from dead proxies (that doesn't respond by timeout)
|
93
|
+
#
|
94
|
+
# @return [Array<ProxyFetcher::Proxy>]
|
95
|
+
# list of valid proxies
|
81
96
|
def cleanup!
|
82
|
-
|
83
|
-
|
84
|
-
proxies.dup.each_slice(ProxyFetcher.config.pool_size) do |proxy_group|
|
85
|
-
threads = proxy_group.map do |group_proxy|
|
86
|
-
Thread.new(group_proxy, proxies) do |proxy, proxies|
|
87
|
-
lock.synchronize { proxies.delete(proxy) } unless proxy.connectable?
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
threads.each(&:join)
|
92
|
-
end
|
93
|
-
|
94
|
-
@proxies
|
97
|
+
valid_proxies = ProxyListValidator.new(@proxies).validate
|
98
|
+
@proxies &= valid_proxies
|
95
99
|
end
|
96
100
|
|
97
101
|
alias validate! cleanup!
|
@@ -17,7 +17,7 @@ module ProxyFetcher
|
|
17
17
|
# [NOTE] Doesn't support direct filters
|
18
18
|
def load_proxy_list(*)
|
19
19
|
doc = load_document(PROVIDER_URL, {})
|
20
|
-
doc.xpath('//table[contains(@class, "table")]/tr[(
|
20
|
+
doc.xpath('//table[contains(@class, "table")]/tbody/tr[(count(td)>2)]')
|
21
21
|
end
|
22
22
|
|
23
23
|
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
@@ -5,7 +5,7 @@ module ProxyFetcher
|
|
5
5
|
# XRoxy provider class.
|
6
6
|
class XRoxy < Base
|
7
7
|
# Provider URL to fetch proxy list
|
8
|
-
PROVIDER_URL = 'https://www.xroxy.com/
|
8
|
+
PROVIDER_URL = 'https://www.xroxy.com/free-proxy-lists/'.freeze
|
9
9
|
|
10
10
|
# Fetches HTML content by sending HTTP request to the provider URL and
|
11
11
|
# parses the document (built as abstract <code>ProxyFetcher::Document</code>)
|
@@ -16,7 +16,7 @@ module ProxyFetcher
|
|
16
16
|
#
|
17
17
|
def load_proxy_list(filters = { type: 'All_http' })
|
18
18
|
doc = load_document(PROVIDER_URL, filters)
|
19
|
-
doc.xpath('//div
|
19
|
+
doc.xpath('//div/table/tbody/tr')
|
20
20
|
end
|
21
21
|
|
22
22
|
# Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
|
@@ -30,29 +30,14 @@ module ProxyFetcher
|
|
30
30
|
#
|
31
31
|
def to_proxy(html_node)
|
32
32
|
ProxyFetcher::Proxy.new.tap do |proxy|
|
33
|
-
proxy.addr = html_node.content_at('td[
|
34
|
-
proxy.port = Integer(html_node.content_at('td[
|
35
|
-
proxy.anonymity = html_node.content_at('td[
|
36
|
-
proxy.country = html_node.content_at('td[
|
37
|
-
proxy.response_time = Integer(html_node.content_at('td[
|
38
|
-
proxy.type =
|
33
|
+
proxy.addr = html_node.content_at('td[1]')
|
34
|
+
proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
|
35
|
+
proxy.anonymity = html_node.content_at('td[3]')
|
36
|
+
proxy.country = html_node.content_at('td[5]')
|
37
|
+
proxy.response_time = Integer(html_node.content_at('td[6]'))
|
38
|
+
proxy.type = html_node.content_at('td[3]')
|
39
39
|
end
|
40
40
|
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
# Parses HTML node to extract proxy type.
|
45
|
-
#
|
46
|
-
# @param html_node [Object]
|
47
|
-
# HTML node from the <code>ProxyFetcher::Document</code> DOM model.
|
48
|
-
#
|
49
|
-
# @return [String]
|
50
|
-
# Proxy type
|
51
|
-
#
|
52
|
-
def parse_type(html_node)
|
53
|
-
https = html_node.content_at('td[5]')
|
54
|
-
https.casecmp('true').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
|
55
|
-
end
|
56
41
|
end
|
57
42
|
|
58
43
|
ProxyFetcher::Configuration.register_provider(:xroxy, XRoxy)
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ProxyFetcher
|
4
|
+
# This class validates list of proxies.
|
5
|
+
# Each proxy is validated using <code>ProxyFetcher::ProxyValidator</code>.
|
6
|
+
class ProxyListValidator
|
7
|
+
# @!attribute [r] proxies
|
8
|
+
# @return [Array<ProxyFetcher::Proxy>] Source array of proxies
|
9
|
+
attr_reader :proxies
|
10
|
+
# @!attribute [r] valid_proxies
|
11
|
+
# @return [Array<ProxyFetcher::Proxy>] Array of valid proxies after validation
|
12
|
+
attr_reader :valid_proxies
|
13
|
+
|
14
|
+
# @param [Array<ProxyFetcher::Proxy>] *proxies
|
15
|
+
# Any number of <code>ProxyFetcher::Proxy</code> to validate
|
16
|
+
def initialize(*proxies)
|
17
|
+
@proxies = proxies.flatten
|
18
|
+
end
|
19
|
+
|
20
|
+
# Performs validation
|
21
|
+
#
|
22
|
+
# @return [Array<ProxyFetcher::Proxy>]
|
23
|
+
# list of valid proxies
|
24
|
+
def validate
|
25
|
+
target_proxies = @proxies.dup
|
26
|
+
target_proxies_lock = Mutex.new
|
27
|
+
connectable_proxies = []
|
28
|
+
connectable_proxies_lock = Mutex.new
|
29
|
+
threads = []
|
30
|
+
|
31
|
+
ProxyFetcher.config.pool_size.times do
|
32
|
+
threads << Thread.new do
|
33
|
+
loop do
|
34
|
+
proxy = target_proxies_lock.synchronize { target_proxies.shift }
|
35
|
+
break unless proxy
|
36
|
+
|
37
|
+
connectable_proxies_lock.synchronize { connectable_proxies << proxy } if proxy.connectable?
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
threads.each(&:join)
|
43
|
+
|
44
|
+
@valid_proxies = connectable_proxies
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -35,7 +35,9 @@ RSpec.shared_examples 'a manager' do
|
|
35
35
|
|
36
36
|
manager = ProxyFetcher::Manager.new
|
37
37
|
|
38
|
-
expect
|
38
|
+
expect do
|
39
|
+
manager.cleanup!
|
40
|
+
end.to change { manager.proxies }.to([])
|
39
41
|
end
|
40
42
|
|
41
43
|
it "doesn't pollute the output with array of proxies" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proxy_fetcher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nikita Bulai
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-11-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|
@@ -80,6 +80,7 @@ files:
|
|
80
80
|
- lib/proxy_fetcher/providers/xroxy.rb
|
81
81
|
- lib/proxy_fetcher/proxy.rb
|
82
82
|
- lib/proxy_fetcher/utils/http_client.rb
|
83
|
+
- lib/proxy_fetcher/utils/proxy_list_validator.rb
|
83
84
|
- lib/proxy_fetcher/utils/proxy_validator.rb
|
84
85
|
- lib/proxy_fetcher/version.rb
|
85
86
|
- proxy_fetcher.gemspec
|