proxy_fetcher 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 59ba9fd7169617766f99b048a1c889e0213ac57c
4
- data.tar.gz: 03e3d51d71fb5c06c0c7c7d5808a8a43c79c50a4
3
+ metadata.gz: ad122ebb9d6241313981ef764c41a1b940dfdc12
4
+ data.tar.gz: 2cf2d359bfd74122fbfdf9d97d94c1e70b008cd5
5
5
  SHA512:
6
- metadata.gz: 495ccb751c9f927f1ef31e1b2536c600aec0f68189ea11d1f8061d59bfe41caf2dd0e818d9b780a5b8279c66b20c05873ffba4976cadb101c4aff2a7a449bc94
7
- data.tar.gz: a2aae32a4a3649b10399e23520fbf0a19df3ca564a3519290352bd51366cda6f8a709aa7a1783de31e5e474ab23bd86ea2fb41fff320a81eb57a16b3ef111b30
6
+ metadata.gz: 4ebe17a6ebfb52a14267856053ad750676d4b28e85e138aa172d778b85368ec85ccd04af55cdceb53e1aaad3db5cdab09d42ee87767d56b1fc5578c1c6ba114b
7
+ data.tar.gz: 6133dfc0b9810831ae7e79c2795ad3214a4b1c78df71e7a2a910429dd21d6b67a7a2f22c29b284fc30d68fff4ea9907fc39edefbb630027eb9a1e2cd4eb2ae24
data/CHANGELOG.md CHANGED
@@ -2,6 +2,24 @@
2
2
 
3
3
  Reverse Chronological Order:
4
4
 
5
+ ## `master`
6
+
7
+ * Add your changelog here
8
+
9
+ ## `0.8.0` (2018-11-12)
10
+
11
+ * Improve speed of proxy list loading.
12
+ * Improve speed of proxies cleanup.
13
+ * Fix ProxyDocker provider
14
+
15
+ ## `0.7.2` (2018-08-13)
16
+
17
+ * Fix XRoxy provider
18
+
19
+ ## `0.7.1` (2018-07-13)
20
+
21
+ * Fix XRoxy provider
22
+
5
23
  ## `0.7.0` (2018-06-04)
6
24
 
7
25
  * Migrate to `HTTP.rb` instead of `Net::HTTP`
data/lib/proxy_fetcher.rb CHANGED
@@ -15,6 +15,7 @@ require File.dirname(__FILE__) + '/proxy_fetcher/null_logger'
15
15
 
16
16
  require File.dirname(__FILE__) + '/proxy_fetcher/utils/http_client'
17
17
  require File.dirname(__FILE__) + '/proxy_fetcher/utils/proxy_validator'
18
+ require File.dirname(__FILE__) + '/proxy_fetcher/utils/proxy_list_validator'
18
19
  require File.dirname(__FILE__) + '/proxy_fetcher/client/client'
19
20
  require File.dirname(__FILE__) + '/proxy_fetcher/client/request'
20
21
  require File.dirname(__FILE__) + '/proxy_fetcher/client/proxies_registry'
@@ -31,12 +31,24 @@ module ProxyFetcher
31
31
  def refresh_list!(filters = nil)
32
32
  @proxies = []
33
33
 
34
+ threads = []
35
+ lock = Mutex.new
36
+
34
37
  ProxyFetcher.config.providers.each do |provider_name|
35
- provider = ProxyFetcher::Configuration.providers_registry.class_for(provider_name)
36
- provider_filters = filters && filters.fetch(provider_name.to_sym, filters)
38
+ threads << Thread.new do
39
+ provider = ProxyFetcher::Configuration.providers_registry.class_for(provider_name)
40
+ provider_filters = filters && filters.fetch(provider_name.to_sym, filters)
41
+ provider_proxies = provider.fetch_proxies!(provider_filters)
37
42
 
38
- @proxies.concat(provider.fetch_proxies!(provider_filters))
43
+ lock.synchronize do
44
+ @proxies.concat(provider_proxies)
45
+ end
46
+ end
39
47
  end
48
+
49
+ threads.each(&:join)
50
+
51
+ @proxies
40
52
  end
41
53
 
42
54
  alias fetch! refresh_list!
@@ -78,20 +90,12 @@ module ProxyFetcher
78
90
  alias pop! get!
79
91
 
80
92
  # Clean current proxy list from dead proxies (that doesn't respond by timeout)
93
+ #
94
+ # @return [Array<ProxyFetcher::Proxy>]
95
+ # list of valid proxies
81
96
  def cleanup!
82
- lock = Mutex.new
83
-
84
- proxies.dup.each_slice(ProxyFetcher.config.pool_size) do |proxy_group|
85
- threads = proxy_group.map do |group_proxy|
86
- Thread.new(group_proxy, proxies) do |proxy, proxies|
87
- lock.synchronize { proxies.delete(proxy) } unless proxy.connectable?
88
- end
89
- end
90
-
91
- threads.each(&:join)
92
- end
93
-
94
- @proxies
97
+ valid_proxies = ProxyListValidator.new(@proxies).validate
98
+ @proxies &= valid_proxies
95
99
  end
96
100
 
97
101
  alias validate! cleanup!
@@ -17,7 +17,7 @@ module ProxyFetcher
17
17
  # [NOTE] Doesn't support direct filters
18
18
  def load_proxy_list(*)
19
19
  doc = load_document(PROVIDER_URL, {})
20
- doc.xpath('//table[contains(@class, "table")]/tr[(not(@id="proxy-table-header")) and (count(td)>2)]')
20
+ doc.xpath('//table[contains(@class, "table")]/tbody/tr[(count(td)>2)]')
21
21
  end
22
22
 
23
23
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -5,7 +5,7 @@ module ProxyFetcher
5
5
  # XRoxy provider class.
6
6
  class XRoxy < Base
7
7
  # Provider URL to fetch proxy list
8
- PROVIDER_URL = 'https://www.xroxy.com/proxylist.php'.freeze
8
+ PROVIDER_URL = 'https://www.xroxy.com/free-proxy-lists/'.freeze
9
9
 
10
10
  # Fetches HTML content by sending HTTP request to the provider URL and
11
11
  # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
@@ -16,7 +16,7 @@ module ProxyFetcher
16
16
  #
17
17
  def load_proxy_list(filters = { type: 'All_http' })
18
18
  doc = load_document(PROVIDER_URL, filters)
19
- doc.xpath('//div[@id="content"]/table[1]/tr[contains(@class, "row")]')
19
+ doc.xpath('//div/table/tbody/tr')
20
20
  end
21
21
 
22
22
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -30,29 +30,14 @@ module ProxyFetcher
30
30
  #
31
31
  def to_proxy(html_node)
32
32
  ProxyFetcher::Proxy.new.tap do |proxy|
33
- proxy.addr = html_node.content_at('td[2]')
34
- proxy.port = Integer(html_node.content_at('td[3]').gsub(/^0+/, ''))
35
- proxy.anonymity = html_node.content_at('td[4]')
36
- proxy.country = html_node.content_at('td[6]')
37
- proxy.response_time = Integer(html_node.content_at('td[7]'))
38
- proxy.type = parse_type(html_node)
33
+ proxy.addr = html_node.content_at('td[1]')
34
+ proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
35
+ proxy.anonymity = html_node.content_at('td[3]')
36
+ proxy.country = html_node.content_at('td[5]')
37
+ proxy.response_time = Integer(html_node.content_at('td[6]'))
38
+ proxy.type = html_node.content_at('td[3]')
39
39
  end
40
40
  end
41
-
42
- private
43
-
44
- # Parses HTML node to extract proxy type.
45
- #
46
- # @param html_node [Object]
47
- # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
48
- #
49
- # @return [String]
50
- # Proxy type
51
- #
52
- def parse_type(html_node)
53
- https = html_node.content_at('td[5]')
54
- https.casecmp('true').zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP
55
- end
56
41
  end
57
42
 
58
43
  ProxyFetcher::Configuration.register_provider(:xroxy, XRoxy)
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProxyFetcher
4
+ # This class validates list of proxies.
5
+ # Each proxy is validated using <code>ProxyFetcher::ProxyValidator</code>.
6
+ class ProxyListValidator
7
+ # @!attribute [r] proxies
8
+ # @return [Array<ProxyFetcher::Proxy>] Source array of proxies
9
+ attr_reader :proxies
10
+ # @!attribute [r] valid_proxies
11
+ # @return [Array<ProxyFetcher::Proxy>] Array of valid proxies after validation
12
+ attr_reader :valid_proxies
13
+
14
+ # @param [Array<ProxyFetcher::Proxy>] *proxies
15
+ # Any number of <code>ProxyFetcher::Proxy</code> to validate
16
+ def initialize(*proxies)
17
+ @proxies = proxies.flatten
18
+ end
19
+
20
+ # Performs validation
21
+ #
22
+ # @return [Array<ProxyFetcher::Proxy>]
23
+ # list of valid proxies
24
+ def validate
25
+ target_proxies = @proxies.dup
26
+ target_proxies_lock = Mutex.new
27
+ connectable_proxies = []
28
+ connectable_proxies_lock = Mutex.new
29
+ threads = []
30
+
31
+ ProxyFetcher.config.pool_size.times do
32
+ threads << Thread.new do
33
+ loop do
34
+ proxy = target_proxies_lock.synchronize { target_proxies.shift }
35
+ break unless proxy
36
+
37
+ connectable_proxies_lock.synchronize { connectable_proxies << proxy } if proxy.connectable?
38
+ end
39
+ end
40
+ end
41
+
42
+ threads.each(&:join)
43
+
44
+ @valid_proxies = connectable_proxies
45
+ end
46
+ end
47
+ end
@@ -13,9 +13,9 @@ module ProxyFetcher
13
13
  # Major version number
14
14
  MAJOR = 0
15
15
  # Minor version number
16
- MINOR = 7
16
+ MINOR = 8
17
17
  # Smallest version number
18
- TINY = 1
18
+ TINY = 0
19
19
 
20
20
  # Full version number
21
21
  STRING = [MAJOR, MINOR, TINY].compact.join('.')
@@ -35,7 +35,9 @@ RSpec.shared_examples 'a manager' do
35
35
 
36
36
  manager = ProxyFetcher::Manager.new
37
37
 
38
- expect { manager.cleanup! }.to change { manager.proxies }.to([])
38
+ expect do
39
+ manager.cleanup!
40
+ end.to change { manager.proxies }.to([])
39
41
  end
40
42
 
41
43
  it "doesn't pollute the output with array of proxies" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: proxy_fetcher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.1
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nikita Bulai
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-07-13 00:00:00.000000000 Z
11
+ date: 2018-11-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http
@@ -80,6 +80,7 @@ files:
80
80
  - lib/proxy_fetcher/providers/xroxy.rb
81
81
  - lib/proxy_fetcher/proxy.rb
82
82
  - lib/proxy_fetcher/utils/http_client.rb
83
+ - lib/proxy_fetcher/utils/proxy_list_validator.rb
83
84
  - lib/proxy_fetcher/utils/proxy_validator.rb
84
85
  - lib/proxy_fetcher/version.rb
85
86
  - proxy_fetcher.gemspec