proxy_fetcher 0.10.2 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -1
  3. data/Gemfile +8 -5
  4. data/Rakefile +7 -3
  5. data/gemfiles/nokogiri.gemfile +8 -6
  6. data/gemfiles/oga.gemfile +8 -6
  7. data/lib/proxy_fetcher.rb +46 -35
  8. data/lib/proxy_fetcher/client/client.rb +10 -3
  9. data/lib/proxy_fetcher/client/request.rb +4 -4
  10. data/lib/proxy_fetcher/configuration.rb +24 -19
  11. data/lib/proxy_fetcher/document.rb +0 -9
  12. data/lib/proxy_fetcher/document/adapters.rb +1 -1
  13. data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +3 -12
  14. data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +1 -1
  15. data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +1 -1
  16. data/lib/proxy_fetcher/document/node.rb +2 -2
  17. data/lib/proxy_fetcher/exceptions.rb +6 -6
  18. data/lib/proxy_fetcher/manager.rb +42 -9
  19. data/lib/proxy_fetcher/providers/base.rb +43 -22
  20. data/lib/proxy_fetcher/providers/free_proxy_list.rb +9 -10
  21. data/lib/proxy_fetcher/providers/free_proxy_list_socks.rb +58 -0
  22. data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +7 -15
  23. data/lib/proxy_fetcher/providers/free_proxy_list_us.rb +54 -0
  24. data/lib/proxy_fetcher/providers/http_tunnel.rb +11 -19
  25. data/lib/proxy_fetcher/providers/mtpro.rb +43 -0
  26. data/lib/proxy_fetcher/providers/proxy_list.rb +8 -16
  27. data/lib/proxy_fetcher/providers/proxypedia.rb +48 -0
  28. data/lib/proxy_fetcher/providers/proxyscrape_http.rb +65 -0
  29. data/lib/proxy_fetcher/providers/proxyscrape_socks4.rb +65 -0
  30. data/lib/proxy_fetcher/providers/proxyscrape_socks5.rb +65 -0
  31. data/lib/proxy_fetcher/providers/xroxy.rb +9 -17
  32. data/lib/proxy_fetcher/proxy.rb +16 -4
  33. data/lib/proxy_fetcher/utils/http_client.rb +7 -12
  34. data/lib/proxy_fetcher/utils/proxy_list_validator.rb +3 -1
  35. data/lib/proxy_fetcher/utils/proxy_validator.rb +21 -9
  36. data/lib/proxy_fetcher/version.rb +3 -3
  37. data/proxy_fetcher.gemspec +21 -16
  38. data/spec/fixtures/proxies.txt +14 -0
  39. data/spec/proxy_fetcher/client/client_spec.rb +72 -57
  40. data/spec/proxy_fetcher/configuration_spec.rb +11 -11
  41. data/spec/proxy_fetcher/document/adapters_spec.rb +8 -8
  42. data/spec/proxy_fetcher/document/node_spec.rb +4 -4
  43. data/spec/proxy_fetcher/manager_spec.rb +18 -0
  44. data/spec/proxy_fetcher/providers/base_spec.rb +9 -9
  45. data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +4 -4
  46. data/spec/proxy_fetcher/providers/proxy_classes_spec.rb +28 -0
  47. data/spec/proxy_fetcher/proxy_spec.rb +14 -14
  48. data/spec/proxy_fetcher/version_spec.rb +2 -0
  49. data/spec/spec_helper.rb +10 -10
  50. data/spec/support/manager_examples.rb +21 -21
  51. metadata +27 -17
  52. data/lib/proxy_fetcher/providers/gather_proxy.rb +0 -58
  53. data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +0 -13
  54. data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +0 -11
  55. data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +0 -11
  56. data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +0 -11
  57. data/spec/proxy_fetcher/providers/proxy_list_spec.rb +0 -11
  58. data/spec/proxy_fetcher/providers/xroxy_spec.rb +0 -11
@@ -6,19 +6,11 @@ module ProxyFetcher
6
6
  class HTTPTunnel < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'http://www.httptunnel.ge/ProxyListForFree.aspx'
9
+ "http://www.httptunnel.ge/ProxyListForFree.aspx"
10
10
  end
11
11
 
12
- # Fetches HTML content by sending HTTP request to the provider URL and
13
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
14
- # to return all the proxy entries (HTML nodes).
15
- #
16
- # @return [Array<ProxyFetcher::Document::Node>]
17
- # Collection of extracted HTML nodes with full proxy info
18
- #
19
- def load_proxy_list(_filters = {})
20
- doc = load_document(provider_url)
21
- doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
12
+ def xpath
13
+ '//table[contains(@id, "GridView")]/tr[(count(td)>2)]'
22
14
  end
23
15
 
24
16
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -53,7 +45,7 @@ module ProxyFetcher
53
45
  # URI object
54
46
  #
55
47
  def parse_proxy_uri(html_node)
56
- full_addr = html_node.content_at('td[1]')
48
+ full_addr = html_node.content_at("td[1]")
57
49
  URI.parse("http://#{full_addr}")
58
50
  end
59
51
 
@@ -66,7 +58,7 @@ module ProxyFetcher
66
58
  # Country code
67
59
  #
68
60
  def parse_country(html_node)
69
- html_node.find('.//img').attr('title')
61
+ html_node.find(".//img").attr("title")
70
62
  end
71
63
 
72
64
  # Parses HTML node to extract proxy anonymity level.
@@ -78,14 +70,14 @@ module ProxyFetcher
78
70
  # Anonymity level
79
71
  #
80
72
  def parse_anonymity(html_node)
81
- transparency = html_node.content_at('td[5]').to_sym
73
+ transparency = html_node.content_at("td[5]").to_sym
82
74
 
83
75
  {
84
- A: 'Anonymous',
85
- E: 'Elite',
86
- T: 'Transparent',
87
- U: 'Unknown'
88
- }.fetch(transparency, 'Unknown')
76
+ A: "Anonymous",
77
+ E: "Elite",
78
+ T: "Transparent",
79
+ U: "Unknown"
80
+ }.fetch(transparency, "Unknown")
89
81
  end
90
82
  end
91
83
 
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # MTPro provider class.
8
+ class MTPro < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://mtpro.xyz/api/?type=socks"
12
+ end
13
+
14
+ def load_proxy_list(filters = {})
15
+ html = load_html(provider_url, filters)
16
+ JSON.parse(html)
17
+ rescue JSON::ParserError
18
+ []
19
+ end
20
+
21
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
22
+ # object.
23
+ #
24
+ # @param node [Object]
25
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
26
+ #
27
+ # @return [ProxyFetcher::Proxy]
28
+ # Proxy object
29
+ #
30
+ def to_proxy(node)
31
+ ProxyFetcher::Proxy.new.tap do |proxy|
32
+ proxy.addr = node["ip"]
33
+ proxy.port = Integer(node["port"])
34
+ proxy.country = node["country"]
35
+ proxy.anonymity = "Unknown"
36
+ proxy.type = ProxyFetcher::Proxy::SOCKS5
37
+ end
38
+ end
39
+ end
40
+
41
+ ProxyFetcher::Configuration.register_provider(:mtpro, MTPro)
42
+ end
43
+ end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'base64'
3
+ require "base64"
4
4
 
5
5
  module ProxyFetcher
6
6
  module Providers
@@ -8,19 +8,11 @@ module ProxyFetcher
8
8
  class ProxyList < Base
9
9
  # Provider URL to fetch proxy list
10
10
  def provider_url
11
- 'https://proxy-list.org/english/index.php'
11
+ "https://proxy-list.org/english/index.php"
12
12
  end
13
13
 
14
- # Fetches HTML content by sending HTTP request to the provider URL and
15
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
16
- # to return all the proxy entries (HTML nodes).
17
- #
18
- # @return [Array<ProxyFetcher::Document::Node>]
19
- # Collection of extracted HTML nodes with full proxy info
20
- #
21
- def load_proxy_list(filters = {})
22
- doc = load_document(provider_url, filters)
23
- doc.css('.table-wrap .table ul')
14
+ def xpath
15
+ '//div[@class="table-wrap"]/div[@class="table"]/ul'
24
16
  end
25
17
 
26
18
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -38,9 +30,9 @@ module ProxyFetcher
38
30
  proxy.addr = uri.host
39
31
  proxy.port = uri.port
40
32
 
41
- proxy.type = html_node.content_at('li[2]')
42
- proxy.anonymity = html_node.content_at('li[4]')
43
- proxy.country = html_node.find("li[5]//span[@class='country']").attr('title')
33
+ proxy.type = html_node.content_at("li[2]")
34
+ proxy.anonymity = html_node.content_at("li[4]")
35
+ proxy.country = html_node.find("li[5]//span[@class='country']").attr("title")
44
36
  end
45
37
  end
46
38
 
@@ -55,7 +47,7 @@ module ProxyFetcher
55
47
  # URI object
56
48
  #
57
49
  def parse_proxy_uri(html_node)
58
- full_addr = ::Base64.decode64(html_node.at_css('li script').html.match(/'(.+)'/)[1])
50
+ full_addr = ::Base64.decode64(html_node.at_css("li script").html.match(/'(.+)'/)[1])
59
51
  URI.parse("http://#{full_addr}")
60
52
  end
61
53
  end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProxyFetcher
4
+ module Providers
5
+ # FreeProxyList provider class.
6
+ class Proxypedia < Base
7
+ # Provider URL to fetch proxy list
8
+ def provider_url
9
+ "https://proxypedia.org"
10
+ end
11
+
12
+ # [NOTE] Doesn't support filtering
13
+ def xpath
14
+ "//main/ul/li[position()>1]"
15
+ end
16
+
17
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
18
+ # object.]
19
+ #
20
+ # @param html_node [Object]
21
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
22
+ #
23
+ # @return [ProxyFetcher::Proxy]
24
+ # Proxy object
25
+ #
26
+ def to_proxy(html_node)
27
+ addr, port = html_node.content_at("a").to_s.split(":")
28
+
29
+ ProxyFetcher::Proxy.new.tap do |proxy|
30
+ proxy.addr = addr
31
+ proxy.port = Integer(port)
32
+ proxy.country = parse_country(html_node)
33
+ proxy.anonymity = "Unknown"
34
+ proxy.type = ProxyFetcher::Proxy::HTTP
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def parse_country(html_node)
41
+ text = html_node.content.to_s
42
+ text[/\((.+?)\)/, 1] || "Unknown"
43
+ end
44
+ end
45
+
46
+ ProxyFetcher::Configuration.register_provider(:proxypedia, Proxypedia)
47
+ end
48
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # FreeProxyList provider class.
8
+ class ProxyscrapeHTTP < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http"
12
+ end
13
+
14
+ # Loads provider HTML and parses it with internal document object.
15
+ #
16
+ # @param url [String]
17
+ # URL to fetch
18
+ #
19
+ # @param filters [Hash]
20
+ # filters for proxy provider
21
+ #
22
+ # @return [Array]
23
+ # Collection of extracted proxies with ports
24
+ #
25
+ def load_document(url, filters = {})
26
+ html = load_html(url, filters)
27
+
28
+ CSV.parse(html, col_sep: "\t").map(&:first)
29
+ end
30
+
31
+ # Fetches HTML content by sending HTTP request to the provider URL and
32
+ # parses the txt document to return all the proxy entries (ip addresses
33
+ # and ports).
34
+ #
35
+ # @return [Array]
36
+ # Collection of extracted proxies with ports
37
+ #
38
+ def load_proxy_list(filters = {})
39
+ load_document(provider_url, filters)
40
+ end
41
+
42
+ # Converts String to <code>ProxyFetcher::Proxy</code> object.
43
+ #
44
+ # @param node [String]
45
+ # String
46
+ #
47
+ # @return [ProxyFetcher::Proxy]
48
+ # Proxy object
49
+ #
50
+ def to_proxy(node)
51
+ addr, port = node.split(":")
52
+
53
+ ProxyFetcher::Proxy.new.tap do |proxy|
54
+ proxy.addr = addr
55
+ proxy.port = Integer(port)
56
+ proxy.country = "Unknown"
57
+ proxy.anonymity = "Unknown"
58
+ proxy.type = ProxyFetcher::Proxy::HTTP
59
+ end
60
+ end
61
+ end
62
+
63
+ ProxyFetcher::Configuration.register_provider(:proxyscrape_http, ProxyscrapeHTTP)
64
+ end
65
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # FreeProxyList provider class.
8
+ class ProxyscrapeSOCKS4 < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks4"
12
+ end
13
+
14
+ # Loads provider HTML and parses it with internal document object.
15
+ #
16
+ # @param url [String]
17
+ # URL to fetch
18
+ #
19
+ # @param filters [Hash]
20
+ # filters for proxy provider
21
+ #
22
+ # @return [Array]
23
+ # Collection of extracted proxies with ports
24
+ #
25
+ def load_document(url, filters = {})
26
+ html = load_html(url, filters)
27
+
28
+ CSV.parse(html, col_sep: "\t").map(&:first)
29
+ end
30
+
31
+ # Fetches HTML content by sending HTTP request to the provider URL and
32
+ # parses the txt document to return all the proxy entries (ip addresses
33
+ # and ports).
34
+ #
35
+ # @return [Array]
36
+ # Collection of extracted proxies with ports
37
+ #
38
+ def load_proxy_list(filters = {})
39
+ load_document(provider_url, filters)
40
+ end
41
+
42
+ # Converts String to <code>ProxyFetcher::Proxy</code> object.
43
+ #
44
+ # @param node [String]
45
+ # String
46
+ #
47
+ # @return [ProxyFetcher::Proxy]
48
+ # Proxy object
49
+ #
50
+ def to_proxy(html_node)
51
+ addr, port = html_node.split(":")
52
+
53
+ ProxyFetcher::Proxy.new.tap do |proxy|
54
+ proxy.addr = addr
55
+ proxy.port = Integer(port)
56
+ proxy.country = "Unknown"
57
+ proxy.anonymity = "Unknown"
58
+ proxy.type = ProxyFetcher::Proxy::SOCKS4
59
+ end
60
+ end
61
+ end
62
+
63
+ ProxyFetcher::Configuration.register_provider(:proxyscrape_socks4, ProxyscrapeSOCKS4)
64
+ end
65
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # FreeProxyList provider class.
8
+ class ProxyscrapeSOCKS5 < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks5"
12
+ end
13
+
14
+ # Loads provider HTML and parses it with internal document object.
15
+ #
16
+ # @param url [String]
17
+ # URL to fetch
18
+ #
19
+ # @param filters [Hash]
20
+ # filters for proxy provider
21
+ #
22
+ # @return [Array]
23
+ # Collection of extracted proxies with ports
24
+ #
25
+ def load_document(url, filters = {})
26
+ html = load_html(url, filters)
27
+
28
+ CSV.parse(html, col_sep: "\t").map(&:first)
29
+ end
30
+
31
+ # Fetches HTML content by sending HTTP request to the provider URL and
32
+ # parses the txt document to return all the proxy entries (ip addresses
33
+ # and ports).
34
+ #
35
+ # @return [Array]
36
+ # Collection of extracted proxies with ports
37
+ #
38
+ def load_proxy_list(filters = {})
39
+ load_document(provider_url, filters)
40
+ end
41
+
42
+ # Converts String to <code>ProxyFetcher::Proxy</code> object.
43
+ #
44
+ # @param node [String]
45
+ # String
46
+ #
47
+ # @return [ProxyFetcher::Proxy]
48
+ # Proxy object
49
+ #
50
+ def to_proxy(html_node)
51
+ addr, port = html_node.split(":")
52
+
53
+ ProxyFetcher::Proxy.new.tap do |proxy|
54
+ proxy.addr = addr
55
+ proxy.port = Integer(port)
56
+ proxy.country = "Unknown"
57
+ proxy.anonymity = "Unknown"
58
+ proxy.type = ProxyFetcher::Proxy::SOCKS5
59
+ end
60
+ end
61
+ end
62
+
63
+ ProxyFetcher::Configuration.register_provider(:proxyscrape_socks5, ProxyscrapeSOCKS5)
64
+ end
65
+ end
@@ -6,19 +6,11 @@ module ProxyFetcher
6
6
  class XRoxy < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'https://www.xroxy.com/free-proxy-lists/'
9
+ "https://www.xroxy.com/proxylist.htm"
10
10
  end
11
11
 
12
- # Fetches HTML content by sending HTTP request to the provider URL and
13
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
14
- # to return all the proxy entries (HTML nodes).
15
- #
16
- # @return [Array<ProxyFetcher::Document::Node>]
17
- # Collection of extracted HTML nodes with full proxy info
18
- #
19
- def load_proxy_list(filters = { type: 'All_http' })
20
- doc = load_document(provider_url, filters)
21
- doc.xpath('//div/table/tbody/tr')
12
+ def xpath
13
+ "//tr[@class='row1' or @class='row0']"
22
14
  end
23
15
 
24
16
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -32,12 +24,12 @@ module ProxyFetcher
32
24
  #
33
25
  def to_proxy(html_node)
34
26
  ProxyFetcher::Proxy.new.tap do |proxy|
35
- proxy.addr = html_node.content_at('td[1]')
36
- proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
37
- proxy.anonymity = html_node.content_at('td[3]')
38
- proxy.country = html_node.content_at('td[5]')
39
- proxy.response_time = Integer(html_node.content_at('td[6]'))
40
- proxy.type = html_node.content_at('td[3]')
27
+ proxy.addr = html_node.content_at("td[1]")
28
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
29
+ proxy.anonymity = html_node.content_at("td[3]")
30
+ proxy.country = html_node.content_at("td[5]")
31
+ proxy.response_time = Integer(html_node.content_at("td[6]"))
32
+ proxy.type = html_node.content_at("td[3]")
41
33
  end
42
34
  end
43
35
  end
@@ -29,10 +29,10 @@ module ProxyFetcher
29
29
 
30
30
  # Proxy types
31
31
  TYPES = [
32
- HTTP = 'HTTP'.freeze,
33
- HTTPS = 'HTTPS'.freeze,
34
- SOCKS4 = 'SOCKS4'.freeze,
35
- SOCKS5 = 'SOCKS5'.freeze
32
+ HTTP = "HTTP",
33
+ HTTPS = "HTTPS",
34
+ SOCKS4 = "SOCKS4",
35
+ SOCKS5 = "SOCKS5"
36
36
  ].freeze
37
37
 
38
38
  # Proxy type predicates (#socks4?, #https?)
@@ -104,5 +104,17 @@ module ProxyFetcher
104
104
  URI::Generic.build(host: addr, port: port).to_s
105
105
  end
106
106
  end
107
+
108
+ def ==(other)
109
+ other.is_a?(Proxy) && addr == other.addr && port == other.port
110
+ end
111
+
112
+ def eql?(other)
113
+ hash.eql?(other.hash)
114
+ end
115
+
116
+ def hash
117
+ [addr.hash, port.hash].hash
118
+ end
107
119
  end
108
120
  end