proxy_fetcher 0.10.2 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +22 -1
  3. data/Gemfile +8 -5
  4. data/Rakefile +7 -3
  5. data/gemfiles/nokogiri.gemfile +8 -6
  6. data/gemfiles/oga.gemfile +8 -6
  7. data/lib/proxy_fetcher.rb +46 -35
  8. data/lib/proxy_fetcher/client/client.rb +10 -3
  9. data/lib/proxy_fetcher/client/request.rb +4 -4
  10. data/lib/proxy_fetcher/configuration.rb +24 -19
  11. data/lib/proxy_fetcher/document.rb +0 -9
  12. data/lib/proxy_fetcher/document/adapters.rb +1 -1
  13. data/lib/proxy_fetcher/document/adapters/abstract_adapter.rb +3 -12
  14. data/lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb +1 -1
  15. data/lib/proxy_fetcher/document/adapters/oga_adapter.rb +1 -1
  16. data/lib/proxy_fetcher/document/node.rb +2 -2
  17. data/lib/proxy_fetcher/exceptions.rb +6 -6
  18. data/lib/proxy_fetcher/manager.rb +42 -9
  19. data/lib/proxy_fetcher/providers/base.rb +43 -22
  20. data/lib/proxy_fetcher/providers/free_proxy_list.rb +9 -10
  21. data/lib/proxy_fetcher/providers/free_proxy_list_socks.rb +58 -0
  22. data/lib/proxy_fetcher/providers/free_proxy_list_ssl.rb +7 -15
  23. data/lib/proxy_fetcher/providers/free_proxy_list_us.rb +54 -0
  24. data/lib/proxy_fetcher/providers/http_tunnel.rb +11 -19
  25. data/lib/proxy_fetcher/providers/mtpro.rb +43 -0
  26. data/lib/proxy_fetcher/providers/proxy_list.rb +8 -16
  27. data/lib/proxy_fetcher/providers/proxypedia.rb +48 -0
  28. data/lib/proxy_fetcher/providers/proxyscrape_http.rb +65 -0
  29. data/lib/proxy_fetcher/providers/proxyscrape_socks4.rb +65 -0
  30. data/lib/proxy_fetcher/providers/proxyscrape_socks5.rb +65 -0
  31. data/lib/proxy_fetcher/providers/xroxy.rb +9 -17
  32. data/lib/proxy_fetcher/proxy.rb +16 -4
  33. data/lib/proxy_fetcher/utils/http_client.rb +7 -12
  34. data/lib/proxy_fetcher/utils/proxy_list_validator.rb +3 -1
  35. data/lib/proxy_fetcher/utils/proxy_validator.rb +21 -9
  36. data/lib/proxy_fetcher/version.rb +3 -3
  37. data/proxy_fetcher.gemspec +21 -16
  38. data/spec/fixtures/proxies.txt +14 -0
  39. data/spec/proxy_fetcher/client/client_spec.rb +72 -57
  40. data/spec/proxy_fetcher/configuration_spec.rb +11 -11
  41. data/spec/proxy_fetcher/document/adapters_spec.rb +8 -8
  42. data/spec/proxy_fetcher/document/node_spec.rb +4 -4
  43. data/spec/proxy_fetcher/manager_spec.rb +18 -0
  44. data/spec/proxy_fetcher/providers/base_spec.rb +9 -9
  45. data/spec/proxy_fetcher/providers/multiple_providers_spec.rb +4 -4
  46. data/spec/proxy_fetcher/providers/proxy_classes_spec.rb +28 -0
  47. data/spec/proxy_fetcher/proxy_spec.rb +14 -14
  48. data/spec/proxy_fetcher/version_spec.rb +2 -0
  49. data/spec/spec_helper.rb +10 -10
  50. data/spec/support/manager_examples.rb +21 -21
  51. metadata +27 -17
  52. data/lib/proxy_fetcher/providers/gather_proxy.rb +0 -58
  53. data/spec/proxy_fetcher/providers/free_proxy_list_spec.rb +0 -13
  54. data/spec/proxy_fetcher/providers/free_proxy_list_ssl_spec.rb +0 -11
  55. data/spec/proxy_fetcher/providers/gather_proxy_spec.rb +0 -11
  56. data/spec/proxy_fetcher/providers/http_tunnel_spec.rb +0 -11
  57. data/spec/proxy_fetcher/providers/proxy_list_spec.rb +0 -11
  58. data/spec/proxy_fetcher/providers/xroxy_spec.rb +0 -11
@@ -6,19 +6,11 @@ module ProxyFetcher
6
6
  class HTTPTunnel < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'http://www.httptunnel.ge/ProxyListForFree.aspx'
9
+ "http://www.httptunnel.ge/ProxyListForFree.aspx"
10
10
  end
11
11
 
12
- # Fetches HTML content by sending HTTP request to the provider URL and
13
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
14
- # to return all the proxy entries (HTML nodes).
15
- #
16
- # @return [Array<ProxyFetcher::Document::Node>]
17
- # Collection of extracted HTML nodes with full proxy info
18
- #
19
- def load_proxy_list(_filters = {})
20
- doc = load_document(provider_url)
21
- doc.xpath('//table[contains(@id, "GridView")]/tr[(count(td)>2)]')
12
+ def xpath
13
+ '//table[contains(@id, "GridView")]/tr[(count(td)>2)]'
22
14
  end
23
15
 
24
16
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -53,7 +45,7 @@ module ProxyFetcher
53
45
  # URI object
54
46
  #
55
47
  def parse_proxy_uri(html_node)
56
- full_addr = html_node.content_at('td[1]')
48
+ full_addr = html_node.content_at("td[1]")
57
49
  URI.parse("http://#{full_addr}")
58
50
  end
59
51
 
@@ -66,7 +58,7 @@ module ProxyFetcher
66
58
  # Country code
67
59
  #
68
60
  def parse_country(html_node)
69
- html_node.find('.//img').attr('title')
61
+ html_node.find(".//img").attr("title")
70
62
  end
71
63
 
72
64
  # Parses HTML node to extract proxy anonymity level.
@@ -78,14 +70,14 @@ module ProxyFetcher
78
70
  # Anonymity level
79
71
  #
80
72
  def parse_anonymity(html_node)
81
- transparency = html_node.content_at('td[5]').to_sym
73
+ transparency = html_node.content_at("td[5]").to_sym
82
74
 
83
75
  {
84
- A: 'Anonymous',
85
- E: 'Elite',
86
- T: 'Transparent',
87
- U: 'Unknown'
88
- }.fetch(transparency, 'Unknown')
76
+ A: "Anonymous",
77
+ E: "Elite",
78
+ T: "Transparent",
79
+ U: "Unknown"
80
+ }.fetch(transparency, "Unknown")
89
81
  end
90
82
  end
91
83
 
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # MTPro provider class.
8
+ class MTPro < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://mtpro.xyz/api/?type=socks"
12
+ end
13
+
14
+ def load_proxy_list(filters = {})
15
+ html = load_html(provider_url, filters)
16
+ JSON.parse(html)
17
+ rescue JSON::ParserError
18
+ []
19
+ end
20
+
21
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
22
+ # object.
23
+ #
24
+ # @param node [Object]
25
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
26
+ #
27
+ # @return [ProxyFetcher::Proxy]
28
+ # Proxy object
29
+ #
30
+ def to_proxy(node)
31
+ ProxyFetcher::Proxy.new.tap do |proxy|
32
+ proxy.addr = node["ip"]
33
+ proxy.port = Integer(node["port"])
34
+ proxy.country = node["country"]
35
+ proxy.anonymity = "Unknown"
36
+ proxy.type = ProxyFetcher::Proxy::SOCKS5
37
+ end
38
+ end
39
+ end
40
+
41
+ ProxyFetcher::Configuration.register_provider(:mtpro, MTPro)
42
+ end
43
+ end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'base64'
3
+ require "base64"
4
4
 
5
5
  module ProxyFetcher
6
6
  module Providers
@@ -8,19 +8,11 @@ module ProxyFetcher
8
8
  class ProxyList < Base
9
9
  # Provider URL to fetch proxy list
10
10
  def provider_url
11
- 'https://proxy-list.org/english/index.php'
11
+ "https://proxy-list.org/english/index.php"
12
12
  end
13
13
 
14
- # Fetches HTML content by sending HTTP request to the provider URL and
15
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
16
- # to return all the proxy entries (HTML nodes).
17
- #
18
- # @return [Array<ProxyFetcher::Document::Node>]
19
- # Collection of extracted HTML nodes with full proxy info
20
- #
21
- def load_proxy_list(filters = {})
22
- doc = load_document(provider_url, filters)
23
- doc.css('.table-wrap .table ul')
14
+ def xpath
15
+ '//div[@class="table-wrap"]/div[@class="table"]/ul'
24
16
  end
25
17
 
26
18
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -38,9 +30,9 @@ module ProxyFetcher
38
30
  proxy.addr = uri.host
39
31
  proxy.port = uri.port
40
32
 
41
- proxy.type = html_node.content_at('li[2]')
42
- proxy.anonymity = html_node.content_at('li[4]')
43
- proxy.country = html_node.find("li[5]//span[@class='country']").attr('title')
33
+ proxy.type = html_node.content_at("li[2]")
34
+ proxy.anonymity = html_node.content_at("li[4]")
35
+ proxy.country = html_node.find("li[5]//span[@class='country']").attr("title")
44
36
  end
45
37
  end
46
38
 
@@ -55,7 +47,7 @@ module ProxyFetcher
55
47
  # URI object
56
48
  #
57
49
  def parse_proxy_uri(html_node)
58
- full_addr = ::Base64.decode64(html_node.at_css('li script').html.match(/'(.+)'/)[1])
50
+ full_addr = ::Base64.decode64(html_node.at_css("li script").html.match(/'(.+)'/)[1])
59
51
  URI.parse("http://#{full_addr}")
60
52
  end
61
53
  end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ProxyFetcher
4
+ module Providers
5
+ # FreeProxyList provider class.
6
+ class Proxypedia < Base
7
+ # Provider URL to fetch proxy list
8
+ def provider_url
9
+ "https://proxypedia.org"
10
+ end
11
+
12
+ # [NOTE] Doesn't support filtering
13
+ def xpath
14
+ "//main/ul/li[position()>1]"
15
+ end
16
+
17
+ # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
18
+ # object.]
19
+ #
20
+ # @param html_node [Object]
21
+ # HTML node from the <code>ProxyFetcher::Document</code> DOM model.
22
+ #
23
+ # @return [ProxyFetcher::Proxy]
24
+ # Proxy object
25
+ #
26
+ def to_proxy(html_node)
27
+ addr, port = html_node.content_at("a").to_s.split(":")
28
+
29
+ ProxyFetcher::Proxy.new.tap do |proxy|
30
+ proxy.addr = addr
31
+ proxy.port = Integer(port)
32
+ proxy.country = parse_country(html_node)
33
+ proxy.anonymity = "Unknown"
34
+ proxy.type = ProxyFetcher::Proxy::HTTP
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def parse_country(html_node)
41
+ text = html_node.content.to_s
42
+ text[/\((.+?)\)/, 1] || "Unknown"
43
+ end
44
+ end
45
+
46
+ ProxyFetcher::Configuration.register_provider(:proxypedia, Proxypedia)
47
+ end
48
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # FreeProxyList provider class.
8
+ class ProxyscrapeHTTP < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http"
12
+ end
13
+
14
+ # Loads provider HTML and parses it with internal document object.
15
+ #
16
+ # @param url [String]
17
+ # URL to fetch
18
+ #
19
+ # @param filters [Hash]
20
+ # filters for proxy provider
21
+ #
22
+ # @return [Array]
23
+ # Collection of extracted proxies with ports
24
+ #
25
+ def load_document(url, filters = {})
26
+ html = load_html(url, filters)
27
+
28
+ CSV.parse(html, col_sep: "\t").map(&:first)
29
+ end
30
+
31
+ # Fetches HTML content by sending HTTP request to the provider URL and
32
+ # parses the txt document to return all the proxy entries (ip addresses
33
+ # and ports).
34
+ #
35
+ # @return [Array]
36
+ # Collection of extracted proxies with ports
37
+ #
38
+ def load_proxy_list(filters = {})
39
+ load_document(provider_url, filters)
40
+ end
41
+
42
+ # Converts String to <code>ProxyFetcher::Proxy</code> object.
43
+ #
44
+ # @param node [String]
45
+ # String
46
+ #
47
+ # @return [ProxyFetcher::Proxy]
48
+ # Proxy object
49
+ #
50
+ def to_proxy(node)
51
+ addr, port = node.split(":")
52
+
53
+ ProxyFetcher::Proxy.new.tap do |proxy|
54
+ proxy.addr = addr
55
+ proxy.port = Integer(port)
56
+ proxy.country = "Unknown"
57
+ proxy.anonymity = "Unknown"
58
+ proxy.type = ProxyFetcher::Proxy::HTTP
59
+ end
60
+ end
61
+ end
62
+
63
+ ProxyFetcher::Configuration.register_provider(:proxyscrape_http, ProxyscrapeHTTP)
64
+ end
65
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # FreeProxyList provider class.
8
+ class ProxyscrapeSOCKS4 < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks4"
12
+ end
13
+
14
+ # Loads provider HTML and parses it with internal document object.
15
+ #
16
+ # @param url [String]
17
+ # URL to fetch
18
+ #
19
+ # @param filters [Hash]
20
+ # filters for proxy provider
21
+ #
22
+ # @return [Array]
23
+ # Collection of extracted proxies with ports
24
+ #
25
+ def load_document(url, filters = {})
26
+ html = load_html(url, filters)
27
+
28
+ CSV.parse(html, col_sep: "\t").map(&:first)
29
+ end
30
+
31
+ # Fetches HTML content by sending HTTP request to the provider URL and
32
+ # parses the txt document to return all the proxy entries (ip addresses
33
+ # and ports).
34
+ #
35
+ # @return [Array]
36
+ # Collection of extracted proxies with ports
37
+ #
38
+ def load_proxy_list(filters = {})
39
+ load_document(provider_url, filters)
40
+ end
41
+
42
+ # Converts String to <code>ProxyFetcher::Proxy</code> object.
43
+ #
44
+ # @param node [String]
45
+ # String
46
+ #
47
+ # @return [ProxyFetcher::Proxy]
48
+ # Proxy object
49
+ #
50
+ def to_proxy(html_node)
51
+ addr, port = html_node.split(":")
52
+
53
+ ProxyFetcher::Proxy.new.tap do |proxy|
54
+ proxy.addr = addr
55
+ proxy.port = Integer(port)
56
+ proxy.country = "Unknown"
57
+ proxy.anonymity = "Unknown"
58
+ proxy.type = ProxyFetcher::Proxy::SOCKS4
59
+ end
60
+ end
61
+ end
62
+
63
+ ProxyFetcher::Configuration.register_provider(:proxyscrape_socks4, ProxyscrapeSOCKS4)
64
+ end
65
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module ProxyFetcher
6
+ module Providers
7
+ # FreeProxyList provider class.
8
+ class ProxyscrapeSOCKS5 < Base
9
+ # Provider URL to fetch proxy list
10
+ def provider_url
11
+ "https://api.proxyscrape.com/v2/?request=getproxies&protocol=socks5"
12
+ end
13
+
14
+ # Loads provider HTML and parses it with internal document object.
15
+ #
16
+ # @param url [String]
17
+ # URL to fetch
18
+ #
19
+ # @param filters [Hash]
20
+ # filters for proxy provider
21
+ #
22
+ # @return [Array]
23
+ # Collection of extracted proxies with ports
24
+ #
25
+ def load_document(url, filters = {})
26
+ html = load_html(url, filters)
27
+
28
+ CSV.parse(html, col_sep: "\t").map(&:first)
29
+ end
30
+
31
+ # Fetches HTML content by sending HTTP request to the provider URL and
32
+ # parses the txt document to return all the proxy entries (ip addresses
33
+ # and ports).
34
+ #
35
+ # @return [Array]
36
+ # Collection of extracted proxies with ports
37
+ #
38
+ def load_proxy_list(filters = {})
39
+ load_document(provider_url, filters)
40
+ end
41
+
42
+ # Converts String to <code>ProxyFetcher::Proxy</code> object.
43
+ #
44
+ # @param node [String]
45
+ # String
46
+ #
47
+ # @return [ProxyFetcher::Proxy]
48
+ # Proxy object
49
+ #
50
+ def to_proxy(html_node)
51
+ addr, port = html_node.split(":")
52
+
53
+ ProxyFetcher::Proxy.new.tap do |proxy|
54
+ proxy.addr = addr
55
+ proxy.port = Integer(port)
56
+ proxy.country = "Unknown"
57
+ proxy.anonymity = "Unknown"
58
+ proxy.type = ProxyFetcher::Proxy::SOCKS5
59
+ end
60
+ end
61
+ end
62
+
63
+ ProxyFetcher::Configuration.register_provider(:proxyscrape_socks5, ProxyscrapeSOCKS5)
64
+ end
65
+ end
@@ -6,19 +6,11 @@ module ProxyFetcher
6
6
  class XRoxy < Base
7
7
  # Provider URL to fetch proxy list
8
8
  def provider_url
9
- 'https://www.xroxy.com/free-proxy-lists/'
9
+ "https://www.xroxy.com/proxylist.htm"
10
10
  end
11
11
 
12
- # Fetches HTML content by sending HTTP request to the provider URL and
13
- # parses the document (built as abstract <code>ProxyFetcher::Document</code>)
14
- # to return all the proxy entries (HTML nodes).
15
- #
16
- # @return [Array<ProxyFetcher::Document::Node>]
17
- # Collection of extracted HTML nodes with full proxy info
18
- #
19
- def load_proxy_list(filters = { type: 'All_http' })
20
- doc = load_document(provider_url, filters)
21
- doc.xpath('//div/table/tbody/tr')
12
+ def xpath
13
+ "//tr[@class='row1' or @class='row0']"
22
14
  end
23
15
 
24
16
  # Converts HTML node (entry of N tags) to <code>ProxyFetcher::Proxy</code>
@@ -32,12 +24,12 @@ module ProxyFetcher
32
24
  #
33
25
  def to_proxy(html_node)
34
26
  ProxyFetcher::Proxy.new.tap do |proxy|
35
- proxy.addr = html_node.content_at('td[1]')
36
- proxy.port = Integer(html_node.content_at('td[2]').gsub(/^0+/, ''))
37
- proxy.anonymity = html_node.content_at('td[3]')
38
- proxy.country = html_node.content_at('td[5]')
39
- proxy.response_time = Integer(html_node.content_at('td[6]'))
40
- proxy.type = html_node.content_at('td[3]')
27
+ proxy.addr = html_node.content_at("td[1]")
28
+ proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, ""))
29
+ proxy.anonymity = html_node.content_at("td[3]")
30
+ proxy.country = html_node.content_at("td[5]")
31
+ proxy.response_time = Integer(html_node.content_at("td[6]"))
32
+ proxy.type = html_node.content_at("td[3]")
41
33
  end
42
34
  end
43
35
  end
@@ -29,10 +29,10 @@ module ProxyFetcher
29
29
 
30
30
  # Proxy types
31
31
  TYPES = [
32
- HTTP = 'HTTP'.freeze,
33
- HTTPS = 'HTTPS'.freeze,
34
- SOCKS4 = 'SOCKS4'.freeze,
35
- SOCKS5 = 'SOCKS5'.freeze
32
+ HTTP = "HTTP",
33
+ HTTPS = "HTTPS",
34
+ SOCKS4 = "SOCKS4",
35
+ SOCKS5 = "SOCKS5"
36
36
  ].freeze
37
37
 
38
38
  # Proxy type predicates (#socks4?, #https?)
@@ -104,5 +104,17 @@ module ProxyFetcher
104
104
  URI::Generic.build(host: addr, port: port).to_s
105
105
  end
106
106
  end
107
+
108
+ def ==(other)
109
+ other.is_a?(Proxy) && addr == other.addr && port == other.port
110
+ end
111
+
112
+ def eql?(other)
113
+ hash.eql?(other.hash)
114
+ end
115
+
116
+ def hash
117
+ [addr.hash, port.hash].hash
118
+ end
107
119
  end
108
120
  end