http_proxy_pool 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. data/README.md +81 -0
  2. data/Rakefile +9 -0
  3. data/bin/proxypool +60 -0
  4. data/lib/http_proxy_pool.rb +25 -0
  5. data/lib/http_proxy_pool/basetask.rb +81 -0
  6. data/lib/http_proxy_pool/error.rb +8 -0
  7. data/lib/http_proxy_pool/example/izmoney_china_hight.site +22 -0
  8. data/lib/http_proxy_pool/example/izmoney_china_normal.site +22 -0
  9. data/lib/http_proxy_pool/example/izmoney_foreign_high.site +22 -0
  10. data/lib/http_proxy_pool/example/izmoney_foreign_normal.site +22 -0
  11. data/lib/http_proxy_pool/example/kuaidaili_inha.site +36 -0
  12. data/lib/http_proxy_pool/example/kuaidaili_intr.site +36 -0
  13. data/lib/http_proxy_pool/example/kuaidaili_outha.site +36 -0
  14. data/lib/http_proxy_pool/example/kuaidaili_outtr.site +36 -0
  15. data/lib/http_proxy_pool/example/proxy360.site +21 -0
  16. data/lib/http_proxy_pool/example/proxy_goubanjia_gngn.site +23 -0
  17. data/lib/http_proxy_pool/example/proxy_goubanjia_gnpt.site +23 -0
  18. data/lib/http_proxy_pool/example/proxy_goubanjia_gwgn.site +23 -0
  19. data/lib/http_proxy_pool/example/proxy_goubanjia_gwpt.site +23 -0
  20. data/lib/http_proxy_pool/example/xicidaili_nn.site +37 -0
  21. data/lib/http_proxy_pool/example/xicidaili_nt.site +37 -0
  22. data/lib/http_proxy_pool/example/xicidaili_qq.site +37 -0
  23. data/lib/http_proxy_pool/example/xicidaili_wn.site +37 -0
  24. data/lib/http_proxy_pool/example/xicidaili_wt.site +37 -0
  25. data/lib/http_proxy_pool/proxy.rb +43 -0
  26. data/lib/http_proxy_pool/proxy_pool.rb +202 -0
  27. data/lib/http_proxy_pool/utils.rb +30 -0
  28. data/lib/http_proxy_pool/version.rb +5 -0
  29. metadata +86 -0
@@ -0,0 +1,81 @@
1
+ # http-proxy-pool
2
+
3
+ 在爬取网页数据、批量投票,点赞等日常中,经常需要更换ip信息,需要大量代理。http-proxy-pool可用于收集网络上免费代理,供其它脚本程序使用。http-proxy-pool可以通过自定义爬取脚本来收集网络代理信息。
4
+
5
+ ## 安装
6
+
7
+ `gem install http-proxy-pool`
8
+
9
+
10
+ ## 使用
11
+
12
+ ##### 1.命令行
13
+
14
+ * 初始化资源
15
+ `proxypool crawl`
16
+
17
+ * 查看当前已收集状态
18
+ `proxypool status`
19
+
20
+ * 随机获取一个可用代理,默认强制检查代理是否可用
21
+ `proxypool get`
22
+
23
+ 更多参数,参看`proxypool help`
24
+
25
+ ##### 2.在脚本中引用
26
+
27
+ require 'http-proxy-pool'
28
+
29
+ pool = HttpProxyPool::ProxyPool.new
30
+ pool.query(:ip => "=~ /^111/", :proxy_type => "== 'HTTP'") do |proxy|
31
+ # do what you want ...
32
+ end
33
+
34
+ query查询出proxy资源不会强制,校验是否可用。可用checker通过来校验:
35
+
36
+ pool.checker(proxy)
37
+
38
+ ## 定义爬取脚本
39
+ http-proxy-pool默认脚本会安装到**[USER\_PATH]/http\_proxy\_pool/script**中,可以自己修改已有脚本,或者在此目录添加新脚本,目前自带以下网站(站点信息源自搜索引擎)爬取脚本:
40
+
41
+ * [ip.izmoney.com](http://ip.izmoney.com)
42
+ * [kuaidaili.com](http://www.kuaidaili.com)
43
+ * [proxy360.cn](http://www.proxy360.cn)
44
+ * [goubanjia.com](http://proxy.goubanjia.com)
45
+
46
+ ##### 一个样例:
47
+
48
+ # 开始抓取地址
49
+ sitetask("start_page_url") do
50
+ nextpage do
51
+ # nextpage 最终返回下一页URL
52
+ # 此部分需判断是否需要是否是最后页
53
+ # 如果未定义nextpage部分,程序默认只会爬去第一页
54
+ end
55
+
56
+ parser do
57
+ # 此部分,最终返回一个Proxy实例的数组
58
+ # 此block中,可以通过解析当前Mechanize页面,通过dom数据生成多个Proxy
59
+ end
60
+ end
61
+
62
+ ##### 创建Proxy:
63
+
64
+ HttpProxyPool::Proxy.new {
65
+ :ip => '127.0.0.1', # IP地址
66
+ :port => 8080, # 端口
67
+ :username => 'jiyaping', # 认证用户名
68
+ :password => 'xxxxxx', # 认证密码
69
+ :proxy_level => 'high', # 代理等级(匿名、透明代理)
70
+ :proxy_type => 'http', # 代理类型(HTTP、HTTPS、SOCKS)
71
+ :speed => '0.5', # 代理速度
72
+ :added_time => DateTime.now, # 添加时间
73
+ :last_access_time => DateTime.now,# 上次使用时间
74
+ :nation => 'cn', # 国家
75
+ :province => 'guangdong', # 省份/州
76
+ :src_from => 'xxxxxx.com' # 获取来源
77
+ }
78
+
79
+ ## 最后
80
+
81
+ 就酱紫 ...
@@ -0,0 +1,9 @@
1
+ gem "minitest"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ desc 'Run tests'
9
+ task :default => :test
@@ -0,0 +1,60 @@
1
+ #! ruby
2
+
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require 'thor'
7
+ require "http_proxy_pool"
8
+
9
+ class HttpProxyPoolApp < Thor
10
+ @@proxy_pool = HttpProxyPool::ProxyPool.new(
11
+ :data_path=> File.join(HttpProxyPool.home, 'ips.yaml'),
12
+ :script => Dir["#{HttpProxyPool.home}/script/*.site"],
13
+ :logger => HttpProxyPool.logger
14
+ )
15
+
16
+ desc 'status', 'show proxy pool status.'
17
+ def status
18
+ @@proxy_pool.status
19
+ end
20
+
21
+ desc 'crawl [WAY]', 'gather ip source store to local file through WAY.'
22
+ method_option :lastest, :aliases => '-l',
23
+ :type => :boolean,
24
+ :default => true,
25
+ :desc => 'only crawl recently ip.'
26
+ method_option :check, :aliases => '-c',
27
+ :type => :boolean,
28
+ :default => false,
29
+ :desc => 'store it after check if available.'
30
+ def crawl(way = 'script')
31
+ puts "wait...."
32
+
33
+ if way == 'script'
34
+ lastest = options[:lastest]
35
+ check = options[:check]
36
+
37
+ @@proxy_pool.crawling(lastest, check)
38
+ end
39
+
40
+ puts "done."
41
+ end
42
+
43
+ desc 'get', 'get ip from local storage.'
44
+ method_option :force_check, :aliases => '-fc',
45
+ :type => :boolean,
46
+ :default => true,
47
+ :desc => 'check the ip if ready to use.'
48
+ method_option :thread_num, :aliases => '-t',
49
+ :type => :numeric,
50
+ :default => 10,
51
+ :desc => 'num of search thread.'
52
+ def get
53
+ force_check = options[:force_check]
54
+ thread_num = options[:thread_num]
55
+
56
+ puts @@proxy_pool.get_random_proxy(force_check, thread_num)
57
+ end
58
+ end
59
+
60
+ HttpProxyPoolApp.start
@@ -0,0 +1,25 @@
1
+ #encoding : utf-8
2
+
3
+ require 'mechanize'
4
+
5
+ require 'http_proxy_pool/error'
6
+ require 'http_proxy_pool/utils'
7
+ require 'http_proxy_pool/basetask'
8
+ require 'http_proxy_pool/proxy'
9
+ require 'http_proxy_pool/proxy_pool'
10
+ require 'http_proxy_pool/version'
11
+
12
+ module HttpProxyPool
13
+ # will support some configure
14
+ @config = {}
15
+
16
+ @home = File.join(Dir.home, 'http_proxy_pool')
17
+ Dir.mkdir(@home) unless Dir.exists? @home
18
+
19
+ @script_path = File.join(@home, 'script')
20
+ Dir.mkdir(@script_path) unless Dir.exists? @script_path
21
+
22
+ @logger = Logger.new(File.join(@home, 'proxy.log'), 2_000_000)
23
+
24
+ init_default_script
25
+ end
@@ -0,0 +1,81 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class Basetask
5
+ attr_accessor :agent,
6
+ :url,
7
+ :logger,
8
+ :page_parser,
9
+ :next_page
10
+
11
+ def initialize(opts = {})
12
+ @agent = opts[:agent]
13
+ @logger = opts[:logger]
14
+ @url = opts[:url]
15
+ end
16
+
17
+ def sitetask(url, opts = {})
18
+ raise ScriptError.new("script do not specify a url!") unless url
19
+
20
+ @url = url
21
+ @agent = opts[:agent] || Mechanize.new
22
+ @logger ||= opts[:logger]
23
+
24
+ #for debug
25
+ #@agent.set_proxy '127.0.0.1', 8888
26
+
27
+ yield
28
+ end
29
+
30
+ def ips(lastest = true)
31
+ uri = @url
32
+
33
+ loop do
34
+ @logger.info("start crawling page [#{uri}] ...")
35
+ @agent.get(uri)
36
+ # get all page need sleep a random time
37
+ rand_sleep unless lastest
38
+
39
+ begin
40
+ instance_eval(&page_parser).each do |field|
41
+ yield field
42
+ end
43
+ rescue Exception => e
44
+ @logger.error("parsing page error[#{uri}]. #{e.to_s}")
45
+ break
46
+ end
47
+
48
+ begin
49
+ break unless @next_page
50
+ uri = instance_eval(&next_page)
51
+ break unless uri
52
+ rescue => e
53
+ @logger.error("error occoured when get next page[#{uri}]. #{e.to_s}")
54
+ break
55
+ end
56
+
57
+ break if lastest
58
+ end
59
+ end
60
+
61
+ def parser(&block)
62
+ @page_parser = block if block_given?
63
+ end
64
+
65
+ def nextpage(&block)
66
+ @next_page = block if block_given?
67
+ end
68
+
69
+ def curr_page
70
+ @agent.page.uri
71
+ end
72
+
73
+ def sitename
74
+ URI.parse(URI.encode(@url)).host
75
+ end
76
+
77
+ def rand_sleep(max_tick = 2)
78
+ sleep rand(max_tick)
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,8 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class BaseError < StandardError; end
5
+ class ScriptError < BaseError; end
6
+ class TaskError < BaseError; end
7
+ class QueryError < BaseError; end
8
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/china/high/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/china/normal/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/foreign/high/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/foreign/normal/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/inha/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/intr/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/outha/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/outtr/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,21 @@
1
+ sitetask("http://www.proxy360.cn/default.aspx") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search(".proxylistitem").each do |node|
6
+ tds = node.search('.tbBottomLine')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text.strip
10
+ fields[:port] = tds[1].text.strip
11
+ fields[:proxy_level]= tds[2].text.strip
12
+ fields[:nation] = tds[3].text.strip
13
+ fields[:added_time] = tds[4].text.strip
14
+ fields[:src_from] = sitename
15
+
16
+ ips << fields
17
+ end
18
+
19
+ ips
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gngn/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gnpt/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gwgn/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gwpt/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/nn/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/nn/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/nt/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/nt/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/qq/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/qq/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/wn/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/wn/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/wt/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/wt/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,43 @@
1
+ # encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class Proxy
5
+ attr_accessor :ip,
6
+ :port,
7
+ :username,
8
+ :password,
9
+ :proxy_level,
10
+ :proxy_type,
11
+ :speed,
12
+ :added_time,
13
+ :last_access_time,
14
+ :nation,
15
+ :province,
16
+ :src_from,
17
+ :try_times
18
+
19
+ def initialize(args = {})
20
+ @ip = args[:ip]
21
+ @port = args[:port]
22
+ @username = args[:username] || ''
23
+ @password = args[:password] || ''
24
+ @proxy_type = args[:proxy_type]
25
+ @proxy_level= args[:proxy_level]
26
+ @speed = args[:speed]
27
+ @added_time = args[:added_time]
28
+ @last_access= args[:last_access]
29
+ @nation = args[:nation]
30
+ @province = args[:province]
31
+ @src_from = args[:src_from]
32
+ @try_times = args[:try_times] || 0
33
+ end
34
+
35
+ def to_arr
36
+ [@ip, @port, @proxy_type, @proxy_level, @nation, @province]
37
+ end
38
+
39
+ def to_s
40
+ "#{@ip}\t#{@port}"
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,202 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class ProxyPool
5
+ attr_accessor :proxys, :logger
6
+
7
+ def initialize(args = {})
8
+ @data_path = args[:data_path] || File.join(HttpProxyPool.home, 'ips.yaml')
9
+ @script = args[:script] || Dir["#{HttpProxyPool.home}/script/*.site"]
10
+ @logger = args[:logger] || HttpProxyPool.logger
11
+ @proxys = []
12
+
13
+ @agent = Mechanize.new
14
+ @agent.user_agent_alias = get_agent_alias
15
+
16
+ load_proxy if File.exists? @data_path
17
+ end
18
+
19
+ def status
20
+ puts "proxy count : #{@proxys.size}"
21
+ end
22
+
23
+ # query interface
24
+ def query(args = {})
25
+ begin
26
+ selected_proxy = @proxys.select do |proxy|
27
+ instance_eval(build_query_parameter('proxy', args))
28
+ end
29
+ rescue => e
30
+ raise QueryError.new("query parameter error!")
31
+ end
32
+
33
+ return selected_proxy unless block_given?
34
+
35
+ selected_proxy.each do |proxy|
36
+ yield proxy
37
+ end
38
+ end
39
+
40
+ def build_query_parameter(prefix = 'proxy', args)
41
+ condition_str = ''
42
+
43
+ args = query_key_filter(args)
44
+
45
+ args.each do |key, express|
46
+ condition_str << "#{prefix}.#{key} #{express} && "
47
+ end
48
+
49
+ condition_str.sub!(/\s?&&\s?$/, '')
50
+
51
+ condition_str
52
+ end
53
+
54
+ def query_key_filter(args)
55
+ proxy = Proxy.new
56
+ args.select{ |k| proxy.respond_to? k }
57
+ end
58
+
59
+ def get_random_proxy(check = true, thread_num = 10)
60
+ mutex = Mutex.new
61
+ result = nil
62
+ thread_list = []
63
+
64
+ begin
65
+ thread_num.times do |thread|
66
+ thread_list << Thread.new do
67
+ while(!result)
68
+ proxy = @proxys[rand(@proxys.size)]
69
+ @logger.info("using #{proxy}.")
70
+ proxy = checker(proxy) if check
71
+
72
+ if proxy.is_a? Proxy
73
+ mutex.synchronize do
74
+ result = proxy
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ thread_list.each { |t| t.join }
82
+ rescue => e
83
+ @logger.error("find proxy error. #{e.to_s}")
84
+ ensure
85
+ save_proxy
86
+ end
87
+
88
+ result
89
+ end
90
+
91
+ def crawling(lastest = true, check = false)
92
+ @script.each do |file|
93
+ begin
94
+ task = Basetask.new(:agent => @agent,:logger => @logger)
95
+ task.instance_eval(read_taskfile(file))
96
+
97
+ task.ips(lastest) do |fields|
98
+ proxy = Proxy.new(fields)
99
+ (next unless checker(proxy)) if check
100
+ @proxys << proxy unless include?(proxy)
101
+ end
102
+ rescue => e
103
+ @logger.error(e)
104
+ ensure
105
+ save_proxy
106
+ end
107
+ end
108
+ end
109
+
110
+ def include?(proxy)
111
+ @proxys.select{ |p| p.ip == proxy.ip}.size > 0
112
+ end
113
+
114
+ def save_proxy
115
+ file = File.open(@data_path, 'w')
116
+ YAML.dump(@proxys, file)
117
+ file.close
118
+ end
119
+
120
+ def load_proxy
121
+ @proxys = YAML.load_file(@data_path)
122
+ end
123
+
124
+ def read_taskfile(file)
125
+ cnt = ''
126
+ File.open(file) do |f|
127
+ while(line = f.gets)
128
+ cnt << line
129
+ end
130
+ end
131
+
132
+ cnt
133
+ end
134
+
135
+ def get_agent_alias
136
+ agent_arr = [
137
+ 'Linux Firefox',
138
+ 'Linux Mozilla',
139
+ 'Mac Firefox',
140
+ 'Mac Mozilla',
141
+ 'Mac Safari',
142
+ 'Windows Chrome',
143
+ 'Windows IE 7',
144
+ 'Windows IE 8',
145
+ 'Windows IE 9',
146
+ 'Windows Mozilla',
147
+ 'iPhone',
148
+ 'iPad',
149
+ 'Android']
150
+
151
+ agent_arr[rand(agent_arr.size)]
152
+ end
153
+
154
+ def checker(proxy)
155
+ if proxy.is_a? Array
156
+ checker_batch(proxy)
157
+ else
158
+ checker_single(proxy)
159
+ end
160
+ end
161
+
162
+ def checker_batch(proxys, task_count = 5)
163
+ result = []
164
+ mutex = Mutex.new
165
+ thread_count = (proxys.size / task_count.to_f).ceil
166
+
167
+ thread_count.times do |thread_idx|
168
+ (Thread.new do
169
+ start_idx = thread_idx * task_count
170
+ end_idx = (thread_idx + 1) * task_count
171
+ end_idx = proxys.size if end_idx > proxys.size
172
+
173
+ proxys[start_idx..end_idx].each do |proxy|
174
+ p = checker_single(proxy)
175
+
176
+ mutex.synchronize do
177
+ result<< p if p
178
+ end
179
+ end
180
+ end).join
181
+ end
182
+
183
+ result
184
+ end
185
+
186
+ def checker_single(proxy, timeout = 0.05)
187
+ http = Net::HTTP.new('baidu.com', 80, proxy.ip, proxy.port)
188
+ http.open_timeout = timeout
189
+ http.read_timeout = timeout * 10
190
+
191
+ begin
192
+ return proxy if http.get('/').code =~ /^[1|2|3|4]/
193
+ rescue => e
194
+ @logger.info("can not connect proxy.[#{proxy}].#{e.to_s}")
195
+ @proxys.delete(proxy)
196
+ @logger.info("delete disabled proxy [#{proxy}].")
197
+ end
198
+
199
+ false
200
+ end
201
+ end
202
+ end
@@ -0,0 +1,30 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ module_function
5
+
6
+ def init_default_script
7
+
8
+ target_dir = Dir.new(@script_path)
9
+
10
+ src_dir = File.join(File.dirname(__FILE__), 'example')
11
+ Dir.entries(src_dir).each do |src|
12
+ next unless src.end_with? '.site'
13
+
14
+ FileUtils.cp File.join(src_dir, src),
15
+ target_dir.path unless target_dir.include? src
16
+ end
17
+ end
18
+
19
+ def home
20
+ @home
21
+ end
22
+
23
+ def script_path
24
+ @script_path
25
+ end
26
+
27
+ def logger
28
+ @logger
29
+ end
30
+ end
@@ -0,0 +1,5 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ VERSION = '0.0.1'
5
+ end
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: http_proxy_pool
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - jiyaping
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-09-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &10417392 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.7'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *10417392
25
+ description: Gather free http proxy data
26
+ email: jiyaping0802@gmail.com
27
+ executables:
28
+ - proxypool
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - lib/http_proxy_pool/basetask.rb
33
+ - lib/http_proxy_pool/error.rb
34
+ - lib/http_proxy_pool/example/izmoney_china_hight.site
35
+ - lib/http_proxy_pool/example/izmoney_china_normal.site
36
+ - lib/http_proxy_pool/example/izmoney_foreign_high.site
37
+ - lib/http_proxy_pool/example/izmoney_foreign_normal.site
38
+ - lib/http_proxy_pool/example/kuaidaili_inha.site
39
+ - lib/http_proxy_pool/example/kuaidaili_intr.site
40
+ - lib/http_proxy_pool/example/kuaidaili_outha.site
41
+ - lib/http_proxy_pool/example/kuaidaili_outtr.site
42
+ - lib/http_proxy_pool/example/proxy360.site
43
+ - lib/http_proxy_pool/example/proxy_goubanjia_gngn.site
44
+ - lib/http_proxy_pool/example/proxy_goubanjia_gnpt.site
45
+ - lib/http_proxy_pool/example/proxy_goubanjia_gwgn.site
46
+ - lib/http_proxy_pool/example/proxy_goubanjia_gwpt.site
47
+ - lib/http_proxy_pool/example/xicidaili_nn.site
48
+ - lib/http_proxy_pool/example/xicidaili_nt.site
49
+ - lib/http_proxy_pool/example/xicidaili_qq.site
50
+ - lib/http_proxy_pool/example/xicidaili_wn.site
51
+ - lib/http_proxy_pool/example/xicidaili_wt.site
52
+ - lib/http_proxy_pool/proxy.rb
53
+ - lib/http_proxy_pool/proxy_pool.rb
54
+ - lib/http_proxy_pool/utils.rb
55
+ - lib/http_proxy_pool/version.rb
56
+ - lib/http_proxy_pool.rb
57
+ - Rakefile
58
+ - README.md
59
+ - !binary |-
60
+ YmluL3Byb3h5cG9vbA==
61
+ homepage: https://github.com/jiyaping/http-proxy-pool
62
+ licenses:
63
+ - MIT
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ! '>='
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ! '>='
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project:
82
+ rubygems_version: 1.8.16
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: http proxy crawling from web
86
+ test_files: []