http_proxy_pool 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. data/README.md +81 -0
  2. data/Rakefile +9 -0
  3. data/bin/proxypool +60 -0
  4. data/lib/http_proxy_pool.rb +25 -0
  5. data/lib/http_proxy_pool/basetask.rb +81 -0
  6. data/lib/http_proxy_pool/error.rb +8 -0
  7. data/lib/http_proxy_pool/example/izmoney_china_hight.site +22 -0
  8. data/lib/http_proxy_pool/example/izmoney_china_normal.site +22 -0
  9. data/lib/http_proxy_pool/example/izmoney_foreign_high.site +22 -0
  10. data/lib/http_proxy_pool/example/izmoney_foreign_normal.site +22 -0
  11. data/lib/http_proxy_pool/example/kuaidaili_inha.site +36 -0
  12. data/lib/http_proxy_pool/example/kuaidaili_intr.site +36 -0
  13. data/lib/http_proxy_pool/example/kuaidaili_outha.site +36 -0
  14. data/lib/http_proxy_pool/example/kuaidaili_outtr.site +36 -0
  15. data/lib/http_proxy_pool/example/proxy360.site +21 -0
  16. data/lib/http_proxy_pool/example/proxy_goubanjia_gngn.site +23 -0
  17. data/lib/http_proxy_pool/example/proxy_goubanjia_gnpt.site +23 -0
  18. data/lib/http_proxy_pool/example/proxy_goubanjia_gwgn.site +23 -0
  19. data/lib/http_proxy_pool/example/proxy_goubanjia_gwpt.site +23 -0
  20. data/lib/http_proxy_pool/example/xicidaili_nn.site +37 -0
  21. data/lib/http_proxy_pool/example/xicidaili_nt.site +37 -0
  22. data/lib/http_proxy_pool/example/xicidaili_qq.site +37 -0
  23. data/lib/http_proxy_pool/example/xicidaili_wn.site +37 -0
  24. data/lib/http_proxy_pool/example/xicidaili_wt.site +37 -0
  25. data/lib/http_proxy_pool/proxy.rb +43 -0
  26. data/lib/http_proxy_pool/proxy_pool.rb +202 -0
  27. data/lib/http_proxy_pool/utils.rb +30 -0
  28. data/lib/http_proxy_pool/version.rb +5 -0
  29. metadata +86 -0
@@ -0,0 +1,81 @@
1
+ # http-proxy-pool
2
+
3
+ 在爬取网页数据、批量投票,点赞等日常中,经常需要更换ip信息,需要大量代理。http-proxy-pool可用于收集网络上免费代理,供其它脚本程序使用。http-proxy-pool可以通过自定义爬取脚本来收集网络代理信息。
4
+
5
+ ## 安装
6
+
7
+ `gem install http-proxy-pool`
8
+
9
+
10
+ ## 使用
11
+
12
+ ##### 1.命令行
13
+
14
+ * 初始化资源
15
+ `proxypool crawl`
16
+
17
+ * 查看当前已收集状态
18
+ `proxypool status`
19
+
20
+ * 随机获取一个可用代理,默认强制检查代理是否可用
21
+ `proxypool get`
22
+
23
+ 更多参数,参看`proxypool help`
24
+
25
+ ##### 2.在脚本中引用
26
+
27
+ require 'http-proxy-pool'
28
+
29
+ pool = HttpProxyPool::ProxyPool.new
30
+ pool.query(:ip => "=~ /^111/", :proxy_type => "== 'HTTP'") do |proxy|
31
+ # do what you want ...
32
+ end
33
+
34
+ query查询出proxy资源不会强制,校验是否可用。可用checker通过来校验:
35
+
36
+ pool.checker(proxy)
37
+
38
+ ## 定义爬取脚本
39
+ http-proxy-pool默认脚本会安装到**[USER\_PATH]/http\_proxy\_pool/script**中,可以自己修改已有脚本,或者在此目录添加新脚本,目前自带以下网站(站点信息源自搜索引擎)爬取脚本:
40
+
41
+ * [ip.izmoney.com](http://ip.izmoney.com)
42
+ * [kuaidaili.com](http://www.kuaidaili.com)
43
+ * [proxy360.cn](http://www.proxy360.cn)
44
+ * [goubanjia.com](http://proxy.goubanjia.com)
45
+
46
+ ##### 一个样例:
47
+
48
+ # 开始抓取地址
49
+ sitetask("start_page_url") do
50
+ nextpage do
51
+ # nextpage 最终返回下一页URL
52
+ # 此部分需判断是否需要是否是最后页
53
+ # 如果未定义nextpage部分,程序默认只会爬去第一页
54
+ end
55
+
56
+ parser do
57
+ # 此部分,最终返回一个Proxy实例的数组
58
+ # 此block中,可以通过解析当前Mechanize页面,通过dom数据生成多个Proxy
59
+ end
60
+ end
61
+
62
+ ##### 创建Proxy:
63
+
64
+ HttpProxyPool::Proxy.new {
65
+ :ip => '127.0.0.1', # IP地址
66
+ :port => 8080, # 端口
67
+ :username => 'jiyaping', # 认证用户名
68
+ :password => 'xxxxxx', # 认证密码
69
+ :proxy_level => 'high', # 代理等级(匿名、透明代理)
70
+ :proxy_type => 'http', # 代理类型(HTTP、HTTPS、SOCKS)
71
+ :speed => '0.5', # 代理速度
72
+ :added_time => DateTime.now, # 添加时间
73
+ :last_access_time => DateTime.now,# 上次使用时间
74
+ :nation => 'cn', # 国家
75
+ :province => 'guangdong', # 省份/州
76
+ :src_from => 'xxxxxx.com' # 获取来源
77
+ }
78
+
79
+ ## 最后
80
+
81
+ 就酱紫 ...
@@ -0,0 +1,9 @@
1
+ gem "minitest"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ desc 'Run tests'
9
+ task :default => :test
@@ -0,0 +1,60 @@
1
+ #! ruby
2
+
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require 'thor'
7
+ require "http_proxy_pool"
8
+
9
+ class HttpProxyPoolApp < Thor
10
+ @@proxy_pool = HttpProxyPool::ProxyPool.new(
11
+ :data_path=> File.join(HttpProxyPool.home, 'ips.yaml'),
12
+ :script => Dir["#{HttpProxyPool.home}/script/*.site"],
13
+ :logger => HttpProxyPool.logger
14
+ )
15
+
16
+ desc 'status', 'show proxy pool status.'
17
+ def status
18
+ @@proxy_pool.status
19
+ end
20
+
21
+ desc 'crawl [WAY]', 'gather ip source store to local file through WAY.'
22
+ method_option :lastest, :aliases => '-l',
23
+ :type => :boolean,
24
+ :default => true,
25
+ :desc => 'only crawl recently ip.'
26
+ method_option :check, :aliases => '-c',
27
+ :type => :boolean,
28
+ :default => false,
29
+ :desc => 'store it after check if available.'
30
+ def crawl(way = 'script')
31
+ puts "wait...."
32
+
33
+ if way == 'script'
34
+ lastest = options[:lastest]
35
+ check = options[:check]
36
+
37
+ @@proxy_pool.crawling(lastest, check)
38
+ end
39
+
40
+ puts "done."
41
+ end
42
+
43
+ desc 'get', 'get ip from local storage.'
44
+ method_option :force_check, :aliases => '-fc',
45
+ :type => :boolean,
46
+ :default => true,
47
+ :desc => 'check the ip if ready to use.'
48
+ method_option :thread_num, :aliases => '-t',
49
+ :type => :numeric,
50
+ :default => 10,
51
+ :desc => 'num of search thread.'
52
+ def get
53
+ force_check = options[:force_check]
54
+ thread_num = options[:thread_num]
55
+
56
+ puts @@proxy_pool.get_random_proxy(force_check, thread_num)
57
+ end
58
+ end
59
+
60
+ HttpProxyPoolApp.start
@@ -0,0 +1,25 @@
1
+ #encoding : utf-8
2
+
3
+ require 'mechanize'
4
+
5
+ require 'http_proxy_pool/error'
6
+ require 'http_proxy_pool/utils'
7
+ require 'http_proxy_pool/basetask'
8
+ require 'http_proxy_pool/proxy'
9
+ require 'http_proxy_pool/proxy_pool'
10
+ require 'http_proxy_pool/version'
11
+
12
+ module HttpProxyPool
13
+ # will support some configure
14
+ @config = {}
15
+
16
+ @home = File.join(Dir.home, 'http_proxy_pool')
17
+ Dir.mkdir(@home) unless Dir.exists? @home
18
+
19
+ @script_path = File.join(@home, 'script')
20
+ Dir.mkdir(@script_path) unless Dir.exists? @script_path
21
+
22
+ @logger = Logger.new(File.join(@home, 'proxy.log'), 2_000_000)
23
+
24
+ init_default_script
25
+ end
@@ -0,0 +1,81 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class Basetask
5
+ attr_accessor :agent,
6
+ :url,
7
+ :logger,
8
+ :page_parser,
9
+ :next_page
10
+
11
+ def initialize(opts = {})
12
+ @agent = opts[:agent]
13
+ @logger = opts[:logger]
14
+ @url = opts[:url]
15
+ end
16
+
17
+ def sitetask(url, opts = {})
18
+ raise ScriptError.new("script do not specify a url!") unless url
19
+
20
+ @url = url
21
+ @agent = opts[:agent] || Mechanize.new
22
+ @logger ||= opts[:logger]
23
+
24
+ #for debug
25
+ #@agent.set_proxy '127.0.0.1', 8888
26
+
27
+ yield
28
+ end
29
+
30
+ def ips(lastest = true)
31
+ uri = @url
32
+
33
+ loop do
34
+ @logger.info("start crawling page [#{uri}] ...")
35
+ @agent.get(uri)
36
+ # get all page need sleep a random time
37
+ rand_sleep unless lastest
38
+
39
+ begin
40
+ instance_eval(&page_parser).each do |field|
41
+ yield field
42
+ end
43
+ rescue Exception => e
44
+ @logger.error("parsing page error[#{uri}]. #{e.to_s}")
45
+ break
46
+ end
47
+
48
+ begin
49
+ break unless @next_page
50
+ uri = instance_eval(&next_page)
51
+ break unless uri
52
+ rescue => e
53
+ @logger.error("error occoured when get next page[#{uri}]. #{e.to_s}")
54
+ break
55
+ end
56
+
57
+ break if lastest
58
+ end
59
+ end
60
+
61
+ def parser(&block)
62
+ @page_parser = block if block_given?
63
+ end
64
+
65
+ def nextpage(&block)
66
+ @next_page = block if block_given?
67
+ end
68
+
69
+ def curr_page
70
+ @agent.page.uri
71
+ end
72
+
73
+ def sitename
74
+ URI.parse(URI.encode(@url)).host
75
+ end
76
+
77
+ def rand_sleep(max_tick = 2)
78
+ sleep rand(max_tick)
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,8 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class BaseError < StandardError; end
5
+ class ScriptError < BaseError; end
6
+ class TaskError < BaseError; end
7
+ class QueryError < BaseError; end
8
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/china/high/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/china/normal/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/foreign/high/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ sitetask("http://ip.izmoney.com/search/foreign/normal/index.html") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text
10
+ fields[:port] = tds[1].text
11
+ fields[:nation] = tds[2].text
12
+ fields[:proxy_level]= tds[4].text
13
+ fields[:proxy_type] = tds[5].text
14
+ fields[:added_time] = DateTime.now
15
+ fields[:src_from] = sitename
16
+
17
+ ips << fields
18
+ end
19
+
20
+ ips
21
+ end
22
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/inha/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/intr/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/outha/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ sitetask("http://www.kuaidaili.com/free/outtr/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.active')
6
+ curr_idx = agent.page.at('.active').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at("#listnav").search("a[href*='/free']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("tbody").search("tr").each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:ip] = tds[0].text
23
+ fields[:port] = tds[1].text
24
+ fields[:proxy_level]= tds[2].text
25
+ fields[:proxy_type] = tds[3].text
26
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
27
+ fields[:speed] = tds[5].text
28
+ fields[:added_time] = tds[6].text
29
+ fields[:src_from] = sitename
30
+
31
+ ips << fields
32
+ end
33
+
34
+ ips
35
+ end
36
+ end
@@ -0,0 +1,21 @@
1
+ sitetask("http://www.proxy360.cn/default.aspx") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search(".proxylistitem").each do |node|
6
+ tds = node.search('.tbBottomLine')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].text.strip
10
+ fields[:port] = tds[1].text.strip
11
+ fields[:proxy_level]= tds[2].text.strip
12
+ fields[:nation] = tds[3].text.strip
13
+ fields[:added_time] = tds[4].text.strip
14
+ fields[:src_from] = sitename
15
+
16
+ ips << fields
17
+ end
18
+
19
+ ips
20
+ end
21
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gngn/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gnpt/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gwgn/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ sitetask("http://proxy.goubanjia.com/free/gwpt/index.shtml") do
2
+ parser do
3
+ ips = []
4
+
5
+ agent.page.search("tbody").search("tr").each do |node|
6
+ tds = node.search('td')
7
+ fields = {}
8
+
9
+ fields[:ip] = tds[0].search(":not(p[style='display: none;'])").text
10
+ fields[:port] = tds[1].text
11
+ fields[:proxy_level]= tds[2].text
12
+ fields[:proxy_type] = tds[3].text
13
+ fields[:nation] = tds[4].text
14
+ fields[:province] = tds[5].text
15
+ fields[:added_time] = DateTime.now
16
+ fields[:src_from] = sitename
17
+
18
+ ips << fields
19
+ end
20
+
21
+ ips
22
+ end
23
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/nn/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/nn/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/nt/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/nt/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/qq/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/qq/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/wn/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/wn/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ sitetask("http://www.xicidaili.com/wt/") do
2
+ nextpage do
3
+ curr_idx = 0
4
+
5
+ if agent.page.at('.current')
6
+ curr_idx = agent.page.at('.current').text.to_i
7
+ end
8
+
9
+ last_page = agent.page.at(".pagination").search("a[href*='/wt/']").last.text.to_i
10
+ return if curr_idx == last_page
11
+
12
+ File.join(url, (curr_idx + 1).to_s)
13
+ end
14
+
15
+ parser do
16
+ ips = []
17
+
18
+ agent.page.search("#ip_list").search("tr")[1..-1].each do |node|
19
+ tds = node.search('td')
20
+ fields = {}
21
+
22
+ fields[:nation] = tds[1].at('img')['alt'] if tds[1].at('img')
23
+ fields[:ip] = tds[2].text
24
+ fields[:port] = tds[3].text
25
+ fields[:province] = tds[4].at('a').text if tds[4].at('a')
26
+ fields[:proxy_level]= tds[5].text
27
+ fields[:proxy_type] = tds[6].text
28
+ fields[:speed] = tds[7].at('div')["title"] if tds[7].at('div')
29
+ fields[:added_time] = tds[9].text
30
+ fields[:src_from] = sitename
31
+
32
+ ips << fields
33
+ end
34
+
35
+ ips
36
+ end
37
+ end
@@ -0,0 +1,43 @@
1
+ # encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class Proxy
5
+ attr_accessor :ip,
6
+ :port,
7
+ :username,
8
+ :password,
9
+ :proxy_level,
10
+ :proxy_type,
11
+ :speed,
12
+ :added_time,
13
+ :last_access_time,
14
+ :nation,
15
+ :province,
16
+ :src_from,
17
+ :try_times
18
+
19
+ def initialize(args = {})
20
+ @ip = args[:ip]
21
+ @port = args[:port]
22
+ @username = args[:username] || ''
23
+ @password = args[:password] || ''
24
+ @proxy_type = args[:proxy_type]
25
+ @proxy_level= args[:proxy_level]
26
+ @speed = args[:speed]
27
+ @added_time = args[:added_time]
28
+ @last_access= args[:last_access]
29
+ @nation = args[:nation]
30
+ @province = args[:province]
31
+ @src_from = args[:src_from]
32
+ @try_times = args[:try_times] || 0
33
+ end
34
+
35
+ def to_arr
36
+ [@ip, @port, @proxy_type, @proxy_level, @nation, @province]
37
+ end
38
+
39
+ def to_s
40
+ "#{@ip}\t#{@port}"
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,202 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ class ProxyPool
5
+ attr_accessor :proxys, :logger
6
+
7
+ def initialize(args = {})
8
+ @data_path = args[:data_path] || File.join(HttpProxyPool.home, 'ips.yaml')
9
+ @script = args[:script] || Dir["#{HttpProxyPool.home}/script/*.site"]
10
+ @logger = args[:logger] || HttpProxyPool.logger
11
+ @proxys = []
12
+
13
+ @agent = Mechanize.new
14
+ @agent.user_agent_alias = get_agent_alias
15
+
16
+ load_proxy if File.exists? @data_path
17
+ end
18
+
19
+ def status
20
+ puts "proxy count : #{@proxys.size}"
21
+ end
22
+
23
+ # query interface
24
+ def query(args = {})
25
+ begin
26
+ selected_proxy = @proxys.select do |proxy|
27
+ instance_eval(build_query_parameter('proxy', args))
28
+ end
29
+ rescue => e
30
+ raise QueryError.new("query parameter error!")
31
+ end
32
+
33
+ return selected_proxy unless block_given?
34
+
35
+ selected_proxy.each do |proxy|
36
+ yield proxy
37
+ end
38
+ end
39
+
40
+ def build_query_parameter(prefix = 'proxy', args)
41
+ condition_str = ''
42
+
43
+ args = query_key_filter(args)
44
+
45
+ args.each do |key, express|
46
+ condition_str << "#{prefix}.#{key} #{express} && "
47
+ end
48
+
49
+ condition_str.sub!(/\s?&&\s?$/, '')
50
+
51
+ condition_str
52
+ end
53
+
54
+ def query_key_filter(args)
55
+ proxy = Proxy.new
56
+ args.select{ |k| proxy.respond_to? k }
57
+ end
58
+
59
+ def get_random_proxy(check = true, thread_num = 10)
60
+ mutex = Mutex.new
61
+ result = nil
62
+ thread_list = []
63
+
64
+ begin
65
+ thread_num.times do |thread|
66
+ thread_list << Thread.new do
67
+ while(!result)
68
+ proxy = @proxys[rand(@proxys.size)]
69
+ @logger.info("using #{proxy}.")
70
+ proxy = checker(proxy) if check
71
+
72
+ if proxy.is_a? Proxy
73
+ mutex.synchronize do
74
+ result = proxy
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ thread_list.each { |t| t.join }
82
+ rescue => e
83
+ @logger.error("find proxy error. #{e.to_s}")
84
+ ensure
85
+ save_proxy
86
+ end
87
+
88
+ result
89
+ end
90
+
91
+ def crawling(lastest = true, check = false)
92
+ @script.each do |file|
93
+ begin
94
+ task = Basetask.new(:agent => @agent,:logger => @logger)
95
+ task.instance_eval(read_taskfile(file))
96
+
97
+ task.ips(lastest) do |fields|
98
+ proxy = Proxy.new(fields)
99
+ (next unless checker(proxy)) if check
100
+ @proxys << proxy unless include?(proxy)
101
+ end
102
+ rescue => e
103
+ @logger.error(e)
104
+ ensure
105
+ save_proxy
106
+ end
107
+ end
108
+ end
109
+
110
+ def include?(proxy)
111
+ @proxys.select{ |p| p.ip == proxy.ip}.size > 0
112
+ end
113
+
114
+ def save_proxy
115
+ file = File.open(@data_path, 'w')
116
+ YAML.dump(@proxys, file)
117
+ file.close
118
+ end
119
+
120
+ def load_proxy
121
+ @proxys = YAML.load_file(@data_path)
122
+ end
123
+
124
+ def read_taskfile(file)
125
+ cnt = ''
126
+ File.open(file) do |f|
127
+ while(line = f.gets)
128
+ cnt << line
129
+ end
130
+ end
131
+
132
+ cnt
133
+ end
134
+
135
+ def get_agent_alias
136
+ agent_arr = [
137
+ 'Linux Firefox',
138
+ 'Linux Mozilla',
139
+ 'Mac Firefox',
140
+ 'Mac Mozilla',
141
+ 'Mac Safari',
142
+ 'Windows Chrome',
143
+ 'Windows IE 7',
144
+ 'Windows IE 8',
145
+ 'Windows IE 9',
146
+ 'Windows Mozilla',
147
+ 'iPhone',
148
+ 'iPad',
149
+ 'Android']
150
+
151
+ agent_arr[rand(agent_arr.size)]
152
+ end
153
+
154
+ def checker(proxy)
155
+ if proxy.is_a? Array
156
+ checker_batch(proxy)
157
+ else
158
+ checker_single(proxy)
159
+ end
160
+ end
161
+
162
+ def checker_batch(proxys, task_count = 5)
163
+ result = []
164
+ mutex = Mutex.new
165
+ thread_count = (proxys.size / task_count.to_f).ceil
166
+
167
+ thread_count.times do |thread_idx|
168
+ (Thread.new do
169
+ start_idx = thread_idx * task_count
170
+ end_idx = (thread_idx + 1) * task_count
171
+ end_idx = proxys.size if end_idx > proxys.size
172
+
173
+ proxys[start_idx..end_idx].each do |proxy|
174
+ p = checker_single(proxy)
175
+
176
+ mutex.synchronize do
177
+ result<< p if p
178
+ end
179
+ end
180
+ end).join
181
+ end
182
+
183
+ result
184
+ end
185
+
186
+ def checker_single(proxy, timeout = 0.05)
187
+ http = Net::HTTP.new('baidu.com', 80, proxy.ip, proxy.port)
188
+ http.open_timeout = timeout
189
+ http.read_timeout = timeout * 10
190
+
191
+ begin
192
+ return proxy if http.get('/').code =~ /^[1|2|3|4]/
193
+ rescue => e
194
+ @logger.info("can not connect proxy.[#{proxy}].#{e.to_s}")
195
+ @proxys.delete(proxy)
196
+ @logger.info("delete disabled proxy [#{proxy}].")
197
+ end
198
+
199
+ false
200
+ end
201
+ end
202
+ end
@@ -0,0 +1,30 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ module_function
5
+
6
+ def init_default_script
7
+
8
+ target_dir = Dir.new(@script_path)
9
+
10
+ src_dir = File.join(File.dirname(__FILE__), 'example')
11
+ Dir.entries(src_dir).each do |src|
12
+ next unless src.end_with? '.site'
13
+
14
+ FileUtils.cp File.join(src_dir, src),
15
+ target_dir.path unless target_dir.include? src
16
+ end
17
+ end
18
+
19
+ def home
20
+ @home
21
+ end
22
+
23
+ def script_path
24
+ @script_path
25
+ end
26
+
27
+ def logger
28
+ @logger
29
+ end
30
+ end
@@ -0,0 +1,5 @@
1
+ #encoding : utf-8
2
+
3
+ module HttpProxyPool
4
+ VERSION = '0.0.1'
5
+ end
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: http_proxy_pool
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - jiyaping
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-09-06 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: &10417392 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.7'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *10417392
25
+ description: Gather free http proxy data
26
+ email: jiyaping0802@gmail.com
27
+ executables:
28
+ - proxypool
29
+ extensions: []
30
+ extra_rdoc_files: []
31
+ files:
32
+ - lib/http_proxy_pool/basetask.rb
33
+ - lib/http_proxy_pool/error.rb
34
+ - lib/http_proxy_pool/example/izmoney_china_hight.site
35
+ - lib/http_proxy_pool/example/izmoney_china_normal.site
36
+ - lib/http_proxy_pool/example/izmoney_foreign_high.site
37
+ - lib/http_proxy_pool/example/izmoney_foreign_normal.site
38
+ - lib/http_proxy_pool/example/kuaidaili_inha.site
39
+ - lib/http_proxy_pool/example/kuaidaili_intr.site
40
+ - lib/http_proxy_pool/example/kuaidaili_outha.site
41
+ - lib/http_proxy_pool/example/kuaidaili_outtr.site
42
+ - lib/http_proxy_pool/example/proxy360.site
43
+ - lib/http_proxy_pool/example/proxy_goubanjia_gngn.site
44
+ - lib/http_proxy_pool/example/proxy_goubanjia_gnpt.site
45
+ - lib/http_proxy_pool/example/proxy_goubanjia_gwgn.site
46
+ - lib/http_proxy_pool/example/proxy_goubanjia_gwpt.site
47
+ - lib/http_proxy_pool/example/xicidaili_nn.site
48
+ - lib/http_proxy_pool/example/xicidaili_nt.site
49
+ - lib/http_proxy_pool/example/xicidaili_qq.site
50
+ - lib/http_proxy_pool/example/xicidaili_wn.site
51
+ - lib/http_proxy_pool/example/xicidaili_wt.site
52
+ - lib/http_proxy_pool/proxy.rb
53
+ - lib/http_proxy_pool/proxy_pool.rb
54
+ - lib/http_proxy_pool/utils.rb
55
+ - lib/http_proxy_pool/version.rb
56
+ - lib/http_proxy_pool.rb
57
+ - Rakefile
58
+ - README.md
59
+ - !binary |-
60
+ YmluL3Byb3h5cG9vbA==
61
+ homepage: https://github.com/jiyaping/http-proxy-pool
62
+ licenses:
63
+ - MIT
64
+ post_install_message:
65
+ rdoc_options: []
66
+ require_paths:
67
+ - lib
68
+ required_ruby_version: !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ! '>='
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ required_rubygems_version: !ruby/object:Gem::Requirement
75
+ none: false
76
+ requirements:
77
+ - - ! '>='
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project:
82
+ rubygems_version: 1.8.16
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: http proxy crawling from web
86
+ test_files: []