grab_epg 0.1.6 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +2 -2
- data/lib/debug.rb +4 -43
- data/lib/grab_tvmao.rb +595 -0
- data/lib/grabepg/grab_base.rb +173 -0
- data/lib/grabepg/grab_tvsou.rb +170 -0
- data/lib/grabepg.rb +4 -595
- data/lib/test/test_grab_tvsou.rb +52 -0
- metadata +7 -3
@@ -0,0 +1,173 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Grabepg
|
5
|
+
|
6
|
+
class GrabBase
|
7
|
+
# To change this template use File | Settings | File Templates.
|
8
|
+
|
9
|
+
|
10
|
+
def conversion_what_day(whatday)
|
11
|
+
ret = "星期"
|
12
|
+
case whatday.to_i
|
13
|
+
when 1
|
14
|
+
ret += "一"
|
15
|
+
when 2
|
16
|
+
ret += "二"
|
17
|
+
when 3
|
18
|
+
ret += "三"
|
19
|
+
when 4
|
20
|
+
ret += "四"
|
21
|
+
when 5
|
22
|
+
ret += "五"
|
23
|
+
when 6
|
24
|
+
ret += "六"
|
25
|
+
when 7
|
26
|
+
ret += "七"
|
27
|
+
end
|
28
|
+
ret
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
#获取指定访问速度的代理服务器
|
33
|
+
#time为最慢速度的时间 int型 代表秒
|
34
|
+
def self.get_topfast_list(use_time)
|
35
|
+
fast_list = []
|
36
|
+
time_use = 0
|
37
|
+
ips_ports = get_proxy_list()
|
38
|
+
ips_ports = get_proxylist_dianxin if ips_ports.size==0
|
39
|
+
ips_ports.each do |ip_port|
|
40
|
+
time_start = Time.now.to_i
|
41
|
+
begin
|
42
|
+
timeout(use_time) do
|
43
|
+
doc = Nokogiri::HTML(open("http://www.baidu.com",:proxy=> "http://#{ip_port}"))
|
44
|
+
end
|
45
|
+
time_end = Time.now.to_i
|
46
|
+
time_use = time_end - time_start
|
47
|
+
p "http://#{ip_port} use_time:#{time_use}"
|
48
|
+
rescue Exception =>e
|
49
|
+
case e
|
50
|
+
when Errno::ETIMEDOUT
|
51
|
+
p "Use http://#{ip_port} timeout"
|
52
|
+
when Timeout::Error
|
53
|
+
p "Use http://#{ip_port} timeout"
|
54
|
+
when Errno::ECONNREFUSED
|
55
|
+
p "Use http://#{ip_port} Error connection"
|
56
|
+
else
|
57
|
+
p "Use http://#{ip_port} Error:#{e.to_s}"
|
58
|
+
end
|
59
|
+
time_use = -1
|
60
|
+
end
|
61
|
+
if(time_use > 0 &&time_use < 8)
|
62
|
+
fast_list << ip_port
|
63
|
+
end
|
64
|
+
end
|
65
|
+
fast_list
|
66
|
+
end
|
67
|
+
|
68
|
+
#获取代理列表
|
69
|
+
def self.get_proxy_list()
|
70
|
+
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
|
71
|
+
if list.count ==0
|
72
|
+
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
73
|
+
end
|
74
|
+
ips_ports = []
|
75
|
+
regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
|
76
|
+
regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
|
77
|
+
list.each do |proxy_txt|
|
78
|
+
port = proxy_txt[regex_port]
|
79
|
+
ip = proxy_txt[regex_ip]
|
80
|
+
if(ip != ""&& !port.to_s.eql?('3128'))
|
81
|
+
port_ip = ip.to_s + ":" + port.to_s
|
82
|
+
ips_ports << port_ip
|
83
|
+
end
|
84
|
+
end
|
85
|
+
p "Count: #{ips_ports.count}"
|
86
|
+
ips_ports
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.gg(url)
|
90
|
+
regex_list = /<TD class="list">.*<\/TD>/
|
91
|
+
href =URI.parse(url)
|
92
|
+
contxt = ""
|
93
|
+
href.open{ |f|
|
94
|
+
f.each_line {|line| contxt =contxt + line + "\n"}
|
95
|
+
}
|
96
|
+
list = contxt.scan(regex_list)
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
def self.get_proxylist_dianxin()
|
102
|
+
list = gg("http://www.proxycn.cn/countryDX.php")
|
103
|
+
if list.count ==0
|
104
|
+
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
105
|
+
end
|
106
|
+
ips_ports = []
|
107
|
+
regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
|
108
|
+
regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
|
109
|
+
list.each do |proxy_txt|
|
110
|
+
port = proxy_txt[regex_port]
|
111
|
+
ip = proxy_txt[regex_ip]
|
112
|
+
if(ip != ""&& !port.to_s.eql?('3128'))
|
113
|
+
port_ip = ip.to_s + ":" + port.to_s
|
114
|
+
ips_ports << port_ip
|
115
|
+
end
|
116
|
+
end
|
117
|
+
p "Count: #{ips_ports.count}"
|
118
|
+
ips_ports
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
|
126
|
+
#使用代理获取url的html的doc值
|
127
|
+
def get_doc_with_proxy(proxylist,url)
|
128
|
+
unless proxylist.nil?||proxylist.empty?
|
129
|
+
unless @proxyindex
|
130
|
+
@proxyindex = 0
|
131
|
+
end
|
132
|
+
@proxyindex=@proxyindex%proxylist.size
|
133
|
+
if(proxylist[@proxyindex])
|
134
|
+
proxy = proxylist[@proxyindex]
|
135
|
+
else
|
136
|
+
proxy = proxylist[@proxyindex+1]
|
137
|
+
end
|
138
|
+
begin
|
139
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
|
140
|
+
@no_firest = 0
|
141
|
+
rescue => err
|
142
|
+
if proxy.empty?||proxy.nil?
|
143
|
+
proxylist.delete_at[@proxyindex]
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
unless @no_firest
|
148
|
+
@no_firest = 0
|
149
|
+
end
|
150
|
+
|
151
|
+
@no_firest += 1
|
152
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
153
|
+
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
154
|
+
@proxyindex += 1
|
155
|
+
get_doc_with_proxy(proxylist,url) if @no_firest<4
|
156
|
+
raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
|
157
|
+
end
|
158
|
+
@proxyindex += 1
|
159
|
+
unless doc
|
160
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
161
|
+
end
|
162
|
+
else
|
163
|
+
begin
|
164
|
+
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
165
|
+
rescue => err
|
166
|
+
raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
doc
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
|
3
|
+
require File.expand_path("../grab_base.rb", __FILE__)
|
4
|
+
module Grabepg
|
5
|
+
# To change this template use File | Settings | File Templates.
|
6
|
+
|
7
|
+
|
8
|
+
class GrabTvsou
|
9
|
+
include Grabepg
|
10
|
+
#首页
|
11
|
+
attr_reader :home_page
|
12
|
+
|
13
|
+
#代理列表
|
14
|
+
attr_reader :proxy_list
|
15
|
+
|
16
|
+
attr_reader :grabbase
|
17
|
+
|
18
|
+
#频道存储
|
19
|
+
attr_reader :channels
|
20
|
+
|
21
|
+
#时间表存储
|
22
|
+
attr_reader :schedules
|
23
|
+
|
24
|
+
#俩个节目间的最小间隔时间
|
25
|
+
attr_reader :default_min_interval
|
26
|
+
|
27
|
+
|
28
|
+
ChannelTypeMap = {"yg_ys_li"=>"央视","yg_ws_li"=>"卫视","yg_hw_li"=>"海外","yg_df_li"=>"地方"}
|
29
|
+
|
30
|
+
#type 从mobie还是网站接口抓取数据
|
31
|
+
def initialize(grabtype,proxy_list)
|
32
|
+
@home_page = get_url(grabtype)
|
33
|
+
@proxy_list = proxy_list
|
34
|
+
@grabbase = GrabBase.new
|
35
|
+
@channels = {}
|
36
|
+
@site="http://m.tvsou.com"
|
37
|
+
end
|
38
|
+
|
39
|
+
#获取从tvsou的什么网站上获取
|
40
|
+
#type: mobile,webpage
|
41
|
+
def get_url(type)
|
42
|
+
return "http://m.tvsou.com/index.asp" if type.eql?("mobile")
|
43
|
+
end
|
44
|
+
|
45
|
+
def get_data_year_month_day(time)
|
46
|
+
|
47
|
+
return {time:"#{time.year}-#{time.month}-#{time.day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{time.month}-#{time.day})"}
|
48
|
+
end
|
49
|
+
|
50
|
+
#获取时间
|
51
|
+
#start_time 时间起始点
|
52
|
+
#use_time 天数
|
53
|
+
def get_data(start_time,use_time)
|
54
|
+
time = Time.now+start_time*24*60*60
|
55
|
+
ret = []
|
56
|
+
use_time.times.each do |i|
|
57
|
+
_time = time + i*24*60*60
|
58
|
+
ret << get_data_year_month_day(_time)
|
59
|
+
end
|
60
|
+
ret
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
#对首页进行处理获取部分频道的URL和嘻嘻
|
65
|
+
def dispose_home_page
|
66
|
+
|
67
|
+
get_channellist = lambda { |li,type|
|
68
|
+
channellist = {}
|
69
|
+
li.css('a').each do |a|
|
70
|
+
channellist.merge!({a.content=>{url:a.get_attribute("href"),type:type}}) unless channellist.has_key?(a.content)
|
71
|
+
end
|
72
|
+
channellist
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,@home_page)
|
77
|
+
doc.css("li").each do |li|
|
78
|
+
case ChannelTypeMap[li.get_attribute("class")]
|
79
|
+
when "央视"
|
80
|
+
@channels.merge!(get_channellist.call(li,"CCTV"))
|
81
|
+
when "卫视"
|
82
|
+
@channels.merge!(get_channellist.call(li,"WTV"))
|
83
|
+
when "海外"
|
84
|
+
|
85
|
+
when "地方"
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
return @channels
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
#获取频道列表
|
94
|
+
#url是获取频道列表的首页
|
95
|
+
#地方需要调用此函数
|
96
|
+
def dispose_channel_page(url,channel_type)
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
#获取频道时间表URL
|
103
|
+
def dispose_href_schedule_data(href,start_time,use_time)
|
104
|
+
hrefs=href.split("&programDT=")
|
105
|
+
_hrefs=hrefs[1].split("&")
|
106
|
+
ret = []
|
107
|
+
get_data(start_time,use_time).each do |time|
|
108
|
+
_hrefs[0]=time[:time]
|
109
|
+
url = hrefs[0]+"&programDT=" + time[:time]
|
110
|
+
1.upto(_hrefs.length-1).each do |i|
|
111
|
+
url += "&"+_hrefs[i]
|
112
|
+
end
|
113
|
+
ret<<{url:url,time:time[:time],date:time[:date]}
|
114
|
+
end
|
115
|
+
ret
|
116
|
+
end
|
117
|
+
|
118
|
+
#根据URL解析时间表页面
|
119
|
+
def dispose_schedule_page(url,start_time,use_time)
|
120
|
+
url = @site +"/"+url
|
121
|
+
urls = url.split("?")
|
122
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
123
|
+
_url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
|
124
|
+
_url = urls[0]+_url
|
125
|
+
urls = dispose_href_schedule_data(_url,start_time,use_time)
|
126
|
+
ret = {}
|
127
|
+
last_time = -5
|
128
|
+
last_schedule = {}
|
129
|
+
urls.each do |url|
|
130
|
+
p "Grab url: #{url}"
|
131
|
+
if url
|
132
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url[:url])
|
133
|
+
schedules = []
|
134
|
+
doc.css('div[class="time"]')[0].css("li[class='gray']").each do |schedule|
|
135
|
+
begin
|
136
|
+
_dispose = schedule.content
|
137
|
+
_dispose_show =schedule.css("span")[0].text
|
138
|
+
time = _dispose.gsub(_dispose_show,"")
|
139
|
+
_url = @site + schedule.css('a')[0].get_attribute("href") if schedule.css('a')[0]
|
140
|
+
schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
|
141
|
+
now = time.gsub(":","").to_i
|
142
|
+
if((now-last_time)<5)
|
143
|
+
schedules.delete(last_schedule)
|
144
|
+
end
|
145
|
+
last_schedule = {time:time,schedule_name:_dispose_show.gsub(" 剧情",""),url:_url}
|
146
|
+
last_time = now
|
147
|
+
rescue => err
|
148
|
+
p "Schedule: #{schedule}"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
ret.merge!({url[:date]=>schedules})
|
152
|
+
end
|
153
|
+
end
|
154
|
+
return ret
|
155
|
+
end
|
156
|
+
|
157
|
+
#解析节目详情页面
|
158
|
+
def dispose_show_info(url)
|
159
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
160
|
+
show_name = doc.css('div[class="tv_info_top"]')[0].content
|
161
|
+
_doc=doc.css("div[class='tv_info']")
|
162
|
+
img_url = _doc.css("img")[0].get_attribute("src").gsub(" ","")
|
163
|
+
show_info = _doc.css("p")[0].content.gsub("[全文]","")
|
164
|
+
{show_name:show_name,img_url:img_url,show_info:show_info}
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|