grab_epg 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +2 -2
- data/lib/debug.rb +4 -43
- data/lib/grab_tvmao.rb +595 -0
- data/lib/grabepg/grab_base.rb +173 -0
- data/lib/grabepg/grab_tvsou.rb +170 -0
- data/lib/grabepg.rb +4 -595
- data/lib/test/test_grab_tvsou.rb +52 -0
- metadata +7 -3
@@ -0,0 +1,173 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Grabepg
|
5
|
+
|
6
|
+
class GrabBase
|
7
|
+
# To change this template use File | Settings | File Templates.
|
8
|
+
|
9
|
+
|
10
|
+
def conversion_what_day(whatday)
|
11
|
+
ret = "星期"
|
12
|
+
case whatday.to_i
|
13
|
+
when 1
|
14
|
+
ret += "一"
|
15
|
+
when 2
|
16
|
+
ret += "二"
|
17
|
+
when 3
|
18
|
+
ret += "三"
|
19
|
+
when 4
|
20
|
+
ret += "四"
|
21
|
+
when 5
|
22
|
+
ret += "五"
|
23
|
+
when 6
|
24
|
+
ret += "六"
|
25
|
+
when 7
|
26
|
+
ret += "七"
|
27
|
+
end
|
28
|
+
ret
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
#获取指定访问速度的代理服务器
|
33
|
+
#time为最慢速度的时间 int型 代表秒
|
34
|
+
def self.get_topfast_list(use_time)
|
35
|
+
fast_list = []
|
36
|
+
time_use = 0
|
37
|
+
ips_ports = get_proxy_list()
|
38
|
+
ips_ports = get_proxylist_dianxin if ips_ports.size==0
|
39
|
+
ips_ports.each do |ip_port|
|
40
|
+
time_start = Time.now.to_i
|
41
|
+
begin
|
42
|
+
timeout(use_time) do
|
43
|
+
doc = Nokogiri::HTML(open("http://www.baidu.com",:proxy=> "http://#{ip_port}"))
|
44
|
+
end
|
45
|
+
time_end = Time.now.to_i
|
46
|
+
time_use = time_end - time_start
|
47
|
+
p "http://#{ip_port} use_time:#{time_use}"
|
48
|
+
rescue Exception =>e
|
49
|
+
case e
|
50
|
+
when Errno::ETIMEDOUT
|
51
|
+
p "Use http://#{ip_port} timeout"
|
52
|
+
when Timeout::Error
|
53
|
+
p "Use http://#{ip_port} timeout"
|
54
|
+
when Errno::ECONNREFUSED
|
55
|
+
p "Use http://#{ip_port} Error connection"
|
56
|
+
else
|
57
|
+
p "Use http://#{ip_port} Error:#{e.to_s}"
|
58
|
+
end
|
59
|
+
time_use = -1
|
60
|
+
end
|
61
|
+
if(time_use > 0 &&time_use < 8)
|
62
|
+
fast_list << ip_port
|
63
|
+
end
|
64
|
+
end
|
65
|
+
fast_list
|
66
|
+
end
|
67
|
+
|
68
|
+
#获取代理列表
|
69
|
+
def self.get_proxy_list()
|
70
|
+
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
|
71
|
+
if list.count ==0
|
72
|
+
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
73
|
+
end
|
74
|
+
ips_ports = []
|
75
|
+
regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
|
76
|
+
regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
|
77
|
+
list.each do |proxy_txt|
|
78
|
+
port = proxy_txt[regex_port]
|
79
|
+
ip = proxy_txt[regex_ip]
|
80
|
+
if(ip != ""&& !port.to_s.eql?('3128'))
|
81
|
+
port_ip = ip.to_s + ":" + port.to_s
|
82
|
+
ips_ports << port_ip
|
83
|
+
end
|
84
|
+
end
|
85
|
+
p "Count: #{ips_ports.count}"
|
86
|
+
ips_ports
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.gg(url)
|
90
|
+
regex_list = /<TD class="list">.*<\/TD>/
|
91
|
+
href =URI.parse(url)
|
92
|
+
contxt = ""
|
93
|
+
href.open{ |f|
|
94
|
+
f.each_line {|line| contxt =contxt + line + "\n"}
|
95
|
+
}
|
96
|
+
list = contxt.scan(regex_list)
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
def self.get_proxylist_dianxin()
|
102
|
+
list = gg("http://www.proxycn.cn/countryDX.php")
|
103
|
+
if list.count ==0
|
104
|
+
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
105
|
+
end
|
106
|
+
ips_ports = []
|
107
|
+
regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
|
108
|
+
regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
|
109
|
+
list.each do |proxy_txt|
|
110
|
+
port = proxy_txt[regex_port]
|
111
|
+
ip = proxy_txt[regex_ip]
|
112
|
+
if(ip != ""&& !port.to_s.eql?('3128'))
|
113
|
+
port_ip = ip.to_s + ":" + port.to_s
|
114
|
+
ips_ports << port_ip
|
115
|
+
end
|
116
|
+
end
|
117
|
+
p "Count: #{ips_ports.count}"
|
118
|
+
ips_ports
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
|
126
|
+
#使用代理获取url的html的doc值
|
127
|
+
def get_doc_with_proxy(proxylist,url)
|
128
|
+
unless proxylist.nil?||proxylist.empty?
|
129
|
+
unless @proxyindex
|
130
|
+
@proxyindex = 0
|
131
|
+
end
|
132
|
+
@proxyindex=@proxyindex%proxylist.size
|
133
|
+
if(proxylist[@proxyindex])
|
134
|
+
proxy = proxylist[@proxyindex]
|
135
|
+
else
|
136
|
+
proxy = proxylist[@proxyindex+1]
|
137
|
+
end
|
138
|
+
begin
|
139
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
|
140
|
+
@no_firest = 0
|
141
|
+
rescue => err
|
142
|
+
if proxy.empty?||proxy.nil?
|
143
|
+
proxylist.delete_at[@proxyindex]
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
unless @no_firest
|
148
|
+
@no_firest = 0
|
149
|
+
end
|
150
|
+
|
151
|
+
@no_firest += 1
|
152
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
153
|
+
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
154
|
+
@proxyindex += 1
|
155
|
+
get_doc_with_proxy(proxylist,url) if @no_firest<4
|
156
|
+
raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
|
157
|
+
end
|
158
|
+
@proxyindex += 1
|
159
|
+
unless doc
|
160
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
161
|
+
end
|
162
|
+
else
|
163
|
+
begin
|
164
|
+
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
165
|
+
rescue => err
|
166
|
+
raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
doc
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
|
3
|
+
require File.expand_path("../grab_base.rb", __FILE__)
|
4
|
+
module Grabepg
|
5
|
+
# To change this template use File | Settings | File Templates.
|
6
|
+
|
7
|
+
|
8
|
+
class GrabTvsou
|
9
|
+
include Grabepg
|
10
|
+
#首页
|
11
|
+
attr_reader :home_page
|
12
|
+
|
13
|
+
#代理列表
|
14
|
+
attr_reader :proxy_list
|
15
|
+
|
16
|
+
attr_reader :grabbase
|
17
|
+
|
18
|
+
#频道存储
|
19
|
+
attr_reader :channels
|
20
|
+
|
21
|
+
#时间表存储
|
22
|
+
attr_reader :schedules
|
23
|
+
|
24
|
+
#俩个节目间的最小间隔时间
|
25
|
+
attr_reader :default_min_interval
|
26
|
+
|
27
|
+
|
28
|
+
ChannelTypeMap = {"yg_ys_li"=>"央视","yg_ws_li"=>"卫视","yg_hw_li"=>"海外","yg_df_li"=>"地方"}
|
29
|
+
|
30
|
+
#type 从mobie还是网站接口抓取数据
|
31
|
+
def initialize(grabtype,proxy_list)
|
32
|
+
@home_page = get_url(grabtype)
|
33
|
+
@proxy_list = proxy_list
|
34
|
+
@grabbase = GrabBase.new
|
35
|
+
@channels = {}
|
36
|
+
@site="http://m.tvsou.com"
|
37
|
+
end
|
38
|
+
|
39
|
+
#获取从tvsou的什么网站上获取
|
40
|
+
#type: mobile,webpage
|
41
|
+
def get_url(type)
|
42
|
+
return "http://m.tvsou.com/index.asp" if type.eql?("mobile")
|
43
|
+
end
|
44
|
+
|
45
|
+
def get_data_year_month_day(time)
|
46
|
+
|
47
|
+
return {time:"#{time.year}-#{time.month}-#{time.day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{time.month}-#{time.day})"}
|
48
|
+
end
|
49
|
+
|
50
|
+
#获取时间
|
51
|
+
#start_time 时间起始点
|
52
|
+
#use_time 天数
|
53
|
+
def get_data(start_time,use_time)
|
54
|
+
time = Time.now+start_time*24*60*60
|
55
|
+
ret = []
|
56
|
+
use_time.times.each do |i|
|
57
|
+
_time = time + i*24*60*60
|
58
|
+
ret << get_data_year_month_day(_time)
|
59
|
+
end
|
60
|
+
ret
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
#对首页进行处理获取部分频道的URL和嘻嘻
|
65
|
+
def dispose_home_page
|
66
|
+
|
67
|
+
get_channellist = lambda { |li,type|
|
68
|
+
channellist = {}
|
69
|
+
li.css('a').each do |a|
|
70
|
+
channellist.merge!({a.content=>{url:a.get_attribute("href"),type:type}}) unless channellist.has_key?(a.content)
|
71
|
+
end
|
72
|
+
channellist
|
73
|
+
}
|
74
|
+
|
75
|
+
|
76
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,@home_page)
|
77
|
+
doc.css("li").each do |li|
|
78
|
+
case ChannelTypeMap[li.get_attribute("class")]
|
79
|
+
when "央视"
|
80
|
+
@channels.merge!(get_channellist.call(li,"CCTV"))
|
81
|
+
when "卫视"
|
82
|
+
@channels.merge!(get_channellist.call(li,"WTV"))
|
83
|
+
when "海外"
|
84
|
+
|
85
|
+
when "地方"
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
return @channels
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
#获取频道列表
|
94
|
+
#url是获取频道列表的首页
|
95
|
+
#地方需要调用此函数
|
96
|
+
def dispose_channel_page(url,channel_type)
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
|
102
|
+
#获取频道时间表URL
|
103
|
+
def dispose_href_schedule_data(href,start_time,use_time)
|
104
|
+
hrefs=href.split("&programDT=")
|
105
|
+
_hrefs=hrefs[1].split("&")
|
106
|
+
ret = []
|
107
|
+
get_data(start_time,use_time).each do |time|
|
108
|
+
_hrefs[0]=time[:time]
|
109
|
+
url = hrefs[0]+"&programDT=" + time[:time]
|
110
|
+
1.upto(_hrefs.length-1).each do |i|
|
111
|
+
url += "&"+_hrefs[i]
|
112
|
+
end
|
113
|
+
ret<<{url:url,time:time[:time],date:time[:date]}
|
114
|
+
end
|
115
|
+
ret
|
116
|
+
end
|
117
|
+
|
118
|
+
#根据URL解析时间表页面
|
119
|
+
def dispose_schedule_page(url,start_time,use_time)
|
120
|
+
url = @site +"/"+url
|
121
|
+
urls = url.split("?")
|
122
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
123
|
+
_url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
|
124
|
+
_url = urls[0]+_url
|
125
|
+
urls = dispose_href_schedule_data(_url,start_time,use_time)
|
126
|
+
ret = {}
|
127
|
+
last_time = -5
|
128
|
+
last_schedule = {}
|
129
|
+
urls.each do |url|
|
130
|
+
p "Grab url: #{url}"
|
131
|
+
if url
|
132
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url[:url])
|
133
|
+
schedules = []
|
134
|
+
doc.css('div[class="time"]')[0].css("li[class='gray']").each do |schedule|
|
135
|
+
begin
|
136
|
+
_dispose = schedule.content
|
137
|
+
_dispose_show =schedule.css("span")[0].text
|
138
|
+
time = _dispose.gsub(_dispose_show,"")
|
139
|
+
_url = @site + schedule.css('a')[0].get_attribute("href") if schedule.css('a')[0]
|
140
|
+
schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
|
141
|
+
now = time.gsub(":","").to_i
|
142
|
+
if((now-last_time)<5)
|
143
|
+
schedules.delete(last_schedule)
|
144
|
+
end
|
145
|
+
last_schedule = {time:time,schedule_name:_dispose_show.gsub(" 剧情",""),url:_url}
|
146
|
+
last_time = now
|
147
|
+
rescue => err
|
148
|
+
p "Schedule: #{schedule}"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
ret.merge!({url[:date]=>schedules})
|
152
|
+
end
|
153
|
+
end
|
154
|
+
return ret
|
155
|
+
end
|
156
|
+
|
157
|
+
#解析节目详情页面
|
158
|
+
def dispose_show_info(url)
|
159
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
160
|
+
show_name = doc.css('div[class="tv_info_top"]')[0].content
|
161
|
+
_doc=doc.css("div[class='tv_info']")
|
162
|
+
img_url = _doc.css("img")[0].get_attribute("src").gsub(" ","")
|
163
|
+
show_info = _doc.css("p")[0].content.gsub("[全文]","")
|
164
|
+
{show_name:show_name,img_url:img_url,show_info:show_info}
|
165
|
+
end
|
166
|
+
|
167
|
+
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|