grab_epg 0.1.6 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,173 @@
1
+ #encoding:utf-8
2
+ require 'nokogiri'
3
+
4
+ module Grabepg
5
+
6
+ class GrabBase
7
+ # To change this template use File | Settings | File Templates.
8
+
9
+
10
+ def conversion_what_day(whatday)
11
+ ret = "星期"
12
+ case whatday.to_i
13
+ when 1
14
+ ret += "一"
15
+ when 2
16
+ ret += "二"
17
+ when 3
18
+ ret += "三"
19
+ when 4
20
+ ret += "四"
21
+ when 5
22
+ ret += "五"
23
+ when 6
24
+ ret += "六"
25
+ when 7
26
+ ret += "七"
27
+ end
28
+ ret
29
+ end
30
+
31
+
32
+ #获取指定访问速度的代理服务器
33
+ #time为最慢速度的时间 int型 代表秒
34
+ def self.get_topfast_list(use_time)
35
+ fast_list = []
36
+ time_use = 0
37
+ ips_ports = get_proxy_list()
38
+ ips_ports = get_proxylist_dianxin if ips_ports.size==0
39
+ ips_ports.each do |ip_port|
40
+ time_start = Time.now.to_i
41
+ begin
42
+ timeout(use_time) do
43
+ doc = Nokogiri::HTML(open("http://www.baidu.com",:proxy=> "http://#{ip_port}"))
44
+ end
45
+ time_end = Time.now.to_i
46
+ time_use = time_end - time_start
47
+ p "http://#{ip_port} use_time:#{time_use}"
48
+ rescue Exception =>e
49
+ case e
50
+ when Errno::ETIMEDOUT
51
+ p "Use http://#{ip_port} timeout"
52
+ when Timeout::Error
53
+ p "Use http://#{ip_port} timeout"
54
+ when Errno::ECONNREFUSED
55
+ p "Use http://#{ip_port} Error connection"
56
+ else
57
+ p "Use http://#{ip_port} Error:#{e.to_s}"
58
+ end
59
+ time_use = -1
60
+ end
61
+ if(time_use > 0 &&time_use < 8)
62
+ fast_list << ip_port
63
+ end
64
+ end
65
+ fast_list
66
+ end
67
+
68
+ #获取代理列表
69
+ def self.get_proxy_list()
70
+ list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
71
+ if list.count ==0
72
+ list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
73
+ end
74
+ ips_ports = []
75
+ regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
76
+ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
77
+ list.each do |proxy_txt|
78
+ port = proxy_txt[regex_port]
79
+ ip = proxy_txt[regex_ip]
80
+ if(ip != ""&& !port.to_s.eql?('3128'))
81
+ port_ip = ip.to_s + ":" + port.to_s
82
+ ips_ports << port_ip
83
+ end
84
+ end
85
+ p "Count: #{ips_ports.count}"
86
+ ips_ports
87
+ end
88
+
89
+ def self.gg(url)
90
+ regex_list = /<TD class="list">.*<\/TD>/
91
+ href =URI.parse(url)
92
+ contxt = ""
93
+ href.open{ |f|
94
+ f.each_line {|line| contxt =contxt + line + "\n"}
95
+ }
96
+ list = contxt.scan(regex_list)
97
+ end
98
+
99
+
100
+
101
+ def self.get_proxylist_dianxin()
102
+ list = gg("http://www.proxycn.cn/countryDX.php")
103
+ if list.count ==0
104
+ list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
105
+ end
106
+ ips_ports = []
107
+ regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
108
+ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
109
+ list.each do |proxy_txt|
110
+ port = proxy_txt[regex_port]
111
+ ip = proxy_txt[regex_ip]
112
+ if(ip != ""&& !port.to_s.eql?('3128'))
113
+ port_ip = ip.to_s + ":" + port.to_s
114
+ ips_ports << port_ip
115
+ end
116
+ end
117
+ p "Count: #{ips_ports.count}"
118
+ ips_ports
119
+ end
120
+
121
+
122
+
123
+
124
+
125
+
126
+ #使用代理获取url的html的doc值
127
+ def get_doc_with_proxy(proxylist,url)
128
+ unless proxylist.nil?||proxylist.empty?
129
+ unless @proxyindex
130
+ @proxyindex = 0
131
+ end
132
+ @proxyindex=@proxyindex%proxylist.size
133
+ if(proxylist[@proxyindex])
134
+ proxy = proxylist[@proxyindex]
135
+ else
136
+ proxy = proxylist[@proxyindex+1]
137
+ end
138
+ begin
139
+ doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
140
+ @no_firest = 0
141
+ rescue => err
142
+ if proxy.empty?||proxy.nil?
143
+ proxylist.delete_at[@proxyindex]
144
+ end
145
+
146
+
147
+ unless @no_firest
148
+ @no_firest = 0
149
+ end
150
+
151
+ @no_firest += 1
152
+ p "*************************Proxy:#{proxy}, url:#{url}"
153
+ #proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
154
+ @proxyindex += 1
155
+ get_doc_with_proxy(proxylist,url) if @no_firest<4
156
+ raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
157
+ end
158
+ @proxyindex += 1
159
+ unless doc
160
+ p "*************************Proxy:#{proxy}, url:#{url}"
161
+ end
162
+ else
163
+ begin
164
+ doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
165
+ rescue => err
166
+ raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
167
+ end
168
+ end
169
+ doc
170
+ end
171
+
172
+ end
173
+ end
@@ -0,0 +1,170 @@
1
+ #encoding:utf-8
2
+
3
+ require File.expand_path("../grab_base.rb", __FILE__)
4
+ module Grabepg
5
+ # To change this template use File | Settings | File Templates.
6
+
7
+
8
+ class GrabTvsou
9
+ include Grabepg
10
+ #首页
11
+ attr_reader :home_page
12
+
13
+ #代理列表
14
+ attr_reader :proxy_list
15
+
16
+ attr_reader :grabbase
17
+
18
+ #频道存储
19
+ attr_reader :channels
20
+
21
+ #时间表存储
22
+ attr_reader :schedules
23
+
24
+ #俩个节目间的最小间隔时间
25
+ attr_reader :default_min_interval
26
+
27
+
28
+ ChannelTypeMap = {"yg_ys_li"=>"央视","yg_ws_li"=>"卫视","yg_hw_li"=>"海外","yg_df_li"=>"地方"}
29
+
30
+ #type 从mobie还是网站接口抓取数据
31
+ def initialize(grabtype,proxy_list)
32
+ @home_page = get_url(grabtype)
33
+ @proxy_list = proxy_list
34
+ @grabbase = GrabBase.new
35
+ @channels = {}
36
+ @site="http://m.tvsou.com"
37
+ end
38
+
39
+ #获取从tvsou的什么网站上获取
40
+ #type: mobile,webpage
41
+ def get_url(type)
42
+ return "http://m.tvsou.com/index.asp" if type.eql?("mobile")
43
+ end
44
+
45
+ def get_data_year_month_day(time)
46
+
47
+ return {time:"#{time.year}-#{time.month}-#{time.day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{time.month}-#{time.day})"}
48
+ end
49
+
50
+ #获取时间
51
+ #start_time 时间起始点
52
+ #use_time 天数
53
+ def get_data(start_time,use_time)
54
+ time = Time.now+start_time*24*60*60
55
+ ret = []
56
+ use_time.times.each do |i|
57
+ _time = time + i*24*60*60
58
+ ret << get_data_year_month_day(_time)
59
+ end
60
+ ret
61
+ end
62
+
63
+
64
+ #对首页进行处理获取部分频道的URL和嘻嘻
65
+ def dispose_home_page
66
+
67
+ get_channellist = lambda { |li,type|
68
+ channellist = {}
69
+ li.css('a').each do |a|
70
+ channellist.merge!({a.content=>{url:a.get_attribute("href"),type:type}}) unless channellist.has_key?(a.content)
71
+ end
72
+ channellist
73
+ }
74
+
75
+
76
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,@home_page)
77
+ doc.css("li").each do |li|
78
+ case ChannelTypeMap[li.get_attribute("class")]
79
+ when "央视"
80
+ @channels.merge!(get_channellist.call(li,"CCTV"))
81
+ when "卫视"
82
+ @channels.merge!(get_channellist.call(li,"WTV"))
83
+ when "海外"
84
+
85
+ when "地方"
86
+
87
+ end
88
+ end
89
+ return @channels
90
+ end
91
+
92
+
93
+ #获取频道列表
94
+ #url是获取频道列表的首页
95
+ #地方需要调用此函数
96
+ def dispose_channel_page(url,channel_type)
97
+
98
+ end
99
+
100
+
101
+
102
+ #获取频道时间表URL
103
+ def dispose_href_schedule_data(href,start_time,use_time)
104
+ hrefs=href.split("&programDT=")
105
+ _hrefs=hrefs[1].split("&")
106
+ ret = []
107
+ get_data(start_time,use_time).each do |time|
108
+ _hrefs[0]=time[:time]
109
+ url = hrefs[0]+"&programDT=" + time[:time]
110
+ 1.upto(_hrefs.length-1).each do |i|
111
+ url += "&"+_hrefs[i]
112
+ end
113
+ ret<<{url:url,time:time[:time],date:time[:date]}
114
+ end
115
+ ret
116
+ end
117
+
118
+ #根据URL解析时间表页面
119
+ def dispose_schedule_page(url,start_time,use_time)
120
+ url = @site +"/"+url
121
+ urls = url.split("?")
122
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
123
+ _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
124
+ _url = urls[0]+_url
125
+ urls = dispose_href_schedule_data(_url,start_time,use_time)
126
+ ret = {}
127
+ last_time = -5
128
+ last_schedule = {}
129
+ urls.each do |url|
130
+ p "Grab url: #{url}"
131
+ if url
132
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url[:url])
133
+ schedules = []
134
+ doc.css('div[class="time"]')[0].css("li[class='gray']").each do |schedule|
135
+ begin
136
+ _dispose = schedule.content
137
+ _dispose_show =schedule.css("span")[0].text
138
+ time = _dispose.gsub(_dispose_show,"")
139
+ _url = @site + schedule.css('a')[0].get_attribute("href") if schedule.css('a')[0]
140
+ schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
141
+ now = time.gsub(":","").to_i
142
+ if((now-last_time)<5)
143
+ schedules.delete(last_schedule)
144
+ end
145
+ last_schedule = {time:time,schedule_name:_dispose_show.gsub(" 剧情",""),url:_url}
146
+ last_time = now
147
+ rescue => err
148
+ p "Schedule: #{schedule}"
149
+ end
150
+ end
151
+ ret.merge!({url[:date]=>schedules})
152
+ end
153
+ end
154
+ return ret
155
+ end
156
+
157
+ #解析节目详情页面
158
+ def dispose_show_info(url)
159
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
160
+ show_name = doc.css('div[class="tv_info_top"]')[0].content
161
+ _doc=doc.css("div[class='tv_info']")
162
+ img_url = _doc.css("img")[0].get_attribute("src").gsub(" ","")
163
+ show_info = _doc.css("p")[0].content.gsub("[全文]","")
164
+ {show_name:show_name,img_url:img_url,show_info:show_info}
165
+ end
166
+
167
+
168
+ end
169
+
170
+ end