grab_epg 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,173 @@
1
+ #encoding:utf-8
2
+ require 'nokogiri'
3
+
4
+ module Grabepg
5
+
6
+ class GrabBase
7
+ # To change this template use File | Settings | File Templates.
8
+
9
+
10
+ def conversion_what_day(whatday)
11
+ ret = "星期"
12
+ case whatday.to_i
13
+ when 1
14
+ ret += "一"
15
+ when 2
16
+ ret += "二"
17
+ when 3
18
+ ret += "三"
19
+ when 4
20
+ ret += "四"
21
+ when 5
22
+ ret += "五"
23
+ when 6
24
+ ret += "六"
25
+ when 7
26
+ ret += "七"
27
+ end
28
+ ret
29
+ end
30
+
31
+
32
+ #获取指定访问速度的代理服务器
33
+ #time为最慢速度的时间 int型 代表秒
34
+ def self.get_topfast_list(use_time)
35
+ fast_list = []
36
+ time_use = 0
37
+ ips_ports = get_proxy_list()
38
+ ips_ports = get_proxylist_dianxin if ips_ports.size==0
39
+ ips_ports.each do |ip_port|
40
+ time_start = Time.now.to_i
41
+ begin
42
+ timeout(use_time) do
43
+ doc = Nokogiri::HTML(open("http://www.baidu.com",:proxy=> "http://#{ip_port}"))
44
+ end
45
+ time_end = Time.now.to_i
46
+ time_use = time_end - time_start
47
+ p "http://#{ip_port} use_time:#{time_use}"
48
+ rescue Exception =>e
49
+ case e
50
+ when Errno::ETIMEDOUT
51
+ p "Use http://#{ip_port} timeout"
52
+ when Timeout::Error
53
+ p "Use http://#{ip_port} timeout"
54
+ when Errno::ECONNREFUSED
55
+ p "Use http://#{ip_port} Error connection"
56
+ else
57
+ p "Use http://#{ip_port} Error:#{e.to_s}"
58
+ end
59
+ time_use = -1
60
+ end
61
+ if(time_use > 0 &&time_use < 8)
62
+ fast_list << ip_port
63
+ end
64
+ end
65
+ fast_list
66
+ end
67
+
68
+ #获取代理列表
69
+ def self.get_proxy_list()
70
+ list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
71
+ if list.count ==0
72
+ list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
73
+ end
74
+ ips_ports = []
75
+ regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
76
+ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
77
+ list.each do |proxy_txt|
78
+ port = proxy_txt[regex_port]
79
+ ip = proxy_txt[regex_ip]
80
+ if(ip != ""&& !port.to_s.eql?('3128'))
81
+ port_ip = ip.to_s + ":" + port.to_s
82
+ ips_ports << port_ip
83
+ end
84
+ end
85
+ p "Count: #{ips_ports.count}"
86
+ ips_ports
87
+ end
88
+
89
+ def self.gg(url)
90
+ regex_list = /<TD class="list">.*<\/TD>/
91
+ href =URI.parse(url)
92
+ contxt = ""
93
+ href.open{ |f|
94
+ f.each_line {|line| contxt =contxt + line + "\n"}
95
+ }
96
+ list = contxt.scan(regex_list)
97
+ end
98
+
99
+
100
+
101
+ def self.get_proxylist_dianxin()
102
+ list = gg("http://www.proxycn.cn/countryDX.php")
103
+ if list.count ==0
104
+ list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
105
+ end
106
+ ips_ports = []
107
+ regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
108
+ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
109
+ list.each do |proxy_txt|
110
+ port = proxy_txt[regex_port]
111
+ ip = proxy_txt[regex_ip]
112
+ if(ip != ""&& !port.to_s.eql?('3128'))
113
+ port_ip = ip.to_s + ":" + port.to_s
114
+ ips_ports << port_ip
115
+ end
116
+ end
117
+ p "Count: #{ips_ports.count}"
118
+ ips_ports
119
+ end
120
+
121
+
122
+
123
+
124
+
125
+
126
+ #使用代理获取url的html的doc值
127
+ def get_doc_with_proxy(proxylist,url)
128
+ unless proxylist.nil?||proxylist.empty?
129
+ unless @proxyindex
130
+ @proxyindex = 0
131
+ end
132
+ @proxyindex=@proxyindex%proxylist.size
133
+ if(proxylist[@proxyindex])
134
+ proxy = proxylist[@proxyindex]
135
+ else
136
+ proxy = proxylist[@proxyindex+1]
137
+ end
138
+ begin
139
+ doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
140
+ @no_firest = 0
141
+ rescue => err
142
+ if proxy.empty?||proxy.nil?
143
+ proxylist.delete_at[@proxyindex]
144
+ end
145
+
146
+
147
+ unless @no_firest
148
+ @no_firest = 0
149
+ end
150
+
151
+ @no_firest += 1
152
+ p "*************************Proxy:#{proxy}, url:#{url}"
153
+ #proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
154
+ @proxyindex += 1
155
+ get_doc_with_proxy(proxylist,url) if @no_firest<4
156
+ raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
157
+ end
158
+ @proxyindex += 1
159
+ unless doc
160
+ p "*************************Proxy:#{proxy}, url:#{url}"
161
+ end
162
+ else
163
+ begin
164
+ doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
165
+ rescue => err
166
+ raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
167
+ end
168
+ end
169
+ doc
170
+ end
171
+
172
+ end
173
+ end
@@ -0,0 +1,170 @@
1
+ #encoding:utf-8
2
+
3
+ require File.expand_path("../grab_base.rb", __FILE__)
4
+ module Grabepg
5
+ # To change this template use File | Settings | File Templates.
6
+
7
+
8
+ class GrabTvsou
9
+ include Grabepg
10
+ #首页
11
+ attr_reader :home_page
12
+
13
+ #代理列表
14
+ attr_reader :proxy_list
15
+
16
+ attr_reader :grabbase
17
+
18
+ #频道存储
19
+ attr_reader :channels
20
+
21
+ #时间表存储
22
+ attr_reader :schedules
23
+
24
+ #俩个节目间的最小间隔时间
25
+ attr_reader :default_min_interval
26
+
27
+
28
+ ChannelTypeMap = {"yg_ys_li"=>"央视","yg_ws_li"=>"卫视","yg_hw_li"=>"海外","yg_df_li"=>"地方"}
29
+
30
+ #type 从mobie还是网站接口抓取数据
31
+ def initialize(grabtype,proxy_list)
32
+ @home_page = get_url(grabtype)
33
+ @proxy_list = proxy_list
34
+ @grabbase = GrabBase.new
35
+ @channels = {}
36
+ @site="http://m.tvsou.com"
37
+ end
38
+
39
+ #获取从tvsou的什么网站上获取
40
+ #type: mobile,webpage
41
+ def get_url(type)
42
+ return "http://m.tvsou.com/index.asp" if type.eql?("mobile")
43
+ end
44
+
45
+ def get_data_year_month_day(time)
46
+
47
+ return {time:"#{time.year}-#{time.month}-#{time.day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{time.month}-#{time.day})"}
48
+ end
49
+
50
+ #获取时间
51
+ #start_time 时间起始点
52
+ #use_time 天数
53
+ def get_data(start_time,use_time)
54
+ time = Time.now+start_time*24*60*60
55
+ ret = []
56
+ use_time.times.each do |i|
57
+ _time = time + i*24*60*60
58
+ ret << get_data_year_month_day(_time)
59
+ end
60
+ ret
61
+ end
62
+
63
+
64
+ #对首页进行处理获取部分频道的URL和嘻嘻
65
+ def dispose_home_page
66
+
67
+ get_channellist = lambda { |li,type|
68
+ channellist = {}
69
+ li.css('a').each do |a|
70
+ channellist.merge!({a.content=>{url:a.get_attribute("href"),type:type}}) unless channellist.has_key?(a.content)
71
+ end
72
+ channellist
73
+ }
74
+
75
+
76
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,@home_page)
77
+ doc.css("li").each do |li|
78
+ case ChannelTypeMap[li.get_attribute("class")]
79
+ when "央视"
80
+ @channels.merge!(get_channellist.call(li,"CCTV"))
81
+ when "卫视"
82
+ @channels.merge!(get_channellist.call(li,"WTV"))
83
+ when "海外"
84
+
85
+ when "地方"
86
+
87
+ end
88
+ end
89
+ return @channels
90
+ end
91
+
92
+
93
+ #获取频道列表
94
+ #url是获取频道列表的首页
95
+ #地方需要调用此函数
96
+ def dispose_channel_page(url,channel_type)
97
+
98
+ end
99
+
100
+
101
+
102
+ #获取频道时间表URL
103
+ def dispose_href_schedule_data(href,start_time,use_time)
104
+ hrefs=href.split("&programDT=")
105
+ _hrefs=hrefs[1].split("&")
106
+ ret = []
107
+ get_data(start_time,use_time).each do |time|
108
+ _hrefs[0]=time[:time]
109
+ url = hrefs[0]+"&programDT=" + time[:time]
110
+ 1.upto(_hrefs.length-1).each do |i|
111
+ url += "&"+_hrefs[i]
112
+ end
113
+ ret<<{url:url,time:time[:time],date:time[:date]}
114
+ end
115
+ ret
116
+ end
117
+
118
+ #根据URL解析时间表页面
119
+ def dispose_schedule_page(url,start_time,use_time)
120
+ url = @site +"/"+url
121
+ urls = url.split("?")
122
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
123
+ _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
124
+ _url = urls[0]+_url
125
+ urls = dispose_href_schedule_data(_url,start_time,use_time)
126
+ ret = {}
127
+ last_time = -5
128
+ last_schedule = {}
129
+ urls.each do |url|
130
+ p "Grab url: #{url}"
131
+ if url
132
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url[:url])
133
+ schedules = []
134
+ doc.css('div[class="time"]')[0].css("li[class='gray']").each do |schedule|
135
+ begin
136
+ _dispose = schedule.content
137
+ _dispose_show =schedule.css("span")[0].text
138
+ time = _dispose.gsub(_dispose_show,"")
139
+ _url = @site + schedule.css('a')[0].get_attribute("href") if schedule.css('a')[0]
140
+ schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
141
+ now = time.gsub(":","").to_i
142
+ if((now-last_time)<5)
143
+ schedules.delete(last_schedule)
144
+ end
145
+ last_schedule = {time:time,schedule_name:_dispose_show.gsub(" 剧情",""),url:_url}
146
+ last_time = now
147
+ rescue => err
148
+ p "Schedule: #{schedule}"
149
+ end
150
+ end
151
+ ret.merge!({url[:date]=>schedules})
152
+ end
153
+ end
154
+ return ret
155
+ end
156
+
157
+ #解析节目详情页面
158
+ def dispose_show_info(url)
159
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
160
+ show_name = doc.css('div[class="tv_info_top"]')[0].content
161
+ _doc=doc.css("div[class='tv_info']")
162
+ img_url = _doc.css("img")[0].get_attribute("src").gsub(" ","")
163
+ show_info = _doc.css("p")[0].content.gsub("[全文]","")
164
+ {show_name:show_name,img_url:img_url,show_info:show_info}
165
+ end
166
+
167
+
168
+ end
169
+
170
+ end