grab_epg 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +8 -8
  2. data/.grabepg.gemspec +1 -1
  3. data/lib/debug.rb +28 -1
  4. data/lib/grabepg.rb +200 -13
  5. metadata +2 -2
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- Mjg3MTAwMjIxM2FlNWQwMzc5MjkzZWEzNzQ4MGMzZjkzZWNiZDgwOA==
4
+ Njk2ZmY1MjVlMjE5MjQwMDFiOTVmODliNDg0Zjc5OWE5MDVlMzExNA==
5
5
  data.tar.gz: !binary |-
6
- YmFlYTA3ZWNjNTRlN2FmNmM4NjI5MWFlZTlhNTI3YjRiYWQ4ZGQ1Mg==
6
+ NTVjMzA3MGQxMDE4ODhmOGZjNmJkM2I4ZTI4ODVhYTBiYzgwZWEwZA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZGExZWJiN2Q4NjUzNTdiODcwMzI5ZjQ3ODAzYzM5YzU0MDI1OGI2ZGI2Yjk3
10
- NjQ3YTNjNTg5YzBkYWM0ZjQzNThkODM4Njk1MDI5YWJhZjQwODkxYjFlZmQw
11
- NDAyN2VlM2NmODI1M2Y4OGYxMThiMmM5MzI5NGI2Y2UzYzFlZDA=
9
+ ZmRmOTEwNmM2M2FhOTU5YWUwNzUyNGVjMWVmZjRjMjU3NDAyZTY4YWY1ZmQz
10
+ N2I5OGE4MzJkMzZkMDg3Mjk1NDM5YmIwZWZmNzRkNTA3NTE4YTA5ZjFiZmM4
11
+ Mjc5MmZmZDI2NTYwN2M5NDFkN2Y1NGZkNzU1NWI1OTI1ODA5NDY=
12
12
  data.tar.gz: !binary |-
13
- OGY5ODVkMzk0MjY4NDc1YjgzMTVkZTY3OThkZjZmZmFkZTZkNDI1NTIxZTcw
14
- NmQxZjYxODg0YTkzMTE0YzNiNzFiNmE4ZmZiMGMwY2M3OGY0ZDZlYWYwZGMz
15
- MzRlMzgzZGFkYTZjYTcyYWIyNGU1MTQ4ZTczZDY5NzBiZDkzMmQ=
13
+ MDY2YWE0YTM0OTlmNGYxZjMxOGY2YTliZWFhMmExZjhmZTY4OGEyZWZhM2Jl
14
+ ZWIzYWE4MThlOTY1NzIyMWE0MGU5NGE4NTA1ZTAzZjZlMWIxMjBjNjkyNjRi
15
+ YTI3MTA3Y2NhMGJlMjcxYjg5NjBmYmI5NmVjNzE3MTdhMTUyYTI=
data/.grabepg.gemspec CHANGED
@@ -10,5 +10,5 @@ Gem::Specification.new do |gem|
10
10
  gem.files = `git ls-files`.split($\)
11
11
  gem.name = "grab_epg"
12
12
  gem.require_paths = ["lib"]
13
- gem.version = "0.0.1"
13
+ gem.version = "0.0.2"
14
14
  end
data/lib/debug.rb CHANGED
@@ -5,5 +5,32 @@ require 'open-uri'
5
5
  require File.expand_path("../grabepg.rb", __FILE__)
6
6
  class Debug
7
7
  # To change this template use File | Settings | File Templates.
8
- p Grabepg.start
8
+ proxylist = ["123.125.116.243:6256", "123.125.116.243:28832", "123.125.116.243:29952", "123.125.116.243:9386", "219.234.82.73:7806", "123.125.116.243:38205", "123.125.116.243:11229", "123.125.116.243:12978", "219.234.82.89:8090", "120.197.85.173:20368", "123.125.116.243:8089", "123.125.116.243:8160", "219.234.82.78:31565", "123.125.116.243:21457", "123.125.116.241:17421", "123.125.116.243:14191", "219.234.82.88:29037", "123.125.116.242:13669", "123.125.116.243:19009", "123.125.116.243:6193", "123.125.116.242:15692", "123.125.116.241:20307", "123.125.116.242:18725", "219.234.82.82:29082", "123.125.116.243:5195", "123.125.116.242:21725", "123.125.116.241:32793", "219.234.82.60:8000", "123.125.116.242:17403", "123.125.116.243:6938", "123.125.116.242:16348", "219.234.82.54:8726", "120.197.85.173:20371", "123.125.116.241:9286", "219.234.82.88:19279", "219.234.82.89:13374", "123.125.116.242:5976"]
9
+
10
+
11
+ def self.test_get_doc_with_proxy(proxylist)
12
+ herf = "http://www.tvmao.com/drama/HS5oLCs="
13
+ Grabepg.get_doc_with_proxy(proxylist,herf)
14
+ end
15
+
16
+ def self.test_get_show_infomation(proxylist)
17
+ herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
18
+ Grabepg.get_show_infomation(proxylist,herf)
19
+ end
20
+
21
+ def self.test_getschedule(proxylist)
22
+ channel = "HUNANTV"
23
+ herf = "/program/HUNANTV-HUNANTV-w1.html"
24
+ Grabepg.getschedule(channel,herf,proxylist,site="http://www.tvmao.com")
25
+ end
26
+
27
+ def self.test_get_show_schedule(proxylist)
28
+ herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
29
+ Grabepg.get_show_schedule(proxylist,herf)
30
+ end
31
+
32
+ Grabepg.start
33
+ #p test_get_show_schedule(proxylist)
34
+ #p test_getschedule(proxylist)
35
+ # p test_get_show_infomation(proxylist)
9
36
  end
data/lib/grabepg.rb CHANGED
@@ -9,21 +9,28 @@ module Grabepg
9
9
 
10
10
  attr_reader :channel #频道列表
11
11
  attr_reader :site #网站地址
12
-
12
+ attr_reader :proxyindex #代理的索引
13
+ attr_reader :show_schedule #根据节目的时间表
13
14
 
14
15
  DEFAULT_GrabtvType=["cctv","satellite","digital",]
15
16
  DEFAULT_SITE = "http://www.tvmao.com"
16
17
 
18
+
19
+ #调用此方法的例子
17
20
  def self.start
18
21
  @channel = []
19
22
  @site = DEFAULT_SITE
20
23
  channel_urls = self.getchannels
21
- getSchudle(channel_urls)
24
+ proxy_list=get_topfast_list
25
+ channel_urls.each do |channel,url|
26
+ p "****************************************GetSchedule : #{getschedule(channel,url,proxy_list)}"
27
+ end
22
28
  end
23
29
 
24
30
 
25
31
  #获取网站的频道表
26
32
  def self.getchannels
33
+ @proxyindex = 0
27
34
  channel_urls = {}
28
35
 
29
36
  get_url =lambda { |type|
@@ -55,12 +62,45 @@ module Grabepg
55
62
  channel_urls
56
63
  end
57
64
 
58
- def self.getSchudle(channel,url)
59
- _img_url = "http://static.haotv.me/channel/logo/"
65
+ #使用代理获取url的html的doc值
66
+ def self.get_doc_with_proxy(proxylist,url)
67
+ unless @proxyindex
68
+ @proxyindex = 0
69
+ end
70
+ @proxyindex=@proxyindex%proxylist.size
71
+ if(proxylist[@proxyindex]!="123.125.116.243:6256"||proxylist[@proxyindex]!="http://123.125.116.243:28832")
72
+ proxy = proxylist[@proxyindex]
73
+ else
74
+ proxy = proxylist[@proxyindex+1]
75
+ end
76
+ begin
77
+ doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}"))
78
+ @no_firest = false
79
+ rescue => err
80
+ @no_firest = true
81
+ p "*************************Proxy:#{proxy}, url:#{url}"
82
+ get_doc_with_proxy(proxylist,url) unless @no_firest
83
+ raise RuntimeError,"Error: #{err.to_s}" if @no_firest
84
+ end
85
+ @proxyindex += 1
86
+ unless doc
87
+ p "*************************Proxy:#{proxy}, url:#{url}"
88
+ end
89
+ doc
90
+ end
91
+
92
+
60
93
 
94
+ #获取节目表
95
+ def self.getschedule(channel,herf,proxylist,site="http://www.tvmao.com")
96
+ if(@site)
97
+ site=@site
98
+ end
99
+ _img_url = "http://static.haotv.me/channel/logo/"
100
+ @show_schedule = {}
61
101
 
62
102
  get_week_url = lambda {|url|
63
- _url = @site
103
+ _url = site
64
104
  urls = []
65
105
  _urls = url.split("-")
66
106
  0.upto(1).each do |i|
@@ -71,26 +111,173 @@ module Grabepg
71
111
  end
72
112
  urls
73
113
  }
74
-
114
+ channel_schedule = {}
75
115
  get_week_url.call(herf).each do |url|
76
- p url
77
- doc = Nokogiri::HTML(open(url))
116
+ p "Grab: #{url}"
117
+ #if(proxylist[proxyidex]!="219.234.82.89:33948")
118
+ # proxy = proxylist[@proxyidex]
119
+ #else
120
+ # proxy = proxylist[@proxyidex+1]
121
+ #end
122
+ #p "Proxy: http://#{proxy}"
123
+ #doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}"))
124
+ #@proxyidex += 1
125
+ doc = get_doc_with_proxy(proxylist,url)
126
+ show_type = []
78
127
  img_url = _img_url + channel+".jpg"
79
128
  data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
80
129
  date = data[0]
81
130
  week = data[1]
82
131
  p "Channel: #{channel} Date: #{date} Week: #{week}"
83
- doc.css('ul[id="pgrow"]')[0].css("li").each do |schudel|
84
- if schudel.content.split(" ").size>1
85
- time = schudel.content.split(" ")[0]
86
- schudel = schudel.content.split(" ")[1]
87
- p "Time: #{time} Schudel: #{schudel}"
132
+ schedule_list = []
133
+ doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
134
+ _herf= schedule.xpath('a[@href]')[0]
135
+ schedule_herf=_herf.get_attribute("href") if _herf
136
+ unless _herf
137
+ drama =schedule.css('a[class="drama"]')[0]
138
+ if drama
139
+ _herfs=drama.get_attribute("href").gsub("/episode/section","#%#")
140
+ schedule_herf = _herfs.split("#%#")[0]
141
+ end
142
+ end
143
+ if schedule.content.split(" ").size>1
144
+ time = schedule.content.split(" ")[0]
145
+ schedule = schedule.content.split(" ")[1]
146
+ show_name = ""
147
+ unless schedule_herf.nil?||schedule_herf.empty?
148
+ show_infomation=get_show_infomation(proxylist,schedule_herf)
149
+ show_type=show_infomation["type"]
150
+ show_name = show_infomation["name"]
151
+ end
152
+ p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name}"
153
+ schedule_list << {"time"=>time,"schedule"=>schedule,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name}
88
154
  end
89
155
  end
156
+ channel_schedule.merge!({"#{week}(#{date})"=>schedule_list})
157
+ end
158
+ {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
159
+ end
160
+
161
+
162
+ #获取节目详细信息
163
+ def self.get_show_infomation(proxy_list,schedule_herf)
164
+ @proxyindex = 0
165
+ unless @site
166
+ @site = "http://www.tvmao.com"
167
+ end
168
+ schedule_herf = @site + schedule_herf
169
+ doc=get_doc_with_proxy(proxy_list,schedule_herf)
170
+ #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
171
+ # p "title: %s" % title
172
+ type = []
173
+ name = doc.css('span[itemprop="name"]')[0].content
174
+ doc.css('span[itemprop="genre"]').each do |_type|
175
+ type << _type.content
176
+ end
177
+ doc.css('a[itemprop="genre"]').each do |_type|
178
+ type<<_type.content
179
+ end
180
+ url = "#{schedule_herf}/detail"
181
+ doc = get_doc_with_proxy(proxy_list,url)
182
+ doc.css('span[itemprop="genre"]').each do |_type|
183
+ type << _type.content
184
+ end
185
+ doc.css('a[itemprop="genre"]').each do |_type|
186
+ type<<_type.content
187
+ end
188
+ type.uniq!
189
+ @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
190
+ {"type"=>type,"name"=>name}
191
+ end
192
+
193
+ #获取节目的时间表
194
+ def self.get_show_schedule(proxylist,herf)
195
+ url = herf + "/playingtime"
196
+ doc = get_doc_with_proxy(proxylist,url)
197
+ i = 0
198
+ schedule = []
199
+ doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
200
+ unless(i==0)
201
+ time = epg.css('div[class="f1 fld"]')[0].content
202
+ channel_name = epg.css('div[class="f2 fld"]')[0].content
203
+ show_name = epg.css('div[class="f3 fld"]')[0].content
204
+ times = time.split(" ")
205
+ week = times[0]
206
+ date = times[1]
207
+ _time = times[2]
208
+ schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
209
+ end
210
+ i += 1
90
211
  end
212
+ schedule
91
213
  end
92
214
 
93
215
 
94
216
 
95
217
 
218
+ #获取指定访问速度的代理服务器
219
+ def self.get_topfast_list()
220
+ fast_list = []
221
+ time_use = 0
222
+ ips_ports = get_proxy_list()
223
+ ips_ports.each do |ip_port|
224
+ time_start = Time.now.to_i
225
+ begin
226
+ timeout(5) do
227
+ doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}"))
228
+ end
229
+ time_end = Time.now.to_i
230
+ time_use = time_end - time_start
231
+ p "http://#{ip_port} use_time:#{time_use}"
232
+ rescue Exception =>e
233
+ case e
234
+ when Errno::ETIMEDOUT
235
+ p "Use http://#{ip_port} timeout"
236
+ when Timeout::Error
237
+ p "Use http://#{ip_port} timeout"
238
+ when Errno::ECONNREFUSED
239
+ p "Use http://#{ip_port} Error connection"
240
+ else
241
+ p "Use http://#{ip_port} Error:#{e.to_s}"
242
+ end
243
+ time_use = -1
244
+ end
245
+ if(time_use > 0 &&time_use < 8)
246
+ fast_list << ip_port
247
+ end
248
+ end
249
+ fast_list
250
+ end
251
+
252
+ #获取代理列表
253
+ def self.get_proxy_list()
254
+ list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
255
+ if list.count ==0
256
+ list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
257
+ end
258
+ ips_ports = []
259
+ regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
260
+ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
261
+ list.each do |proxy_txt|
262
+ port = proxy_txt[regex_port]
263
+ ip = proxy_txt[regex_ip]
264
+ if(ip != ""&& !port.to_s.eql?('3128'))
265
+ port_ip = ip.to_s + ":" + port.to_s
266
+ ips_ports << port_ip
267
+ end
268
+ end
269
+ p "Count: #{ips_ports.count}"
270
+ ips_ports
271
+ end
272
+
273
+ def self.gg(url)
274
+ regex_list = /<TD class="list">.*<\/TD>/
275
+ href =URI.parse(url)
276
+ contxt = ""
277
+ href.open{ |f|
278
+ f.each_line {|line| contxt =contxt + line + "\n"}
279
+ }
280
+ list = contxt.scan(regex_list)
281
+ end
282
+
96
283
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grab_epg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - hahazql
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-25 00:00:00.000000000 Z
11
+ date: 2013-04-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: ! '"用于从TVMAO抓取EPG信息"'
14
14
  email: