grab_epg 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +8 -8
  2. data/.grabepg.gemspec +1 -1
  3. data/lib/debug.rb +28 -1
  4. data/lib/grabepg.rb +200 -13
  5. metadata +2 -2
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- Mjg3MTAwMjIxM2FlNWQwMzc5MjkzZWEzNzQ4MGMzZjkzZWNiZDgwOA==
4
+ Njk2ZmY1MjVlMjE5MjQwMDFiOTVmODliNDg0Zjc5OWE5MDVlMzExNA==
5
5
  data.tar.gz: !binary |-
6
- YmFlYTA3ZWNjNTRlN2FmNmM4NjI5MWFlZTlhNTI3YjRiYWQ4ZGQ1Mg==
6
+ NTVjMzA3MGQxMDE4ODhmOGZjNmJkM2I4ZTI4ODVhYTBiYzgwZWEwZA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZGExZWJiN2Q4NjUzNTdiODcwMzI5ZjQ3ODAzYzM5YzU0MDI1OGI2ZGI2Yjk3
10
- NjQ3YTNjNTg5YzBkYWM0ZjQzNThkODM4Njk1MDI5YWJhZjQwODkxYjFlZmQw
11
- NDAyN2VlM2NmODI1M2Y4OGYxMThiMmM5MzI5NGI2Y2UzYzFlZDA=
9
+ ZmRmOTEwNmM2M2FhOTU5YWUwNzUyNGVjMWVmZjRjMjU3NDAyZTY4YWY1ZmQz
10
+ N2I5OGE4MzJkMzZkMDg3Mjk1NDM5YmIwZWZmNzRkNTA3NTE4YTA5ZjFiZmM4
11
+ Mjc5MmZmZDI2NTYwN2M5NDFkN2Y1NGZkNzU1NWI1OTI1ODA5NDY=
12
12
  data.tar.gz: !binary |-
13
- OGY5ODVkMzk0MjY4NDc1YjgzMTVkZTY3OThkZjZmZmFkZTZkNDI1NTIxZTcw
14
- NmQxZjYxODg0YTkzMTE0YzNiNzFiNmE4ZmZiMGMwY2M3OGY0ZDZlYWYwZGMz
15
- MzRlMzgzZGFkYTZjYTcyYWIyNGU1MTQ4ZTczZDY5NzBiZDkzMmQ=
13
+ MDY2YWE0YTM0OTlmNGYxZjMxOGY2YTliZWFhMmExZjhmZTY4OGEyZWZhM2Jl
14
+ ZWIzYWE4MThlOTY1NzIyMWE0MGU5NGE4NTA1ZTAzZjZlMWIxMjBjNjkyNjRi
15
+ YTI3MTA3Y2NhMGJlMjcxYjg5NjBmYmI5NmVjNzE3MTdhMTUyYTI=
data/.grabepg.gemspec CHANGED
@@ -10,5 +10,5 @@ Gem::Specification.new do |gem|
10
10
  gem.files = `git ls-files`.split($\)
11
11
  gem.name = "grab_epg"
12
12
  gem.require_paths = ["lib"]
13
- gem.version = "0.0.1"
13
+ gem.version = "0.0.2"
14
14
  end
data/lib/debug.rb CHANGED
@@ -5,5 +5,32 @@ require 'open-uri'
5
5
  require File.expand_path("../grabepg.rb", __FILE__)
6
6
  class Debug
7
7
  # To change this template use File | Settings | File Templates.
8
- p Grabepg.start
8
+ proxylist = ["123.125.116.243:6256", "123.125.116.243:28832", "123.125.116.243:29952", "123.125.116.243:9386", "219.234.82.73:7806", "123.125.116.243:38205", "123.125.116.243:11229", "123.125.116.243:12978", "219.234.82.89:8090", "120.197.85.173:20368", "123.125.116.243:8089", "123.125.116.243:8160", "219.234.82.78:31565", "123.125.116.243:21457", "123.125.116.241:17421", "123.125.116.243:14191", "219.234.82.88:29037", "123.125.116.242:13669", "123.125.116.243:19009", "123.125.116.243:6193", "123.125.116.242:15692", "123.125.116.241:20307", "123.125.116.242:18725", "219.234.82.82:29082", "123.125.116.243:5195", "123.125.116.242:21725", "123.125.116.241:32793", "219.234.82.60:8000", "123.125.116.242:17403", "123.125.116.243:6938", "123.125.116.242:16348", "219.234.82.54:8726", "120.197.85.173:20371", "123.125.116.241:9286", "219.234.82.88:19279", "219.234.82.89:13374", "123.125.116.242:5976"]
9
+
10
+
11
+ def self.test_get_doc_with_proxy(proxylist)
12
+ herf = "http://www.tvmao.com/drama/HS5oLCs="
13
+ Grabepg.get_doc_with_proxy(proxylist,herf)
14
+ end
15
+
16
+ def self.test_get_show_infomation(proxylist)
17
+ herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
18
+ Grabepg.get_show_infomation(proxylist,herf)
19
+ end
20
+
21
+ def self.test_getschedule(proxylist)
22
+ channel = "HUNANTV"
23
+ herf = "/program/HUNANTV-HUNANTV-w1.html"
24
+ Grabepg.getschedule(channel,herf,proxylist,site="http://www.tvmao.com")
25
+ end
26
+
27
+ def self.test_get_show_schedule(proxylist)
28
+ herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
29
+ Grabepg.get_show_schedule(proxylist,herf)
30
+ end
31
+
32
+ Grabepg.start
33
+ #p test_get_show_schedule(proxylist)
34
+ #p test_getschedule(proxylist)
35
+ # p test_get_show_infomation(proxylist)
9
36
  end
data/lib/grabepg.rb CHANGED
@@ -9,21 +9,28 @@ module Grabepg
9
9
 
10
10
  attr_reader :channel #频道列表
11
11
  attr_reader :site #网站地址
12
-
12
+ attr_reader :proxyindex #代理的索引
13
+ attr_reader :show_schedule #根据节目的时间表
13
14
 
14
15
  DEFAULT_GrabtvType=["cctv","satellite","digital",]
15
16
  DEFAULT_SITE = "http://www.tvmao.com"
16
17
 
18
+
19
+ #调用此方法的例子
17
20
  def self.start
18
21
  @channel = []
19
22
  @site = DEFAULT_SITE
20
23
  channel_urls = self.getchannels
21
- getSchudle(channel_urls)
24
+ proxy_list=get_topfast_list
25
+ channel_urls.each do |channel,url|
26
+ p "****************************************GetSchedule : #{getschedule(channel,url,proxy_list)}"
27
+ end
22
28
  end
23
29
 
24
30
 
25
31
  #获取网站的频道表
26
32
  def self.getchannels
33
+ @proxyindex = 0
27
34
  channel_urls = {}
28
35
 
29
36
  get_url =lambda { |type|
@@ -55,12 +62,45 @@ module Grabepg
55
62
  channel_urls
56
63
  end
57
64
 
58
- def self.getSchudle(channel,url)
59
- _img_url = "http://static.haotv.me/channel/logo/"
65
+ #使用代理获取url的html的doc值
66
+ def self.get_doc_with_proxy(proxylist,url)
67
+ unless @proxyindex
68
+ @proxyindex = 0
69
+ end
70
+ @proxyindex=@proxyindex%proxylist.size
71
+ if(proxylist[@proxyindex]!="123.125.116.243:6256"||proxylist[@proxyindex]!="http://123.125.116.243:28832")
72
+ proxy = proxylist[@proxyindex]
73
+ else
74
+ proxy = proxylist[@proxyindex+1]
75
+ end
76
+ begin
77
+ doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}"))
78
+ @no_firest = false
79
+ rescue => err
80
+ @no_firest = true
81
+ p "*************************Proxy:#{proxy}, url:#{url}"
82
+ get_doc_with_proxy(proxylist,url) unless @no_firest
83
+ raise RuntimeError,"Error: #{err.to_s}" if @no_firest
84
+ end
85
+ @proxyindex += 1
86
+ unless doc
87
+ p "*************************Proxy:#{proxy}, url:#{url}"
88
+ end
89
+ doc
90
+ end
91
+
92
+
60
93
 
94
+ #获取节目表
95
+ def self.getschedule(channel,herf,proxylist,site="http://www.tvmao.com")
96
+ if(@site)
97
+ site=@site
98
+ end
99
+ _img_url = "http://static.haotv.me/channel/logo/"
100
+ @show_schedule = {}
61
101
 
62
102
  get_week_url = lambda {|url|
63
- _url = @site
103
+ _url = site
64
104
  urls = []
65
105
  _urls = url.split("-")
66
106
  0.upto(1).each do |i|
@@ -71,26 +111,173 @@ module Grabepg
71
111
  end
72
112
  urls
73
113
  }
74
-
114
+ channel_schedule = {}
75
115
  get_week_url.call(herf).each do |url|
76
- p url
77
- doc = Nokogiri::HTML(open(url))
116
+ p "Grab: #{url}"
117
+ #if(proxylist[proxyidex]!="219.234.82.89:33948")
118
+ # proxy = proxylist[@proxyidex]
119
+ #else
120
+ # proxy = proxylist[@proxyidex+1]
121
+ #end
122
+ #p "Proxy: http://#{proxy}"
123
+ #doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}"))
124
+ #@proxyidex += 1
125
+ doc = get_doc_with_proxy(proxylist,url)
126
+ show_type = []
78
127
  img_url = _img_url + channel+".jpg"
79
128
  data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
80
129
  date = data[0]
81
130
  week = data[1]
82
131
  p "Channel: #{channel} Date: #{date} Week: #{week}"
83
- doc.css('ul[id="pgrow"]')[0].css("li").each do |schudel|
84
- if schudel.content.split(" ").size>1
85
- time = schudel.content.split(" ")[0]
86
- schudel = schudel.content.split(" ")[1]
87
- p "Time: #{time} Schudel: #{schudel}"
132
+ schedule_list = []
133
+ doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
134
+ _herf= schedule.xpath('a[@href]')[0]
135
+ schedule_herf=_herf.get_attribute("href") if _herf
136
+ unless _herf
137
+ drama =schedule.css('a[class="drama"]')[0]
138
+ if drama
139
+ _herfs=drama.get_attribute("href").gsub("/episode/section","#%#")
140
+ schedule_herf = _herfs.split("#%#")[0]
141
+ end
142
+ end
143
+ if schedule.content.split(" ").size>1
144
+ time = schedule.content.split(" ")[0]
145
+ schedule = schedule.content.split(" ")[1]
146
+ show_name = ""
147
+ unless schedule_herf.nil?||schedule_herf.empty?
148
+ show_infomation=get_show_infomation(proxylist,schedule_herf)
149
+ show_type=show_infomation["type"]
150
+ show_name = show_infomation["name"]
151
+ end
152
+ p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name}"
153
+ schedule_list << {"time"=>time,"schedule"=>schedule,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name}
88
154
  end
89
155
  end
156
+ channel_schedule.merge!({"#{week}(#{date})"=>schedule_list})
157
+ end
158
+ {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
159
+ end
160
+
161
+
162
+ #获取节目详细信息
163
+ def self.get_show_infomation(proxy_list,schedule_herf)
164
+ @proxyindex = 0
165
+ unless @site
166
+ @site = "http://www.tvmao.com"
167
+ end
168
+ schedule_herf = @site + schedule_herf
169
+ doc=get_doc_with_proxy(proxy_list,schedule_herf)
170
+ #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
171
+ # p "title: %s" % title
172
+ type = []
173
+ name = doc.css('span[itemprop="name"]')[0].content
174
+ doc.css('span[itemprop="genre"]').each do |_type|
175
+ type << _type.content
176
+ end
177
+ doc.css('a[itemprop="genre"]').each do |_type|
178
+ type<<_type.content
179
+ end
180
+ url = "#{schedule_herf}/detail"
181
+ doc = get_doc_with_proxy(proxy_list,url)
182
+ doc.css('span[itemprop="genre"]').each do |_type|
183
+ type << _type.content
184
+ end
185
+ doc.css('a[itemprop="genre"]').each do |_type|
186
+ type<<_type.content
187
+ end
188
+ type.uniq!
189
+ @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
190
+ {"type"=>type,"name"=>name}
191
+ end
192
+
193
+ #获取节目的时间表
194
+ def self.get_show_schedule(proxylist,herf)
195
+ url = herf + "/playingtime"
196
+ doc = get_doc_with_proxy(proxylist,url)
197
+ i = 0
198
+ schedule = []
199
+ doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
200
+ unless(i==0)
201
+ time = epg.css('div[class="f1 fld"]')[0].content
202
+ channel_name = epg.css('div[class="f2 fld"]')[0].content
203
+ show_name = epg.css('div[class="f3 fld"]')[0].content
204
+ times = time.split(" ")
205
+ week = times[0]
206
+ date = times[1]
207
+ _time = times[2]
208
+ schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
209
+ end
210
+ i += 1
90
211
  end
212
+ schedule
91
213
  end
92
214
 
93
215
 
94
216
 
95
217
 
218
+ #获取指定访问速度的代理服务器
219
+ def self.get_topfast_list()
220
+ fast_list = []
221
+ time_use = 0
222
+ ips_ports = get_proxy_list()
223
+ ips_ports.each do |ip_port|
224
+ time_start = Time.now.to_i
225
+ begin
226
+ timeout(5) do
227
+ doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}"))
228
+ end
229
+ time_end = Time.now.to_i
230
+ time_use = time_end - time_start
231
+ p "http://#{ip_port} use_time:#{time_use}"
232
+ rescue Exception =>e
233
+ case e
234
+ when Errno::ETIMEDOUT
235
+ p "Use http://#{ip_port} timeout"
236
+ when Timeout::Error
237
+ p "Use http://#{ip_port} timeout"
238
+ when Errno::ECONNREFUSED
239
+ p "Use http://#{ip_port} Error connection"
240
+ else
241
+ p "Use http://#{ip_port} Error:#{e.to_s}"
242
+ end
243
+ time_use = -1
244
+ end
245
+ if(time_use > 0 &&time_use < 8)
246
+ fast_list << ip_port
247
+ end
248
+ end
249
+ fast_list
250
+ end
251
+
252
+ #获取代理列表
253
+ def self.get_proxy_list()
254
+ list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
255
+ if list.count ==0
256
+ list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
257
+ end
258
+ ips_ports = []
259
+ regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
260
+ regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
261
+ list.each do |proxy_txt|
262
+ port = proxy_txt[regex_port]
263
+ ip = proxy_txt[regex_ip]
264
+ if(ip != ""&& !port.to_s.eql?('3128'))
265
+ port_ip = ip.to_s + ":" + port.to_s
266
+ ips_ports << port_ip
267
+ end
268
+ end
269
+ p "Count: #{ips_ports.count}"
270
+ ips_ports
271
+ end
272
+
273
+ def self.gg(url)
274
+ regex_list = /<TD class="list">.*<\/TD>/
275
+ href =URI.parse(url)
276
+ contxt = ""
277
+ href.open{ |f|
278
+ f.each_line {|line| contxt =contxt + line + "\n"}
279
+ }
280
+ list = contxt.scan(regex_list)
281
+ end
282
+
96
283
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grab_epg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - hahazql
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-04-25 00:00:00.000000000 Z
11
+ date: 2013-04-26 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: ! '"用于从TVMAO抓取EPG信息"'
14
14
  email: