grab_epg 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +1 -1
- data/lib/debug.rb +28 -1
- data/lib/grabepg.rb +200 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Njk2ZmY1MjVlMjE5MjQwMDFiOTVmODliNDg0Zjc5OWE5MDVlMzExNA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NTVjMzA3MGQxMDE4ODhmOGZjNmJkM2I4ZTI4ODVhYTBiYzgwZWEwZA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZmRmOTEwNmM2M2FhOTU5YWUwNzUyNGVjMWVmZjRjMjU3NDAyZTY4YWY1ZmQz
|
10
|
+
N2I5OGE4MzJkMzZkMDg3Mjk1NDM5YmIwZWZmNzRkNTA3NTE4YTA5ZjFiZmM4
|
11
|
+
Mjc5MmZmZDI2NTYwN2M5NDFkN2Y1NGZkNzU1NWI1OTI1ODA5NDY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MDY2YWE0YTM0OTlmNGYxZjMxOGY2YTliZWFhMmExZjhmZTY4OGEyZWZhM2Jl
|
14
|
+
ZWIzYWE4MThlOTY1NzIyMWE0MGU5NGE4NTA1ZTAzZjZlMWIxMjBjNjkyNjRi
|
15
|
+
YTI3MTA3Y2NhMGJlMjcxYjg5NjBmYmI5NmVjNzE3MTdhMTUyYTI=
|
data/.grabepg.gemspec
CHANGED
data/lib/debug.rb
CHANGED
@@ -5,5 +5,32 @@ require 'open-uri'
|
|
5
5
|
require File.expand_path("../grabepg.rb", __FILE__)
|
6
6
|
class Debug
|
7
7
|
# To change this template use File | Settings | File Templates.
|
8
|
-
|
8
|
+
proxylist = ["123.125.116.243:6256", "123.125.116.243:28832", "123.125.116.243:29952", "123.125.116.243:9386", "219.234.82.73:7806", "123.125.116.243:38205", "123.125.116.243:11229", "123.125.116.243:12978", "219.234.82.89:8090", "120.197.85.173:20368", "123.125.116.243:8089", "123.125.116.243:8160", "219.234.82.78:31565", "123.125.116.243:21457", "123.125.116.241:17421", "123.125.116.243:14191", "219.234.82.88:29037", "123.125.116.242:13669", "123.125.116.243:19009", "123.125.116.243:6193", "123.125.116.242:15692", "123.125.116.241:20307", "123.125.116.242:18725", "219.234.82.82:29082", "123.125.116.243:5195", "123.125.116.242:21725", "123.125.116.241:32793", "219.234.82.60:8000", "123.125.116.242:17403", "123.125.116.243:6938", "123.125.116.242:16348", "219.234.82.54:8726", "120.197.85.173:20371", "123.125.116.241:9286", "219.234.82.88:19279", "219.234.82.89:13374", "123.125.116.242:5976"]
|
9
|
+
|
10
|
+
|
11
|
+
def self.test_get_doc_with_proxy(proxylist)
|
12
|
+
herf = "http://www.tvmao.com/drama/HS5oLCs="
|
13
|
+
Grabepg.get_doc_with_proxy(proxylist,herf)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.test_get_show_infomation(proxylist)
|
17
|
+
herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
|
18
|
+
Grabepg.get_show_infomation(proxylist,herf)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.test_getschedule(proxylist)
|
22
|
+
channel = "HUNANTV"
|
23
|
+
herf = "/program/HUNANTV-HUNANTV-w1.html"
|
24
|
+
Grabepg.getschedule(channel,herf,proxylist,site="http://www.tvmao.com")
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.test_get_show_schedule(proxylist)
|
28
|
+
herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
|
29
|
+
Grabepg.get_show_schedule(proxylist,herf)
|
30
|
+
end
|
31
|
+
|
32
|
+
Grabepg.start
|
33
|
+
#p test_get_show_schedule(proxylist)
|
34
|
+
#p test_getschedule(proxylist)
|
35
|
+
# p test_get_show_infomation(proxylist)
|
9
36
|
end
|
data/lib/grabepg.rb
CHANGED
@@ -9,21 +9,28 @@ module Grabepg
|
|
9
9
|
|
10
10
|
attr_reader :channel #频道列表
|
11
11
|
attr_reader :site #网站地址
|
12
|
-
|
12
|
+
attr_reader :proxyindex #代理的索引
|
13
|
+
attr_reader :show_schedule #根据节目的时间表
|
13
14
|
|
14
15
|
DEFAULT_GrabtvType=["cctv","satellite","digital",]
|
15
16
|
DEFAULT_SITE = "http://www.tvmao.com"
|
16
17
|
|
18
|
+
|
19
|
+
#调用此方法的例子
|
17
20
|
def self.start
|
18
21
|
@channel = []
|
19
22
|
@site = DEFAULT_SITE
|
20
23
|
channel_urls = self.getchannels
|
21
|
-
|
24
|
+
proxy_list=get_topfast_list
|
25
|
+
channel_urls.each do |channel,url|
|
26
|
+
p "****************************************GetSchedule : #{getschedule(channel,url,proxy_list)}"
|
27
|
+
end
|
22
28
|
end
|
23
29
|
|
24
30
|
|
25
31
|
#获取网站的频道表
|
26
32
|
def self.getchannels
|
33
|
+
@proxyindex = 0
|
27
34
|
channel_urls = {}
|
28
35
|
|
29
36
|
get_url =lambda { |type|
|
@@ -55,12 +62,45 @@ module Grabepg
|
|
55
62
|
channel_urls
|
56
63
|
end
|
57
64
|
|
58
|
-
|
59
|
-
|
65
|
+
#使用代理获取url的html的doc值
|
66
|
+
def self.get_doc_with_proxy(proxylist,url)
|
67
|
+
unless @proxyindex
|
68
|
+
@proxyindex = 0
|
69
|
+
end
|
70
|
+
@proxyindex=@proxyindex%proxylist.size
|
71
|
+
if(proxylist[@proxyindex]!="123.125.116.243:6256"||proxylist[@proxyindex]!="http://123.125.116.243:28832")
|
72
|
+
proxy = proxylist[@proxyindex]
|
73
|
+
else
|
74
|
+
proxy = proxylist[@proxyindex+1]
|
75
|
+
end
|
76
|
+
begin
|
77
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}"))
|
78
|
+
@no_firest = false
|
79
|
+
rescue => err
|
80
|
+
@no_firest = true
|
81
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
82
|
+
get_doc_with_proxy(proxylist,url) unless @no_firest
|
83
|
+
raise RuntimeError,"Error: #{err.to_s}" if @no_firest
|
84
|
+
end
|
85
|
+
@proxyindex += 1
|
86
|
+
unless doc
|
87
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
88
|
+
end
|
89
|
+
doc
|
90
|
+
end
|
91
|
+
|
92
|
+
|
60
93
|
|
94
|
+
#获取节目表
|
95
|
+
def self.getschedule(channel,herf,proxylist,site="http://www.tvmao.com")
|
96
|
+
if(@site)
|
97
|
+
site=@site
|
98
|
+
end
|
99
|
+
_img_url = "http://static.haotv.me/channel/logo/"
|
100
|
+
@show_schedule = {}
|
61
101
|
|
62
102
|
get_week_url = lambda {|url|
|
63
|
-
_url =
|
103
|
+
_url = site
|
64
104
|
urls = []
|
65
105
|
_urls = url.split("-")
|
66
106
|
0.upto(1).each do |i|
|
@@ -71,26 +111,173 @@ module Grabepg
|
|
71
111
|
end
|
72
112
|
urls
|
73
113
|
}
|
74
|
-
|
114
|
+
channel_schedule = {}
|
75
115
|
get_week_url.call(herf).each do |url|
|
76
|
-
p url
|
77
|
-
|
116
|
+
p "Grab: #{url}"
|
117
|
+
#if(proxylist[proxyidex]!="219.234.82.89:33948")
|
118
|
+
# proxy = proxylist[@proxyidex]
|
119
|
+
#else
|
120
|
+
# proxy = proxylist[@proxyidex+1]
|
121
|
+
#end
|
122
|
+
#p "Proxy: http://#{proxy}"
|
123
|
+
#doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}"))
|
124
|
+
#@proxyidex += 1
|
125
|
+
doc = get_doc_with_proxy(proxylist,url)
|
126
|
+
show_type = []
|
78
127
|
img_url = _img_url + channel+".jpg"
|
79
128
|
data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
|
80
129
|
date = data[0]
|
81
130
|
week = data[1]
|
82
131
|
p "Channel: #{channel} Date: #{date} Week: #{week}"
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
132
|
+
schedule_list = []
|
133
|
+
doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
|
134
|
+
_herf= schedule.xpath('a[@href]')[0]
|
135
|
+
schedule_herf=_herf.get_attribute("href") if _herf
|
136
|
+
unless _herf
|
137
|
+
drama =schedule.css('a[class="drama"]')[0]
|
138
|
+
if drama
|
139
|
+
_herfs=drama.get_attribute("href").gsub("/episode/section","#%#")
|
140
|
+
schedule_herf = _herfs.split("#%#")[0]
|
141
|
+
end
|
142
|
+
end
|
143
|
+
if schedule.content.split(" ").size>1
|
144
|
+
time = schedule.content.split(" ")[0]
|
145
|
+
schedule = schedule.content.split(" ")[1]
|
146
|
+
show_name = ""
|
147
|
+
unless schedule_herf.nil?||schedule_herf.empty?
|
148
|
+
show_infomation=get_show_infomation(proxylist,schedule_herf)
|
149
|
+
show_type=show_infomation["type"]
|
150
|
+
show_name = show_infomation["name"]
|
151
|
+
end
|
152
|
+
p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name}"
|
153
|
+
schedule_list << {"time"=>time,"schedule"=>schedule,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name}
|
88
154
|
end
|
89
155
|
end
|
156
|
+
channel_schedule.merge!({"#{week}(#{date})"=>schedule_list})
|
157
|
+
end
|
158
|
+
{"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
#获取节目详细信息
|
163
|
+
def self.get_show_infomation(proxy_list,schedule_herf)
|
164
|
+
@proxyindex = 0
|
165
|
+
unless @site
|
166
|
+
@site = "http://www.tvmao.com"
|
167
|
+
end
|
168
|
+
schedule_herf = @site + schedule_herf
|
169
|
+
doc=get_doc_with_proxy(proxy_list,schedule_herf)
|
170
|
+
#title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
|
171
|
+
# p "title: %s" % title
|
172
|
+
type = []
|
173
|
+
name = doc.css('span[itemprop="name"]')[0].content
|
174
|
+
doc.css('span[itemprop="genre"]').each do |_type|
|
175
|
+
type << _type.content
|
176
|
+
end
|
177
|
+
doc.css('a[itemprop="genre"]').each do |_type|
|
178
|
+
type<<_type.content
|
179
|
+
end
|
180
|
+
url = "#{schedule_herf}/detail"
|
181
|
+
doc = get_doc_with_proxy(proxy_list,url)
|
182
|
+
doc.css('span[itemprop="genre"]').each do |_type|
|
183
|
+
type << _type.content
|
184
|
+
end
|
185
|
+
doc.css('a[itemprop="genre"]').each do |_type|
|
186
|
+
type<<_type.content
|
187
|
+
end
|
188
|
+
type.uniq!
|
189
|
+
@show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
|
190
|
+
{"type"=>type,"name"=>name}
|
191
|
+
end
|
192
|
+
|
193
|
+
#获取节目的时间表
|
194
|
+
def self.get_show_schedule(proxylist,herf)
|
195
|
+
url = herf + "/playingtime"
|
196
|
+
doc = get_doc_with_proxy(proxylist,url)
|
197
|
+
i = 0
|
198
|
+
schedule = []
|
199
|
+
doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
|
200
|
+
unless(i==0)
|
201
|
+
time = epg.css('div[class="f1 fld"]')[0].content
|
202
|
+
channel_name = epg.css('div[class="f2 fld"]')[0].content
|
203
|
+
show_name = epg.css('div[class="f3 fld"]')[0].content
|
204
|
+
times = time.split(" ")
|
205
|
+
week = times[0]
|
206
|
+
date = times[1]
|
207
|
+
_time = times[2]
|
208
|
+
schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
|
209
|
+
end
|
210
|
+
i += 1
|
90
211
|
end
|
212
|
+
schedule
|
91
213
|
end
|
92
214
|
|
93
215
|
|
94
216
|
|
95
217
|
|
218
|
+
#获取指定访问速度的代理服务器
|
219
|
+
def self.get_topfast_list()
|
220
|
+
fast_list = []
|
221
|
+
time_use = 0
|
222
|
+
ips_ports = get_proxy_list()
|
223
|
+
ips_ports.each do |ip_port|
|
224
|
+
time_start = Time.now.to_i
|
225
|
+
begin
|
226
|
+
timeout(5) do
|
227
|
+
doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}"))
|
228
|
+
end
|
229
|
+
time_end = Time.now.to_i
|
230
|
+
time_use = time_end - time_start
|
231
|
+
p "http://#{ip_port} use_time:#{time_use}"
|
232
|
+
rescue Exception =>e
|
233
|
+
case e
|
234
|
+
when Errno::ETIMEDOUT
|
235
|
+
p "Use http://#{ip_port} timeout"
|
236
|
+
when Timeout::Error
|
237
|
+
p "Use http://#{ip_port} timeout"
|
238
|
+
when Errno::ECONNREFUSED
|
239
|
+
p "Use http://#{ip_port} Error connection"
|
240
|
+
else
|
241
|
+
p "Use http://#{ip_port} Error:#{e.to_s}"
|
242
|
+
end
|
243
|
+
time_use = -1
|
244
|
+
end
|
245
|
+
if(time_use > 0 &&time_use < 8)
|
246
|
+
fast_list << ip_port
|
247
|
+
end
|
248
|
+
end
|
249
|
+
fast_list
|
250
|
+
end
|
251
|
+
|
252
|
+
#获取代理列表
|
253
|
+
def self.get_proxy_list()
|
254
|
+
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
|
255
|
+
if list.count ==0
|
256
|
+
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
257
|
+
end
|
258
|
+
ips_ports = []
|
259
|
+
regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
|
260
|
+
regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
|
261
|
+
list.each do |proxy_txt|
|
262
|
+
port = proxy_txt[regex_port]
|
263
|
+
ip = proxy_txt[regex_ip]
|
264
|
+
if(ip != ""&& !port.to_s.eql?('3128'))
|
265
|
+
port_ip = ip.to_s + ":" + port.to_s
|
266
|
+
ips_ports << port_ip
|
267
|
+
end
|
268
|
+
end
|
269
|
+
p "Count: #{ips_ports.count}"
|
270
|
+
ips_ports
|
271
|
+
end
|
272
|
+
|
273
|
+
def self.gg(url)
|
274
|
+
regex_list = /<TD class="list">.*<\/TD>/
|
275
|
+
href =URI.parse(url)
|
276
|
+
contxt = ""
|
277
|
+
href.open{ |f|
|
278
|
+
f.each_line {|line| contxt =contxt + line + "\n"}
|
279
|
+
}
|
280
|
+
list = contxt.scan(regex_list)
|
281
|
+
end
|
282
|
+
|
96
283
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grab_epg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hahazql
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-04-
|
11
|
+
date: 2013-04-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: ! '"用于从TVMAO抓取EPG信息"'
|
14
14
|
email:
|