grab_epg 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +2 -2
- data/lib/debug.rb +4 -43
- data/lib/grab_tvmao.rb +595 -0
- data/lib/grabepg/grab_base.rb +173 -0
- data/lib/grabepg/grab_tvsou.rb +170 -0
- data/lib/grabepg.rb +4 -595
- data/lib/test/test_grab_tvsou.rb +52 -0
- metadata +7 -3
data/lib/grabepg.rb
CHANGED
@@ -1,595 +1,4 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
module Grabepg
|
7
|
-
# To change this template use File | Settings | File Templates.
|
8
|
-
|
9
|
-
|
10
|
-
#图片的获取: Net::HTTP.get(url)
|
11
|
-
#图片的文件类型获取:
|
12
|
-
|
13
|
-
attr_reader :channel #频道列表
|
14
|
-
attr_reader :site #网站地址
|
15
|
-
attr_reader :proxyindex #代理的索引
|
16
|
-
attr_reader :show_schedule #根据节目的时间表
|
17
|
-
attr_reader :img_down_path #图片下载路径存放
|
18
|
-
|
19
|
-
DEFAULT_GrabtvType=["cctv","satellite","digital",]
|
20
|
-
DEFAULT_SITE = "http://www.tvmao.com"
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
#将星期的wday获取值转化为中文名
|
27
|
-
#conversion wady to chinese
|
28
|
-
def self.conversion_what_day(whatday)
|
29
|
-
ret = "星期"
|
30
|
-
case whatday.to_i
|
31
|
-
when 1
|
32
|
-
ret += "一"
|
33
|
-
when 2
|
34
|
-
ret += "二"
|
35
|
-
when 3
|
36
|
-
ret += "三"
|
37
|
-
when 4
|
38
|
-
ret += "四"
|
39
|
-
when 5
|
40
|
-
ret += "五"
|
41
|
-
when 6
|
42
|
-
ret += "六"
|
43
|
-
when 7
|
44
|
-
ret += "七"
|
45
|
-
end
|
46
|
-
ret
|
47
|
-
end
|
48
|
-
|
49
|
-
#如果时间为1~9的一位则为其在数字前加0补齐二位
|
50
|
-
def self.dispose_time(num)
|
51
|
-
num = num.to_s
|
52
|
-
if num.length < 2
|
53
|
-
num = "0"+num
|
54
|
-
end
|
55
|
-
num
|
56
|
-
end
|
57
|
-
|
58
|
-
#转化当前时间的格式
|
59
|
-
def self.get_week_date_time(time)
|
60
|
-
month = time.month
|
61
|
-
day = time.day
|
62
|
-
whatday = time.wday
|
63
|
-
ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")"
|
64
|
-
ret
|
65
|
-
end
|
66
|
-
|
67
|
-
#前几天需要减去的num
|
68
|
-
def self.del_day_num(day_num)
|
69
|
-
ret = day_num*60*60*24
|
70
|
-
ret
|
71
|
-
end
|
72
|
-
|
73
|
-
#获取距离当前多少天的之前的日期
|
74
|
-
def self.get_time_day_prior(num)
|
75
|
-
time = Time.now - del_day_num(num)
|
76
|
-
ret = get_week_date_time(time)
|
77
|
-
ret
|
78
|
-
end
|
79
|
-
|
80
|
-
#前面一周要删除的日期的列表
|
81
|
-
def self.del_time_list
|
82
|
-
ret = []
|
83
|
-
time = Time.now
|
84
|
-
wday = time.wday
|
85
|
-
if(wday==1)
|
86
|
-
for i in 0..7
|
87
|
-
ret<<self.get_time_day_prior(i)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
ret
|
91
|
-
end
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
#调用此方法的例子
|
97
|
-
def self.start
|
98
|
-
#作用是获取俩个字符串的相似度
|
99
|
-
#get str1 and str2 similarity
|
100
|
-
get_similarity_string = lambda { |str1,str2|
|
101
|
-
_length = 0
|
102
|
-
type = 0
|
103
|
-
if str1.length>str2.length
|
104
|
-
_length=str2.length
|
105
|
-
type = 2
|
106
|
-
else
|
107
|
-
_length=str1.length
|
108
|
-
type =1
|
109
|
-
end
|
110
|
-
_str_list = []
|
111
|
-
_str = ""
|
112
|
-
for i in 0.._length
|
113
|
-
case type
|
114
|
-
when 2
|
115
|
-
n=i
|
116
|
-
0.upto(str1.length-1).each do |j|
|
117
|
-
p "N: #{n}"
|
118
|
-
if(str2[n]==str1[j])
|
119
|
-
_str =_str+str2[n]
|
120
|
-
n = n+1
|
121
|
-
p "Str = #{_str}"
|
122
|
-
else
|
123
|
-
_str_list << _str
|
124
|
-
_str = ""
|
125
|
-
end
|
126
|
-
end
|
127
|
-
when 1
|
128
|
-
n=i
|
129
|
-
0.upto(str2.length-1).each do |j|
|
130
|
-
p "N: #{n}"
|
131
|
-
if(str1[n]==str2[j])
|
132
|
-
_str =_str+str1[n]
|
133
|
-
n=n+1
|
134
|
-
p "Str = #{_str}"
|
135
|
-
else
|
136
|
-
_str_list << _str
|
137
|
-
_str = ""
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|
142
|
-
p _str_list
|
143
|
-
_str = ""
|
144
|
-
_str_list.each do |str|
|
145
|
-
if _str.length<str.length
|
146
|
-
_str=str
|
147
|
-
end
|
148
|
-
end
|
149
|
-
_str
|
150
|
-
}
|
151
|
-
|
152
|
-
|
153
|
-
path = "/home/zql/workspace/New/smart_remote/img_path"
|
154
|
-
channel_list = Grabepg.getchannels(path)
|
155
|
-
channel_urls = channel_list['channel_urls']
|
156
|
-
channel_infos = channel_list['channel_info']
|
157
|
-
p "Channel img save file,path='#{Grabepg.img_down_path}'"
|
158
|
-
proxy_list=Grabepg.get_topfast_list(5) #get_topfast_list 参数是代表最慢用时 单位秒
|
159
|
-
|
160
|
-
|
161
|
-
#Use for Test
|
162
|
-
|
163
|
-
p "************************************"
|
164
|
-
p "proxy_list:#{proxy_list}"
|
165
|
-
p "************************************"
|
166
|
-
|
167
|
-
bool_start = false
|
168
|
-
|
169
|
-
|
170
|
-
channel_urls.each do |channel,url|
|
171
|
-
|
172
|
-
if(channel=="CCTV16")
|
173
|
-
bool_start = true
|
174
|
-
end
|
175
|
-
|
176
|
-
if bool_start
|
177
|
-
previous_show_name = ""
|
178
|
-
channel_info = channel_infos[channel]
|
179
|
-
channel_name = channel_info["channel_name"]
|
180
|
-
channel_type = channel_info["channel_type"]
|
181
|
-
channel_id = channel_info["channel_id"]
|
182
|
-
channel_img_path = channel_info["img_path"]
|
183
|
-
|
184
|
-
#channel,herf,proxylist,day_num=7
|
185
|
-
|
186
|
-
|
187
|
-
start_time=0
|
188
|
-
use_num =1
|
189
|
-
|
190
|
-
#getScheduleAssignDate参数:
|
191
|
-
# channel 频道
|
192
|
-
# herf 频道地址
|
193
|
-
# proxylist 代理列表
|
194
|
-
# start_num 开始时间 int 为开始时间与今天的差值 正数代表今天之后的第几天 负数代表今天之前的第几天
|
195
|
-
# day_num 抓取的时间段天数
|
196
|
-
# img_dir_down_path 图片网络地址保存路径 有默认值 可不设置
|
197
|
-
schedule_list=Grabepg.getScheduleAssignDate(channel,url,proxy_list,start_time,use_num) #抓取的七天后的1天的数据
|
198
|
-
|
199
|
-
|
200
|
-
end
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
def self.img_down_path
|
205
|
-
@img_down_path
|
206
|
-
end
|
207
|
-
|
208
|
-
|
209
|
-
#获取网站的频道表
|
210
|
-
#img_path 图片存放路径
|
211
|
-
def self.getchannels(img_dir_path)
|
212
|
-
@channel = []
|
213
|
-
@site=DEFAULT_SITE
|
214
|
-
@proxyindex = 0
|
215
|
-
@img_down_dir_path = img_dir_path
|
216
|
-
@img_down_file = File.new(File.join(img_dir_path,"channel_img_down_path"),'w+')
|
217
|
-
|
218
|
-
channel_urls = {}
|
219
|
-
channel_info = {}
|
220
|
-
get_url =lambda { |type|
|
221
|
-
@site + "/program/duration/#{type}/w1.html" unless (type.nil?||type.empty?)
|
222
|
-
}
|
223
|
-
|
224
|
-
get_channel_id = lambda {|url|
|
225
|
-
channel_id = url.split("/")[2].split("-")[1] unless (url.nil?||url.empty?)
|
226
|
-
}
|
227
|
-
|
228
|
-
DEFAULT_GrabtvType.each do |type|
|
229
|
-
url = get_url.call(type)
|
230
|
-
p url
|
231
|
-
doc = Nokogiri::HTML(open(url))
|
232
|
-
p doc.content
|
233
|
-
p "*************************************************************"
|
234
|
-
doc.css('td[class="tdchn"]').each do |td|
|
235
|
-
channel_name=td.content
|
236
|
-
herf = ""
|
237
|
-
td.css('a').each do |a|
|
238
|
-
herf=a['href']
|
239
|
-
end
|
240
|
-
channel_id = get_channel_id.call(herf)
|
241
|
-
|
242
|
-
#获取频道图片的地址
|
243
|
-
img_path = "http://static.haotv.me/channel/logo/#{channel_id}.jpg"
|
244
|
-
@img_down_file.puts("#{channel_id}:#{img_path}")
|
245
|
-
@channel<<({channel_id=>{name:channel_name,herf:herf,type:type}})
|
246
|
-
channel_info.merge!({channel_id=>{"channel_name"=>channel_name,"channel_type"=>type,"channel_id"=>channel_id,"img_path"=>img_path}})
|
247
|
-
channel_urls.merge!({channel_id=>herf})
|
248
|
-
end
|
249
|
-
end
|
250
|
-
@img_down_file.close
|
251
|
-
p "Channel: #{@channel}"
|
252
|
-
{"channel_info"=>channel_info,"channel_urls"=>channel_urls}
|
253
|
-
end
|
254
|
-
|
255
|
-
#使用代理获取url的html的doc值
|
256
|
-
def self.get_doc_with_proxy(proxylist,url)
|
257
|
-
unless @proxyindex
|
258
|
-
@proxyindex = 0
|
259
|
-
end
|
260
|
-
@proxyindex=@proxyindex%proxylist.size
|
261
|
-
if(proxylist[@proxyindex])
|
262
|
-
proxy = proxylist[@proxyindex]
|
263
|
-
else
|
264
|
-
proxy = proxylist[@proxyindex+1]
|
265
|
-
end
|
266
|
-
begin
|
267
|
-
doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
|
268
|
-
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
269
|
-
@no_firest = 0
|
270
|
-
rescue => err
|
271
|
-
|
272
|
-
unless @no_firest
|
273
|
-
@no_firest = 0
|
274
|
-
end
|
275
|
-
|
276
|
-
@no_firest += 1
|
277
|
-
p "*************************Proxy:#{proxy}, url:#{url}"
|
278
|
-
proxylist.delete(proxy)
|
279
|
-
get_doc_with_proxy(proxylist,url) if @no_firest<4
|
280
|
-
raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
|
281
|
-
end
|
282
|
-
@proxyindex += 1
|
283
|
-
unless doc
|
284
|
-
p "*************************Proxy:#{proxy}, url:#{url}"
|
285
|
-
end
|
286
|
-
doc
|
287
|
-
end
|
288
|
-
|
289
|
-
#获取某天的节目表
|
290
|
-
def self.get_schedulelist_atday(channel,url,proxylist)
|
291
|
-
p "Grab: #{url}"
|
292
|
-
doc = get_doc_with_proxy(proxylist,url)
|
293
|
-
show_type = []
|
294
|
-
|
295
|
-
|
296
|
-
_img_url = "http://static.haotv.me/channel/logo/"
|
297
|
-
img_url = _img_url + channel+".jpg"
|
298
|
-
|
299
|
-
|
300
|
-
data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
|
301
|
-
date = data[0]
|
302
|
-
week = data[1]
|
303
|
-
p "Channel: #{channel} Date: #{date} Week: #{week}"
|
304
|
-
@date = "#{week}(#{date})"
|
305
|
-
schedule_list = []
|
306
|
-
|
307
|
-
_herf = doc.css("h1[style='float:left']").xpath('img[@src]')[0]
|
308
|
-
img_url = _herf.get_attribute("src") if _herf
|
309
|
-
|
310
|
-
p "**************IMG: #{img_url}"
|
311
|
-
|
312
|
-
|
313
|
-
doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
|
314
|
-
_herf= schedule.xpath('a[@href]')[0]
|
315
|
-
schedule_herf=_herf.get_attribute("href") if _herf
|
316
|
-
unless _herf
|
317
|
-
drama =schedule.css('a[class="drama"]')[0]
|
318
|
-
if drama
|
319
|
-
_herfs=drama.get_attribute("href").gsub("/episode/section","#%#")
|
320
|
-
schedule_herf = _herfs.split("#%#")[0]
|
321
|
-
end
|
322
|
-
end
|
323
|
-
if schedule.content.split(" ").size>1
|
324
|
-
time = schedule.content.split(" ")[0]
|
325
|
-
schedule = schedule.content.split(" ")[1]
|
326
|
-
show_name = ""
|
327
|
-
unless schedule_herf.nil?||schedule_herf.empty?
|
328
|
-
show_infomation=get_show_infomation(proxylist,schedule_herf)
|
329
|
-
show_type=show_infomation["type"]
|
330
|
-
show_name = show_infomation["name"]
|
331
|
-
show_img = show_infomation["img"]
|
332
|
-
end
|
333
|
-
p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name} img:#{show_img}"
|
334
|
-
schedule_list << {"schedule_name"=>schedule,"schedule_logo"=>show_img,"schedule_start"=>time,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name}
|
335
|
-
end
|
336
|
-
end
|
337
|
-
schedule_list
|
338
|
-
end
|
339
|
-
|
340
|
-
#获取制定时间和长度url
|
341
|
-
#start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
|
342
|
-
#day_num 为int型 代表抓取的时间从开始时间计算的多少天
|
343
|
-
def self.get_assign_date_url(url,start_time,day_num)
|
344
|
-
site="http://www.tvmao.com"
|
345
|
-
if(@site)
|
346
|
-
site=@site
|
347
|
-
end
|
348
|
-
|
349
|
-
_url = site
|
350
|
-
urls = []
|
351
|
-
_urls = url.split("-")
|
352
|
-
|
353
|
-
time = Time.now
|
354
|
-
_wday = time.wday
|
355
|
-
wday = _wday + start_time
|
356
|
-
if wday<0
|
357
|
-
wday = 1
|
358
|
-
end
|
359
|
-
|
360
|
-
end_day = wday + day_num - 1
|
361
|
-
|
362
|
-
if end_day>(_wday+7)
|
363
|
-
end_day = _wday + 7
|
364
|
-
end
|
365
|
-
|
366
|
-
0.upto(1).each do |i|
|
367
|
-
_url = _url+"#{_urls[i]}"+"-"
|
368
|
-
end
|
369
|
-
|
370
|
-
wday.upto(end_day).each do |i|
|
371
|
-
urls << _url+"w#{i}.html"
|
372
|
-
end
|
373
|
-
urls
|
374
|
-
end
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
#获取指定时间段的节目表
|
379
|
-
def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
|
380
|
-
begin
|
381
|
-
day_num = 1 if day_num<1
|
382
|
-
rescue
|
383
|
-
day_num = 1
|
384
|
-
end
|
385
|
-
site="http://www.tvmao.com"
|
386
|
-
unless img_dir_down_path
|
387
|
-
img_dir_down_path = __FILE__
|
388
|
-
end
|
389
|
-
@img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
|
390
|
-
|
391
|
-
if(@site)
|
392
|
-
site=@site
|
393
|
-
end
|
394
|
-
_img_url = "http://static.haotv.me/channel/logo/"
|
395
|
-
@show_schedule = {}
|
396
|
-
|
397
|
-
channel_schedule = {}
|
398
|
-
get_assign_date_url(herf,start_num,day_num).each do |url|
|
399
|
-
@date = ""
|
400
|
-
schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
|
401
|
-
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
402
|
-
end
|
403
|
-
@img_down_file.close
|
404
|
-
{"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
|
405
|
-
end
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
#因原已调用所以保留
|
412
|
-
#获取一周节目表
|
413
|
-
def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
|
414
|
-
p "Day Num is #{day_num}"
|
415
|
-
begin
|
416
|
-
day_num = 1 if day_num<1
|
417
|
-
rescue
|
418
|
-
day_num = 1
|
419
|
-
end
|
420
|
-
site="http://www.tvmao.com"
|
421
|
-
unless img_dir_down_path
|
422
|
-
img_dir_down_path = __FILE__
|
423
|
-
end
|
424
|
-
@img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
|
425
|
-
|
426
|
-
if(@site)
|
427
|
-
site=@site
|
428
|
-
end
|
429
|
-
_img_url = "http://static.haotv.me/channel/logo/"
|
430
|
-
@show_schedule = {}
|
431
|
-
|
432
|
-
get_week_url = lambda {|url,day_num|
|
433
|
-
_url = site
|
434
|
-
urls = []
|
435
|
-
_urls = url.split("-")
|
436
|
-
0.upto(1).each do |i|
|
437
|
-
_url = _url+"#{_urls[i]}"+"-"
|
438
|
-
end
|
439
|
-
1.upto(day_num).each do |i|
|
440
|
-
urls << _url+"w#{i}.html"
|
441
|
-
end
|
442
|
-
urls
|
443
|
-
}
|
444
|
-
|
445
|
-
channel_schedule = {}
|
446
|
-
get_week_url.call(herf,day_num).each do |url|
|
447
|
-
@date = ""
|
448
|
-
schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
|
449
|
-
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
450
|
-
end
|
451
|
-
@img_down_file.close
|
452
|
-
{"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
|
453
|
-
end
|
454
|
-
|
455
|
-
|
456
|
-
#获取节目详细信息
|
457
|
-
def self.get_show_infomation(proxy_list,schedule_herf)
|
458
|
-
begin
|
459
|
-
@proxyindex = 0
|
460
|
-
unless @site
|
461
|
-
@site = "http://www.tvmao.com"
|
462
|
-
end
|
463
|
-
schedule_herf = @site + schedule_herf
|
464
|
-
doc=get_doc_with_proxy(proxy_list,schedule_herf)
|
465
|
-
#title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
|
466
|
-
# p "title: %s" % title
|
467
|
-
type = []
|
468
|
-
name = doc.css('span[itemprop="name"]')[0].content
|
469
|
-
|
470
|
-
#获取节目的图片
|
471
|
-
if doc.css('img[class="tvc"]')
|
472
|
-
schedule_img_down_path = doc.css('img[class="tvc"]')[0].get_attribute('src') if doc.css('img[class="tvc"]')[0]
|
473
|
-
end
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
doc.css('span[itemprop="genre"]').each do |_type|
|
478
|
-
type << _type.content
|
479
|
-
end
|
480
|
-
doc.css('a[itemprop="genre"]').each do |_type|
|
481
|
-
type<<_type.content
|
482
|
-
end
|
483
|
-
url = "#{schedule_herf}/detail"
|
484
|
-
doc = get_doc_with_proxy(proxy_list,url)
|
485
|
-
doc.css('span[itemprop="genre"]').each do |_type|
|
486
|
-
type << _type.content
|
487
|
-
end
|
488
|
-
doc.css('a[itemprop="genre"]').each do |_type|
|
489
|
-
type<<_type.content
|
490
|
-
end
|
491
|
-
type.uniq!
|
492
|
-
@img_down_file.puts("#{name}:#{schedule_img_down_path}")
|
493
|
-
@show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
|
494
|
-
{"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
|
495
|
-
rescue => e
|
496
|
-
p "Error In get_show_infomation msg : #{e.to_s}"
|
497
|
-
end
|
498
|
-
end
|
499
|
-
|
500
|
-
#获取节目的时间表
|
501
|
-
def self.get_show_schedule(proxylist,herf)
|
502
|
-
url = herf + "/playingtime"
|
503
|
-
doc = get_doc_with_proxy(proxylist,url)
|
504
|
-
i = 0
|
505
|
-
schedule = []
|
506
|
-
doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
|
507
|
-
unless(i==0)
|
508
|
-
time = epg.css('div[class="f1 fld"]')[0].content
|
509
|
-
channel_name = epg.css('div[class="f2 fld"]')[0].content
|
510
|
-
show_name = epg.css('div[class="f3 fld"]')[0].content
|
511
|
-
times = time.split(" ")
|
512
|
-
week = times[0]
|
513
|
-
date = times[1]
|
514
|
-
_time = times[2]
|
515
|
-
schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
|
516
|
-
end
|
517
|
-
i += 1
|
518
|
-
end
|
519
|
-
schedule
|
520
|
-
end
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
#获取指定访问速度的代理服务器
|
526
|
-
#time为最慢速度的时间 int型 代表秒
|
527
|
-
def self.get_topfast_list(use_time)
|
528
|
-
fast_list = []
|
529
|
-
time_use = 0
|
530
|
-
ips_ports = get_proxy_list()
|
531
|
-
ips_ports.each do |ip_port|
|
532
|
-
time_start = Time.now.to_i
|
533
|
-
begin
|
534
|
-
timeout(use_time) do
|
535
|
-
doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}"))
|
536
|
-
end
|
537
|
-
time_end = Time.now.to_i
|
538
|
-
time_use = time_end - time_start
|
539
|
-
p "http://#{ip_port} use_time:#{time_use}"
|
540
|
-
rescue Exception =>e
|
541
|
-
case e
|
542
|
-
when Errno::ETIMEDOUT
|
543
|
-
p "Use http://#{ip_port} timeout"
|
544
|
-
when Timeout::Error
|
545
|
-
p "Use http://#{ip_port} timeout"
|
546
|
-
when Errno::ECONNREFUSED
|
547
|
-
p "Use http://#{ip_port} Error connection"
|
548
|
-
else
|
549
|
-
p "Use http://#{ip_port} Error:#{e.to_s}"
|
550
|
-
end
|
551
|
-
time_use = -1
|
552
|
-
end
|
553
|
-
if(time_use > 0 &&time_use < 8)
|
554
|
-
fast_list << ip_port
|
555
|
-
end
|
556
|
-
end
|
557
|
-
fast_list
|
558
|
-
end
|
559
|
-
|
560
|
-
#获取代理列表
|
561
|
-
def self.get_proxy_list()
|
562
|
-
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
|
563
|
-
if list.count ==0
|
564
|
-
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
565
|
-
end
|
566
|
-
ips_ports = []
|
567
|
-
regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
|
568
|
-
regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
|
569
|
-
list.each do |proxy_txt|
|
570
|
-
port = proxy_txt[regex_port]
|
571
|
-
ip = proxy_txt[regex_ip]
|
572
|
-
if(ip != ""&& !port.to_s.eql?('3128'))
|
573
|
-
port_ip = ip.to_s + ":" + port.to_s
|
574
|
-
ips_ports << port_ip
|
575
|
-
end
|
576
|
-
end
|
577
|
-
p "Count: #{ips_ports.count}"
|
578
|
-
ips_ports
|
579
|
-
end
|
580
|
-
|
581
|
-
def self.gg(url)
|
582
|
-
regex_list = /<TD class="list">.*<\/TD>/
|
583
|
-
href =URI.parse(url)
|
584
|
-
contxt = ""
|
585
|
-
href.open{ |f|
|
586
|
-
f.each_line {|line| contxt =contxt + line + "\n"}
|
587
|
-
}
|
588
|
-
list = contxt.scan(regex_list)
|
589
|
-
end
|
590
|
-
|
591
|
-
def save_img
|
592
|
-
|
593
|
-
end
|
594
|
-
|
595
|
-
end
|
1
|
+
require 'grabepg'
|
2
|
+
require 'grab_tvmao'
|
3
|
+
require File.expand_path("../grabepg/grab_tvsou", __FILE__)
|
4
|
+
require File.expand_path("../grabepg/grab_base", __FILE__)
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require File.expand_path("../../grabepg/grab_tvsou", __FILE__)
|
2
|
+
|
3
|
+
|
4
|
+
class TestGrabTvsou
|
5
|
+
# To change this template use File | Settings | File Templates.
|
6
|
+
include Grabepg
|
7
|
+
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@grabtvsou = GrabTvsou.new("mobile",[])
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_data(start_time,use_time)
|
14
|
+
@grabtvsou.get_data(0,5)
|
15
|
+
end
|
16
|
+
|
17
|
+
def dispose_href_schedule_data(href,start_time,use_time)
|
18
|
+
@grabtvsou.dispose_href_schedule_data(href,start_time,use_time)
|
19
|
+
end
|
20
|
+
|
21
|
+
def dispose_schedule_page()
|
22
|
+
href = "http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys"
|
23
|
+
@grabtvsou.dispose_schedule_page(href,0,1)
|
24
|
+
end
|
25
|
+
|
26
|
+
def dispose_show_info
|
27
|
+
hrefs = ["http://m.tvsou.com/jq3.asp?id=81300&tid=3","http://m.tvsou.com/intro.asp?id=145"]
|
28
|
+
ret = []
|
29
|
+
hrefs.each do |href|
|
30
|
+
ret<<@grabtvsou.dispose_show_info(href)
|
31
|
+
end
|
32
|
+
ret
|
33
|
+
end
|
34
|
+
|
35
|
+
def dispose_home_page
|
36
|
+
@grabtvsou.dispose_home_page
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.start
|
40
|
+
_grabtvsou = GrabTvsou.new("mobile",[])
|
41
|
+
channels = _grabtvsou.dispose_home_page
|
42
|
+
i = 0
|
43
|
+
ret = {}
|
44
|
+
channels.each do |channel_type,value|
|
45
|
+
value.each do |channel_name,channel_msg|
|
46
|
+
return ret if i==2
|
47
|
+
ret.merge!({channel_name=>{"schedule"=>_grabtvsou.dispose_schedule_page(channel_msg[:url],0,1),"channel_type"=>channel_type}})
|
48
|
+
i += 1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grab_epg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hahazql
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-05-
|
11
|
+
date: 2013-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: ! '"
|
13
|
+
description: ! '"用于抓取EPG信息"'
|
14
14
|
email:
|
15
15
|
- hahazhouqunli@gmail.com
|
16
16
|
executables: []
|
@@ -22,8 +22,12 @@ files:
|
|
22
22
|
- Gemfile
|
23
23
|
- README.md
|
24
24
|
- lib/debug.rb
|
25
|
+
- lib/grab_tvmao.rb
|
25
26
|
- lib/grabepg.rb
|
26
27
|
- lib/grabepg.rb~
|
28
|
+
- lib/grabepg/grab_base.rb
|
29
|
+
- lib/grabepg/grab_tvsou.rb
|
30
|
+
- lib/test/test_grab_tvsou.rb
|
27
31
|
- projectFilesBackup/.idea/grabepg.iml
|
28
32
|
homepage: https://github.com/hahazql/grab_epg
|
29
33
|
licenses: []
|