grab_epg 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +1 -1
- data/lib/debug.rb +3 -1
- data/lib/grab_tvmao.rb +215 -67
- data/lib/grabepg/grab_base.rb +43 -4
- data/lib/grabepg/grab_tvsou.rb +46 -7
- data/lib/test/test_grab_tvsou.rb +8 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZWMyZTQzOThkZWI3YTUxYjIwZGU5ZGRkZmQ2ZTQzNTBjYjAxYWE5ZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YjM0MDViYzQzZGQ3OWNiYjk3ZmVjMDA0Mzk2OTA0M2UzNjdlNTdkNg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjE5ZTgxMzg5OTIwYTNjYTNlMDkzYWVlMWMwODczZDk3ZTZkNzIzYmQzZTI0
|
10
|
+
YWUwYTNhOWVjNjQyNTVhNjAwODYxY2RhOTc4YmM4YWI2ZmMzNzI3ZjRhMmZj
|
11
|
+
MTg2MjJmOWIzNmRhODU1N2MwNDAwZmJhYmQwZTk2ZDU3MTU1YzU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MGM5MmEzZWU0MjcwZTgyNzliOTRkYjhkOGQ2Njk4OTZlOTI1ZjFkMjI5MGJi
|
14
|
+
MzE4Mjg0MmJjNmE2ZmJkY2YwNTQzM2QzZDcwNDQxNzM4MWE4NTI4ZjU2ZmMz
|
15
|
+
YzMwZTcyMTA1NjJiMTlhY2NhZGE0MTU4NzJiMzBkZTVjNWVkZGM=
|
data/.grabepg.gemspec
CHANGED
data/lib/debug.rb
CHANGED
@@ -3,10 +3,12 @@
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'open-uri'
|
5
5
|
require File.expand_path("../test/test_grab_tvsou.rb", __FILE__)
|
6
|
+
require File.expand_path("../test/test_grab_tvmao.rb", __FILE__)
|
6
7
|
#require 'test/test_grab_tvsou'
|
7
8
|
class Debug
|
8
9
|
# To change this template use File | Settings | File Templates.
|
9
10
|
#proxylist = ["219.234.82.84:24809", "219.234.82.84:17130", "219.234.82.84:23684", "219.234.82.84:18253", "219.234.82.84:33987", "219.234.82.84:17183", "219.234.82.84:13243", "219.234.82.84:16158", "219.234.82.84:14826", "219.234.82.84:8489", "219.234.82.84:22222", "219.234.82.84:6370", "219.234.82.84:7571", "219.234.82.84:33944", "219.234.82.84:9743", "219.234.82.84:8089", "219.234.82.84:20991", "219.234.82.84:34032", "219.234.82.84:9415", "219.234.82.84:26149", "219.234.82.84:11095", "219.234.82.84:21724", "219.234.82.84:9177", "219.234.82.84:34034", "219.234.82.84:17945", "219.234.82.85:32229", "219.234.82.85:28341", "219.234.82.85:36314", "219.234.82.85:30605", "219.234.82.85:23684", "219.234.82.85:34015", "219.234.82.85:33919", "219.234.82.85:30639", "219.234.82.85:33965", "219.234.82.85:37299", "219.234.82.85:20747", "219.234.82.86:6666", "219.234.82.86:34106", "219.234.82.86:25301", "219.234.82.86:32896", "219.234.82.86:23034", "219.234.82.86:22685", "219.234.82.86:13078", "219.234.82.86:38770", "219.234.82.86:28402", "219.234.82.86:18887", "219.234.82.86:6588", "219.234.82.86:7292", "219.234.82.86:24268", "219.234.82.86:16472", "219.234.82.86:32597", "219.234.82.86:31122", "219.234.82.88:8817", "219.234.82.88:8160", "219.234.82.88:9239", "219.234.82.88:6133", "114.141.162.53:8080", "123.125.116.243:17656", "123.125.116.241:29156", "123.125.116.243:6938", "219.234.82.88:29484", "219.234.82.88:8084", "219.234.82.88:32229", "219.234.82.88:22758", "219.234.82.88:5616", "124.225.52.14:8080", "219.234.82.88:30028", "219.234.82.88:23685", "219.234.82.88:29037", "219.234.82.88:8755"]
|
10
11
|
|
11
|
-
p TestGrabTvsou.
|
12
|
+
# p TestGrabTvsou.new.get_channel_logo
|
13
|
+
p TestGrabTvmao.new.test_get_show_type_by_batch
|
12
14
|
end
|
data/lib/grab_tvmao.rb
CHANGED
@@ -3,7 +3,11 @@
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'open-uri'
|
5
5
|
|
6
|
-
|
6
|
+
require File.expand_path("../grabepg/grab_base.rb", __FILE__)
|
7
|
+
require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__)
|
8
|
+
|
9
|
+
module Grabepg
|
10
|
+
class GrabTvmao
|
7
11
|
# To change this template use File | Settings | File Templates.
|
8
12
|
|
9
13
|
|
@@ -22,10 +26,115 @@ module GrabTvmao
|
|
22
26
|
|
23
27
|
|
24
28
|
|
29
|
+
def initialize
|
30
|
+
@grabbase = GrabBase.new
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
#批量从tvmao获取节目类型
|
36
|
+
#channel 节目表属于的屏道
|
37
|
+
#url 节目表获取的网络地址
|
38
|
+
#date 日期
|
39
|
+
#schedule 需要批量修改的时间表
|
40
|
+
#proxylist 代理列表
|
41
|
+
def get_show_type_by_batch(channel,url,date,schedule,proxylist)
|
42
|
+
_schedule = {}
|
43
|
+
schedule.each do |s|
|
44
|
+
time = s["schedule_start"].gsub(":","").to_i
|
45
|
+
_schedule.merge!(time=>s)
|
46
|
+
end
|
47
|
+
url = get_show_type_url(url,date)
|
48
|
+
schedules = get_schedulelist_atday(channel,url,proxylist)
|
49
|
+
type = nil
|
50
|
+
schedules.each do |schedule|
|
51
|
+
schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
|
52
|
+
if _schedule.has_key?(schedule_time_num)
|
53
|
+
_schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"]
|
54
|
+
p "*****************************************************************************************"
|
55
|
+
p "Schedule: #{_schedule[schedule_time_num]}"
|
56
|
+
p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}"
|
57
|
+
p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}"
|
58
|
+
if _schedule[schedule_time_num]["schedule_logo"]==""
|
59
|
+
unless schedule["img"]==""
|
60
|
+
_schedule[schedule_time_num]["schedule_logo"]=schedule["img"]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
ret = []
|
66
|
+
_schedule.each do |key,value|
|
67
|
+
ret << value
|
68
|
+
end
|
69
|
+
|
70
|
+
ret
|
71
|
+
end
|
72
|
+
|
73
|
+
#批量从tvmao获取节目类型
|
74
|
+
#channel 节目表属于的屏道
|
75
|
+
#url 节目表获取的网络地址
|
76
|
+
#date 日期
|
77
|
+
#time 节目开始时间
|
78
|
+
#proxylist 代理列表
|
79
|
+
def get_show_type(channel,url,date,time,proxylist)
|
80
|
+
url = get_show_type_url(url,date)
|
81
|
+
schedules = get_schedulelist_atday(channel,url,proxylist)
|
82
|
+
_time_num = time.gsub(":","").to_i
|
83
|
+
type = nil
|
84
|
+
schedules.each do |schedule|
|
85
|
+
schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
|
86
|
+
if _time_num==schedule_time_num
|
87
|
+
type = schedule["type"]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
if type
|
91
|
+
return type
|
92
|
+
else
|
93
|
+
return []
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_show_type_url(url,date)
|
98
|
+
whatday = 0
|
99
|
+
_date = date.split("(")[0]
|
100
|
+
case _date
|
101
|
+
when "星期一"
|
102
|
+
whatday=1
|
103
|
+
when "星期二"
|
104
|
+
whatday=2
|
105
|
+
when "星期三"
|
106
|
+
whatday=3
|
107
|
+
when "星期四"
|
108
|
+
whatday=4
|
109
|
+
when "星期五"
|
110
|
+
whatday=5
|
111
|
+
when "星期六"
|
112
|
+
whatday=6
|
113
|
+
when "星期日"
|
114
|
+
whatday=7
|
115
|
+
end
|
116
|
+
|
117
|
+
get_week_url = lambda {|url,whatday|
|
118
|
+
_url = "http://www.tvmao.com"
|
119
|
+
urls = []
|
120
|
+
_urls = url.split("-")
|
121
|
+
0.upto(1).each do |i|
|
122
|
+
_url = _url+"#{_urls[i]}"+"-"
|
123
|
+
end
|
124
|
+
url = _url+"w#{whatday}.html"
|
125
|
+
return url
|
126
|
+
}
|
127
|
+
return get_week_url.call(url,whatday)
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
25
134
|
|
26
135
|
#将星期的wday获取值转化为中文名
|
27
136
|
#conversion wady to chinese
|
28
|
-
def
|
137
|
+
def conversion_what_day(whatday)
|
29
138
|
ret = "星期"
|
30
139
|
case whatday.to_i
|
31
140
|
when 1
|
@@ -47,7 +156,7 @@ module GrabTvmao
|
|
47
156
|
end
|
48
157
|
|
49
158
|
#如果时间为1~9的一位则为其在数字前加0补齐二位
|
50
|
-
def
|
159
|
+
def dispose_time(num)
|
51
160
|
num = num.to_s
|
52
161
|
if num.length < 2
|
53
162
|
num = "0"+num
|
@@ -56,7 +165,7 @@ module GrabTvmao
|
|
56
165
|
end
|
57
166
|
|
58
167
|
#转化当前时间的格式
|
59
|
-
def
|
168
|
+
def get_week_date_time(time)
|
60
169
|
month = time.month
|
61
170
|
day = time.day
|
62
171
|
whatday = time.wday
|
@@ -65,26 +174,26 @@ module GrabTvmao
|
|
65
174
|
end
|
66
175
|
|
67
176
|
#前几天需要减去的num
|
68
|
-
def
|
177
|
+
def del_day_num(day_num)
|
69
178
|
ret = day_num*60*60*24
|
70
179
|
ret
|
71
180
|
end
|
72
181
|
|
73
182
|
#获取距离当前多少天的之前的日期
|
74
|
-
def
|
183
|
+
def get_time_day_prior(num)
|
75
184
|
time = Time.now - del_day_num(num)
|
76
185
|
ret = get_week_date_time(time)
|
77
186
|
ret
|
78
187
|
end
|
79
188
|
|
80
189
|
#前面一周要删除的日期的列表
|
81
|
-
def
|
190
|
+
def del_time_list
|
82
191
|
ret = []
|
83
192
|
time = Time.now
|
84
193
|
wday = time.wday
|
85
194
|
if(wday==1)
|
86
195
|
for i in 0..7
|
87
|
-
ret<<
|
196
|
+
ret<<get_time_day_prior(i)
|
88
197
|
end
|
89
198
|
end
|
90
199
|
ret
|
@@ -94,7 +203,7 @@ module GrabTvmao
|
|
94
203
|
|
95
204
|
|
96
205
|
#调用此方法的例子
|
97
|
-
def
|
206
|
+
def start
|
98
207
|
#作用是获取俩个字符串的相似度
|
99
208
|
#get str1 and str2 similarity
|
100
209
|
get_similarity_string = lambda { |str1,str2|
|
@@ -201,14 +310,14 @@ module GrabTvmao
|
|
201
310
|
end
|
202
311
|
end
|
203
312
|
|
204
|
-
def
|
313
|
+
def img_down_path
|
205
314
|
@img_down_path
|
206
315
|
end
|
207
316
|
|
208
317
|
|
209
318
|
#获取网站的频道表
|
210
319
|
#img_path 图片存放路径
|
211
|
-
def
|
320
|
+
def getchannels(img_dir_path)
|
212
321
|
@channel = []
|
213
322
|
@site=DEFAULT_SITE
|
214
323
|
@proxyindex = 0
|
@@ -252,39 +361,76 @@ module GrabTvmao
|
|
252
361
|
{"channel_info"=>channel_info,"channel_urls"=>channel_urls}
|
253
362
|
end
|
254
363
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
if(proxylist[@proxyindex])
|
262
|
-
proxy = proxylist[@proxyindex]
|
263
|
-
else
|
264
|
-
proxy = proxylist[@proxyindex+1]
|
265
|
-
end
|
266
|
-
begin
|
267
|
-
doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
|
268
|
-
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
269
|
-
@no_firest = 0
|
270
|
-
rescue => err
|
364
|
+
|
365
|
+
def err_doc_proxy(proxy,proxylist,url="",err="")
|
366
|
+
if proxy.empty?||proxy.nil?
|
367
|
+
proxylist.delete_at[@proxyindex]
|
368
|
+
end
|
369
|
+
|
271
370
|
|
272
371
|
unless @no_firest
|
273
372
|
@no_firest = 0
|
274
373
|
end
|
275
374
|
|
276
375
|
@no_firest += 1
|
277
|
-
p "*************************Proxy:#{proxy}, url:#{url} Error:#{err
|
376
|
+
p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}"
|
278
377
|
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
279
|
-
|
280
|
-
|
378
|
+
@proxyindex += 1
|
379
|
+
@proxyindex=@proxyindex%@size
|
380
|
+
doc=get_doc_with_proxy(proxylist,url) if @no_firest<4
|
381
|
+
unless @no_firest<4
|
382
|
+
@no_firest=0
|
383
|
+
raise RuntimeError,"Error: #{err}"
|
384
|
+
end
|
385
|
+
doc
|
386
|
+
end
|
387
|
+
|
388
|
+
|
389
|
+
#使用代理获取url的html的doc值
|
390
|
+
def get_doc_with_proxy(proxylist,url)
|
391
|
+
unless proxylist.nil?||proxylist.empty?
|
392
|
+
unless @proxyindex
|
393
|
+
@proxyindex = 0
|
394
|
+
end
|
395
|
+
@size = proxylist.size
|
396
|
+
@proxyindex=@proxyindex%proxylist.size
|
397
|
+
if(proxylist[@proxyindex])
|
398
|
+
proxy = proxylist[@proxyindex]
|
399
|
+
else
|
400
|
+
proxy = proxylist[@proxyindex+1]
|
401
|
+
end
|
402
|
+
begin
|
403
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty?
|
404
|
+
if doc.nil?
|
405
|
+
p "DOC is nil"
|
406
|
+
doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
|
407
|
+
@no_firest=0
|
408
|
+
end
|
409
|
+
@no_firest = 0
|
410
|
+
rescue => err
|
411
|
+
p "IN Rescue"
|
412
|
+
doc=err_doc_proxy(proxy,proxylist,url,err.to_s)
|
413
|
+
@no_firest=0
|
414
|
+
p "Get DOC"
|
415
|
+
@proxyindex += 1
|
416
|
+
@proxyindex=@proxyindex%@size
|
417
|
+
return doc
|
418
|
+
end
|
419
|
+
@proxyindex += 1
|
420
|
+
@proxyindex=@proxyindex%@size
|
421
|
+
else
|
422
|
+
begin
|
423
|
+
doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty?
|
424
|
+
rescue => err
|
425
|
+
p "Error : Proxy:#{proxy}, url:#{url}"
|
426
|
+
raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
|
427
|
+
end
|
428
|
+
end
|
429
|
+
doc
|
281
430
|
end
|
282
|
-
@proxyindex += 1
|
283
|
-
doc
|
284
|
-
end
|
285
431
|
|
286
432
|
#获取某天的节目表
|
287
|
-
def
|
433
|
+
def get_schedulelist_atday(channel,url,proxylist)
|
288
434
|
p "Grab: #{url}"
|
289
435
|
doc = get_doc_with_proxy(proxylist,url)
|
290
436
|
show_type = []
|
@@ -322,6 +468,7 @@ module GrabTvmao
|
|
322
468
|
schedule = schedule.content.split(" ")[1]
|
323
469
|
show_name = ""
|
324
470
|
unless schedule_herf.nil?||schedule_herf.empty?
|
471
|
+
p "Show_infomation:#{schedule_herf} Time:#{time}"
|
325
472
|
show_infomation=get_show_infomation(proxylist,schedule_herf)
|
326
473
|
show_type=show_infomation["type"]
|
327
474
|
show_name = show_infomation["name"]
|
@@ -337,7 +484,7 @@ module GrabTvmao
|
|
337
484
|
#获取制定时间和长度url
|
338
485
|
#start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
|
339
486
|
#day_num 为int型 代表抓取的时间从开始时间计算的多少天
|
340
|
-
def
|
487
|
+
def get_assign_date_url(url,start_time,day_num)
|
341
488
|
site="http://www.tvmao.com"
|
342
489
|
if(@site)
|
343
490
|
site=@site
|
@@ -373,7 +520,7 @@ module GrabTvmao
|
|
373
520
|
|
374
521
|
|
375
522
|
#获取指定时间段的节目表
|
376
|
-
def
|
523
|
+
def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
|
377
524
|
begin
|
378
525
|
day_num = 1 if day_num<1
|
379
526
|
rescue
|
@@ -394,7 +541,7 @@ module GrabTvmao
|
|
394
541
|
channel_schedule = {}
|
395
542
|
get_assign_date_url(herf,start_num,day_num).each do |url|
|
396
543
|
@date = ""
|
397
|
-
schedule_list =
|
544
|
+
schedule_list = get_schedulelist_atday(channel,url,proxylist)
|
398
545
|
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
399
546
|
end
|
400
547
|
@img_down_file.close
|
@@ -407,7 +554,7 @@ module GrabTvmao
|
|
407
554
|
|
408
555
|
#因原已调用所以保留
|
409
556
|
#获取一周节目表
|
410
|
-
def
|
557
|
+
def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
|
411
558
|
p "Day Num is #{day_num}"
|
412
559
|
begin
|
413
560
|
day_num = 1 if day_num<1
|
@@ -442,7 +589,7 @@ module GrabTvmao
|
|
442
589
|
channel_schedule = {}
|
443
590
|
get_week_url.call(herf,day_num).each do |url|
|
444
591
|
@date = ""
|
445
|
-
schedule_list =
|
592
|
+
schedule_list = get_schedulelist_atday(channel,url,proxylist)
|
446
593
|
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
447
594
|
end
|
448
595
|
@img_down_file.close
|
@@ -451,16 +598,14 @@ module GrabTvmao
|
|
451
598
|
|
452
599
|
|
453
600
|
#获取节目详细信息
|
454
|
-
def
|
601
|
+
def get_show_infomation(proxy_list,schedule_herf)
|
455
602
|
begin
|
456
603
|
@proxyindex = 0
|
457
604
|
unless @site
|
458
605
|
@site = "http://www.tvmao.com"
|
459
606
|
end
|
460
607
|
schedule_herf = @site + schedule_herf
|
461
|
-
doc=get_doc_with_proxy(proxy_list,schedule_herf)
|
462
|
-
#title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
|
463
|
-
# p "title: %s" % title
|
608
|
+
doc = get_doc_with_proxy(proxy_list,schedule_herf)
|
464
609
|
type = []
|
465
610
|
name = doc.css('span[itemprop="name"]')[0].content
|
466
611
|
|
@@ -479,39 +624,42 @@ module GrabTvmao
|
|
479
624
|
end
|
480
625
|
url = "#{schedule_herf}/detail"
|
481
626
|
doc = get_doc_with_proxy(proxy_list,url)
|
482
|
-
doc
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
type<<_type.content
|
627
|
+
if doc
|
628
|
+
doc.css('span[itemprop="genre"]').each do |_type|
|
629
|
+
type << _type.content
|
630
|
+
end
|
487
631
|
end
|
488
632
|
type.uniq!
|
489
|
-
@
|
633
|
+
unless @show_schedule
|
634
|
+
@show_schedule={}
|
635
|
+
end
|
490
636
|
@show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
|
491
637
|
{"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
|
492
|
-
rescue => e
|
493
|
-
|
638
|
+
#rescue => e
|
639
|
+
# p "Error In get_show_infomation msg : #{e.to_s}"
|
494
640
|
end
|
495
641
|
end
|
496
642
|
|
497
643
|
#获取节目的时间表
|
498
|
-
def
|
644
|
+
def get_show_schedule(proxylist,herf)
|
499
645
|
url = herf + "/playingtime"
|
500
646
|
doc = get_doc_with_proxy(proxylist,url)
|
501
647
|
i = 0
|
502
648
|
schedule = []
|
503
|
-
doc.css('div[id="epg"]')[0]
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
649
|
+
if doc.css('div[id="epg"]')[0]
|
650
|
+
doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
|
651
|
+
unless(i==0)
|
652
|
+
time = epg.css('div[class="f1 fld"]')[0].content
|
653
|
+
channel_name = epg.css('div[class="f2 fld"]')[0].content
|
654
|
+
show_name = epg.css('div[class="f3 fld"]')[0].content
|
655
|
+
times = time.split(" ")
|
656
|
+
week = times[0]
|
657
|
+
date = times[1]
|
658
|
+
_time = times[2]
|
659
|
+
schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
|
660
|
+
end
|
661
|
+
i += 1
|
513
662
|
end
|
514
|
-
i += 1
|
515
663
|
end
|
516
664
|
schedule
|
517
665
|
end
|
@@ -521,7 +669,7 @@ module GrabTvmao
|
|
521
669
|
|
522
670
|
#获取指定访问速度的代理服务器
|
523
671
|
#time为最慢速度的时间 int型 代表秒
|
524
|
-
def
|
672
|
+
def get_topfast_list(use_time)
|
525
673
|
fast_list = []
|
526
674
|
time_use = 0
|
527
675
|
ips_ports = get_proxy_list()
|
@@ -555,7 +703,7 @@ module GrabTvmao
|
|
555
703
|
end
|
556
704
|
|
557
705
|
#获取代理列表
|
558
|
-
def
|
706
|
+
def get_proxy_list()
|
559
707
|
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
|
560
708
|
if list.count ==0
|
561
709
|
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
@@ -575,7 +723,7 @@ module GrabTvmao
|
|
575
723
|
ips_ports
|
576
724
|
end
|
577
725
|
|
578
|
-
def
|
726
|
+
def gg(url)
|
579
727
|
regex_list = /<TD class="list">.*<\/TD>/
|
580
728
|
href =URI.parse(url)
|
581
729
|
contxt = ""
|
@@ -588,5 +736,5 @@ module GrabTvmao
|
|
588
736
|
def save_img
|
589
737
|
|
590
738
|
end
|
591
|
-
|
739
|
+
end
|
592
740
|
end
|
data/lib/grabepg/grab_base.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#encoding:utf-8
|
2
2
|
require 'nokogiri'
|
3
|
+
require 'iconv'
|
3
4
|
|
4
5
|
module Grabepg
|
5
6
|
|
@@ -22,13 +23,48 @@ module Grabepg
|
|
22
23
|
ret += "五"
|
23
24
|
when 6
|
24
25
|
ret += "六"
|
25
|
-
when
|
26
|
-
ret += "
|
26
|
+
when 0
|
27
|
+
ret += "日"
|
27
28
|
end
|
28
29
|
ret
|
29
30
|
end
|
30
31
|
|
31
32
|
|
33
|
+
def self.proxy_list(path)
|
34
|
+
proxy_list = []
|
35
|
+
crt_date = DateTime.now.strftime('%F')
|
36
|
+
proxy_path = "%s/proxy/%s.txt" % [File.dirname(path),crt_date]
|
37
|
+
p "Proxy_Path: #{proxy_path}"
|
38
|
+
if File.exist?(proxy_path)
|
39
|
+
file_proxy = File.open(proxy_path,"r")
|
40
|
+
file_proxy.each_line {|line|
|
41
|
+
proxy_list << line.chomp.to_s
|
42
|
+
}
|
43
|
+
p "Get Proxy_list:#{proxy_list}"
|
44
|
+
file_proxy.flush
|
45
|
+
file_proxy.close
|
46
|
+
else
|
47
|
+
proxy_list=GetProxyList.get_list(ENV["proxy_limit"].to_i,ENV["proxy_page"].to_i)
|
48
|
+
dirpath = "#{File.dirname(path)}/proxy/"
|
49
|
+
Dir.open(dirpath) {|fna|
|
50
|
+
fna.each do |fn|
|
51
|
+
if(fn.to_s != ".." && fn.to_s != ".")
|
52
|
+
File.delete("#{dirpath + fn.to_s}")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
}
|
56
|
+
file_proxy = File.new(proxy_path,"a")
|
57
|
+
proxy_list.each do |proxy|
|
58
|
+
p "Proxy:#{proxy}"
|
59
|
+
file_proxy.puts proxy
|
60
|
+
end
|
61
|
+
file_proxy.flush
|
62
|
+
file_proxy.close
|
63
|
+
end
|
64
|
+
return proxy_list
|
65
|
+
end
|
66
|
+
|
67
|
+
|
32
68
|
#获取指定访问速度的代理服务器
|
33
69
|
#time为最慢速度的时间 int型 代表秒
|
34
70
|
def self.get_topfast_list(use_time)
|
@@ -157,8 +193,10 @@ module Grabepg
|
|
157
193
|
proxy = proxylist[@proxyindex+1]
|
158
194
|
end
|
159
195
|
begin
|
160
|
-
|
196
|
+
ic = Iconv.new("UTF-8//IGNORE","GB2312")
|
197
|
+
doc = Nokogiri::HTML(ic.iconv(open(url,:proxy=>"#{proxy}").read)) unless proxy.nil?||proxy.empty?
|
161
198
|
if doc.nil?
|
199
|
+
p "DOC is nil"
|
162
200
|
doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
|
163
201
|
@no_firest=0
|
164
202
|
end
|
@@ -176,7 +214,8 @@ module Grabepg
|
|
176
214
|
@proxyindex=@proxyindex%@size
|
177
215
|
else
|
178
216
|
begin
|
179
|
-
|
217
|
+
ic = Iconv.new("GB2312//IGNORE","GB2312")
|
218
|
+
doc = Nokogiri::HTML(ic.iconv(open(url).read)) if proxy.nil?||proxy.empty?
|
180
219
|
rescue => err
|
181
220
|
p "Error : Proxy:#{proxy}, url:#{url}"
|
182
221
|
raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
|
data/lib/grabepg/grab_tvsou.rb
CHANGED
@@ -36,6 +36,10 @@ module Grabepg
|
|
36
36
|
@site="http://m.tvsou.com"
|
37
37
|
end
|
38
38
|
|
39
|
+
def get_proxy_list
|
40
|
+
@proxy_list
|
41
|
+
end
|
42
|
+
|
39
43
|
#获取从tvsou的什么网站上获取
|
40
44
|
#type: mobile,webpage
|
41
45
|
def get_url(type)
|
@@ -44,11 +48,15 @@ module Grabepg
|
|
44
48
|
|
45
49
|
def get_data_year_month_day(time)
|
46
50
|
|
47
|
-
month
|
51
|
+
month=time.month.to_s
|
48
52
|
if month.length<2
|
49
|
-
month
|
53
|
+
month="0"+month
|
50
54
|
end
|
51
|
-
|
55
|
+
day = time.day.to_s
|
56
|
+
if day.length<2
|
57
|
+
day = "0"+day
|
58
|
+
end
|
59
|
+
return {time:"#{time.year}-#{time.month}-#{day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{day})"}
|
52
60
|
end
|
53
61
|
|
54
62
|
#获取时间
|
@@ -111,6 +119,28 @@ module Grabepg
|
|
111
119
|
|
112
120
|
end
|
113
121
|
|
122
|
+
#获取频道图标地址
|
123
|
+
# url 手机表的URL值
|
124
|
+
# channel_type 频道类型
|
125
|
+
# no_dis 直接使用URL 不处理
|
126
|
+
def get_channel_logo(_url,channel_type,no_dis=false)
|
127
|
+
if no_dis
|
128
|
+
url = _url
|
129
|
+
else
|
130
|
+
tvs = _url.split("TVid=")
|
131
|
+
tvid = tvs[1].split("&")[0]
|
132
|
+
channelids = _url.split("Channelid=")
|
133
|
+
channelid = channelids[1].split("&")[0]
|
134
|
+
if channel_type=="CCTV"
|
135
|
+
url = "http://epg.tvsou.com/programys/TV_#{tvid}/Channel_#{channelid}/W1.htm"
|
136
|
+
elsif channel_type=="WTV"
|
137
|
+
url = "http://epg.tvsou.com/programws/TV_#{tvid}/Channel_#{channelid}/W1.htm"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
141
|
+
logo_network_path=doc.css("div[id='epg_m1']").css("img")[0].get_attribute("src")
|
142
|
+
return logo_network_path
|
143
|
+
end
|
114
144
|
|
115
145
|
|
116
146
|
#获取频道时间表URL
|
@@ -131,11 +161,12 @@ module Grabepg
|
|
131
161
|
|
132
162
|
#根据URL解析时间表页面
|
133
163
|
def dispose_schedule_page(url,start_time,use_time)
|
134
|
-
url =
|
164
|
+
url = url
|
135
165
|
urls = url.split("?")
|
136
166
|
begin
|
137
167
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
138
168
|
@error_num = 0
|
169
|
+
_url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
|
139
170
|
rescue => err
|
140
171
|
unless @error_num
|
141
172
|
@error_num = 0
|
@@ -144,7 +175,6 @@ module Grabepg
|
|
144
175
|
raise err.to_s if @error_num==5
|
145
176
|
dispose_schedule_page(url,start_time,use_time)
|
146
177
|
end
|
147
|
-
_url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
|
148
178
|
_url = urls[0]+_url
|
149
179
|
urls = dispose_href_schedule_data(_url,start_time,use_time)
|
150
180
|
ret = {}
|
@@ -162,7 +192,8 @@ module Grabepg
|
|
162
192
|
_dispose = schedule.content
|
163
193
|
_dispose_show =schedule.css("span")[0].text
|
164
194
|
time = _dispose.gsub(_dispose_show,"")
|
165
|
-
|
195
|
+
href =schedule.css('a')[schedule.css('a').count-1].get_attribute("href")
|
196
|
+
_url = @site+"/" + href if schedule.css('a')[0]
|
166
197
|
schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
|
167
198
|
now = time.gsub(":","").to_i
|
168
199
|
if((now-last_time)<5)
|
@@ -190,6 +221,14 @@ module Grabepg
|
|
190
221
|
#解析节目详情页面
|
191
222
|
def dispose_show_info(url)
|
192
223
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
224
|
+
if doc.nil?
|
225
|
+
unless @error_num
|
226
|
+
@error_num = 0
|
227
|
+
end
|
228
|
+
@error_num+=1
|
229
|
+
raise err.to_s if @error_num==5
|
230
|
+
dispose_show_info(url)
|
231
|
+
end
|
193
232
|
begin
|
194
233
|
show_name = doc.css('div[class="tv_info_top"]')[0].content
|
195
234
|
_doc=doc.css("div[class='tv_info']")
|
@@ -210,4 +249,4 @@ module Grabepg
|
|
210
249
|
|
211
250
|
end
|
212
251
|
|
213
|
-
end
|
252
|
+
end
|
data/lib/test/test_grab_tvsou.rb
CHANGED
@@ -11,20 +11,19 @@ class TestGrabTvsou
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def get_data(start_time,use_time)
|
14
|
-
@grabtvsou.get_data(
|
14
|
+
@grabtvsou.get_data(start_time,use_time)
|
15
15
|
end
|
16
16
|
|
17
17
|
def dispose_href_schedule_data(href,start_time,use_time)
|
18
18
|
@grabtvsou.dispose_href_schedule_data(href,start_time,use_time)
|
19
19
|
end
|
20
20
|
|
21
|
-
def dispose_schedule_page()
|
22
|
-
href = "http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys"
|
21
|
+
def dispose_schedule_page(href="http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys")
|
23
22
|
@grabtvsou.dispose_schedule_page(href,0,1)
|
24
23
|
end
|
25
24
|
|
26
25
|
def dispose_show_info
|
27
|
-
hrefs = ["http://
|
26
|
+
hrefs = ["http://msou.com//jq3.asp?id=75928&tid=3","http://m.tvsou.com//jq3.asp?id=89450&tid=3"]
|
28
27
|
ret = []
|
29
28
|
hrefs.each do |href|
|
30
29
|
ret<<@grabtvsou.dispose_show_info(href)
|
@@ -36,6 +35,11 @@ class TestGrabTvsou
|
|
36
35
|
@grabtvsou.dispose_home_page
|
37
36
|
end
|
38
37
|
|
38
|
+
def get_channel_logo(url="epg.asp?TVid=1&Channelid=1&pro=ys")
|
39
|
+
@grabtvsou.get_channel_logo(url)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
39
43
|
def self.start
|
40
44
|
_grabtvsou = GrabTvsou.new("mobile",[])
|
41
45
|
p channels = _grabtvsou.dispose_home_page
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grab_epg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hahazql
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: ! '"用于抓取EPG信息"'
|
14
14
|
email:
|