grab_epg 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +1 -1
- data/lib/debug.rb +3 -1
- data/lib/grab_tvmao.rb +215 -67
- data/lib/grabepg/grab_base.rb +43 -4
- data/lib/grabepg/grab_tvsou.rb +46 -7
- data/lib/test/test_grab_tvsou.rb +8 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZWMyZTQzOThkZWI3YTUxYjIwZGU5ZGRkZmQ2ZTQzNTBjYjAxYWE5ZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YjM0MDViYzQzZGQ3OWNiYjk3ZmVjMDA0Mzk2OTA0M2UzNjdlNTdkNg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjE5ZTgxMzg5OTIwYTNjYTNlMDkzYWVlMWMwODczZDk3ZTZkNzIzYmQzZTI0
|
10
|
+
YWUwYTNhOWVjNjQyNTVhNjAwODYxY2RhOTc4YmM4YWI2ZmMzNzI3ZjRhMmZj
|
11
|
+
MTg2MjJmOWIzNmRhODU1N2MwNDAwZmJhYmQwZTk2ZDU3MTU1YzU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MGM5MmEzZWU0MjcwZTgyNzliOTRkYjhkOGQ2Njk4OTZlOTI1ZjFkMjI5MGJi
|
14
|
+
MzE4Mjg0MmJjNmE2ZmJkY2YwNTQzM2QzZDcwNDQxNzM4MWE4NTI4ZjU2ZmMz
|
15
|
+
YzMwZTcyMTA1NjJiMTlhY2NhZGE0MTU4NzJiMzBkZTVjNWVkZGM=
|
data/.grabepg.gemspec
CHANGED
data/lib/debug.rb
CHANGED
@@ -3,10 +3,12 @@
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'open-uri'
|
5
5
|
require File.expand_path("../test/test_grab_tvsou.rb", __FILE__)
|
6
|
+
require File.expand_path("../test/test_grab_tvmao.rb", __FILE__)
|
6
7
|
#require 'test/test_grab_tvsou'
|
7
8
|
class Debug
|
8
9
|
# To change this template use File | Settings | File Templates.
|
9
10
|
#proxylist = ["219.234.82.84:24809", "219.234.82.84:17130", "219.234.82.84:23684", "219.234.82.84:18253", "219.234.82.84:33987", "219.234.82.84:17183", "219.234.82.84:13243", "219.234.82.84:16158", "219.234.82.84:14826", "219.234.82.84:8489", "219.234.82.84:22222", "219.234.82.84:6370", "219.234.82.84:7571", "219.234.82.84:33944", "219.234.82.84:9743", "219.234.82.84:8089", "219.234.82.84:20991", "219.234.82.84:34032", "219.234.82.84:9415", "219.234.82.84:26149", "219.234.82.84:11095", "219.234.82.84:21724", "219.234.82.84:9177", "219.234.82.84:34034", "219.234.82.84:17945", "219.234.82.85:32229", "219.234.82.85:28341", "219.234.82.85:36314", "219.234.82.85:30605", "219.234.82.85:23684", "219.234.82.85:34015", "219.234.82.85:33919", "219.234.82.85:30639", "219.234.82.85:33965", "219.234.82.85:37299", "219.234.82.85:20747", "219.234.82.86:6666", "219.234.82.86:34106", "219.234.82.86:25301", "219.234.82.86:32896", "219.234.82.86:23034", "219.234.82.86:22685", "219.234.82.86:13078", "219.234.82.86:38770", "219.234.82.86:28402", "219.234.82.86:18887", "219.234.82.86:6588", "219.234.82.86:7292", "219.234.82.86:24268", "219.234.82.86:16472", "219.234.82.86:32597", "219.234.82.86:31122", "219.234.82.88:8817", "219.234.82.88:8160", "219.234.82.88:9239", "219.234.82.88:6133", "114.141.162.53:8080", "123.125.116.243:17656", "123.125.116.241:29156", "123.125.116.243:6938", "219.234.82.88:29484", "219.234.82.88:8084", "219.234.82.88:32229", "219.234.82.88:22758", "219.234.82.88:5616", "124.225.52.14:8080", "219.234.82.88:30028", "219.234.82.88:23685", "219.234.82.88:29037", "219.234.82.88:8755"]
|
10
11
|
|
11
|
-
p TestGrabTvsou.
|
12
|
+
# p TestGrabTvsou.new.get_channel_logo
|
13
|
+
p TestGrabTvmao.new.test_get_show_type_by_batch
|
12
14
|
end
|
data/lib/grab_tvmao.rb
CHANGED
@@ -3,7 +3,11 @@
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'open-uri'
|
5
5
|
|
6
|
-
|
6
|
+
require File.expand_path("../grabepg/grab_base.rb", __FILE__)
|
7
|
+
require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__)
|
8
|
+
|
9
|
+
module Grabepg
|
10
|
+
class GrabTvmao
|
7
11
|
# To change this template use File | Settings | File Templates.
|
8
12
|
|
9
13
|
|
@@ -22,10 +26,115 @@ module GrabTvmao
|
|
22
26
|
|
23
27
|
|
24
28
|
|
29
|
+
def initialize
|
30
|
+
@grabbase = GrabBase.new
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
#批量从tvmao获取节目类型
|
36
|
+
#channel 节目表属于的屏道
|
37
|
+
#url 节目表获取的网络地址
|
38
|
+
#date 日期
|
39
|
+
#schedule 需要批量修改的时间表
|
40
|
+
#proxylist 代理列表
|
41
|
+
def get_show_type_by_batch(channel,url,date,schedule,proxylist)
|
42
|
+
_schedule = {}
|
43
|
+
schedule.each do |s|
|
44
|
+
time = s["schedule_start"].gsub(":","").to_i
|
45
|
+
_schedule.merge!(time=>s)
|
46
|
+
end
|
47
|
+
url = get_show_type_url(url,date)
|
48
|
+
schedules = get_schedulelist_atday(channel,url,proxylist)
|
49
|
+
type = nil
|
50
|
+
schedules.each do |schedule|
|
51
|
+
schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
|
52
|
+
if _schedule.has_key?(schedule_time_num)
|
53
|
+
_schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"]
|
54
|
+
p "*****************************************************************************************"
|
55
|
+
p "Schedule: #{_schedule[schedule_time_num]}"
|
56
|
+
p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}"
|
57
|
+
p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}"
|
58
|
+
if _schedule[schedule_time_num]["schedule_logo"]==""
|
59
|
+
unless schedule["img"]==""
|
60
|
+
_schedule[schedule_time_num]["schedule_logo"]=schedule["img"]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
ret = []
|
66
|
+
_schedule.each do |key,value|
|
67
|
+
ret << value
|
68
|
+
end
|
69
|
+
|
70
|
+
ret
|
71
|
+
end
|
72
|
+
|
73
|
+
#批量从tvmao获取节目类型
|
74
|
+
#channel 节目表属于的屏道
|
75
|
+
#url 节目表获取的网络地址
|
76
|
+
#date 日期
|
77
|
+
#time 节目开始时间
|
78
|
+
#proxylist 代理列表
|
79
|
+
def get_show_type(channel,url,date,time,proxylist)
|
80
|
+
url = get_show_type_url(url,date)
|
81
|
+
schedules = get_schedulelist_atday(channel,url,proxylist)
|
82
|
+
_time_num = time.gsub(":","").to_i
|
83
|
+
type = nil
|
84
|
+
schedules.each do |schedule|
|
85
|
+
schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
|
86
|
+
if _time_num==schedule_time_num
|
87
|
+
type = schedule["type"]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
if type
|
91
|
+
return type
|
92
|
+
else
|
93
|
+
return []
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def get_show_type_url(url,date)
|
98
|
+
whatday = 0
|
99
|
+
_date = date.split("(")[0]
|
100
|
+
case _date
|
101
|
+
when "星期一"
|
102
|
+
whatday=1
|
103
|
+
when "星期二"
|
104
|
+
whatday=2
|
105
|
+
when "星期三"
|
106
|
+
whatday=3
|
107
|
+
when "星期四"
|
108
|
+
whatday=4
|
109
|
+
when "星期五"
|
110
|
+
whatday=5
|
111
|
+
when "星期六"
|
112
|
+
whatday=6
|
113
|
+
when "星期日"
|
114
|
+
whatday=7
|
115
|
+
end
|
116
|
+
|
117
|
+
get_week_url = lambda {|url,whatday|
|
118
|
+
_url = "http://www.tvmao.com"
|
119
|
+
urls = []
|
120
|
+
_urls = url.split("-")
|
121
|
+
0.upto(1).each do |i|
|
122
|
+
_url = _url+"#{_urls[i]}"+"-"
|
123
|
+
end
|
124
|
+
url = _url+"w#{whatday}.html"
|
125
|
+
return url
|
126
|
+
}
|
127
|
+
return get_week_url.call(url,whatday)
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
25
134
|
|
26
135
|
#将星期的wday获取值转化为中文名
|
27
136
|
#conversion wady to chinese
|
28
|
-
def
|
137
|
+
def conversion_what_day(whatday)
|
29
138
|
ret = "星期"
|
30
139
|
case whatday.to_i
|
31
140
|
when 1
|
@@ -47,7 +156,7 @@ module GrabTvmao
|
|
47
156
|
end
|
48
157
|
|
49
158
|
#如果时间为1~9的一位则为其在数字前加0补齐二位
|
50
|
-
def
|
159
|
+
def dispose_time(num)
|
51
160
|
num = num.to_s
|
52
161
|
if num.length < 2
|
53
162
|
num = "0"+num
|
@@ -56,7 +165,7 @@ module GrabTvmao
|
|
56
165
|
end
|
57
166
|
|
58
167
|
#转化当前时间的格式
|
59
|
-
def
|
168
|
+
def get_week_date_time(time)
|
60
169
|
month = time.month
|
61
170
|
day = time.day
|
62
171
|
whatday = time.wday
|
@@ -65,26 +174,26 @@ module GrabTvmao
|
|
65
174
|
end
|
66
175
|
|
67
176
|
#前几天需要减去的num
|
68
|
-
def
|
177
|
+
def del_day_num(day_num)
|
69
178
|
ret = day_num*60*60*24
|
70
179
|
ret
|
71
180
|
end
|
72
181
|
|
73
182
|
#获取距离当前多少天的之前的日期
|
74
|
-
def
|
183
|
+
def get_time_day_prior(num)
|
75
184
|
time = Time.now - del_day_num(num)
|
76
185
|
ret = get_week_date_time(time)
|
77
186
|
ret
|
78
187
|
end
|
79
188
|
|
80
189
|
#前面一周要删除的日期的列表
|
81
|
-
def
|
190
|
+
def del_time_list
|
82
191
|
ret = []
|
83
192
|
time = Time.now
|
84
193
|
wday = time.wday
|
85
194
|
if(wday==1)
|
86
195
|
for i in 0..7
|
87
|
-
ret<<
|
196
|
+
ret<<get_time_day_prior(i)
|
88
197
|
end
|
89
198
|
end
|
90
199
|
ret
|
@@ -94,7 +203,7 @@ module GrabTvmao
|
|
94
203
|
|
95
204
|
|
96
205
|
#调用此方法的例子
|
97
|
-
def
|
206
|
+
def start
|
98
207
|
#作用是获取俩个字符串的相似度
|
99
208
|
#get str1 and str2 similarity
|
100
209
|
get_similarity_string = lambda { |str1,str2|
|
@@ -201,14 +310,14 @@ module GrabTvmao
|
|
201
310
|
end
|
202
311
|
end
|
203
312
|
|
204
|
-
def
|
313
|
+
def img_down_path
|
205
314
|
@img_down_path
|
206
315
|
end
|
207
316
|
|
208
317
|
|
209
318
|
#获取网站的频道表
|
210
319
|
#img_path 图片存放路径
|
211
|
-
def
|
320
|
+
def getchannels(img_dir_path)
|
212
321
|
@channel = []
|
213
322
|
@site=DEFAULT_SITE
|
214
323
|
@proxyindex = 0
|
@@ -252,39 +361,76 @@ module GrabTvmao
|
|
252
361
|
{"channel_info"=>channel_info,"channel_urls"=>channel_urls}
|
253
362
|
end
|
254
363
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
if(proxylist[@proxyindex])
|
262
|
-
proxy = proxylist[@proxyindex]
|
263
|
-
else
|
264
|
-
proxy = proxylist[@proxyindex+1]
|
265
|
-
end
|
266
|
-
begin
|
267
|
-
doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
|
268
|
-
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
269
|
-
@no_firest = 0
|
270
|
-
rescue => err
|
364
|
+
|
365
|
+
def err_doc_proxy(proxy,proxylist,url="",err="")
|
366
|
+
if proxy.empty?||proxy.nil?
|
367
|
+
proxylist.delete_at[@proxyindex]
|
368
|
+
end
|
369
|
+
|
271
370
|
|
272
371
|
unless @no_firest
|
273
372
|
@no_firest = 0
|
274
373
|
end
|
275
374
|
|
276
375
|
@no_firest += 1
|
277
|
-
p "*************************Proxy:#{proxy}, url:#{url} Error:#{err
|
376
|
+
p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}"
|
278
377
|
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
279
|
-
|
280
|
-
|
378
|
+
@proxyindex += 1
|
379
|
+
@proxyindex=@proxyindex%@size
|
380
|
+
doc=get_doc_with_proxy(proxylist,url) if @no_firest<4
|
381
|
+
unless @no_firest<4
|
382
|
+
@no_firest=0
|
383
|
+
raise RuntimeError,"Error: #{err}"
|
384
|
+
end
|
385
|
+
doc
|
386
|
+
end
|
387
|
+
|
388
|
+
|
389
|
+
#使用代理获取url的html的doc值
|
390
|
+
def get_doc_with_proxy(proxylist,url)
|
391
|
+
unless proxylist.nil?||proxylist.empty?
|
392
|
+
unless @proxyindex
|
393
|
+
@proxyindex = 0
|
394
|
+
end
|
395
|
+
@size = proxylist.size
|
396
|
+
@proxyindex=@proxyindex%proxylist.size
|
397
|
+
if(proxylist[@proxyindex])
|
398
|
+
proxy = proxylist[@proxyindex]
|
399
|
+
else
|
400
|
+
proxy = proxylist[@proxyindex+1]
|
401
|
+
end
|
402
|
+
begin
|
403
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty?
|
404
|
+
if doc.nil?
|
405
|
+
p "DOC is nil"
|
406
|
+
doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
|
407
|
+
@no_firest=0
|
408
|
+
end
|
409
|
+
@no_firest = 0
|
410
|
+
rescue => err
|
411
|
+
p "IN Rescue"
|
412
|
+
doc=err_doc_proxy(proxy,proxylist,url,err.to_s)
|
413
|
+
@no_firest=0
|
414
|
+
p "Get DOC"
|
415
|
+
@proxyindex += 1
|
416
|
+
@proxyindex=@proxyindex%@size
|
417
|
+
return doc
|
418
|
+
end
|
419
|
+
@proxyindex += 1
|
420
|
+
@proxyindex=@proxyindex%@size
|
421
|
+
else
|
422
|
+
begin
|
423
|
+
doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty?
|
424
|
+
rescue => err
|
425
|
+
p "Error : Proxy:#{proxy}, url:#{url}"
|
426
|
+
raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
|
427
|
+
end
|
428
|
+
end
|
429
|
+
doc
|
281
430
|
end
|
282
|
-
@proxyindex += 1
|
283
|
-
doc
|
284
|
-
end
|
285
431
|
|
286
432
|
#获取某天的节目表
|
287
|
-
def
|
433
|
+
def get_schedulelist_atday(channel,url,proxylist)
|
288
434
|
p "Grab: #{url}"
|
289
435
|
doc = get_doc_with_proxy(proxylist,url)
|
290
436
|
show_type = []
|
@@ -322,6 +468,7 @@ module GrabTvmao
|
|
322
468
|
schedule = schedule.content.split(" ")[1]
|
323
469
|
show_name = ""
|
324
470
|
unless schedule_herf.nil?||schedule_herf.empty?
|
471
|
+
p "Show_infomation:#{schedule_herf} Time:#{time}"
|
325
472
|
show_infomation=get_show_infomation(proxylist,schedule_herf)
|
326
473
|
show_type=show_infomation["type"]
|
327
474
|
show_name = show_infomation["name"]
|
@@ -337,7 +484,7 @@ module GrabTvmao
|
|
337
484
|
#获取制定时间和长度url
|
338
485
|
#start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
|
339
486
|
#day_num 为int型 代表抓取的时间从开始时间计算的多少天
|
340
|
-
def
|
487
|
+
def get_assign_date_url(url,start_time,day_num)
|
341
488
|
site="http://www.tvmao.com"
|
342
489
|
if(@site)
|
343
490
|
site=@site
|
@@ -373,7 +520,7 @@ module GrabTvmao
|
|
373
520
|
|
374
521
|
|
375
522
|
#获取指定时间段的节目表
|
376
|
-
def
|
523
|
+
def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
|
377
524
|
begin
|
378
525
|
day_num = 1 if day_num<1
|
379
526
|
rescue
|
@@ -394,7 +541,7 @@ module GrabTvmao
|
|
394
541
|
channel_schedule = {}
|
395
542
|
get_assign_date_url(herf,start_num,day_num).each do |url|
|
396
543
|
@date = ""
|
397
|
-
schedule_list =
|
544
|
+
schedule_list = get_schedulelist_atday(channel,url,proxylist)
|
398
545
|
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
399
546
|
end
|
400
547
|
@img_down_file.close
|
@@ -407,7 +554,7 @@ module GrabTvmao
|
|
407
554
|
|
408
555
|
#因原已调用所以保留
|
409
556
|
#获取一周节目表
|
410
|
-
def
|
557
|
+
def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
|
411
558
|
p "Day Num is #{day_num}"
|
412
559
|
begin
|
413
560
|
day_num = 1 if day_num<1
|
@@ -442,7 +589,7 @@ module GrabTvmao
|
|
442
589
|
channel_schedule = {}
|
443
590
|
get_week_url.call(herf,day_num).each do |url|
|
444
591
|
@date = ""
|
445
|
-
schedule_list =
|
592
|
+
schedule_list = get_schedulelist_atday(channel,url,proxylist)
|
446
593
|
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
447
594
|
end
|
448
595
|
@img_down_file.close
|
@@ -451,16 +598,14 @@ module GrabTvmao
|
|
451
598
|
|
452
599
|
|
453
600
|
#获取节目详细信息
|
454
|
-
def
|
601
|
+
def get_show_infomation(proxy_list,schedule_herf)
|
455
602
|
begin
|
456
603
|
@proxyindex = 0
|
457
604
|
unless @site
|
458
605
|
@site = "http://www.tvmao.com"
|
459
606
|
end
|
460
607
|
schedule_herf = @site + schedule_herf
|
461
|
-
doc=get_doc_with_proxy(proxy_list,schedule_herf)
|
462
|
-
#title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
|
463
|
-
# p "title: %s" % title
|
608
|
+
doc = get_doc_with_proxy(proxy_list,schedule_herf)
|
464
609
|
type = []
|
465
610
|
name = doc.css('span[itemprop="name"]')[0].content
|
466
611
|
|
@@ -479,39 +624,42 @@ module GrabTvmao
|
|
479
624
|
end
|
480
625
|
url = "#{schedule_herf}/detail"
|
481
626
|
doc = get_doc_with_proxy(proxy_list,url)
|
482
|
-
doc
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
type<<_type.content
|
627
|
+
if doc
|
628
|
+
doc.css('span[itemprop="genre"]').each do |_type|
|
629
|
+
type << _type.content
|
630
|
+
end
|
487
631
|
end
|
488
632
|
type.uniq!
|
489
|
-
@
|
633
|
+
unless @show_schedule
|
634
|
+
@show_schedule={}
|
635
|
+
end
|
490
636
|
@show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
|
491
637
|
{"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
|
492
|
-
rescue => e
|
493
|
-
|
638
|
+
#rescue => e
|
639
|
+
# p "Error In get_show_infomation msg : #{e.to_s}"
|
494
640
|
end
|
495
641
|
end
|
496
642
|
|
497
643
|
#获取节目的时间表
|
498
|
-
def
|
644
|
+
def get_show_schedule(proxylist,herf)
|
499
645
|
url = herf + "/playingtime"
|
500
646
|
doc = get_doc_with_proxy(proxylist,url)
|
501
647
|
i = 0
|
502
648
|
schedule = []
|
503
|
-
doc.css('div[id="epg"]')[0]
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
649
|
+
if doc.css('div[id="epg"]')[0]
|
650
|
+
doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
|
651
|
+
unless(i==0)
|
652
|
+
time = epg.css('div[class="f1 fld"]')[0].content
|
653
|
+
channel_name = epg.css('div[class="f2 fld"]')[0].content
|
654
|
+
show_name = epg.css('div[class="f3 fld"]')[0].content
|
655
|
+
times = time.split(" ")
|
656
|
+
week = times[0]
|
657
|
+
date = times[1]
|
658
|
+
_time = times[2]
|
659
|
+
schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
|
660
|
+
end
|
661
|
+
i += 1
|
513
662
|
end
|
514
|
-
i += 1
|
515
663
|
end
|
516
664
|
schedule
|
517
665
|
end
|
@@ -521,7 +669,7 @@ module GrabTvmao
|
|
521
669
|
|
522
670
|
#获取指定访问速度的代理服务器
|
523
671
|
#time为最慢速度的时间 int型 代表秒
|
524
|
-
def
|
672
|
+
def get_topfast_list(use_time)
|
525
673
|
fast_list = []
|
526
674
|
time_use = 0
|
527
675
|
ips_ports = get_proxy_list()
|
@@ -555,7 +703,7 @@ module GrabTvmao
|
|
555
703
|
end
|
556
704
|
|
557
705
|
#获取代理列表
|
558
|
-
def
|
706
|
+
def get_proxy_list()
|
559
707
|
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
|
560
708
|
if list.count ==0
|
561
709
|
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
@@ -575,7 +723,7 @@ module GrabTvmao
|
|
575
723
|
ips_ports
|
576
724
|
end
|
577
725
|
|
578
|
-
def
|
726
|
+
def gg(url)
|
579
727
|
regex_list = /<TD class="list">.*<\/TD>/
|
580
728
|
href =URI.parse(url)
|
581
729
|
contxt = ""
|
@@ -588,5 +736,5 @@ module GrabTvmao
|
|
588
736
|
def save_img
|
589
737
|
|
590
738
|
end
|
591
|
-
|
739
|
+
end
|
592
740
|
end
|
data/lib/grabepg/grab_base.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#encoding:utf-8
|
2
2
|
require 'nokogiri'
|
3
|
+
require 'iconv'
|
3
4
|
|
4
5
|
module Grabepg
|
5
6
|
|
@@ -22,13 +23,48 @@ module Grabepg
|
|
22
23
|
ret += "五"
|
23
24
|
when 6
|
24
25
|
ret += "六"
|
25
|
-
when
|
26
|
-
ret += "
|
26
|
+
when 0
|
27
|
+
ret += "日"
|
27
28
|
end
|
28
29
|
ret
|
29
30
|
end
|
30
31
|
|
31
32
|
|
33
|
+
def self.proxy_list(path)
|
34
|
+
proxy_list = []
|
35
|
+
crt_date = DateTime.now.strftime('%F')
|
36
|
+
proxy_path = "%s/proxy/%s.txt" % [File.dirname(path),crt_date]
|
37
|
+
p "Proxy_Path: #{proxy_path}"
|
38
|
+
if File.exist?(proxy_path)
|
39
|
+
file_proxy = File.open(proxy_path,"r")
|
40
|
+
file_proxy.each_line {|line|
|
41
|
+
proxy_list << line.chomp.to_s
|
42
|
+
}
|
43
|
+
p "Get Proxy_list:#{proxy_list}"
|
44
|
+
file_proxy.flush
|
45
|
+
file_proxy.close
|
46
|
+
else
|
47
|
+
proxy_list=GetProxyList.get_list(ENV["proxy_limit"].to_i,ENV["proxy_page"].to_i)
|
48
|
+
dirpath = "#{File.dirname(path)}/proxy/"
|
49
|
+
Dir.open(dirpath) {|fna|
|
50
|
+
fna.each do |fn|
|
51
|
+
if(fn.to_s != ".." && fn.to_s != ".")
|
52
|
+
File.delete("#{dirpath + fn.to_s}")
|
53
|
+
end
|
54
|
+
end
|
55
|
+
}
|
56
|
+
file_proxy = File.new(proxy_path,"a")
|
57
|
+
proxy_list.each do |proxy|
|
58
|
+
p "Proxy:#{proxy}"
|
59
|
+
file_proxy.puts proxy
|
60
|
+
end
|
61
|
+
file_proxy.flush
|
62
|
+
file_proxy.close
|
63
|
+
end
|
64
|
+
return proxy_list
|
65
|
+
end
|
66
|
+
|
67
|
+
|
32
68
|
#获取指定访问速度的代理服务器
|
33
69
|
#time为最慢速度的时间 int型 代表秒
|
34
70
|
def self.get_topfast_list(use_time)
|
@@ -157,8 +193,10 @@ module Grabepg
|
|
157
193
|
proxy = proxylist[@proxyindex+1]
|
158
194
|
end
|
159
195
|
begin
|
160
|
-
|
196
|
+
ic = Iconv.new("UTF-8//IGNORE","GB2312")
|
197
|
+
doc = Nokogiri::HTML(ic.iconv(open(url,:proxy=>"#{proxy}").read)) unless proxy.nil?||proxy.empty?
|
161
198
|
if doc.nil?
|
199
|
+
p "DOC is nil"
|
162
200
|
doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
|
163
201
|
@no_firest=0
|
164
202
|
end
|
@@ -176,7 +214,8 @@ module Grabepg
|
|
176
214
|
@proxyindex=@proxyindex%@size
|
177
215
|
else
|
178
216
|
begin
|
179
|
-
|
217
|
+
ic = Iconv.new("GB2312//IGNORE","GB2312")
|
218
|
+
doc = Nokogiri::HTML(ic.iconv(open(url).read)) if proxy.nil?||proxy.empty?
|
180
219
|
rescue => err
|
181
220
|
p "Error : Proxy:#{proxy}, url:#{url}"
|
182
221
|
raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
|
data/lib/grabepg/grab_tvsou.rb
CHANGED
@@ -36,6 +36,10 @@ module Grabepg
|
|
36
36
|
@site="http://m.tvsou.com"
|
37
37
|
end
|
38
38
|
|
39
|
+
def get_proxy_list
|
40
|
+
@proxy_list
|
41
|
+
end
|
42
|
+
|
39
43
|
#获取从tvsou的什么网站上获取
|
40
44
|
#type: mobile,webpage
|
41
45
|
def get_url(type)
|
@@ -44,11 +48,15 @@ module Grabepg
|
|
44
48
|
|
45
49
|
def get_data_year_month_day(time)
|
46
50
|
|
47
|
-
month
|
51
|
+
month=time.month.to_s
|
48
52
|
if month.length<2
|
49
|
-
month
|
53
|
+
month="0"+month
|
50
54
|
end
|
51
|
-
|
55
|
+
day = time.day.to_s
|
56
|
+
if day.length<2
|
57
|
+
day = "0"+day
|
58
|
+
end
|
59
|
+
return {time:"#{time.year}-#{time.month}-#{day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{day})"}
|
52
60
|
end
|
53
61
|
|
54
62
|
#获取时间
|
@@ -111,6 +119,28 @@ module Grabepg
|
|
111
119
|
|
112
120
|
end
|
113
121
|
|
122
|
+
#获取频道图标地址
|
123
|
+
# url 手机表的URL值
|
124
|
+
# channel_type 频道类型
|
125
|
+
# no_dis 直接使用URL 不处理
|
126
|
+
def get_channel_logo(_url,channel_type,no_dis=false)
|
127
|
+
if no_dis
|
128
|
+
url = _url
|
129
|
+
else
|
130
|
+
tvs = _url.split("TVid=")
|
131
|
+
tvid = tvs[1].split("&")[0]
|
132
|
+
channelids = _url.split("Channelid=")
|
133
|
+
channelid = channelids[1].split("&")[0]
|
134
|
+
if channel_type=="CCTV"
|
135
|
+
url = "http://epg.tvsou.com/programys/TV_#{tvid}/Channel_#{channelid}/W1.htm"
|
136
|
+
elsif channel_type=="WTV"
|
137
|
+
url = "http://epg.tvsou.com/programws/TV_#{tvid}/Channel_#{channelid}/W1.htm"
|
138
|
+
end
|
139
|
+
end
|
140
|
+
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
141
|
+
logo_network_path=doc.css("div[id='epg_m1']").css("img")[0].get_attribute("src")
|
142
|
+
return logo_network_path
|
143
|
+
end
|
114
144
|
|
115
145
|
|
116
146
|
#获取频道时间表URL
|
@@ -131,11 +161,12 @@ module Grabepg
|
|
131
161
|
|
132
162
|
#根据URL解析时间表页面
|
133
163
|
def dispose_schedule_page(url,start_time,use_time)
|
134
|
-
url =
|
164
|
+
url = url
|
135
165
|
urls = url.split("?")
|
136
166
|
begin
|
137
167
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
138
168
|
@error_num = 0
|
169
|
+
_url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
|
139
170
|
rescue => err
|
140
171
|
unless @error_num
|
141
172
|
@error_num = 0
|
@@ -144,7 +175,6 @@ module Grabepg
|
|
144
175
|
raise err.to_s if @error_num==5
|
145
176
|
dispose_schedule_page(url,start_time,use_time)
|
146
177
|
end
|
147
|
-
_url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
|
148
178
|
_url = urls[0]+_url
|
149
179
|
urls = dispose_href_schedule_data(_url,start_time,use_time)
|
150
180
|
ret = {}
|
@@ -162,7 +192,8 @@ module Grabepg
|
|
162
192
|
_dispose = schedule.content
|
163
193
|
_dispose_show =schedule.css("span")[0].text
|
164
194
|
time = _dispose.gsub(_dispose_show,"")
|
165
|
-
|
195
|
+
href =schedule.css('a')[schedule.css('a').count-1].get_attribute("href")
|
196
|
+
_url = @site+"/" + href if schedule.css('a')[0]
|
166
197
|
schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
|
167
198
|
now = time.gsub(":","").to_i
|
168
199
|
if((now-last_time)<5)
|
@@ -190,6 +221,14 @@ module Grabepg
|
|
190
221
|
#解析节目详情页面
|
191
222
|
def dispose_show_info(url)
|
192
223
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
224
|
+
if doc.nil?
|
225
|
+
unless @error_num
|
226
|
+
@error_num = 0
|
227
|
+
end
|
228
|
+
@error_num+=1
|
229
|
+
raise err.to_s if @error_num==5
|
230
|
+
dispose_show_info(url)
|
231
|
+
end
|
193
232
|
begin
|
194
233
|
show_name = doc.css('div[class="tv_info_top"]')[0].content
|
195
234
|
_doc=doc.css("div[class='tv_info']")
|
@@ -210,4 +249,4 @@ module Grabepg
|
|
210
249
|
|
211
250
|
end
|
212
251
|
|
213
|
-
end
|
252
|
+
end
|
data/lib/test/test_grab_tvsou.rb
CHANGED
@@ -11,20 +11,19 @@ class TestGrabTvsou
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def get_data(start_time,use_time)
|
14
|
-
@grabtvsou.get_data(
|
14
|
+
@grabtvsou.get_data(start_time,use_time)
|
15
15
|
end
|
16
16
|
|
17
17
|
def dispose_href_schedule_data(href,start_time,use_time)
|
18
18
|
@grabtvsou.dispose_href_schedule_data(href,start_time,use_time)
|
19
19
|
end
|
20
20
|
|
21
|
-
def dispose_schedule_page()
|
22
|
-
href = "http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys"
|
21
|
+
def dispose_schedule_page(href="http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys")
|
23
22
|
@grabtvsou.dispose_schedule_page(href,0,1)
|
24
23
|
end
|
25
24
|
|
26
25
|
def dispose_show_info
|
27
|
-
hrefs = ["http://
|
26
|
+
hrefs = ["http://msou.com//jq3.asp?id=75928&tid=3","http://m.tvsou.com//jq3.asp?id=89450&tid=3"]
|
28
27
|
ret = []
|
29
28
|
hrefs.each do |href|
|
30
29
|
ret<<@grabtvsou.dispose_show_info(href)
|
@@ -36,6 +35,11 @@ class TestGrabTvsou
|
|
36
35
|
@grabtvsou.dispose_home_page
|
37
36
|
end
|
38
37
|
|
38
|
+
def get_channel_logo(url="epg.asp?TVid=1&Channelid=1&pro=ys")
|
39
|
+
@grabtvsou.get_channel_logo(url)
|
40
|
+
end
|
41
|
+
|
42
|
+
|
39
43
|
def self.start
|
40
44
|
_grabtvsou = GrabTvsou.new("mobile",[])
|
41
45
|
p channels = _grabtvsou.dispose_home_page
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grab_epg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hahazql
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: ! '"用于抓取EPG信息"'
|
14
14
|
email:
|