grab_epg 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTI5MTMwMTY1NTRmZjk5NGIwZGM4MTk3NTljNGFjMzAwOWY4NTdiNA==
4
+ ZWMyZTQzOThkZWI3YTUxYjIwZGU5ZGRkZmQ2ZTQzNTBjYjAxYWE5ZA==
5
5
  data.tar.gz: !binary |-
6
- NzNjZTc3YzY1OWZkYTZjOGUzNTVjNzVmZjgzYjg3NjQyZWUzNGFmMg==
6
+ YjM0MDViYzQzZGQ3OWNiYjk3ZmVjMDA0Mzk2OTA0M2UzNjdlNTdkNg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZGMwNThmMzZlY2FmZmI1ZmQzNjY5ODdkYTI4MTk5MWI2NWZiODBlZjQ1YzNk
10
- MzllOGE1YmNkZjRiNjc3MDlhY2FjZjMyNjFiNTcxYjFlZTlmYzgwNmVlMmQx
11
- NzUxODIwMGE1MjgxZWM0NWY3ZDlmNWE0YmYyN2U0NTY1YjU3NmQ=
9
+ YjE5ZTgxMzg5OTIwYTNjYTNlMDkzYWVlMWMwODczZDk3ZTZkNzIzYmQzZTI0
10
+ YWUwYTNhOWVjNjQyNTVhNjAwODYxY2RhOTc4YmM4YWI2ZmMzNzI3ZjRhMmZj
11
+ MTg2MjJmOWIzNmRhODU1N2MwNDAwZmJhYmQwZTk2ZDU3MTU1YzU=
12
12
  data.tar.gz: !binary |-
13
- MTA3MmE5MGZkMzU5YzliYjljNTQ1NzljNWViYTQ5YWU5ZmNlZGQ5OWJmZTUz
14
- YmZiN2QyNjUzOTk0OGQwMzM0NmZjOTEwOTI2MzJkZjAxMDg5YzdlNzUxNjM3
15
- Y2VkNmQzMGUyMjQ0Nzc5MTZkMGE5NjY4Y2IwZTY2ZGI5Y2MyOTA=
13
+ MGM5MmEzZWU0MjcwZTgyNzliOTRkYjhkOGQ2Njk4OTZlOTI1ZjFkMjI5MGJi
14
+ MzE4Mjg0MmJjNmE2ZmJkY2YwNTQzM2QzZDcwNDQxNzM4MWE4NTI4ZjU2ZmMz
15
+ YzMwZTcyMTA1NjJiMTlhY2NhZGE0MTU4NzJiMzBkZTVjNWVkZGM=
data/.grabepg.gemspec CHANGED
@@ -10,6 +10,6 @@ Gem::Specification.new do |gem|
10
10
  gem.files = `git ls-files`.split($\)
11
11
  gem.name = "grab_epg"
12
12
  gem.require_paths = ["lib"]
13
- gem.version = "0.2.3"
13
+ gem.version = "0.2.4"
14
14
  gem.homepage = "https://github.com/hahazql/grab_epg"
15
15
  end
data/lib/debug.rb CHANGED
@@ -3,10 +3,12 @@
3
3
  require 'nokogiri'
4
4
  require 'open-uri'
5
5
  require File.expand_path("../test/test_grab_tvsou.rb", __FILE__)
6
+ require File.expand_path("../test/test_grab_tvmao.rb", __FILE__)
6
7
  #require 'test/test_grab_tvsou'
7
8
  class Debug
8
9
  # To change this template use File | Settings | File Templates.
9
10
  #proxylist = ["219.234.82.84:24809", "219.234.82.84:17130", "219.234.82.84:23684", "219.234.82.84:18253", "219.234.82.84:33987", "219.234.82.84:17183", "219.234.82.84:13243", "219.234.82.84:16158", "219.234.82.84:14826", "219.234.82.84:8489", "219.234.82.84:22222", "219.234.82.84:6370", "219.234.82.84:7571", "219.234.82.84:33944", "219.234.82.84:9743", "219.234.82.84:8089", "219.234.82.84:20991", "219.234.82.84:34032", "219.234.82.84:9415", "219.234.82.84:26149", "219.234.82.84:11095", "219.234.82.84:21724", "219.234.82.84:9177", "219.234.82.84:34034", "219.234.82.84:17945", "219.234.82.85:32229", "219.234.82.85:28341", "219.234.82.85:36314", "219.234.82.85:30605", "219.234.82.85:23684", "219.234.82.85:34015", "219.234.82.85:33919", "219.234.82.85:30639", "219.234.82.85:33965", "219.234.82.85:37299", "219.234.82.85:20747", "219.234.82.86:6666", "219.234.82.86:34106", "219.234.82.86:25301", "219.234.82.86:32896", "219.234.82.86:23034", "219.234.82.86:22685", "219.234.82.86:13078", "219.234.82.86:38770", "219.234.82.86:28402", "219.234.82.86:18887", "219.234.82.86:6588", "219.234.82.86:7292", "219.234.82.86:24268", "219.234.82.86:16472", "219.234.82.86:32597", "219.234.82.86:31122", "219.234.82.88:8817", "219.234.82.88:8160", "219.234.82.88:9239", "219.234.82.88:6133", "114.141.162.53:8080", "123.125.116.243:17656", "123.125.116.241:29156", "123.125.116.243:6938", "219.234.82.88:29484", "219.234.82.88:8084", "219.234.82.88:32229", "219.234.82.88:22758", "219.234.82.88:5616", "124.225.52.14:8080", "219.234.82.88:30028", "219.234.82.88:23685", "219.234.82.88:29037", "219.234.82.88:8755"]
10
11
 
11
- p TestGrabTvsou.start
12
+ # p TestGrabTvsou.new.get_channel_logo
13
+ p TestGrabTvmao.new.test_get_show_type_by_batch
12
14
  end
data/lib/grab_tvmao.rb CHANGED
@@ -3,7 +3,11 @@
3
3
  require 'nokogiri'
4
4
  require 'open-uri'
5
5
 
6
- module GrabTvmao
6
+ require File.expand_path("../grabepg/grab_base.rb", __FILE__)
7
+ require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__)
8
+
9
+ module Grabepg
10
+ class GrabTvmao
7
11
  # To change this template use File | Settings | File Templates.
8
12
 
9
13
 
@@ -22,10 +26,115 @@ module GrabTvmao
22
26
 
23
27
 
24
28
 
29
+ def initialize
30
+ @grabbase = GrabBase.new
31
+ end
32
+
33
+
34
+
35
+ #批量从tvmao获取节目类型
36
+ #channel 节目表属于的屏道
37
+ #url 节目表获取的网络地址
38
+ #date 日期
39
+ #schedule 需要批量修改的时间表
40
+ #proxylist 代理列表
41
+ def get_show_type_by_batch(channel,url,date,schedule,proxylist)
42
+ _schedule = {}
43
+ schedule.each do |s|
44
+ time = s["schedule_start"].gsub(":","").to_i
45
+ _schedule.merge!(time=>s)
46
+ end
47
+ url = get_show_type_url(url,date)
48
+ schedules = get_schedulelist_atday(channel,url,proxylist)
49
+ type = nil
50
+ schedules.each do |schedule|
51
+ schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
52
+ if _schedule.has_key?(schedule_time_num)
53
+ _schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"]
54
+ p "*****************************************************************************************"
55
+ p "Schedule: #{_schedule[schedule_time_num]}"
56
+ p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}"
57
+ p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}"
58
+ if _schedule[schedule_time_num]["schedule_logo"]==""
59
+ unless schedule["img"]==""
60
+ _schedule[schedule_time_num]["schedule_logo"]=schedule["img"]
61
+ end
62
+ end
63
+ end
64
+ end
65
+ ret = []
66
+ _schedule.each do |key,value|
67
+ ret << value
68
+ end
69
+
70
+ ret
71
+ end
72
+
73
+ #批量从tvmao获取节目类型
74
+ #channel 节目表属于的屏道
75
+ #url 节目表获取的网络地址
76
+ #date 日期
77
+ #time 节目开始时间
78
+ #proxylist 代理列表
79
+ def get_show_type(channel,url,date,time,proxylist)
80
+ url = get_show_type_url(url,date)
81
+ schedules = get_schedulelist_atday(channel,url,proxylist)
82
+ _time_num = time.gsub(":","").to_i
83
+ type = nil
84
+ schedules.each do |schedule|
85
+ schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
86
+ if _time_num==schedule_time_num
87
+ type = schedule["type"]
88
+ end
89
+ end
90
+ if type
91
+ return type
92
+ else
93
+ return []
94
+ end
95
+ end
96
+
97
+ def get_show_type_url(url,date)
98
+ whatday = 0
99
+ _date = date.split("(")[0]
100
+ case _date
101
+ when "星期一"
102
+ whatday=1
103
+ when "星期二"
104
+ whatday=2
105
+ when "星期三"
106
+ whatday=3
107
+ when "星期四"
108
+ whatday=4
109
+ when "星期五"
110
+ whatday=5
111
+ when "星期六"
112
+ whatday=6
113
+ when "星期日"
114
+ whatday=7
115
+ end
116
+
117
+ get_week_url = lambda {|url,whatday|
118
+ _url = "http://www.tvmao.com"
119
+ urls = []
120
+ _urls = url.split("-")
121
+ 0.upto(1).each do |i|
122
+ _url = _url+"#{_urls[i]}"+"-"
123
+ end
124
+ url = _url+"w#{whatday}.html"
125
+ return url
126
+ }
127
+ return get_week_url.call(url,whatday)
128
+ end
129
+
130
+
131
+
132
+
133
+
25
134
 
26
135
  #将星期的wday获取值转化为中文名
27
136
  #conversion wady to chinese
28
- def self.conversion_what_day(whatday)
137
+ def conversion_what_day(whatday)
29
138
  ret = "星期"
30
139
  case whatday.to_i
31
140
  when 1
@@ -47,7 +156,7 @@ module GrabTvmao
47
156
  end
48
157
 
49
158
  #如果时间为1~9的一位则为其在数字前加0补齐二位
50
- def self.dispose_time(num)
159
+ def dispose_time(num)
51
160
  num = num.to_s
52
161
  if num.length < 2
53
162
  num = "0"+num
@@ -56,7 +165,7 @@ module GrabTvmao
56
165
  end
57
166
 
58
167
  #转化当前时间的格式
59
- def self.get_week_date_time(time)
168
+ def get_week_date_time(time)
60
169
  month = time.month
61
170
  day = time.day
62
171
  whatday = time.wday
@@ -65,26 +174,26 @@ module GrabTvmao
65
174
  end
66
175
 
67
176
  #前几天需要减去的num
68
- def self.del_day_num(day_num)
177
+ def del_day_num(day_num)
69
178
  ret = day_num*60*60*24
70
179
  ret
71
180
  end
72
181
 
73
182
  #获取距离当前多少天的之前的日期
74
- def self.get_time_day_prior(num)
183
+ def get_time_day_prior(num)
75
184
  time = Time.now - del_day_num(num)
76
185
  ret = get_week_date_time(time)
77
186
  ret
78
187
  end
79
188
 
80
189
  #前面一周要删除的日期的列表
81
- def self.del_time_list
190
+ def del_time_list
82
191
  ret = []
83
192
  time = Time.now
84
193
  wday = time.wday
85
194
  if(wday==1)
86
195
  for i in 0..7
87
- ret<<self.get_time_day_prior(i)
196
+ ret<<get_time_day_prior(i)
88
197
  end
89
198
  end
90
199
  ret
@@ -94,7 +203,7 @@ module GrabTvmao
94
203
 
95
204
 
96
205
  #调用此方法的例子
97
- def self.start
206
+ def start
98
207
  #作用是获取俩个字符串的相似度
99
208
  #get str1 and str2 similarity
100
209
  get_similarity_string = lambda { |str1,str2|
@@ -201,14 +310,14 @@ module GrabTvmao
201
310
  end
202
311
  end
203
312
 
204
- def self.img_down_path
313
+ def img_down_path
205
314
  @img_down_path
206
315
  end
207
316
 
208
317
 
209
318
  #获取网站的频道表
210
319
  #img_path 图片存放路径
211
- def self.getchannels(img_dir_path)
320
+ def getchannels(img_dir_path)
212
321
  @channel = []
213
322
  @site=DEFAULT_SITE
214
323
  @proxyindex = 0
@@ -252,39 +361,76 @@ module GrabTvmao
252
361
  {"channel_info"=>channel_info,"channel_urls"=>channel_urls}
253
362
  end
254
363
 
255
- #使用代理获取url的html的doc值
256
- def self.get_doc_with_proxy(proxylist,url)
257
- unless @proxyindex
258
- @proxyindex = 0
259
- end
260
- @proxyindex=@proxyindex%proxylist.size
261
- if(proxylist[@proxyindex])
262
- proxy = proxylist[@proxyindex]
263
- else
264
- proxy = proxylist[@proxyindex+1]
265
- end
266
- begin
267
- doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
268
- doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
269
- @no_firest = 0
270
- rescue => err
364
+
365
+ def err_doc_proxy(proxy,proxylist,url="",err="")
366
+ if proxy.empty?||proxy.nil?
367
+ proxylist.delete_at[@proxyindex]
368
+ end
369
+
271
370
 
272
371
  unless @no_firest
273
372
  @no_firest = 0
274
373
  end
275
374
 
276
375
  @no_firest += 1
277
- p "*************************Proxy:#{proxy}, url:#{url} Error:#{err.to_s}"
376
+ p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}"
278
377
  #proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
279
- get_doc_with_proxy(proxylist,url) if @no_firest<4
280
- raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
378
+ @proxyindex += 1
379
+ @proxyindex=@proxyindex%@size
380
+ doc=get_doc_with_proxy(proxylist,url) if @no_firest<4
381
+ unless @no_firest<4
382
+ @no_firest=0
383
+ raise RuntimeError,"Error: #{err}"
384
+ end
385
+ doc
386
+ end
387
+
388
+
389
+ #使用代理获取url的html的doc值
390
+ def get_doc_with_proxy(proxylist,url)
391
+ unless proxylist.nil?||proxylist.empty?
392
+ unless @proxyindex
393
+ @proxyindex = 0
394
+ end
395
+ @size = proxylist.size
396
+ @proxyindex=@proxyindex%proxylist.size
397
+ if(proxylist[@proxyindex])
398
+ proxy = proxylist[@proxyindex]
399
+ else
400
+ proxy = proxylist[@proxyindex+1]
401
+ end
402
+ begin
403
+ doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty?
404
+ if doc.nil?
405
+ p "DOC is nil"
406
+ doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
407
+ @no_firest=0
408
+ end
409
+ @no_firest = 0
410
+ rescue => err
411
+ p "IN Rescue"
412
+ doc=err_doc_proxy(proxy,proxylist,url,err.to_s)
413
+ @no_firest=0
414
+ p "Get DOC"
415
+ @proxyindex += 1
416
+ @proxyindex=@proxyindex%@size
417
+ return doc
418
+ end
419
+ @proxyindex += 1
420
+ @proxyindex=@proxyindex%@size
421
+ else
422
+ begin
423
+ doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty?
424
+ rescue => err
425
+ p "Error : Proxy:#{proxy}, url:#{url}"
426
+ raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
427
+ end
428
+ end
429
+ doc
281
430
  end
282
- @proxyindex += 1
283
- doc
284
- end
285
431
 
286
432
  #获取某天的节目表
287
- def self.get_schedulelist_atday(channel,url,proxylist)
433
+ def get_schedulelist_atday(channel,url,proxylist)
288
434
  p "Grab: #{url}"
289
435
  doc = get_doc_with_proxy(proxylist,url)
290
436
  show_type = []
@@ -322,6 +468,7 @@ module GrabTvmao
322
468
  schedule = schedule.content.split(" ")[1]
323
469
  show_name = ""
324
470
  unless schedule_herf.nil?||schedule_herf.empty?
471
+ p "Show_infomation:#{schedule_herf} Time:#{time}"
325
472
  show_infomation=get_show_infomation(proxylist,schedule_herf)
326
473
  show_type=show_infomation["type"]
327
474
  show_name = show_infomation["name"]
@@ -337,7 +484,7 @@ module GrabTvmao
337
484
  #获取制定时间和长度url
338
485
  #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
339
486
  #day_num 为int型 代表抓取的时间从开始时间计算的多少天
340
- def self.get_assign_date_url(url,start_time,day_num)
487
+ def get_assign_date_url(url,start_time,day_num)
341
488
  site="http://www.tvmao.com"
342
489
  if(@site)
343
490
  site=@site
@@ -373,7 +520,7 @@ module GrabTvmao
373
520
 
374
521
 
375
522
  #获取指定时间段的节目表
376
- def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
523
+ def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
377
524
  begin
378
525
  day_num = 1 if day_num<1
379
526
  rescue
@@ -394,7 +541,7 @@ module GrabTvmao
394
541
  channel_schedule = {}
395
542
  get_assign_date_url(herf,start_num,day_num).each do |url|
396
543
  @date = ""
397
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
544
+ schedule_list = get_schedulelist_atday(channel,url,proxylist)
398
545
  channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
399
546
  end
400
547
  @img_down_file.close
@@ -407,7 +554,7 @@ module GrabTvmao
407
554
 
408
555
  #因原已调用所以保留
409
556
  #获取一周节目表
410
- def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
557
+ def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
411
558
  p "Day Num is #{day_num}"
412
559
  begin
413
560
  day_num = 1 if day_num<1
@@ -442,7 +589,7 @@ module GrabTvmao
442
589
  channel_schedule = {}
443
590
  get_week_url.call(herf,day_num).each do |url|
444
591
  @date = ""
445
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
592
+ schedule_list = get_schedulelist_atday(channel,url,proxylist)
446
593
  channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
447
594
  end
448
595
  @img_down_file.close
@@ -451,16 +598,14 @@ module GrabTvmao
451
598
 
452
599
 
453
600
  #获取节目详细信息
454
- def self.get_show_infomation(proxy_list,schedule_herf)
601
+ def get_show_infomation(proxy_list,schedule_herf)
455
602
  begin
456
603
  @proxyindex = 0
457
604
  unless @site
458
605
  @site = "http://www.tvmao.com"
459
606
  end
460
607
  schedule_herf = @site + schedule_herf
461
- doc=get_doc_with_proxy(proxy_list,schedule_herf)
462
- #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
463
- # p "title: %s" % title
608
+ doc = get_doc_with_proxy(proxy_list,schedule_herf)
464
609
  type = []
465
610
  name = doc.css('span[itemprop="name"]')[0].content
466
611
 
@@ -479,39 +624,42 @@ module GrabTvmao
479
624
  end
480
625
  url = "#{schedule_herf}/detail"
481
626
  doc = get_doc_with_proxy(proxy_list,url)
482
- doc.css('span[itemprop="genre"]').each do |_type|
483
- type << _type.content
484
- end
485
- doc.css('a[itemprop="genre"]').each do |_type|
486
- type<<_type.content
627
+ if doc
628
+ doc.css('span[itemprop="genre"]').each do |_type|
629
+ type << _type.content
630
+ end
487
631
  end
488
632
  type.uniq!
489
- @img_down_file.puts("#{name}:#{schedule_img_down_path}")
633
+ unless @show_schedule
634
+ @show_schedule={}
635
+ end
490
636
  @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
491
637
  {"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
492
- rescue => e
493
- p "Error In get_show_infomation msg : #{e.to_s}"
638
+ #rescue => e
639
+ # p "Error In get_show_infomation msg : #{e.to_s}"
494
640
  end
495
641
  end
496
642
 
497
643
  #获取节目的时间表
498
- def self.get_show_schedule(proxylist,herf)
644
+ def get_show_schedule(proxylist,herf)
499
645
  url = herf + "/playingtime"
500
646
  doc = get_doc_with_proxy(proxylist,url)
501
647
  i = 0
502
648
  schedule = []
503
- doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
504
- unless(i==0)
505
- time = epg.css('div[class="f1 fld"]')[0].content
506
- channel_name = epg.css('div[class="f2 fld"]')[0].content
507
- show_name = epg.css('div[class="f3 fld"]')[0].content
508
- times = time.split(" ")
509
- week = times[0]
510
- date = times[1]
511
- _time = times[2]
512
- schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
649
+ if doc.css('div[id="epg"]')[0]
650
+ doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
651
+ unless(i==0)
652
+ time = epg.css('div[class="f1 fld"]')[0].content
653
+ channel_name = epg.css('div[class="f2 fld"]')[0].content
654
+ show_name = epg.css('div[class="f3 fld"]')[0].content
655
+ times = time.split(" ")
656
+ week = times[0]
657
+ date = times[1]
658
+ _time = times[2]
659
+ schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
660
+ end
661
+ i += 1
513
662
  end
514
- i += 1
515
663
  end
516
664
  schedule
517
665
  end
@@ -521,7 +669,7 @@ module GrabTvmao
521
669
 
522
670
  #获取指定访问速度的代理服务器
523
671
  #time为最慢速度的时间 int型 代表秒
524
- def self.get_topfast_list(use_time)
672
+ def get_topfast_list(use_time)
525
673
  fast_list = []
526
674
  time_use = 0
527
675
  ips_ports = get_proxy_list()
@@ -555,7 +703,7 @@ module GrabTvmao
555
703
  end
556
704
 
557
705
  #获取代理列表
558
- def self.get_proxy_list()
706
+ def get_proxy_list()
559
707
  list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
560
708
  if list.count ==0
561
709
  list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
@@ -575,7 +723,7 @@ module GrabTvmao
575
723
  ips_ports
576
724
  end
577
725
 
578
- def self.gg(url)
726
+ def gg(url)
579
727
  regex_list = /<TD class="list">.*<\/TD>/
580
728
  href =URI.parse(url)
581
729
  contxt = ""
@@ -588,5 +736,5 @@ module GrabTvmao
588
736
  def save_img
589
737
 
590
738
  end
591
-
739
+ end
592
740
  end
@@ -1,5 +1,6 @@
1
1
  #encoding:utf-8
2
2
  require 'nokogiri'
3
+ require 'iconv'
3
4
 
4
5
  module Grabepg
5
6
 
@@ -22,13 +23,48 @@ module Grabepg
22
23
  ret += "五"
23
24
  when 6
24
25
  ret += "六"
25
- when 7
26
- ret += ""
26
+ when 0
27
+ ret += ""
27
28
  end
28
29
  ret
29
30
  end
30
31
 
31
32
 
33
+ def self.proxy_list(path)
34
+ proxy_list = []
35
+ crt_date = DateTime.now.strftime('%F')
36
+ proxy_path = "%s/proxy/%s.txt" % [File.dirname(path),crt_date]
37
+ p "Proxy_Path: #{proxy_path}"
38
+ if File.exist?(proxy_path)
39
+ file_proxy = File.open(proxy_path,"r")
40
+ file_proxy.each_line {|line|
41
+ proxy_list << line.chomp.to_s
42
+ }
43
+ p "Get Proxy_list:#{proxy_list}"
44
+ file_proxy.flush
45
+ file_proxy.close
46
+ else
47
+ proxy_list=GetProxyList.get_list(ENV["proxy_limit"].to_i,ENV["proxy_page"].to_i)
48
+ dirpath = "#{File.dirname(path)}/proxy/"
49
+ Dir.open(dirpath) {|fna|
50
+ fna.each do |fn|
51
+ if(fn.to_s != ".." && fn.to_s != ".")
52
+ File.delete("#{dirpath + fn.to_s}")
53
+ end
54
+ end
55
+ }
56
+ file_proxy = File.new(proxy_path,"a")
57
+ proxy_list.each do |proxy|
58
+ p "Proxy:#{proxy}"
59
+ file_proxy.puts proxy
60
+ end
61
+ file_proxy.flush
62
+ file_proxy.close
63
+ end
64
+ return proxy_list
65
+ end
66
+
67
+
32
68
  #获取指定访问速度的代理服务器
33
69
  #time为最慢速度的时间 int型 代表秒
34
70
  def self.get_topfast_list(use_time)
@@ -157,8 +193,10 @@ module Grabepg
157
193
  proxy = proxylist[@proxyindex+1]
158
194
  end
159
195
  begin
160
- doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
196
+ ic = Iconv.new("UTF-8//IGNORE","GB2312")
197
+ doc = Nokogiri::HTML(ic.iconv(open(url,:proxy=>"#{proxy}").read)) unless proxy.nil?||proxy.empty?
161
198
  if doc.nil?
199
+ p "DOC is nil"
162
200
  doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
163
201
  @no_firest=0
164
202
  end
@@ -176,7 +214,8 @@ module Grabepg
176
214
  @proxyindex=@proxyindex%@size
177
215
  else
178
216
  begin
179
- doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
217
+ ic = Iconv.new("GB2312//IGNORE","GB2312")
218
+ doc = Nokogiri::HTML(ic.iconv(open(url).read)) if proxy.nil?||proxy.empty?
180
219
  rescue => err
181
220
  p "Error : Proxy:#{proxy}, url:#{url}"
182
221
  raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
@@ -36,6 +36,10 @@ module Grabepg
36
36
  @site="http://m.tvsou.com"
37
37
  end
38
38
 
39
+ def get_proxy_list
40
+ @proxy_list
41
+ end
42
+
39
43
  #获取从tvsou的什么网站上获取
40
44
  #type: mobile,webpage
41
45
  def get_url(type)
@@ -44,11 +48,15 @@ module Grabepg
44
48
 
45
49
  def get_data_year_month_day(time)
46
50
 
47
- month = time.month.to_s
51
+ month=time.month.to_s
48
52
  if month.length<2
49
- month = "0"+month
53
+ month="0"+month
50
54
  end
51
- return {time:"#{time.year}-#{time.month}-#{time.day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{time.day})"}
55
+ day = time.day.to_s
56
+ if day.length<2
57
+ day = "0"+day
58
+ end
59
+ return {time:"#{time.year}-#{time.month}-#{day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{day})"}
52
60
  end
53
61
 
54
62
  #获取时间
@@ -111,6 +119,28 @@ module Grabepg
111
119
 
112
120
  end
113
121
 
122
+ #获取频道图标地址
123
+ # url 手机表的URL值
124
+ # channel_type 频道类型
125
+ # no_dis 直接使用URL 不处理
126
+ def get_channel_logo(_url,channel_type,no_dis=false)
127
+ if no_dis
128
+ url = _url
129
+ else
130
+ tvs = _url.split("TVid=")
131
+ tvid = tvs[1].split("&")[0]
132
+ channelids = _url.split("Channelid=")
133
+ channelid = channelids[1].split("&")[0]
134
+ if channel_type=="CCTV"
135
+ url = "http://epg.tvsou.com/programys/TV_#{tvid}/Channel_#{channelid}/W1.htm"
136
+ elsif channel_type=="WTV"
137
+ url = "http://epg.tvsou.com/programws/TV_#{tvid}/Channel_#{channelid}/W1.htm"
138
+ end
139
+ end
140
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
141
+ logo_network_path=doc.css("div[id='epg_m1']").css("img")[0].get_attribute("src")
142
+ return logo_network_path
143
+ end
114
144
 
115
145
 
116
146
  #获取频道时间表URL
@@ -131,11 +161,12 @@ module Grabepg
131
161
 
132
162
  #根据URL解析时间表页面
133
163
  def dispose_schedule_page(url,start_time,use_time)
134
- url = @site +"/"+url
164
+ url = url
135
165
  urls = url.split("?")
136
166
  begin
137
167
  doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
138
168
  @error_num = 0
169
+ _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
139
170
  rescue => err
140
171
  unless @error_num
141
172
  @error_num = 0
@@ -144,7 +175,6 @@ module Grabepg
144
175
  raise err.to_s if @error_num==5
145
176
  dispose_schedule_page(url,start_time,use_time)
146
177
  end
147
- _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
148
178
  _url = urls[0]+_url
149
179
  urls = dispose_href_schedule_data(_url,start_time,use_time)
150
180
  ret = {}
@@ -162,7 +192,8 @@ module Grabepg
162
192
  _dispose = schedule.content
163
193
  _dispose_show =schedule.css("span")[0].text
164
194
  time = _dispose.gsub(_dispose_show,"")
165
- _url = @site+"/" + schedule.css('a')[0].get_attribute("href") if schedule.css('a')[0]
195
+ href =schedule.css('a')[schedule.css('a').count-1].get_attribute("href")
196
+ _url = @site+"/" + href if schedule.css('a')[0]
166
197
  schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
167
198
  now = time.gsub(":","").to_i
168
199
  if((now-last_time)<5)
@@ -190,6 +221,14 @@ module Grabepg
190
221
  #解析节目详情页面
191
222
  def dispose_show_info(url)
192
223
  doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
224
+ if doc.nil?
225
+ unless @error_num
226
+ @error_num = 0
227
+ end
228
+ @error_num+=1
229
+ raise err.to_s if @error_num==5
230
+ dispose_show_info(url)
231
+ end
193
232
  begin
194
233
  show_name = doc.css('div[class="tv_info_top"]')[0].content
195
234
  _doc=doc.css("div[class='tv_info']")
@@ -210,4 +249,4 @@ module Grabepg
210
249
 
211
250
  end
212
251
 
213
- end
252
+ end
@@ -11,20 +11,19 @@ class TestGrabTvsou
11
11
  end
12
12
 
13
13
  def get_data(start_time,use_time)
14
- @grabtvsou.get_data(0,5)
14
+ @grabtvsou.get_data(start_time,use_time)
15
15
  end
16
16
 
17
17
  def dispose_href_schedule_data(href,start_time,use_time)
18
18
  @grabtvsou.dispose_href_schedule_data(href,start_time,use_time)
19
19
  end
20
20
 
21
- def dispose_schedule_page()
22
- href = "http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys"
21
+ def dispose_schedule_page(href="http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys")
23
22
  @grabtvsou.dispose_schedule_page(href,0,1)
24
23
  end
25
24
 
26
25
  def dispose_show_info
27
- hrefs = ["http://m.tvsou.com/jq3.asp?id=81300&tid=3","http://m.tvsou.com/intro.asp?id=145"]
26
+ hrefs = ["http://msou.com//jq3.asp?id=75928&tid=3","http://m.tvsou.com//jq3.asp?id=89450&tid=3"]
28
27
  ret = []
29
28
  hrefs.each do |href|
30
29
  ret<<@grabtvsou.dispose_show_info(href)
@@ -36,6 +35,11 @@ class TestGrabTvsou
36
35
  @grabtvsou.dispose_home_page
37
36
  end
38
37
 
38
+ def get_channel_logo(url="epg.asp?TVid=1&Channelid=1&pro=ys")
39
+ @grabtvsou.get_channel_logo(url)
40
+ end
41
+
42
+
39
43
  def self.start
40
44
  _grabtvsou = GrabTvsou.new("mobile",[])
41
45
  p channels = _grabtvsou.dispose_home_page
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grab_epg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - hahazql
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-06-04 00:00:00.000000000 Z
11
+ date: 2013-06-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: ! '"用于抓取EPG信息"'
14
14
  email: