grab_epg 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MTI5MTMwMTY1NTRmZjk5NGIwZGM4MTk3NTljNGFjMzAwOWY4NTdiNA==
4
+ ZWMyZTQzOThkZWI3YTUxYjIwZGU5ZGRkZmQ2ZTQzNTBjYjAxYWE5ZA==
5
5
  data.tar.gz: !binary |-
6
- NzNjZTc3YzY1OWZkYTZjOGUzNTVjNzVmZjgzYjg3NjQyZWUzNGFmMg==
6
+ YjM0MDViYzQzZGQ3OWNiYjk3ZmVjMDA0Mzk2OTA0M2UzNjdlNTdkNg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZGMwNThmMzZlY2FmZmI1ZmQzNjY5ODdkYTI4MTk5MWI2NWZiODBlZjQ1YzNk
10
- MzllOGE1YmNkZjRiNjc3MDlhY2FjZjMyNjFiNTcxYjFlZTlmYzgwNmVlMmQx
11
- NzUxODIwMGE1MjgxZWM0NWY3ZDlmNWE0YmYyN2U0NTY1YjU3NmQ=
9
+ YjE5ZTgxMzg5OTIwYTNjYTNlMDkzYWVlMWMwODczZDk3ZTZkNzIzYmQzZTI0
10
+ YWUwYTNhOWVjNjQyNTVhNjAwODYxY2RhOTc4YmM4YWI2ZmMzNzI3ZjRhMmZj
11
+ MTg2MjJmOWIzNmRhODU1N2MwNDAwZmJhYmQwZTk2ZDU3MTU1YzU=
12
12
  data.tar.gz: !binary |-
13
- MTA3MmE5MGZkMzU5YzliYjljNTQ1NzljNWViYTQ5YWU5ZmNlZGQ5OWJmZTUz
14
- YmZiN2QyNjUzOTk0OGQwMzM0NmZjOTEwOTI2MzJkZjAxMDg5YzdlNzUxNjM3
15
- Y2VkNmQzMGUyMjQ0Nzc5MTZkMGE5NjY4Y2IwZTY2ZGI5Y2MyOTA=
13
+ MGM5MmEzZWU0MjcwZTgyNzliOTRkYjhkOGQ2Njk4OTZlOTI1ZjFkMjI5MGJi
14
+ MzE4Mjg0MmJjNmE2ZmJkY2YwNTQzM2QzZDcwNDQxNzM4MWE4NTI4ZjU2ZmMz
15
+ YzMwZTcyMTA1NjJiMTlhY2NhZGE0MTU4NzJiMzBkZTVjNWVkZGM=
data/.grabepg.gemspec CHANGED
@@ -10,6 +10,6 @@ Gem::Specification.new do |gem|
10
10
  gem.files = `git ls-files`.split($\)
11
11
  gem.name = "grab_epg"
12
12
  gem.require_paths = ["lib"]
13
- gem.version = "0.2.3"
13
+ gem.version = "0.2.4"
14
14
  gem.homepage = "https://github.com/hahazql/grab_epg"
15
15
  end
data/lib/debug.rb CHANGED
@@ -3,10 +3,12 @@
3
3
  require 'nokogiri'
4
4
  require 'open-uri'
5
5
  require File.expand_path("../test/test_grab_tvsou.rb", __FILE__)
6
+ require File.expand_path("../test/test_grab_tvmao.rb", __FILE__)
6
7
  #require 'test/test_grab_tvsou'
7
8
  class Debug
8
9
  # To change this template use File | Settings | File Templates.
9
10
  #proxylist = ["219.234.82.84:24809", "219.234.82.84:17130", "219.234.82.84:23684", "219.234.82.84:18253", "219.234.82.84:33987", "219.234.82.84:17183", "219.234.82.84:13243", "219.234.82.84:16158", "219.234.82.84:14826", "219.234.82.84:8489", "219.234.82.84:22222", "219.234.82.84:6370", "219.234.82.84:7571", "219.234.82.84:33944", "219.234.82.84:9743", "219.234.82.84:8089", "219.234.82.84:20991", "219.234.82.84:34032", "219.234.82.84:9415", "219.234.82.84:26149", "219.234.82.84:11095", "219.234.82.84:21724", "219.234.82.84:9177", "219.234.82.84:34034", "219.234.82.84:17945", "219.234.82.85:32229", "219.234.82.85:28341", "219.234.82.85:36314", "219.234.82.85:30605", "219.234.82.85:23684", "219.234.82.85:34015", "219.234.82.85:33919", "219.234.82.85:30639", "219.234.82.85:33965", "219.234.82.85:37299", "219.234.82.85:20747", "219.234.82.86:6666", "219.234.82.86:34106", "219.234.82.86:25301", "219.234.82.86:32896", "219.234.82.86:23034", "219.234.82.86:22685", "219.234.82.86:13078", "219.234.82.86:38770", "219.234.82.86:28402", "219.234.82.86:18887", "219.234.82.86:6588", "219.234.82.86:7292", "219.234.82.86:24268", "219.234.82.86:16472", "219.234.82.86:32597", "219.234.82.86:31122", "219.234.82.88:8817", "219.234.82.88:8160", "219.234.82.88:9239", "219.234.82.88:6133", "114.141.162.53:8080", "123.125.116.243:17656", "123.125.116.241:29156", "123.125.116.243:6938", "219.234.82.88:29484", "219.234.82.88:8084", "219.234.82.88:32229", "219.234.82.88:22758", "219.234.82.88:5616", "124.225.52.14:8080", "219.234.82.88:30028", "219.234.82.88:23685", "219.234.82.88:29037", "219.234.82.88:8755"]
10
11
 
11
- p TestGrabTvsou.start
12
+ # p TestGrabTvsou.new.get_channel_logo
13
+ p TestGrabTvmao.new.test_get_show_type_by_batch
12
14
  end
data/lib/grab_tvmao.rb CHANGED
@@ -3,7 +3,11 @@
3
3
  require 'nokogiri'
4
4
  require 'open-uri'
5
5
 
6
- module GrabTvmao
6
+ require File.expand_path("../grabepg/grab_base.rb", __FILE__)
7
+ require File.expand_path("../grabepg/grab_tvsou.rb", __FILE__)
8
+
9
+ module Grabepg
10
+ class GrabTvmao
7
11
  # To change this template use File | Settings | File Templates.
8
12
 
9
13
 
@@ -22,10 +26,115 @@ module GrabTvmao
22
26
 
23
27
 
24
28
 
29
+ def initialize
30
+ @grabbase = GrabBase.new
31
+ end
32
+
33
+
34
+
35
+ #批量从tvmao获取节目类型
36
+ #channel 节目表属于的屏道
37
+ #url 节目表获取的网络地址
38
+ #date 日期
39
+ #schedule 需要批量修改的时间表
40
+ #proxylist 代理列表
41
+ def get_show_type_by_batch(channel,url,date,schedule,proxylist)
42
+ _schedule = {}
43
+ schedule.each do |s|
44
+ time = s["schedule_start"].gsub(":","").to_i
45
+ _schedule.merge!(time=>s)
46
+ end
47
+ url = get_show_type_url(url,date)
48
+ schedules = get_schedulelist_atday(channel,url,proxylist)
49
+ type = nil
50
+ schedules.each do |schedule|
51
+ schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
52
+ if _schedule.has_key?(schedule_time_num)
53
+ _schedule[schedule_time_num]["type"]=_schedule[schedule_time_num]["type"]|schedule["type"]
54
+ p "*****************************************************************************************"
55
+ p "Schedule: #{_schedule[schedule_time_num]}"
56
+ p "schedule_logo_1: #{_schedule[schedule_time_num]["schedule_logo"]}"
57
+ p "schedule_logo_2: #{_schedule[schedule_time_num][:schedule_logo]}"
58
+ if _schedule[schedule_time_num]["schedule_logo"]==""
59
+ unless schedule["img"]==""
60
+ _schedule[schedule_time_num]["schedule_logo"]=schedule["img"]
61
+ end
62
+ end
63
+ end
64
+ end
65
+ ret = []
66
+ _schedule.each do |key,value|
67
+ ret << value
68
+ end
69
+
70
+ ret
71
+ end
72
+
73
+ #批量从tvmao获取节目类型
74
+ #channel 节目表属于的屏道
75
+ #url 节目表获取的网络地址
76
+ #date 日期
77
+ #time 节目开始时间
78
+ #proxylist 代理列表
79
+ def get_show_type(channel,url,date,time,proxylist)
80
+ url = get_show_type_url(url,date)
81
+ schedules = get_schedulelist_atday(channel,url,proxylist)
82
+ _time_num = time.gsub(":","").to_i
83
+ type = nil
84
+ schedules.each do |schedule|
85
+ schedule_time_num = schedule["schedule_start"].gsub(":","").to_i
86
+ if _time_num==schedule_time_num
87
+ type = schedule["type"]
88
+ end
89
+ end
90
+ if type
91
+ return type
92
+ else
93
+ return []
94
+ end
95
+ end
96
+
97
+ def get_show_type_url(url,date)
98
+ whatday = 0
99
+ _date = date.split("(")[0]
100
+ case _date
101
+ when "星期一"
102
+ whatday=1
103
+ when "星期二"
104
+ whatday=2
105
+ when "星期三"
106
+ whatday=3
107
+ when "星期四"
108
+ whatday=4
109
+ when "星期五"
110
+ whatday=5
111
+ when "星期六"
112
+ whatday=6
113
+ when "星期日"
114
+ whatday=7
115
+ end
116
+
117
+ get_week_url = lambda {|url,whatday|
118
+ _url = "http://www.tvmao.com"
119
+ urls = []
120
+ _urls = url.split("-")
121
+ 0.upto(1).each do |i|
122
+ _url = _url+"#{_urls[i]}"+"-"
123
+ end
124
+ url = _url+"w#{whatday}.html"
125
+ return url
126
+ }
127
+ return get_week_url.call(url,whatday)
128
+ end
129
+
130
+
131
+
132
+
133
+
25
134
 
26
135
  #将星期的wday获取值转化为中文名
27
136
  #conversion wady to chinese
28
- def self.conversion_what_day(whatday)
137
+ def conversion_what_day(whatday)
29
138
  ret = "星期"
30
139
  case whatday.to_i
31
140
  when 1
@@ -47,7 +156,7 @@ module GrabTvmao
47
156
  end
48
157
 
49
158
  #如果时间为1~9的一位则为其在数字前加0补齐二位
50
- def self.dispose_time(num)
159
+ def dispose_time(num)
51
160
  num = num.to_s
52
161
  if num.length < 2
53
162
  num = "0"+num
@@ -56,7 +165,7 @@ module GrabTvmao
56
165
  end
57
166
 
58
167
  #转化当前时间的格式
59
- def self.get_week_date_time(time)
168
+ def get_week_date_time(time)
60
169
  month = time.month
61
170
  day = time.day
62
171
  whatday = time.wday
@@ -65,26 +174,26 @@ module GrabTvmao
65
174
  end
66
175
 
67
176
  #前几天需要减去的num
68
- def self.del_day_num(day_num)
177
+ def del_day_num(day_num)
69
178
  ret = day_num*60*60*24
70
179
  ret
71
180
  end
72
181
 
73
182
  #获取距离当前多少天的之前的日期
74
- def self.get_time_day_prior(num)
183
+ def get_time_day_prior(num)
75
184
  time = Time.now - del_day_num(num)
76
185
  ret = get_week_date_time(time)
77
186
  ret
78
187
  end
79
188
 
80
189
  #前面一周要删除的日期的列表
81
- def self.del_time_list
190
+ def del_time_list
82
191
  ret = []
83
192
  time = Time.now
84
193
  wday = time.wday
85
194
  if(wday==1)
86
195
  for i in 0..7
87
- ret<<self.get_time_day_prior(i)
196
+ ret<<get_time_day_prior(i)
88
197
  end
89
198
  end
90
199
  ret
@@ -94,7 +203,7 @@ module GrabTvmao
94
203
 
95
204
 
96
205
  #调用此方法的例子
97
- def self.start
206
+ def start
98
207
  #作用是获取俩个字符串的相似度
99
208
  #get str1 and str2 similarity
100
209
  get_similarity_string = lambda { |str1,str2|
@@ -201,14 +310,14 @@ module GrabTvmao
201
310
  end
202
311
  end
203
312
 
204
- def self.img_down_path
313
+ def img_down_path
205
314
  @img_down_path
206
315
  end
207
316
 
208
317
 
209
318
  #获取网站的频道表
210
319
  #img_path 图片存放路径
211
- def self.getchannels(img_dir_path)
320
+ def getchannels(img_dir_path)
212
321
  @channel = []
213
322
  @site=DEFAULT_SITE
214
323
  @proxyindex = 0
@@ -252,39 +361,76 @@ module GrabTvmao
252
361
  {"channel_info"=>channel_info,"channel_urls"=>channel_urls}
253
362
  end
254
363
 
255
- #使用代理获取url的html的doc值
256
- def self.get_doc_with_proxy(proxylist,url)
257
- unless @proxyindex
258
- @proxyindex = 0
259
- end
260
- @proxyindex=@proxyindex%proxylist.size
261
- if(proxylist[@proxyindex])
262
- proxy = proxylist[@proxyindex]
263
- else
264
- proxy = proxylist[@proxyindex+1]
265
- end
266
- begin
267
- doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
268
- doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
269
- @no_firest = 0
270
- rescue => err
364
+
365
+ def err_doc_proxy(proxy,proxylist,url="",err="")
366
+ if proxy.empty?||proxy.nil?
367
+ proxylist.delete_at[@proxyindex]
368
+ end
369
+
271
370
 
272
371
  unless @no_firest
273
372
  @no_firest = 0
274
373
  end
275
374
 
276
375
  @no_firest += 1
277
- p "*************************Proxy:#{proxy}, url:#{url} Error:#{err.to_s}"
376
+ p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}"
278
377
  #proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
279
- get_doc_with_proxy(proxylist,url) if @no_firest<4
280
- raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
378
+ @proxyindex += 1
379
+ @proxyindex=@proxyindex%@size
380
+ doc=get_doc_with_proxy(proxylist,url) if @no_firest<4
381
+ unless @no_firest<4
382
+ @no_firest=0
383
+ raise RuntimeError,"Error: #{err}"
384
+ end
385
+ doc
386
+ end
387
+
388
+
389
+ #使用代理获取url的html的doc值
390
+ def get_doc_with_proxy(proxylist,url)
391
+ unless proxylist.nil?||proxylist.empty?
392
+ unless @proxyindex
393
+ @proxyindex = 0
394
+ end
395
+ @size = proxylist.size
396
+ @proxyindex=@proxyindex%proxylist.size
397
+ if(proxylist[@proxyindex])
398
+ proxy = proxylist[@proxyindex]
399
+ else
400
+ proxy = proxylist[@proxyindex+1]
401
+ end
402
+ begin
403
+ doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}").read) unless proxy.nil?||proxy.empty?
404
+ if doc.nil?
405
+ p "DOC is nil"
406
+ doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
407
+ @no_firest=0
408
+ end
409
+ @no_firest = 0
410
+ rescue => err
411
+ p "IN Rescue"
412
+ doc=err_doc_proxy(proxy,proxylist,url,err.to_s)
413
+ @no_firest=0
414
+ p "Get DOC"
415
+ @proxyindex += 1
416
+ @proxyindex=@proxyindex%@size
417
+ return doc
418
+ end
419
+ @proxyindex += 1
420
+ @proxyindex=@proxyindex%@size
421
+ else
422
+ begin
423
+ doc = Nokogiri::HTML(open(url).read) if proxy.nil?||proxy.empty?
424
+ rescue => err
425
+ p "Error : Proxy:#{proxy}, url:#{url}"
426
+ raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
427
+ end
428
+ end
429
+ doc
281
430
  end
282
- @proxyindex += 1
283
- doc
284
- end
285
431
 
286
432
  #获取某天的节目表
287
- def self.get_schedulelist_atday(channel,url,proxylist)
433
+ def get_schedulelist_atday(channel,url,proxylist)
288
434
  p "Grab: #{url}"
289
435
  doc = get_doc_with_proxy(proxylist,url)
290
436
  show_type = []
@@ -322,6 +468,7 @@ module GrabTvmao
322
468
  schedule = schedule.content.split(" ")[1]
323
469
  show_name = ""
324
470
  unless schedule_herf.nil?||schedule_herf.empty?
471
+ p "Show_infomation:#{schedule_herf} Time:#{time}"
325
472
  show_infomation=get_show_infomation(proxylist,schedule_herf)
326
473
  show_type=show_infomation["type"]
327
474
  show_name = show_infomation["name"]
@@ -337,7 +484,7 @@ module GrabTvmao
337
484
  #获取制定时间和长度url
338
485
  #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
339
486
  #day_num 为int型 代表抓取的时间从开始时间计算的多少天
340
- def self.get_assign_date_url(url,start_time,day_num)
487
+ def get_assign_date_url(url,start_time,day_num)
341
488
  site="http://www.tvmao.com"
342
489
  if(@site)
343
490
  site=@site
@@ -373,7 +520,7 @@ module GrabTvmao
373
520
 
374
521
 
375
522
  #获取指定时间段的节目表
376
- def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
523
+ def getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
377
524
  begin
378
525
  day_num = 1 if day_num<1
379
526
  rescue
@@ -394,7 +541,7 @@ module GrabTvmao
394
541
  channel_schedule = {}
395
542
  get_assign_date_url(herf,start_num,day_num).each do |url|
396
543
  @date = ""
397
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
544
+ schedule_list = get_schedulelist_atday(channel,url,proxylist)
398
545
  channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
399
546
  end
400
547
  @img_down_file.close
@@ -407,7 +554,7 @@ module GrabTvmao
407
554
 
408
555
  #因原已调用所以保留
409
556
  #获取一周节目表
410
- def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
557
+ def getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
411
558
  p "Day Num is #{day_num}"
412
559
  begin
413
560
  day_num = 1 if day_num<1
@@ -442,7 +589,7 @@ module GrabTvmao
442
589
  channel_schedule = {}
443
590
  get_week_url.call(herf,day_num).each do |url|
444
591
  @date = ""
445
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
592
+ schedule_list = get_schedulelist_atday(channel,url,proxylist)
446
593
  channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
447
594
  end
448
595
  @img_down_file.close
@@ -451,16 +598,14 @@ module GrabTvmao
451
598
 
452
599
 
453
600
  #获取节目详细信息
454
- def self.get_show_infomation(proxy_list,schedule_herf)
601
+ def get_show_infomation(proxy_list,schedule_herf)
455
602
  begin
456
603
  @proxyindex = 0
457
604
  unless @site
458
605
  @site = "http://www.tvmao.com"
459
606
  end
460
607
  schedule_herf = @site + schedule_herf
461
- doc=get_doc_with_proxy(proxy_list,schedule_herf)
462
- #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
463
- # p "title: %s" % title
608
+ doc = get_doc_with_proxy(proxy_list,schedule_herf)
464
609
  type = []
465
610
  name = doc.css('span[itemprop="name"]')[0].content
466
611
 
@@ -479,39 +624,42 @@ module GrabTvmao
479
624
  end
480
625
  url = "#{schedule_herf}/detail"
481
626
  doc = get_doc_with_proxy(proxy_list,url)
482
- doc.css('span[itemprop="genre"]').each do |_type|
483
- type << _type.content
484
- end
485
- doc.css('a[itemprop="genre"]').each do |_type|
486
- type<<_type.content
627
+ if doc
628
+ doc.css('span[itemprop="genre"]').each do |_type|
629
+ type << _type.content
630
+ end
487
631
  end
488
632
  type.uniq!
489
- @img_down_file.puts("#{name}:#{schedule_img_down_path}")
633
+ unless @show_schedule
634
+ @show_schedule={}
635
+ end
490
636
  @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
491
637
  {"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
492
- rescue => e
493
- p "Error In get_show_infomation msg : #{e.to_s}"
638
+ #rescue => e
639
+ # p "Error In get_show_infomation msg : #{e.to_s}"
494
640
  end
495
641
  end
496
642
 
497
643
  #获取节目的时间表
498
- def self.get_show_schedule(proxylist,herf)
644
+ def get_show_schedule(proxylist,herf)
499
645
  url = herf + "/playingtime"
500
646
  doc = get_doc_with_proxy(proxylist,url)
501
647
  i = 0
502
648
  schedule = []
503
- doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
504
- unless(i==0)
505
- time = epg.css('div[class="f1 fld"]')[0].content
506
- channel_name = epg.css('div[class="f2 fld"]')[0].content
507
- show_name = epg.css('div[class="f3 fld"]')[0].content
508
- times = time.split(" ")
509
- week = times[0]
510
- date = times[1]
511
- _time = times[2]
512
- schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
649
+ if doc.css('div[id="epg"]')[0]
650
+ doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
651
+ unless(i==0)
652
+ time = epg.css('div[class="f1 fld"]')[0].content
653
+ channel_name = epg.css('div[class="f2 fld"]')[0].content
654
+ show_name = epg.css('div[class="f3 fld"]')[0].content
655
+ times = time.split(" ")
656
+ week = times[0]
657
+ date = times[1]
658
+ _time = times[2]
659
+ schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
660
+ end
661
+ i += 1
513
662
  end
514
- i += 1
515
663
  end
516
664
  schedule
517
665
  end
@@ -521,7 +669,7 @@ module GrabTvmao
521
669
 
522
670
  #获取指定访问速度的代理服务器
523
671
  #time为最慢速度的时间 int型 代表秒
524
- def self.get_topfast_list(use_time)
672
+ def get_topfast_list(use_time)
525
673
  fast_list = []
526
674
  time_use = 0
527
675
  ips_ports = get_proxy_list()
@@ -555,7 +703,7 @@ module GrabTvmao
555
703
  end
556
704
 
557
705
  #获取代理列表
558
- def self.get_proxy_list()
706
+ def get_proxy_list()
559
707
  list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
560
708
  if list.count ==0
561
709
  list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
@@ -575,7 +723,7 @@ module GrabTvmao
575
723
  ips_ports
576
724
  end
577
725
 
578
- def self.gg(url)
726
+ def gg(url)
579
727
  regex_list = /<TD class="list">.*<\/TD>/
580
728
  href =URI.parse(url)
581
729
  contxt = ""
@@ -588,5 +736,5 @@ module GrabTvmao
588
736
  def save_img
589
737
 
590
738
  end
591
-
739
+ end
592
740
  end
@@ -1,5 +1,6 @@
1
1
  #encoding:utf-8
2
2
  require 'nokogiri'
3
+ require 'iconv'
3
4
 
4
5
  module Grabepg
5
6
 
@@ -22,13 +23,48 @@ module Grabepg
22
23
  ret += "五"
23
24
  when 6
24
25
  ret += "六"
25
- when 7
26
- ret += ""
26
+ when 0
27
+ ret += ""
27
28
  end
28
29
  ret
29
30
  end
30
31
 
31
32
 
33
+ def self.proxy_list(path)
34
+ proxy_list = []
35
+ crt_date = DateTime.now.strftime('%F')
36
+ proxy_path = "%s/proxy/%s.txt" % [File.dirname(path),crt_date]
37
+ p "Proxy_Path: #{proxy_path}"
38
+ if File.exist?(proxy_path)
39
+ file_proxy = File.open(proxy_path,"r")
40
+ file_proxy.each_line {|line|
41
+ proxy_list << line.chomp.to_s
42
+ }
43
+ p "Get Proxy_list:#{proxy_list}"
44
+ file_proxy.flush
45
+ file_proxy.close
46
+ else
47
+ proxy_list=GetProxyList.get_list(ENV["proxy_limit"].to_i,ENV["proxy_page"].to_i)
48
+ dirpath = "#{File.dirname(path)}/proxy/"
49
+ Dir.open(dirpath) {|fna|
50
+ fna.each do |fn|
51
+ if(fn.to_s != ".." && fn.to_s != ".")
52
+ File.delete("#{dirpath + fn.to_s}")
53
+ end
54
+ end
55
+ }
56
+ file_proxy = File.new(proxy_path,"a")
57
+ proxy_list.each do |proxy|
58
+ p "Proxy:#{proxy}"
59
+ file_proxy.puts proxy
60
+ end
61
+ file_proxy.flush
62
+ file_proxy.close
63
+ end
64
+ return proxy_list
65
+ end
66
+
67
+
32
68
  #获取指定访问速度的代理服务器
33
69
  #time为最慢速度的时间 int型 代表秒
34
70
  def self.get_topfast_list(use_time)
@@ -157,8 +193,10 @@ module Grabepg
157
193
  proxy = proxylist[@proxyindex+1]
158
194
  end
159
195
  begin
160
- doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
196
+ ic = Iconv.new("UTF-8//IGNORE","GB2312")
197
+ doc = Nokogiri::HTML(ic.iconv(open(url,:proxy=>"#{proxy}").read)) unless proxy.nil?||proxy.empty?
161
198
  if doc.nil?
199
+ p "DOC is nil"
162
200
  doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
163
201
  @no_firest=0
164
202
  end
@@ -176,7 +214,8 @@ module Grabepg
176
214
  @proxyindex=@proxyindex%@size
177
215
  else
178
216
  begin
179
- doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
217
+ ic = Iconv.new("GB2312//IGNORE","GB2312")
218
+ doc = Nokogiri::HTML(ic.iconv(open(url).read)) if proxy.nil?||proxy.empty?
180
219
  rescue => err
181
220
  p "Error : Proxy:#{proxy}, url:#{url}"
182
221
  raise RuntimeError,"Error: #{err.to_s} Method:get_doc_with_proxy"
@@ -36,6 +36,10 @@ module Grabepg
36
36
  @site="http://m.tvsou.com"
37
37
  end
38
38
 
39
+ def get_proxy_list
40
+ @proxy_list
41
+ end
42
+
39
43
  #获取从tvsou的什么网站上获取
40
44
  #type: mobile,webpage
41
45
  def get_url(type)
@@ -44,11 +48,15 @@ module Grabepg
44
48
 
45
49
  def get_data_year_month_day(time)
46
50
 
47
- month = time.month.to_s
51
+ month=time.month.to_s
48
52
  if month.length<2
49
- month = "0"+month
53
+ month="0"+month
50
54
  end
51
- return {time:"#{time.year}-#{time.month}-#{time.day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{time.day})"}
55
+ day = time.day.to_s
56
+ if day.length<2
57
+ day = "0"+day
58
+ end
59
+ return {time:"#{time.year}-#{time.month}-#{day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{day})"}
52
60
  end
53
61
 
54
62
  #获取时间
@@ -111,6 +119,28 @@ module Grabepg
111
119
 
112
120
  end
113
121
 
122
+ #获取频道图标地址
123
+ # url 手机表的URL值
124
+ # channel_type 频道类型
125
+ # no_dis 直接使用URL 不处理
126
+ def get_channel_logo(_url,channel_type,no_dis=false)
127
+ if no_dis
128
+ url = _url
129
+ else
130
+ tvs = _url.split("TVid=")
131
+ tvid = tvs[1].split("&")[0]
132
+ channelids = _url.split("Channelid=")
133
+ channelid = channelids[1].split("&")[0]
134
+ if channel_type=="CCTV"
135
+ url = "http://epg.tvsou.com/programys/TV_#{tvid}/Channel_#{channelid}/W1.htm"
136
+ elsif channel_type=="WTV"
137
+ url = "http://epg.tvsou.com/programws/TV_#{tvid}/Channel_#{channelid}/W1.htm"
138
+ end
139
+ end
140
+ doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
141
+ logo_network_path=doc.css("div[id='epg_m1']").css("img")[0].get_attribute("src")
142
+ return logo_network_path
143
+ end
114
144
 
115
145
 
116
146
  #获取频道时间表URL
@@ -131,11 +161,12 @@ module Grabepg
131
161
 
132
162
  #根据URL解析时间表页面
133
163
  def dispose_schedule_page(url,start_time,use_time)
134
- url = @site +"/"+url
164
+ url = url
135
165
  urls = url.split("?")
136
166
  begin
137
167
  doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
138
168
  @error_num = 0
169
+ _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
139
170
  rescue => err
140
171
  unless @error_num
141
172
  @error_num = 0
@@ -144,7 +175,6 @@ module Grabepg
144
175
  raise err.to_s if @error_num==5
145
176
  dispose_schedule_page(url,start_time,use_time)
146
177
  end
147
- _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
148
178
  _url = urls[0]+_url
149
179
  urls = dispose_href_schedule_data(_url,start_time,use_time)
150
180
  ret = {}
@@ -162,7 +192,8 @@ module Grabepg
162
192
  _dispose = schedule.content
163
193
  _dispose_show =schedule.css("span")[0].text
164
194
  time = _dispose.gsub(_dispose_show,"")
165
- _url = @site+"/" + schedule.css('a')[0].get_attribute("href") if schedule.css('a')[0]
195
+ href =schedule.css('a')[schedule.css('a').count-1].get_attribute("href")
196
+ _url = @site+"/" + href if schedule.css('a')[0]
166
197
  schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
167
198
  now = time.gsub(":","").to_i
168
199
  if((now-last_time)<5)
@@ -190,6 +221,14 @@ module Grabepg
190
221
  #解析节目详情页面
191
222
  def dispose_show_info(url)
192
223
  doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
224
+ if doc.nil?
225
+ unless @error_num
226
+ @error_num = 0
227
+ end
228
+ @error_num+=1
229
+ raise err.to_s if @error_num==5
230
+ dispose_show_info(url)
231
+ end
193
232
  begin
194
233
  show_name = doc.css('div[class="tv_info_top"]')[0].content
195
234
  _doc=doc.css("div[class='tv_info']")
@@ -210,4 +249,4 @@ module Grabepg
210
249
 
211
250
  end
212
251
 
213
- end
252
+ end
@@ -11,20 +11,19 @@ class TestGrabTvsou
11
11
  end
12
12
 
13
13
  def get_data(start_time,use_time)
14
- @grabtvsou.get_data(0,5)
14
+ @grabtvsou.get_data(start_time,use_time)
15
15
  end
16
16
 
17
17
  def dispose_href_schedule_data(href,start_time,use_time)
18
18
  @grabtvsou.dispose_href_schedule_data(href,start_time,use_time)
19
19
  end
20
20
 
21
- def dispose_schedule_page()
22
- href = "http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys"
21
+ def dispose_schedule_page(href="http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys")
23
22
  @grabtvsou.dispose_schedule_page(href,0,1)
24
23
  end
25
24
 
26
25
  def dispose_show_info
27
- hrefs = ["http://m.tvsou.com/jq3.asp?id=81300&tid=3","http://m.tvsou.com/intro.asp?id=145"]
26
+ hrefs = ["http://msou.com//jq3.asp?id=75928&tid=3","http://m.tvsou.com//jq3.asp?id=89450&tid=3"]
28
27
  ret = []
29
28
  hrefs.each do |href|
30
29
  ret<<@grabtvsou.dispose_show_info(href)
@@ -36,6 +35,11 @@ class TestGrabTvsou
36
35
  @grabtvsou.dispose_home_page
37
36
  end
38
37
 
38
+ def get_channel_logo(url="epg.asp?TVid=1&Channelid=1&pro=ys")
39
+ @grabtvsou.get_channel_logo(url)
40
+ end
41
+
42
+
39
43
  def self.start
40
44
  _grabtvsou = GrabTvsou.new("mobile",[])
41
45
  p channels = _grabtvsou.dispose_home_page
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grab_epg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - hahazql
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-06-04 00:00:00.000000000 Z
11
+ date: 2013-06-07 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: ! '"用于抓取EPG信息"'
14
14
  email: