grab_epg 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/grabepg.rb CHANGED
@@ -1,595 +1,4 @@
1
- #encoding:utf-8
2
-
3
- require 'nokogiri'
4
- require 'open-uri'
5
-
6
- module Grabepg
7
- # To change this template use File | Settings | File Templates.
8
-
9
-
10
- #图片的获取: Net::HTTP.get(url)
11
- #图片的文件类型获取:
12
-
13
- attr_reader :channel #频道列表
14
- attr_reader :site #网站地址
15
- attr_reader :proxyindex #代理的索引
16
- attr_reader :show_schedule #根据节目的时间表
17
- attr_reader :img_down_path #图片下载路径存放
18
-
19
- DEFAULT_GrabtvType=["cctv","satellite","digital",]
20
- DEFAULT_SITE = "http://www.tvmao.com"
21
-
22
-
23
-
24
-
25
-
26
- #将星期的wday获取值转化为中文名
27
- #conversion wady to chinese
28
- def self.conversion_what_day(whatday)
29
- ret = "星期"
30
- case whatday.to_i
31
- when 1
32
- ret += "一"
33
- when 2
34
- ret += "二"
35
- when 3
36
- ret += "三"
37
- when 4
38
- ret += "四"
39
- when 5
40
- ret += "五"
41
- when 6
42
- ret += "六"
43
- when 7
44
- ret += "七"
45
- end
46
- ret
47
- end
48
-
49
- #如果时间为1~9的一位则为其在数字前加0补齐二位
50
- def self.dispose_time(num)
51
- num = num.to_s
52
- if num.length < 2
53
- num = "0"+num
54
- end
55
- num
56
- end
57
-
58
- #转化当前时间的格式
59
- def self.get_week_date_time(time)
60
- month = time.month
61
- day = time.day
62
- whatday = time.wday
63
- ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")"
64
- ret
65
- end
66
-
67
- #前几天需要减去的num
68
- def self.del_day_num(day_num)
69
- ret = day_num*60*60*24
70
- ret
71
- end
72
-
73
- #获取距离当前多少天的之前的日期
74
- def self.get_time_day_prior(num)
75
- time = Time.now - del_day_num(num)
76
- ret = get_week_date_time(time)
77
- ret
78
- end
79
-
80
- #前面一周要删除的日期的列表
81
- def self.del_time_list
82
- ret = []
83
- time = Time.now
84
- wday = time.wday
85
- if(wday==1)
86
- for i in 0..7
87
- ret<<self.get_time_day_prior(i)
88
- end
89
- end
90
- ret
91
- end
92
-
93
-
94
-
95
-
96
- #调用此方法的例子
97
- def self.start
98
- #作用是获取俩个字符串的相似度
99
- #get str1 and str2 similarity
100
- get_similarity_string = lambda { |str1,str2|
101
- _length = 0
102
- type = 0
103
- if str1.length>str2.length
104
- _length=str2.length
105
- type = 2
106
- else
107
- _length=str1.length
108
- type =1
109
- end
110
- _str_list = []
111
- _str = ""
112
- for i in 0.._length
113
- case type
114
- when 2
115
- n=i
116
- 0.upto(str1.length-1).each do |j|
117
- p "N: #{n}"
118
- if(str2[n]==str1[j])
119
- _str =_str+str2[n]
120
- n = n+1
121
- p "Str = #{_str}"
122
- else
123
- _str_list << _str
124
- _str = ""
125
- end
126
- end
127
- when 1
128
- n=i
129
- 0.upto(str2.length-1).each do |j|
130
- p "N: #{n}"
131
- if(str1[n]==str2[j])
132
- _str =_str+str1[n]
133
- n=n+1
134
- p "Str = #{_str}"
135
- else
136
- _str_list << _str
137
- _str = ""
138
- end
139
- end
140
- end
141
- end
142
- p _str_list
143
- _str = ""
144
- _str_list.each do |str|
145
- if _str.length<str.length
146
- _str=str
147
- end
148
- end
149
- _str
150
- }
151
-
152
-
153
- path = "/home/zql/workspace/New/smart_remote/img_path"
154
- channel_list = Grabepg.getchannels(path)
155
- channel_urls = channel_list['channel_urls']
156
- channel_infos = channel_list['channel_info']
157
- p "Channel img save file,path='#{Grabepg.img_down_path}'"
158
- proxy_list=Grabepg.get_topfast_list(5) #get_topfast_list 参数是代表最慢用时 单位秒
159
-
160
-
161
- #Use for Test
162
-
163
- p "************************************"
164
- p "proxy_list:#{proxy_list}"
165
- p "************************************"
166
-
167
- bool_start = false
168
-
169
-
170
- channel_urls.each do |channel,url|
171
-
172
- if(channel=="CCTV16")
173
- bool_start = true
174
- end
175
-
176
- if bool_start
177
- previous_show_name = ""
178
- channel_info = channel_infos[channel]
179
- channel_name = channel_info["channel_name"]
180
- channel_type = channel_info["channel_type"]
181
- channel_id = channel_info["channel_id"]
182
- channel_img_path = channel_info["img_path"]
183
-
184
- #channel,herf,proxylist,day_num=7
185
-
186
-
187
- start_time=0
188
- use_num =1
189
-
190
- #getScheduleAssignDate参数:
191
- # channel 频道
192
- # herf 频道地址
193
- # proxylist 代理列表
194
- # start_num 开始时间 int 为开始时间与今天的差值 正数代表今天之后的第几天 负数代表今天之前的第几天
195
- # day_num 抓取的时间段天数
196
- # img_dir_down_path 图片网络地址保存路径 有默认值 可不设置
197
- schedule_list=Grabepg.getScheduleAssignDate(channel,url,proxy_list,start_time,use_num) #抓取的七天后的1天的数据
198
-
199
-
200
- end
201
- end
202
- end
203
-
204
- def self.img_down_path
205
- @img_down_path
206
- end
207
-
208
-
209
- #获取网站的频道表
210
- #img_path 图片存放路径
211
- def self.getchannels(img_dir_path)
212
- @channel = []
213
- @site=DEFAULT_SITE
214
- @proxyindex = 0
215
- @img_down_dir_path = img_dir_path
216
- @img_down_file = File.new(File.join(img_dir_path,"channel_img_down_path"),'w+')
217
-
218
- channel_urls = {}
219
- channel_info = {}
220
- get_url =lambda { |type|
221
- @site + "/program/duration/#{type}/w1.html" unless (type.nil?||type.empty?)
222
- }
223
-
224
- get_channel_id = lambda {|url|
225
- channel_id = url.split("/")[2].split("-")[1] unless (url.nil?||url.empty?)
226
- }
227
-
228
- DEFAULT_GrabtvType.each do |type|
229
- url = get_url.call(type)
230
- p url
231
- doc = Nokogiri::HTML(open(url))
232
- p doc.content
233
- p "*************************************************************"
234
- doc.css('td[class="tdchn"]').each do |td|
235
- channel_name=td.content
236
- herf = ""
237
- td.css('a').each do |a|
238
- herf=a['href']
239
- end
240
- channel_id = get_channel_id.call(herf)
241
-
242
- #获取频道图片的地址
243
- img_path = "http://static.haotv.me/channel/logo/#{channel_id}.jpg"
244
- @img_down_file.puts("#{channel_id}:#{img_path}")
245
- @channel<<({channel_id=>{name:channel_name,herf:herf,type:type}})
246
- channel_info.merge!({channel_id=>{"channel_name"=>channel_name,"channel_type"=>type,"channel_id"=>channel_id,"img_path"=>img_path}})
247
- channel_urls.merge!({channel_id=>herf})
248
- end
249
- end
250
- @img_down_file.close
251
- p "Channel: #{@channel}"
252
- {"channel_info"=>channel_info,"channel_urls"=>channel_urls}
253
- end
254
-
255
- #使用代理获取url的html的doc值
256
- def self.get_doc_with_proxy(proxylist,url)
257
- unless @proxyindex
258
- @proxyindex = 0
259
- end
260
- @proxyindex=@proxyindex%proxylist.size
261
- if(proxylist[@proxyindex])
262
- proxy = proxylist[@proxyindex]
263
- else
264
- proxy = proxylist[@proxyindex+1]
265
- end
266
- begin
267
- doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
268
- doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
269
- @no_firest = 0
270
- rescue => err
271
-
272
- unless @no_firest
273
- @no_firest = 0
274
- end
275
-
276
- @no_firest += 1
277
- p "*************************Proxy:#{proxy}, url:#{url}"
278
- proxylist.delete(proxy)
279
- get_doc_with_proxy(proxylist,url) if @no_firest<4
280
- raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
281
- end
282
- @proxyindex += 1
283
- unless doc
284
- p "*************************Proxy:#{proxy}, url:#{url}"
285
- end
286
- doc
287
- end
288
-
289
- #获取某天的节目表
290
- def self.get_schedulelist_atday(channel,url,proxylist)
291
- p "Grab: #{url}"
292
- doc = get_doc_with_proxy(proxylist,url)
293
- show_type = []
294
-
295
-
296
- _img_url = "http://static.haotv.me/channel/logo/"
297
- img_url = _img_url + channel+".jpg"
298
-
299
-
300
- data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
301
- date = data[0]
302
- week = data[1]
303
- p "Channel: #{channel} Date: #{date} Week: #{week}"
304
- @date = "#{week}(#{date})"
305
- schedule_list = []
306
-
307
- _herf = doc.css("h1[style='float:left']").xpath('img[@src]')[0]
308
- img_url = _herf.get_attribute("src") if _herf
309
-
310
- p "**************IMG: #{img_url}"
311
-
312
-
313
- doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
314
- _herf= schedule.xpath('a[@href]')[0]
315
- schedule_herf=_herf.get_attribute("href") if _herf
316
- unless _herf
317
- drama =schedule.css('a[class="drama"]')[0]
318
- if drama
319
- _herfs=drama.get_attribute("href").gsub("/episode/section","#%#")
320
- schedule_herf = _herfs.split("#%#")[0]
321
- end
322
- end
323
- if schedule.content.split(" ").size>1
324
- time = schedule.content.split(" ")[0]
325
- schedule = schedule.content.split(" ")[1]
326
- show_name = ""
327
- unless schedule_herf.nil?||schedule_herf.empty?
328
- show_infomation=get_show_infomation(proxylist,schedule_herf)
329
- show_type=show_infomation["type"]
330
- show_name = show_infomation["name"]
331
- show_img = show_infomation["img"]
332
- end
333
- p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name} img:#{show_img}"
334
- schedule_list << {"schedule_name"=>schedule,"schedule_logo"=>show_img,"schedule_start"=>time,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name}
335
- end
336
- end
337
- schedule_list
338
- end
339
-
340
- #获取制定时间和长度url
341
- #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
342
- #day_num 为int型 代表抓取的时间从开始时间计算的多少天
343
- def self.get_assign_date_url(url,start_time,day_num)
344
- site="http://www.tvmao.com"
345
- if(@site)
346
- site=@site
347
- end
348
-
349
- _url = site
350
- urls = []
351
- _urls = url.split("-")
352
-
353
- time = Time.now
354
- _wday = time.wday
355
- wday = _wday + start_time
356
- if wday<0
357
- wday = 1
358
- end
359
-
360
- end_day = wday + day_num - 1
361
-
362
- if end_day>(_wday+7)
363
- end_day = _wday + 7
364
- end
365
-
366
- 0.upto(1).each do |i|
367
- _url = _url+"#{_urls[i]}"+"-"
368
- end
369
-
370
- wday.upto(end_day).each do |i|
371
- urls << _url+"w#{i}.html"
372
- end
373
- urls
374
- end
375
-
376
-
377
-
378
- #获取指定时间段的节目表
379
- def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
380
- begin
381
- day_num = 1 if day_num<1
382
- rescue
383
- day_num = 1
384
- end
385
- site="http://www.tvmao.com"
386
- unless img_dir_down_path
387
- img_dir_down_path = __FILE__
388
- end
389
- @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
390
-
391
- if(@site)
392
- site=@site
393
- end
394
- _img_url = "http://static.haotv.me/channel/logo/"
395
- @show_schedule = {}
396
-
397
- channel_schedule = {}
398
- get_assign_date_url(herf,start_num,day_num).each do |url|
399
- @date = ""
400
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
401
- channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
402
- end
403
- @img_down_file.close
404
- {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
405
- end
406
-
407
-
408
-
409
-
410
-
411
- #因原已调用所以保留
412
- #获取一周节目表
413
- def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
414
- p "Day Num is #{day_num}"
415
- begin
416
- day_num = 1 if day_num<1
417
- rescue
418
- day_num = 1
419
- end
420
- site="http://www.tvmao.com"
421
- unless img_dir_down_path
422
- img_dir_down_path = __FILE__
423
- end
424
- @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
425
-
426
- if(@site)
427
- site=@site
428
- end
429
- _img_url = "http://static.haotv.me/channel/logo/"
430
- @show_schedule = {}
431
-
432
- get_week_url = lambda {|url,day_num|
433
- _url = site
434
- urls = []
435
- _urls = url.split("-")
436
- 0.upto(1).each do |i|
437
- _url = _url+"#{_urls[i]}"+"-"
438
- end
439
- 1.upto(day_num).each do |i|
440
- urls << _url+"w#{i}.html"
441
- end
442
- urls
443
- }
444
-
445
- channel_schedule = {}
446
- get_week_url.call(herf,day_num).each do |url|
447
- @date = ""
448
- schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
449
- channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
450
- end
451
- @img_down_file.close
452
- {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
453
- end
454
-
455
-
456
- #获取节目详细信息
457
- def self.get_show_infomation(proxy_list,schedule_herf)
458
- begin
459
- @proxyindex = 0
460
- unless @site
461
- @site = "http://www.tvmao.com"
462
- end
463
- schedule_herf = @site + schedule_herf
464
- doc=get_doc_with_proxy(proxy_list,schedule_herf)
465
- #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
466
- # p "title: %s" % title
467
- type = []
468
- name = doc.css('span[itemprop="name"]')[0].content
469
-
470
- #获取节目的图片
471
- if doc.css('img[class="tvc"]')
472
- schedule_img_down_path = doc.css('img[class="tvc"]')[0].get_attribute('src') if doc.css('img[class="tvc"]')[0]
473
- end
474
-
475
-
476
-
477
- doc.css('span[itemprop="genre"]').each do |_type|
478
- type << _type.content
479
- end
480
- doc.css('a[itemprop="genre"]').each do |_type|
481
- type<<_type.content
482
- end
483
- url = "#{schedule_herf}/detail"
484
- doc = get_doc_with_proxy(proxy_list,url)
485
- doc.css('span[itemprop="genre"]').each do |_type|
486
- type << _type.content
487
- end
488
- doc.css('a[itemprop="genre"]').each do |_type|
489
- type<<_type.content
490
- end
491
- type.uniq!
492
- @img_down_file.puts("#{name}:#{schedule_img_down_path}")
493
- @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
494
- {"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
495
- rescue => e
496
- p "Error In get_show_infomation msg : #{e.to_s}"
497
- end
498
- end
499
-
500
- #获取节目的时间表
501
- def self.get_show_schedule(proxylist,herf)
502
- url = herf + "/playingtime"
503
- doc = get_doc_with_proxy(proxylist,url)
504
- i = 0
505
- schedule = []
506
- doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
507
- unless(i==0)
508
- time = epg.css('div[class="f1 fld"]')[0].content
509
- channel_name = epg.css('div[class="f2 fld"]')[0].content
510
- show_name = epg.css('div[class="f3 fld"]')[0].content
511
- times = time.split(" ")
512
- week = times[0]
513
- date = times[1]
514
- _time = times[2]
515
- schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
516
- end
517
- i += 1
518
- end
519
- schedule
520
- end
521
-
522
-
523
-
524
-
525
- #获取指定访问速度的代理服务器
526
- #time为最慢速度的时间 int型 代表秒
527
- def self.get_topfast_list(use_time)
528
- fast_list = []
529
- time_use = 0
530
- ips_ports = get_proxy_list()
531
- ips_ports.each do |ip_port|
532
- time_start = Time.now.to_i
533
- begin
534
- timeout(use_time) do
535
- doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}"))
536
- end
537
- time_end = Time.now.to_i
538
- time_use = time_end - time_start
539
- p "http://#{ip_port} use_time:#{time_use}"
540
- rescue Exception =>e
541
- case e
542
- when Errno::ETIMEDOUT
543
- p "Use http://#{ip_port} timeout"
544
- when Timeout::Error
545
- p "Use http://#{ip_port} timeout"
546
- when Errno::ECONNREFUSED
547
- p "Use http://#{ip_port} Error connection"
548
- else
549
- p "Use http://#{ip_port} Error:#{e.to_s}"
550
- end
551
- time_use = -1
552
- end
553
- if(time_use > 0 &&time_use < 8)
554
- fast_list << ip_port
555
- end
556
- end
557
- fast_list
558
- end
559
-
560
- #获取代理列表
561
- def self.get_proxy_list()
562
- list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
563
- if list.count ==0
564
- list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
565
- end
566
- ips_ports = []
567
- regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
568
- regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
569
- list.each do |proxy_txt|
570
- port = proxy_txt[regex_port]
571
- ip = proxy_txt[regex_ip]
572
- if(ip != ""&& !port.to_s.eql?('3128'))
573
- port_ip = ip.to_s + ":" + port.to_s
574
- ips_ports << port_ip
575
- end
576
- end
577
- p "Count: #{ips_ports.count}"
578
- ips_ports
579
- end
580
-
581
- def self.gg(url)
582
- regex_list = /<TD class="list">.*<\/TD>/
583
- href =URI.parse(url)
584
- contxt = ""
585
- href.open{ |f|
586
- f.each_line {|line| contxt =contxt + line + "\n"}
587
- }
588
- list = contxt.scan(regex_list)
589
- end
590
-
591
- def save_img
592
-
593
- end
594
-
595
- end
1
+ require 'grabepg'
2
+ require 'grab_tvmao'
3
+ require File.expand_path("../grabepg/grab_tvsou", __FILE__)
4
+ require File.expand_path("../grabepg/grab_base", __FILE__)
@@ -0,0 +1,52 @@
1
+ require File.expand_path("../../grabepg/grab_tvsou", __FILE__)
2
+
3
+
4
+ class TestGrabTvsou
5
+ # To change this template use File | Settings | File Templates.
6
+ include Grabepg
7
+
8
+
9
+ def initialize
10
+ @grabtvsou = GrabTvsou.new("mobile",[])
11
+ end
12
+
13
+ def get_data(start_time,use_time)
14
+ @grabtvsou.get_data(0,5)
15
+ end
16
+
17
+ def dispose_href_schedule_data(href,start_time,use_time)
18
+ @grabtvsou.dispose_href_schedule_data(href,start_time,use_time)
19
+ end
20
+
21
+ def dispose_schedule_page()
22
+ href = "http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys"
23
+ @grabtvsou.dispose_schedule_page(href,0,1)
24
+ end
25
+
26
+ def dispose_show_info
27
+ hrefs = ["http://m.tvsou.com/jq3.asp?id=81300&tid=3","http://m.tvsou.com/intro.asp?id=145"]
28
+ ret = []
29
+ hrefs.each do |href|
30
+ ret<<@grabtvsou.dispose_show_info(href)
31
+ end
32
+ ret
33
+ end
34
+
35
+ def dispose_home_page
36
+ @grabtvsou.dispose_home_page
37
+ end
38
+
39
+ def self.start
40
+ _grabtvsou = GrabTvsou.new("mobile",[])
41
+ channels = _grabtvsou.dispose_home_page
42
+ i = 0
43
+ ret = {}
44
+ channels.each do |channel_type,value|
45
+ value.each do |channel_name,channel_msg|
46
+ return ret if i==2
47
+ ret.merge!({channel_name=>{"schedule"=>_grabtvsou.dispose_schedule_page(channel_msg[:url],0,1),"channel_type"=>channel_type}})
48
+ i += 1
49
+ end
50
+ end
51
+ end
52
+ end
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grab_epg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - hahazql
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-05-13 00:00:00.000000000 Z
11
+ date: 2013-05-27 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: ! '"用于从TVMAO抓取EPG信息"'
13
+ description: ! '"用于抓取EPG信息"'
14
14
  email:
15
15
  - hahazhouqunli@gmail.com
16
16
  executables: []
@@ -22,8 +22,12 @@ files:
22
22
  - Gemfile
23
23
  - README.md
24
24
  - lib/debug.rb
25
+ - lib/grab_tvmao.rb
25
26
  - lib/grabepg.rb
26
27
  - lib/grabepg.rb~
28
+ - lib/grabepg/grab_base.rb
29
+ - lib/grabepg/grab_tvsou.rb
30
+ - lib/test/test_grab_tvsou.rb
27
31
  - projectFilesBackup/.idea/grabepg.iml
28
32
  homepage: https://github.com/hahazql/grab_epg
29
33
  licenses: []