RubyGems - grab_epg - Versions diffs - 0.1.6 → 0.1.8 - Mend

grab_epg 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/lib/grabepg.rb CHANGED Viewed

@@ -1,595 +1,4 @@
-#encoding:utf-8
-require 'nokogiri'
-require 'open-uri'
-module Grabepg
-  # To change this template use File | Settings | File Templates.
-  #图片的获取： Net::HTTP.get(url)
-  #图片的文件类型获取：
-  attr_reader :channel  #频道列表
-  attr_reader :site #网站地址
-  attr_reader :proxyindex #代理的索引
-  attr_reader :show_schedule #根据节目的时间表
-  attr_reader :img_down_path #图片下载路径存放
-  DEFAULT_GrabtvType=["cctv","satellite","digital",]
-  DEFAULT_SITE = "http://www.tvmao.com"
-#将星期的wday获取值转化为中文名
-#conversion wady to chinese
-  def self.conversion_what_day(whatday)
-    ret = "星期"
-    case whatday.to_i
-      when 1
-        ret += "一"
-      when 2
-        ret += "二"
-      when 3
-        ret += "三"
-      when 4
-        ret += "四"
-      when 5
-        ret += "五"
-      when 6
-        ret += "六"
-      when 7
-        ret += "七"
-    end
-    ret
-  end
-  #如果时间为1~9的一位则为其在数字前加0补齐二位
-  def self.dispose_time(num)
-    num = num.to_s
-    if num.length < 2
-      num = "0"+num
-    end
-    num
-  end
-  #转化当前时间的格式
-  def self.get_week_date_time(time)
-    month = time.month
-    day = time.day
-    whatday = time.wday
-    ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")"
-    ret
-  end
-  #前几天需要减去的num
-  def self.del_day_num(day_num)
-    ret = day_num*60*60*24
-    ret
-  end
-  #获取距离当前多少天的之前的日期
-  def self.get_time_day_prior(num)
-    time = Time.now - del_day_num(num)
-    ret = get_week_date_time(time)
-    ret
-  end
-  #前面一周要删除的日期的列表
-  def self.del_time_list
-    ret = []
-    time = Time.now
-    wday = time.wday
-    if(wday==1)
-      for i in 0..7
-        ret<<self.get_time_day_prior(i)
-      end
-    end
-    ret
- end
-  #调用此方法的例子
-  def self.start
-    #作用是获取俩个字符串的相似度
-    #get str1 and str2 similarity
-    get_similarity_string = lambda { |str1,str2|
-      _length = 0
-      type = 0
-      if str1.length>str2.length
-        _length=str2.length
-        type = 2
-      else
-        _length=str1.length
-        type =1
-      end
-      _str_list = []
-      _str = ""
-      for i in 0.._length
-        case type
-          when 2
-            n=i
-            0.upto(str1.length-1).each do |j|
-              p "N: #{n}"
-              if(str2[n]==str1[j])
-                _str =_str+str2[n]
-                n = n+1
-                p "Str = #{_str}"
-              else
-                _str_list << _str
-                _str = ""
-              end
-            end
-          when 1
-            n=i
-            0.upto(str2.length-1).each do |j|
-              p "N: #{n}"
-              if(str1[n]==str2[j])
-                _str =_str+str1[n]
-                n=n+1
-                p "Str = #{_str}"
-              else
-                _str_list << _str
-                _str = ""
-              end
-            end
-        end
-      end
-      p _str_list
-      _str = ""
-      _str_list.each do |str|
-        if _str.length<str.length
-          _str=str
-        end
-      end
-      _str
-    }
-    path = "/home/zql/workspace/New/smart_remote/img_path"
-    channel_list = Grabepg.getchannels(path)
-    channel_urls = channel_list['channel_urls']
-    channel_infos = channel_list['channel_info']
-    p "Channel img save file,path='#{Grabepg.img_down_path}'"
-    proxy_list=Grabepg.get_topfast_list(5)  #get_topfast_list 参数是代表最慢用时 单位秒
-    #Use for Test
-    p "************************************"
-    p "proxy_list:#{proxy_list}"
-    p "************************************"
-    bool_start = false
-    channel_urls.each do |channel,url|
-      if(channel=="CCTV16")
-        bool_start = true
-      end
-      if bool_start
-        previous_show_name = ""
-        channel_info = channel_infos[channel]
-        channel_name = channel_info["channel_name"]
-        channel_type = channel_info["channel_type"]
-        channel_id = channel_info["channel_id"]
-        channel_img_path = channel_info["img_path"]
-        #channel,herf,proxylist,day_num=7
-        start_time=0
-        use_num =1
-        #getScheduleAssignDate参数：
-        # channel 频道
-        # herf 频道地址
-        # proxylist 代理列表
-        # start_num 开始时间 int 为开始时间与今天的差值 正数代表今天之后的第几天   负数代表今天之前的第几天
-        # day_num 抓取的时间段天数
-        # img_dir_down_path 图片网络地址保存路径 有默认值 可不设置
-        schedule_list=Grabepg.getScheduleAssignDate(channel,url,proxy_list,start_time,use_num)  #抓取的七天后的1天的数据
-        end
-      end
-    end
-  def self.img_down_path
-    @img_down_path
-  end
-  #获取网站的频道表
-  #img_path 图片存放路径
-  def self.getchannels(img_dir_path)
-    @channel = []
-    @site=DEFAULT_SITE
-    @proxyindex = 0
-    @img_down_dir_path = img_dir_path
-    @img_down_file = File.new(File.join(img_dir_path,"channel_img_down_path"),'w+')
-    channel_urls = {}
-    channel_info = {}
-    get_url =lambda { |type|
-      @site + "/program/duration/#{type}/w1.html" unless (type.nil?||type.empty?)
-        }
-    get_channel_id = lambda {|url|
-      channel_id = url.split("/")[2].split("-")[1] unless (url.nil?||url.empty?)
-    }
-    DEFAULT_GrabtvType.each do |type|
-      url = get_url.call(type)
-      p url
-      doc = Nokogiri::HTML(open(url))
-      p doc.content
-      p "*************************************************************"
-      doc.css('td[class="tdchn"]').each do |td|
-       channel_name=td.content
-       herf = ""
-       td.css('a').each do |a|
-        herf=a['href']
-       end
-        channel_id = get_channel_id.call(herf)
-       #获取频道图片的地址
-        img_path = "http://static.haotv.me/channel/logo/#{channel_id}.jpg"
-        @img_down_file.puts("#{channel_id}:#{img_path}")
-        @channel<<({channel_id=>{name:channel_name,herf:herf,type:type}})
-        channel_info.merge!({channel_id=>{"channel_name"=>channel_name,"channel_type"=>type,"channel_id"=>channel_id,"img_path"=>img_path}})
-        channel_urls.merge!({channel_id=>herf})
-      end
-    end
-    @img_down_file.close
-    p "Channel: #{@channel}"
-    {"channel_info"=>channel_info,"channel_urls"=>channel_urls}
-  end
-  #使用代理获取url的html的doc值
-  def self.get_doc_with_proxy(proxylist,url)
-    unless @proxyindex
-      @proxyindex = 0
-    end
-    @proxyindex=@proxyindex%proxylist.size
-    if(proxylist[@proxyindex])
-      proxy = proxylist[@proxyindex]
-    else
-      proxy = proxylist[@proxyindex+1]
-    end
-    begin
-      doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
-      doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
-      @no_firest = 0
-    rescue => err
-      unless @no_firest
-        @no_firest = 0
-      end
-      @no_firest += 1
-      p "*************************Proxy:#{proxy}, url:#{url}"
-      proxylist.delete(proxy)
-      get_doc_with_proxy(proxylist,url) if @no_firest<4
-      raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
-    end
-    @proxyindex += 1
-    unless doc
-      p "*************************Proxy:#{proxy}, url:#{url}"
-    end
-    doc
-  end
-  #获取某天的节目表
-  def self.get_schedulelist_atday(channel,url,proxylist)
-    p "Grab: #{url}"
-    doc = get_doc_with_proxy(proxylist,url)
-    show_type = []
-    _img_url = "http://static.haotv.me/channel/logo/"
-    img_url = _img_url + channel+".jpg"
-    data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
-    date = data[0]
-    week = data[1]
-    p "Channel: #{channel}  Date: #{date} Week: #{week}"
-    @date = "#{week}(#{date})"
-    schedule_list = []
-    _herf = doc.css("h1[style='float:left']").xpath('img[@src]')[0]
-    img_url = _herf.get_attribute("src") if _herf
-    p "**************IMG: #{img_url}"
-    doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
-      _herf= schedule.xpath('a[@href]')[0]
-      schedule_herf=_herf.get_attribute("href") if _herf
-      unless _herf
-        drama =schedule.css('a[class="drama"]')[0]
-        if drama
-          _herfs=drama.get_attribute("href").gsub("/episode/section","#%#")
-          schedule_herf = _herfs.split("#%#")[0]
-        end
-      end
-      if schedule.content.split(" ").size>1
-        time = schedule.content.split(" ")[0]
-        schedule = schedule.content.split(" ")[1]
-        show_name = ""
-        unless schedule_herf.nil?||schedule_herf.empty?
-          show_infomation=get_show_infomation(proxylist,schedule_herf)
-          show_type=show_infomation["type"]
-          show_name = show_infomation["name"]
-          show_img = show_infomation["img"]
-        end
-        p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf}  type: #{show_type} name: #{show_name} img:#{show_img}"
-        schedule_list << {"schedule_name"=>schedule,"schedule_logo"=>show_img,"schedule_start"=>time,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name}
-      end
-    end
-    schedule_list
-  end
-  #获取制定时间和长度url
-  #start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
-  #day_num 为int型 代表抓取的时间从开始时间计算的多少天
-  def self.get_assign_date_url(url,start_time,day_num)
-    site="http://www.tvmao.com"
-    if(@site)
-      site=@site
-    end
-    _url = site
-    urls = []
-    _urls = url.split("-")
-    time = Time.now
-    _wday = time.wday
-    wday = _wday + start_time
-    if wday<0
-      wday = 1
-    end
-    end_day = wday + day_num - 1
-    if end_day>(_wday+7)
-     end_day = _wday + 7
-    end
-    0.upto(1).each do |i|
-      _url = _url+"#{_urls[i]}"+"-"
-    end
-    wday.upto(end_day).each do |i|
-      urls << _url+"w#{i}.html"
-    end
-    urls
-  end
-  #获取指定时间段的节目表
-  def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
-    begin
-      day_num = 1 if day_num<1
-    rescue
-      day_num = 1
-    end
-    site="http://www.tvmao.com"
-    unless img_dir_down_path
-      img_dir_down_path = __FILE__
-    end
-    @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
-    if(@site)
-      site=@site
-    end
-    _img_url = "http://static.haotv.me/channel/logo/"
-    @show_schedule = {}
-    channel_schedule = {}
-    get_assign_date_url(herf,start_num,day_num).each do |url|
-      @date = ""
-      schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
-      channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
-    end
-    @img_down_file.close
-    {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
-  end
-  #因原已调用所以保留
-  #获取一周节目表
-  def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
-    p "Day Num is #{day_num}"
-    begin
-      day_num = 1 if day_num<1
-    rescue
-      day_num = 1
-    end
-    site="http://www.tvmao.com"
-    unless img_dir_down_path
-      img_dir_down_path = __FILE__
-    end
-    @img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
-    if(@site)
-      site=@site
-    end
-    _img_url = "http://static.haotv.me/channel/logo/"
-    @show_schedule = {}
-    get_week_url = lambda {|url,day_num|
-       _url = site
-       urls = []
-       _urls = url.split("-")
-       0.upto(1).each do |i|
-        _url = _url+"#{_urls[i]}"+"-"
-       end
-      1.upto(day_num).each do |i|
-        urls << _url+"w#{i}.html"
-      end
-      urls
-    }
-    channel_schedule = {}
-    get_week_url.call(herf,day_num).each do |url|
-      @date = ""
-      schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
-      channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
-    end
-    @img_down_file.close
-    {"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
-  end
-  #获取节目详细信息
-  def self.get_show_infomation(proxy_list,schedule_herf)
-    begin
-    @proxyindex = 0
-    unless @site
-      @site = "http://www.tvmao.com"
-    end
-    schedule_herf = @site + schedule_herf
-    doc=get_doc_with_proxy(proxy_list,schedule_herf)
-    #title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
-   # p "title: %s" % title
-    type = []
-    name = doc.css('span[itemprop="name"]')[0].content
-    #获取节目的图片
-    if doc.css('img[class="tvc"]')
-     schedule_img_down_path = doc.css('img[class="tvc"]')[0].get_attribute('src') if doc.css('img[class="tvc"]')[0]
-    end
-    doc.css('span[itemprop="genre"]').each do |_type|
-      type << _type.content
-    end
-    doc.css('a[itemprop="genre"]').each do |_type|
-      type<<_type.content
-    end
-    url = "#{schedule_herf}/detail"
-    doc = get_doc_with_proxy(proxy_list,url)
-    doc.css('span[itemprop="genre"]').each do |_type|
-      type << _type.content
-    end
-    doc.css('a[itemprop="genre"]').each do |_type|
-      type<<_type.content
-    end
-    type.uniq!
-    @img_down_file.puts("#{name}:#{schedule_img_down_path}")
-    @show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
-    {"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
-    rescue => e
-      p "Error In get_show_infomation msg : #{e.to_s}"
-    end
-  end
-  #获取节目的时间表
-  def self.get_show_schedule(proxylist,herf)
-    url = herf + "/playingtime"
-    doc = get_doc_with_proxy(proxylist,url)
-    i = 0
-    schedule = []
-    doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
-      unless(i==0)
-        time = epg.css('div[class="f1 fld"]')[0].content
-        channel_name = epg.css('div[class="f2 fld"]')[0].content
-        show_name = epg.css('div[class="f3 fld"]')[0].content
-        times = time.split(" ")
-        week = times[0]
-        date = times[1]
-        _time = times[2]
-        schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
-      end
-      i += 1
-    end
-    schedule
-  end
-  #获取指定访问速度的代理服务器
-  #time为最慢速度的时间 int型 代表秒
-  def self.get_topfast_list(use_time)
-    fast_list = []
-    time_use = 0
-    ips_ports = get_proxy_list()
-    ips_ports.each do |ip_port|
-      time_start = Time.now.to_i
-      begin
-        timeout(use_time) do
-          doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}"))
-        end
-        time_end = Time.now.to_i
-        time_use = time_end - time_start
-        p  "http://#{ip_port}   use_time:#{time_use}"
-      rescue Exception =>e
-        case e
-          when Errno::ETIMEDOUT
-            p "Use http://#{ip_port} timeout"
-          when Timeout::Error
-            p "Use http://#{ip_port} timeout"
-          when Errno::ECONNREFUSED
-            p "Use http://#{ip_port} Error connection"
-          else
-            p "Use http://#{ip_port} Error:#{e.to_s}"
-        end
-        time_use = -1
-      end
-      if(time_use > 0 &&time_use < 8)
-        fast_list << ip_port
-      end
-    end
-    fast_list
-  end
-  #获取代理列表
-  def self.get_proxy_list()
-    list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
-    if list.count ==0
-      list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
-    end
-    ips_ports = []
-    regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
-    regex_ip   = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
-    list.each do |proxy_txt|
-      port = proxy_txt[regex_port]
-      ip = proxy_txt[regex_ip]
-      if(ip != ""&& !port.to_s.eql?('3128'))
-        port_ip = ip.to_s + ":" + port.to_s
-        ips_ports << port_ip
-      end
-    end
-    p "Count: #{ips_ports.count}"
-    ips_ports
-  end
-  def self.gg(url)
-    regex_list = /<TD class="list">.*<\/TD>/
-    href =URI.parse(url)
-    contxt = ""
-    href.open{ |f|
-      f.each_line {|line| contxt =contxt + line + "\n"}
-    }
-    list = contxt.scan(regex_list)
-  end
-  def save_img
-  end
-end
+require 'grabepg'
+require 'grab_tvmao'
+require File.expand_path("../grabepg/grab_tvsou", __FILE__)
+require File.expand_path("../grabepg/grab_base", __FILE__)

data/lib/test/test_grab_tvsou.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require File.expand_path("../../grabepg/grab_tvsou", __FILE__)
+class TestGrabTvsou
+  # To change this template use File | Settings | File Templates.
+  include Grabepg
+  def initialize
+    @grabtvsou = GrabTvsou.new("mobile",[])
+  end
+  def get_data(start_time,use_time)
+    @grabtvsou.get_data(0,5)
+  end
+  def dispose_href_schedule_data(href,start_time,use_time)
+    @grabtvsou.dispose_href_schedule_data(href,start_time,use_time)
+  end
+  def dispose_schedule_page()
+    href = "http://m.tvsou.com/epg.asp?TVid=1&Channelid=1&pro=ys"
+   @grabtvsou.dispose_schedule_page(href,0,1)
+  end
+  def dispose_show_info
+    hrefs = ["http://m.tvsou.com/jq3.asp?id=81300&tid=3","http://m.tvsou.com/intro.asp?id=145"]
+    ret = []
+    hrefs.each do |href|
+      ret<<@grabtvsou.dispose_show_info(href)
+    end
+    ret
+  end
+  def dispose_home_page
+    @grabtvsou.dispose_home_page
+  end
+  def self.start
+    _grabtvsou = GrabTvsou.new("mobile",[])
+    channels = _grabtvsou.dispose_home_page
+    i = 0
+    ret = {}
+    channels.each do |channel_type,value|
+      value.each do |channel_name,channel_msg|
+        return ret if i==2
+        ret.merge!({channel_name=>{"schedule"=>_grabtvsou.dispose_schedule_page(channel_msg[:url],0,1),"channel_type"=>channel_type}})
+        i += 1
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,16 +1,16 @@
 --- !ruby/object:Gem::Specification
 name: grab_epg
 version: !ruby/object:Gem::Version
-  version: 0.1.6
+  version: 0.1.8
 platform: ruby
 authors:
 - hahazql
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-05-13 00:00:00.000000000 Z
+date: 2013-05-27 00:00:00.000000000 Z
 dependencies: []
-description: ! '"用于从TVMAO抓取EPG信息"'
+description: ! '"用于抓取EPG信息"'
 email:
 - hahazhouqunli@gmail.com
 executables: []
@@ -22,8 +22,12 @@ files:
 - Gemfile
 - README.md
 - lib/debug.rb
+- lib/grab_tvmao.rb
 - lib/grabepg.rb
 - lib/grabepg.rb~
+- lib/grabepg/grab_base.rb
+- lib/grabepg/grab_tvsou.rb
+- lib/test/test_grab_tvsou.rb
 - projectFilesBackup/.idea/grabepg.iml
 homepage: https://github.com/hahazql/grab_epg
 licenses: []