RubyGems - video_scraper - Versions diffs - 1.0.5 - Mend

video_scraper 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/ChangeLog +4 -0
data/README +71 -0
data/Rakefile +146 -0
data/lib/www/video_scraper.rb +88 -0
data/lib/www/video_scraper/adult_satellites.rb +27 -0
data/lib/www/video_scraper/age_sage.rb +28 -0
data/lib/www/video_scraper/ameba_vision.rb +22 -0
data/lib/www/video_scraper/base.rb +88 -0
data/lib/www/video_scraper/dailymotion.rb +30 -0
data/lib/www/video_scraper/eic_book.rb +34 -0
data/lib/www/video_scraper/moro_tube.rb +31 -0
data/lib/www/video_scraper/nico_video.rb +68 -0
data/lib/www/video_scraper/pornhub.rb +24 -0
data/lib/www/video_scraper/pornotube.rb +39 -0
data/lib/www/video_scraper/red_tube.rb +89 -0
data/lib/www/video_scraper/tube8.rb +31 -0
data/lib/www/video_scraper/veoh.rb +28 -0
data/lib/www/video_scraper/you_porn.rb +26 -0
data/lib/www/video_scraper/you_tube.rb +53 -0
data/lib/www/video_scraper/your_file_host.rb +54 -0
data/test/test_helper.rb +23 -0
data/test/www/test_video_scraper.rb +43 -0
data/test/www/video_scraper/test_adult_satellites.rb +13 -0
data/test/www/video_scraper/test_age_sage.rb +13 -0
data/test/www/video_scraper/test_ameba_vision.rb +12 -0
data/test/www/video_scraper/test_base.rb +14 -0
data/test/www/video_scraper/test_dailymotion.rb +14 -0
data/test/www/video_scraper/test_eic_book.rb +14 -0
data/test/www/video_scraper/test_moro_tube.rb +13 -0
data/test/www/video_scraper/test_nico_video.rb +23 -0
data/test/www/video_scraper/test_pornhub.rb +14 -0
data/test/www/video_scraper/test_pornotube.rb +21 -0
data/test/www/video_scraper/test_red_tube.rb +13 -0
data/test/www/video_scraper/test_tube8.rb +14 -0
data/test/www/video_scraper/test_veoh.rb +24 -0
data/test/www/video_scraper/test_you_porn.rb +13 -0
data/test/www/video_scraper/test_you_tube.rb +32 -0
data/test/www/video_scraper/test_your_file_host.rb +14 -0
metadata +133 -0

data/lib/www/video_scraper/dailymotion.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class Dailymotion < Base
+      url_regex %r!\Ahttp://www\.dailymotion\.com/.*?/video/([\w/-]+)!
+      def scrape
+        uri = URI.parse(@page_url)
+        html = http_get(@page_url)
+        doc = Hpricot(html.toutf8)
+        doc.search('//script').each do |elem|
+          if m = elem.inner_html.match(/\.addVariable\("video",\s*"([^"]+)"/i)
+            path = CGI.unescape(m[1]).split(/\|\||@@/).first
+            @video_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
+          end
+          if m = elem.inner_html.match(/\.addVariable\("preview",\s+"([^"]+)"/)
+            path = CGI.unescape(m[1]).split(/\|\||@@/).first
+            @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
+          end
+        end
+        @title = doc.at('//h1[@class="nav"]').inner_html rescue nil
+        @embed_tag = CGI.unescapeHTML(doc.at('//textarea[@id="video_player_embed_code_text"]').inner_html) rescue nil
+      end
+    end
+  end
+end

data/lib/www/video_scraper/eic_book.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class EicBook < Base
+      attr_reader :capture_urls
+      url_regex %r!\Ahttp://www\.eic-book\.com/(detail_\d+\.html).*!
+      def scrape
+        uri = URI.parse(@page_url)
+        @page_url = "#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm"
+        html = http_get(@page_url)
+        doc = Hpricot(html.toutf8)
+        raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
+        flashvars = CGI.parse(flashvars.attributes['value'])
+        @video_url = flashvars['flv'][0]
+        @title = CGI.unescapeHTML(doc.at('//h2[@class="detailTtl"]').inner_html).gsub('&nbsp;', ' ') rescue nil
+        html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=h4")
+        doc = Hpricot(html.toutf8)
+        if img = doc.at('//div[@class="detailMN"]/img[@class="waku01"]')
+          @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
+        end
+        html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=cp")
+        doc = Hpricot(html.toutf8)
+        @capture_urls = []
+        doc.search('//div[@class="detailMN"]/img[@class="waku01"]') do |img|
+          @capture_urls << URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
+        end
+      end
+    end
+  end
+end

data/lib/www/video_scraper/moro_tube.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class MoroTube < Base
+      url_regex %r!\Ahttp://www\.morotube\.com/watch\.php\?clip=([[:alnum:]]{8})!
+      attr_reader :author, :duration
+      def scrape
+        uri = URI.parse(@page_url)
+        uri.path = '/gen_xml.php'
+        uri.query = "type=o&id=#{url_regex_match[1]}"
+        xml = http_get(uri.to_s)
+        xdoc = Hpricot.XML(xml.toutf8)
+        @title = xdoc.search('/root/video/title').inner_html
+        @video_url = xdoc.search('/root/video/file').inner_html
+        @thumb_url = xdoc.search('/root/video/image').inner_html
+        @author = xdoc.search('/root/video/author').inner_html
+        @duration = xdoc.search('/root/video/duration').inner_html
+        html = http_get(@page_url)
+        doc = Hpricot(html)
+        doc.search('//input#inpVdoEmbed') do |elem|
+          @embed_tag = elem.attributes['value']
+        end
+      end
+    end
+  end
+end

data/lib/www/video_scraper/nico_video.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class NicoVideo < Base
+      url_regex %r!\Ahttp://www\.nicovideo\.jp/watch/([[:alnum:]]+)!
+      def scrape
+        begin
+          login
+          id = url_regex_match[1]
+          get_flv(id)
+          get_thumb(id)
+          get_embed_tag(id)
+        rescue Timeout::Error => e
+          raise TryAgainLater, e.to_s
+        rescue WWW::Mechanize::ResponseCodeError => e
+          case e.response_code
+          when '404', '403'
+            raise FileNotFound, e.to_s
+          when '502'
+            raise TryAgainLater, e.to_s
+          else
+            raise TryAgainLater, e.to_s
+          end
+        end
+      end
+      private
+      def login
+        page = agent.post('https://secure.nicovideo.jp/secure/login?site=niconico',
+                           'mail' => @opt[:nico_video_mail],
+                           'password' => @opt[:nico_video_password])
+        raise RuntimeError, 'login failure' unless page.header['x-niconico-authflag'] == '1'
+      end
+      def get_flv(id)
+        request_url = "http://www.nicovideo.jp/api/getflv?v=#{id}"
+        page = agent.get(request_url)
+        q = CGI.parse(page.body)
+        raise FileNotFound unless q['url']
+        @video_url = q['url'].first
+      end
+      def get_thumb(id)
+        page = agent.get("http://www.nicovideo.jp/api/getthumbinfo/#{id}")
+        xdoc = Hpricot.XML(page.body.toutf8)
+        xdoc.search('//thumbnail_url') do |elem|
+          @thumb_url = elem.inner_html
+        end
+        xdoc.search('//thumb/title') do |elem|
+          @title = elem.inner_html
+        end
+      end
+      def get_embed_tag(id)
+        page = agent.get(@page_url)
+        response_body = page.body
+        doc = Hpricot(response_body)
+        doc.search('//form[@name="form_iframe"] //input[@name="input_iframe"]') do |elem|
+          @embed_tag = elem.attributes['value']
+        end
+      end
+    end
+  end
+end

data/lib/www/video_scraper/pornhub.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class Pornhub < Base
+      url_regex %r|\Ahttp://www\.pornhub\.com/view_video\.php.*viewkey=[[:alnum:]]{20}|
+      def scrape
+        html = http_get(@page_url)
+        raise FileNotFound unless m = html.match(/\.addVariable\("options",\s*"([^"]+)"\);/i)
+        request_url = URI.decode m[1]
+        response_body = http_get(request_url)
+        @video_url = response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
+        if m = @video_url.match(%r|videos/(\d{3}/\d{3}/\d{3})/\d+.flv|)
+          @thumb_url = "http://p1.pornhub.com/thumbs/#{m[1]}/small.jpg"
+        end
+        @embed_tag = html.match(%r|<textarea[^>]+class="share-flag-embed">(<object type="application/x-shockwave-flash".*?</object>)</textarea>|).to_a[1]
+        @title = html.match(%r|<title>(.*) - Pornhub\.com</title>|).to_a[1]
+      end
+    end
+  end
+end

data/lib/www/video_scraper/pornotube.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class Pornotube < Base
+      url_regex %r!\Ahttp://(?:www\.)?pornotube\.com/(?:media|channels)\.php\?.*m=(\d+)!
+      def scrape
+        id = url_regex_match[1]
+        login
+        page = agent.get(@page_url)
+        raise FileNotFound unless embed = page.root.at('//object/embed')
+        src = embed.attributes['src']
+        hash = src.to_s.match(/\?v=(.*)$/)[1]
+        t = page.at('//div[@class="contentheader"]//span[@class="blue"]')
+        @title = t.inner_html.gsub(/<[^>]*>/, '').strip
+        page = agent.get("http://pornotube.com/player/player.php?#{hash}")
+        q = CGI::parse(page.body)
+        @video_url = "http://#{q['mediaDomain'][0]}.pornotube.com/#{q['userId'][0]}/#{q['mediaId'][0]}.flv"
+        @thumb_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}.jpg";
+        @image_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}_full.jpg";
+        @embed_tag = q['embedCode'][0]
+      end
+      private
+      def login
+        agent.post("http://pornotube.com/index.php",
+                   'verifyAge' => 'true',
+                   'bMonth' => '01',
+                   'bDay' => '01',
+                   'bYear' => '1970',
+                   'submit' => 'View All Content')
+      end
+    end
+  end
+end

data/lib/www/video_scraper/red_tube.rb ADDED Viewed

@@ -0,0 +1,89 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class RedTube < Base
+      url_regex %r|\Ahttp://www\.redtube\.com/(\d+)|
+      def scrape
+        s = content_id || '0'
+        s = '1' if s.empty?
+        pathnr = s.to_i / 1000
+        s = "%07d" % s.to_i
+        logger.debug s
+        pathnr = "%07d" % pathnr
+        logger.debug pathnr
+        xc = %w!R 1 5 3 4 2 O 7 K 9 H B C D X F G A I J 8 L M Z 6 P Q 0 S T U V W E Y N!
+        qsum = 0
+        s.length.times do |i|
+          qsum += s[i,1].to_i * (i + 1)
+        end
+        s1 = qsum.to_s
+        qsum = 0
+        s1.length.times do |i|
+          qsum += s1[i,1].to_i
+        end
+        qstr = "%02d" % qsum
+        code = ''
+        code += xc[s[3] - 48 + qsum + 3]
+        code += qstr[1,1]
+        code += xc[s[0] - 48 + qsum + 2]
+        code += xc[s[2] - 48 + qsum + 1]
+        code += xc[s[5] - 48 + qsum + 6]
+        code += xc[s[1] - 48 + qsum + 5]
+        code += qstr[0,1]
+        code += xc[s[4] - 48 + qsum + 7]
+        code += xc[s[6] - 48 + qsum + 4]
+        content_video = pathnr + '/' + code + '.flv'
+        @pathnr = pathnr
+        @s = s
+        @video_url = "http://dl.redtube.com/_videos_t4vn23s9jc5498tgj49icfj4678/#{content_video}"
+      end
+      def thumb_url
+        return @thumb_url if @thumb_url
+        1.upto(10) do |i|
+          url = "http://thumbs.redtube.com/_thumbs/#{@pathnr}/#{@s}/#{@s}_#{'%03d' % i}.jpg"
+          logger.debug url
+          begin
+            uri = URI.parse(url)
+            Net::HTTP.start(uri.host, uri.port) do |http|
+              response = http.head(uri.request_uri,
+                                   {"User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)"})
+              logger.debug response.code
+              if 200 == response.code.to_i
+                @thumb_url = url
+                return @thumb_url
+              end
+            end
+          rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT
+          end
+        end
+        nil
+      end
+      def title
+        return @title if @title
+        html = http_get(@page_url)
+        doc = Hpricot(html.toutf8)
+        @title = doc.at("//table/tr[2]/td/table/tr[3]/td/table/tr/td").inner_html.gsub(/<[^>]*>/, '').strip
+      end
+      def embed_tag
+        return @embed_tag if @embed_tag
+        url = "http://www.redtube.com/embed/#{content_id}"
+        response_body = http_get(url)
+        doc = Hpricot(response_body)
+        doc.search('//textarea#cpf') do |elem|
+          @embed_tag = elem.inner_html
+        end
+        @embed_tag
+      end
+      private
+      def content_id; url_regex_match[1]; end
+    end
+  end
+end

data/lib/www/video_scraper/tube8.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class Tube8 < Base
+      attr_reader :video_url_3gp
+      url_regex %r!\Ahttp://www\.tube8\.com/.*/(\d+)(?:/|$)!
+      def scrape
+        html = http_get(@page_url)
+        doc = Hpricot(html.toutf8)
+        raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
+        flashvars = CGI.parse(flashvars.attributes['value'])
+        @video_url = flashvars['videoUrl'][0]
+        uri = URI.parse(@page_url)
+        @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", flashvars['imageUrl'][0]).to_s
+        @title = doc.at('//h1[@class="text"]').inner_html rescue nil
+        doc.search('//a').each do |elem|
+          if href = elem.attributes['href']
+            if href.match(/\.3gp$/)
+              @video_url_3gp = href
+              break
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/www/video_scraper/veoh.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class Veoh < Base
+      url_regex [%r!\Ahttp://www\.veoh\.com/videos/(v\d+[[:alnum:]]+)!,
+                 %r!\Ahttp://www\.veoh\.com/collection/\w+/watch/.*#watch%3[Dd](v\d+[[:alnum:]]+)!,
+                 %r!\Ahttp://www\.veoh\.com/(?:browse|collection)/(?:[\w]+/)+watch/(v\d+[[:alnum:]]+)!]
+      def scrape
+        @id = url_regex_match[1]
+        @page_url = "http://www.veoh.com/videos/#{@id}"
+        request_url = "http://www.veoh.com/rest/video/#{@id}/details"
+        xml = http_get(request_url)
+        @video_url = xml.match(/fullPreviewHashPath="([^"]+)"/).to_a[1]
+        @title = xml.match(/title="([^"]+)"/).to_a[1]
+        @thumb_url = xml.match(/fullMedResImagePath="([^"]+)"/).to_a[1]
+        html = http_get(@page_url)
+        #logger.debug html
+        if embed_tag = html.match(/class="embedinput"\s[^>]*value="([^"]+)"/).to_a[1]
+          @embed_tag = CGI.unescapeHTML(embed_tag)
+        end
+      end
+    end
+  end
+end

data/lib/www/video_scraper/you_porn.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class YouPorn < Base
+      url_regex %r!\Ahttp://(?:www\.)?youporn\.com/watch/(\d+)!
+      def scrape
+        id = url_regex_match[1]
+        request_url = @page_url.sub(/(\?.*)?$/, '?user_choice=Enter')
+        html = http_get(request_url, 'Cookie' => 'age_check=1')
+        doc = Hpricot(html)
+        doc.search('//div[@id="download"]//a').each do |elem|
+          href = elem.attributes['href']
+          (@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*!
+        end
+        h1 = doc.at('//div[@id="videoArea"]/h1')
+        @title = h1.inner_html.gsub(/<[^>]*>/, '').strip
+        @thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg') if h1.at('/img') != nil
+      end
+    end
+  end
+end

data/lib/www/video_scraper/you_tube.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class YouTube < Base
+      url_regex %r!\Ahttp://(?:www|jp)\.youtube\.com/watch.*[?&]v=([[:alnum:]]+)!
+      def scrape
+        page = pass_verify_age
+        @title = page.root.at('//head/title').inner_html.sub(/^YouTube[\s-]*/, '') rescue ''
+        @embed_tag = page.root.at('//input[@id="embed_code"]').attributes['value'] rescue nil
+        page.root.search('//script').each do |script|
+          if m = script.inner_html.match(/var\s+swfArgs\s*=\s*([^;]+);/)
+            swf_args = JSON::parse(m[1])
+            uri = URI.parse(@page_url)
+            uri.path = '/get_video'
+            uri.query = "video_id=#{swf_args['video_id']}&t=#{swf_args['t']}"
+            @video_url = uri.to_s
+            @thumb_url = "http://i.ytimg.com/vi/#{swf_args['video_id']}/default.jpg"
+          end
+        end
+        raise FileNotFound, 'file not found' if @video_url.nil?
+      end
+      private
+      def login
+        uri = URI.parse(@page_url)
+        page = agent.get("#{uri.scheme}://#{uri.host}/login")
+        #login_form = page.form('loginForm')
+        #login_form.username = @opt[:you_tube_username]
+        #login_form.password = @opt[:you_tube_password]
+        login_form = page.form('gaia_loginform')
+        login_form.email  = @opt[:you_tube_username]
+        login_form.passwd = @opt[:you_tube_password]
+        agent.submit(login_form)
+      end
+      def pass_verify_age
+        uri = URI.parse(@page_url)
+        page = agent.get(uri)
+        if page.uri.path =~ /verify_age/
+          login
+          page = agent.post(page.uri,
+                            'next_url' => "#{uri.path}?#{uri.query}",
+                            'action_confirm' => 'Confirm Birth Date')
+        end
+        page
+      end
+    end
+  end
+end