RubyGems - video_scraper - Versions diffs - 1.0.5 - Mend

video_scraper 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/ChangeLog +4 -0
data/README +71 -0
data/Rakefile +146 -0
data/lib/www/video_scraper.rb +88 -0
data/lib/www/video_scraper/adult_satellites.rb +27 -0
data/lib/www/video_scraper/age_sage.rb +28 -0
data/lib/www/video_scraper/ameba_vision.rb +22 -0
data/lib/www/video_scraper/base.rb +88 -0
data/lib/www/video_scraper/dailymotion.rb +30 -0
data/lib/www/video_scraper/eic_book.rb +34 -0
data/lib/www/video_scraper/moro_tube.rb +31 -0
data/lib/www/video_scraper/nico_video.rb +68 -0
data/lib/www/video_scraper/pornhub.rb +24 -0
data/lib/www/video_scraper/pornotube.rb +39 -0
data/lib/www/video_scraper/red_tube.rb +89 -0
data/lib/www/video_scraper/tube8.rb +31 -0
data/lib/www/video_scraper/veoh.rb +28 -0
data/lib/www/video_scraper/you_porn.rb +26 -0
data/lib/www/video_scraper/you_tube.rb +53 -0
data/lib/www/video_scraper/your_file_host.rb +54 -0
data/test/test_helper.rb +23 -0
data/test/www/test_video_scraper.rb +43 -0
data/test/www/video_scraper/test_adult_satellites.rb +13 -0
data/test/www/video_scraper/test_age_sage.rb +13 -0
data/test/www/video_scraper/test_ameba_vision.rb +12 -0
data/test/www/video_scraper/test_base.rb +14 -0
data/test/www/video_scraper/test_dailymotion.rb +14 -0
data/test/www/video_scraper/test_eic_book.rb +14 -0
data/test/www/video_scraper/test_moro_tube.rb +13 -0
data/test/www/video_scraper/test_nico_video.rb +23 -0
data/test/www/video_scraper/test_pornhub.rb +14 -0
data/test/www/video_scraper/test_pornotube.rb +21 -0
data/test/www/video_scraper/test_red_tube.rb +13 -0
data/test/www/video_scraper/test_tube8.rb +14 -0
data/test/www/video_scraper/test_veoh.rb +24 -0
data/test/www/video_scraper/test_you_porn.rb +13 -0
data/test/www/video_scraper/test_you_tube.rb +32 -0
data/test/www/video_scraper/test_your_file_host.rb +14 -0
metadata +133 -0

data/ChangeLog ADDED Viewed

@@ -0,0 +1,4 @@
+== 1.0.1 / 2009-01-17
+* initial release

data/README ADDED Viewed

@@ -0,0 +1,71 @@
+= WWW::VideoScraper
+* http://coderepos.org/share/browser/lang/ruby/video_scraper
+* http://github.com/valda/video_scraper/tree/master
+== DESCRIPTION:
+Web scraping library for video sharing sites.
+== FEATURES/PROBLEMS:
+Supported sites
+* AdultSatellites
+* AmebaVision
+* Dailymotion
+* MoroTube
+* NICO NICO DOUGA
+* Pornhub
+* Pornotube
+* RedTube
+* Tube8
+* Ura Agesage
+* Veoh
+* YouPorn
+* YouTube
+* YourFileHost
+== SYNOPSIS:
+   >> require 'www/video_scraper'
+   >> scraper = WWW::VideoScraper.scrape('http://www.youtube.com/watch?v=OFPnvARUOHI')
+   >> scraper.video_url
+   => "http://www.youtube.com/get_video?video_id=OFPnvARUOHI&t=OEgsToPDskIpQJU48rm4-sS1RtbItouY"
+   >> scraper.thumb_url
+   => "http://i.ytimg.com/vi/OFPnvARUOHI/default.jpg"
+== REQUIREMENTS:
+* WWW::Mechanize
+* Hpricot
+* CGIAlt (recommend)
+== INSTALL:
+* sudo gem install valda-video_scraper
+== LICENSE:
+(The MIT License)
+Copyright (c) 2009 YAMAGUCHI Seiji <valda at underscore.jp>
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Rakefile ADDED Viewed

@@ -0,0 +1,146 @@
+require 'rubygems'
+require 'rake'
+require 'rake/clean'
+require 'rake/testtask'
+require 'rake/packagetask'
+require 'rake/gempackagetask'
+require 'rake/rdoctask'
+require 'rake/contrib/rubyforgepublisher'
+require 'rake/contrib/sshpublisher'
+require 'fileutils'
+require 'lib/www/video_scraper'
+include FileUtils
+NAME              = "video_scraper"
+AUTHOR            = "YAMAGUCHI Seiji"
+EMAIL             = "valda@underscore.jp"
+DESCRIPTION       = "Web scraping library for video sharing sites."
+RUBYFORGE_PROJECT = "video_scraper"
+HOMEPATH          = "http://github.com/valda/video_scraper"
+BIN_FILES         = %w(  )
+VERS              = WWW::VideoScraper::VERSION
+REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
+CLEAN.include ['**/.*.sw?', '*.gem', '.config']
+RDOC_OPTS = [
+       '--title', "#{NAME} documentation",
+       "--charset", "utf-8",
+       "--opname", "index.html",
+       "--line-numbers",
+       "--main", "README",
+       "--inline-source",
+]
+task :default => [:test]
+task :package => [:clean]
+Rake::TestTask.new("test") do |t|
+       t.libs   << "test"
+       t.pattern = "test/**/*_test.rb"
+       t.verbose = true
+end
+spec = Gem::Specification.new do |s|
+       s.name              = NAME
+       s.version           = VERS
+       s.platform          = Gem::Platform::RUBY
+       s.has_rdoc          = true
+       s.extra_rdoc_files  = ["README", "ChangeLog"]
+       s.rdoc_options     += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
+       s.summary           = DESCRIPTION
+       s.description       = DESCRIPTION
+       s.author            = AUTHOR
+       s.email             = EMAIL
+       s.homepage          = HOMEPATH
+       s.executables       = BIN_FILES
+       s.rubyforge_project = RUBYFORGE_PROJECT
+       s.bindir            = "bin"
+       s.require_path      = "lib"
+       #s.autorequire       = ""
+       s.test_files        = Dir["test/*_test.rb"]
+       s.add_dependency('mechanize', '>=0.8.4')
+       s.add_dependency('hpricot', '>=0.6.164')
+       s.add_dependency('json', '>=1.1.3')
+       #s.required_ruby_version = '>= 1.8.2'
+       s.files = %w(README ChangeLog Rakefile) +
+               Dir.glob("{bin,doc,test,lib,templates,generator,extras,website,script}/**/*") +
+               Dir.glob("ext/**/*.{h,c,rb}") +
+               Dir.glob("examples/**/*.rb") +
+               Dir.glob("tools/*.rb") +
+               Dir.glob("rails/*.rb")
+       s.extensions = FileList["ext/**/extconf.rb"].to_a
+end
+Rake::GemPackageTask.new(spec) do |p|
+       p.need_tar = true
+       p.gem_spec = spec
+end
+task :install do
+       name = "#{NAME}-#{VERS}.gem"
+       sh %{rake package}
+       sh %{sudo gem install pkg/#{name}}
+end
+task :uninstall => [:clean] do
+       sh %{sudo gem uninstall #{NAME}}
+end
+Rake::RDocTask.new do |rdoc|
+       rdoc.rdoc_dir = 'html'
+       rdoc.options += RDOC_OPTS
+       rdoc.template = "resh"
+       #rdoc.template = "#{ENV['template']}.rb" if ENV['template']
+       if ENV['DOC_FILES']
+               rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
+       else
+               rdoc.rdoc_files.include('README', 'ChangeLog')
+               rdoc.rdoc_files.include('lib/**/*.rb')
+               rdoc.rdoc_files.include('ext/**/*.c')
+       end
+end
+desc "Publish to RubyForge"
+task :rubyforge => [:rdoc, :package] do
+       require 'rubyforge'
+       Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'yamaguchi').upload
+end
+desc 'Package and upload the release to rubyforge.'
+task :release => [:clean, :package] do |t|
+       v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
+       abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
+       pkg = "pkg/#{NAME}-#{VERS}"
+       require 'rubyforge'
+       rf = RubyForge.new.configure
+       puts "Logging in"
+       rf.login
+       c = rf.userconfig
+#      c["release_notes"] = description if description
+#      c["release_changes"] = changes if changes
+       c["preformatted"] = true
+       files = [
+               "#{pkg}.tgz",
+               "#{pkg}.gem"
+       ].compact
+       puts "Releasing #{NAME} v. #{VERS}"
+       rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
+end
+desc 'Show information about the gem.'
+task :debug_gem do
+       puts spec.to_ruby
+end
+desc 'Update gem spec'
+task :gemspec do
+  open("#{NAME}.gemspec", 'w').write spec.to_ruby
+end

data/lib/www/video_scraper.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require 'rubygems'
+require 'open-uri'
+require 'hpricot'
+require 'mechanize'
+require 'kconv'
+require 'json'
+require 'uri'
+begin
+  require 'cgialt' unless defined? CGI
+rescue LoadError
+  require 'cgi'
+end
+module WWW
+  module VideoScraper
+    VERSION = '1.0.5'
+    MODULES_NAME = %w(adult_satellites age_sage ameba_vision dailymotion eic_book
+                      moro_tube nico_video pornhub pornotube red_tube tube8 veoh
+                      you_porn you_tube your_file_host)
+    @@modules = MODULES_NAME.map do |name|
+      require File.expand_path(File.join(File.dirname(__FILE__), 'video_scraper', name))
+      const_get( name.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase } )
+    end
+    @@options = {
+      :logger => nil,
+      :cache => nil,
+    }
+    class << self
+      def modules
+        @@nodules
+      end
+      def options
+        @@options
+      end
+      def options=(opts)
+        @@options = opts
+      end
+      def configure(&proc)
+        raise ArgumentError, "Block is required." unless block_given?
+        yield @@options
+      end
+      def find_module(url)
+        @@modules.find { |mod| mod.valid_url?(url) }
+      end
+      # 与えられた URL を処理できるモジュールを @@modules から検索して実行する
+      def scrape(url, opt = nil)
+        opt = @@options.merge(opt || {})
+        opt[:logger] ||= logger
+        raise StandardError, "url param is requred" unless url
+        logger.info "url: #{url}"
+        if mod = find_module(url)
+          logger.info "found module: #{mod.to_s}"
+          return mod.scrape(url, opt)
+        end
+        logger.info "unsupport url."
+        return nil
+      rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT => e
+        logger.warn "  Timeout : #{e.to_s}"
+        raise TryAgainLater, e.to_s
+      rescue OpenURI::HTTPError => e
+        raise TryAgainLater, e.to_s if e.to_s.match(/50\d/)
+        raise FileNotFound, e.to_s if e.to_s.match(/40\d/)
+        raise
+      rescue Exception => e
+        logger.error "#{e.class}: #{e.to_s}"
+        raise e
+      end
+      private
+      def logger
+        return @@options[:logger] if @@options[:logger]
+        @@options[:logger] = NullLogger.new
+      end
+    end
+  end
+end

data/lib/www/video_scraper/adult_satellites.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class AdultSatellites < Base
+      url_regex %r!http://(?:www\.)?asa\.tv/movie_detail\.php.*!
+      def scrape
+        html = http_get(@page_url)
+        doc = Hpricot(html.toutf8)
+        raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
+        flashvars = CGI.parse(flashvars.attributes['value'])
+        @video_url = flashvars['videoName'][0]
+        uri = URI.parse(@page_url)
+        if m = @video_url.match(%r!/([[:alnum:]]+/[[:alnum:]]+)\.flv!)
+          @thumb_url = "#{uri.scheme}://#{uri.host}/captured/#{m[1]}_1.jpg"
+        end
+        @title = doc.at('//strong[@class="ptitle"]').inner_html rescue nil
+        if embed = doc.at('//input[@name="embed"]')
+          @embed_tag = CGI.unescapeHTML(embed.attributes['value'])
+        end
+      end
+    end
+  end
+end

data/lib/www/video_scraper/age_sage.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class AgeSage < Base
+      url_regex %r!\Ahttp://adult\.agesage\.jp/contentsPage\.html\?mcd=[[:alnum:]]{16}!
+      def scrape
+        @request_url = @page_url.sub('.html', '.xml')
+        @response_body = http_get(@request_url)
+        raise FileNotFound if @response_body.nil? or @response_body.empty?
+        xdoc = Hpricot.XML(@response_body.toutf8)
+        if movie = xdoc.at('/movie')
+          @video_url = movie.at('/movieurl').inner_html
+          @thumb_url = movie.at('/thumbnail').inner_html
+          @title = movie.at('/title').inner_html
+          mcd = @page_url.match(%r|agesage\.jp/contentsPage\.html\?mcd=([[:alnum:]]{16})|)[1]
+          @embed_tag = <<-HTML
+<script type="text/javascript" src="http://adult.agesage.jp/js/past_uraui.js"></script>
+<script type="text/javascript">Purauifla("mcd=#{mcd}", 320, 275);</script>
+        HTML
+        end
+      end
+    end
+  end
+end

data/lib/www/video_scraper/ameba_vision.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# -*- mode:ruby; coding:utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/base')
+module WWW
+  module VideoScraper
+    class AmebaVision < Base
+      url_regex %r!\Ahttp://vision\.ameba\.jp/watch\.do.*?\?movie=(\d+)!
+      def scrape
+        id = url_regex_match[1]
+        request_url = "http://vision.ameba.jp/api/get/detailMovie.do?movie=#{id}"
+        xml = http_get(request_url)
+        xdoc = Hpricot.XML(xml.toutf8)
+        @title = xdoc.at('//item/title').inner_html
+        @page_url = xdoc.at('//item/link').inner_html
+        @thumb_url = xdoc.at('//item/imageUrlLarge').inner_html
+        @video_url = @thumb_url.sub('//vi', '//vm').sub('/jpg/', '/flv/').sub('_4.jpg', '.flv')
+      end
+    end
+  end
+end

data/lib/www/video_scraper/base.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# -*- mode:ruby; coding:utf-8 -*-
+module WWW
+  module VideoScraper
+    class TryAgainLater < RuntimeError; end
+    class FileNotFound < RuntimeError; end
+    class NullLogger
+      def method_missing(name, *args); return nil; end
+    end
+    class Base
+      attr_reader :page_url, :video_url, :thumb_url, :embed_tag, :title
+      ## class methods
+      class << self
+        def url_regex(regex)
+          @url_regex = regex
+        end
+        def valid_url?(url)
+          Array(@url_regex).any? { |r| r.match(url) }
+        end
+        def scrape(url, opt = nil)
+          instance = self.new(url, opt)
+          instance.scrape
+          instance
+        end
+      end
+      def initialize(url, opt = nil)
+        @page_url = url
+        @opt = (opt || {})
+        url_regex = self.class.instance_variable_get(:@url_regex)
+        Array(url_regex).any? do |r|
+          @url_regex_match = r.match(@page_url).freeze
+        end
+        raise StandardError, "url is not #{self.class.name} link: #{url}" if @url_regex_match.nil?
+      end
+      def scrape
+        raise StandardError, 'not implemented yet'
+      end
+      protected
+      def url_regex_match
+        @url_regex_match
+      end
+      def agent
+        @agent ||= WWW::Mechanize.new do |a|
+          a.user_agent_alias = 'Windows IE 6'
+        end
+      end
+      def logger
+        return @opt[:logger] if @opt[:logger]
+        @opt[:logger] = NullLogger.new
+      end
+      def http_get(url, opt = nil)
+        open_opt = {
+          "User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
+        }.merge( opt || {} )
+        if @opt[:cache]
+          unless @opt[:cache].respond_to?(:get) and @opt[:cache].respond_to?(:set)
+            raise RuntimeError, 'As for cache object what responds to :get and :set is required.'
+          end
+          @opt[:logger].debug 'use cache.'
+          cache_key = "#{url}|#{open_opt}"
+          unless content = @opt[:cache].get(cache_key)
+            content = open(url, open_opt) {|fh| fh.read }
+            @opt[:cache].set(cache_key, content)
+          end
+        else
+          content = open(url, open_opt) {|fh| fh.read }
+        end
+        content
+      rescue OpenURI::HTTPError => e
+        raise TryAgainLater, e.to_s if e.to_s.include?('503')
+        raise e
+      rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT => e
+        raise TryAgainLater, e.to_s
+      end
+    end
+  end
+end