video_scraper 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/ChangeLog +4 -0
  2. data/README +71 -0
  3. data/Rakefile +146 -0
  4. data/lib/www/video_scraper.rb +88 -0
  5. data/lib/www/video_scraper/adult_satellites.rb +27 -0
  6. data/lib/www/video_scraper/age_sage.rb +28 -0
  7. data/lib/www/video_scraper/ameba_vision.rb +22 -0
  8. data/lib/www/video_scraper/base.rb +88 -0
  9. data/lib/www/video_scraper/dailymotion.rb +30 -0
  10. data/lib/www/video_scraper/eic_book.rb +34 -0
  11. data/lib/www/video_scraper/moro_tube.rb +31 -0
  12. data/lib/www/video_scraper/nico_video.rb +68 -0
  13. data/lib/www/video_scraper/pornhub.rb +24 -0
  14. data/lib/www/video_scraper/pornotube.rb +39 -0
  15. data/lib/www/video_scraper/red_tube.rb +89 -0
  16. data/lib/www/video_scraper/tube8.rb +31 -0
  17. data/lib/www/video_scraper/veoh.rb +28 -0
  18. data/lib/www/video_scraper/you_porn.rb +26 -0
  19. data/lib/www/video_scraper/you_tube.rb +53 -0
  20. data/lib/www/video_scraper/your_file_host.rb +54 -0
  21. data/test/test_helper.rb +23 -0
  22. data/test/www/test_video_scraper.rb +43 -0
  23. data/test/www/video_scraper/test_adult_satellites.rb +13 -0
  24. data/test/www/video_scraper/test_age_sage.rb +13 -0
  25. data/test/www/video_scraper/test_ameba_vision.rb +12 -0
  26. data/test/www/video_scraper/test_base.rb +14 -0
  27. data/test/www/video_scraper/test_dailymotion.rb +14 -0
  28. data/test/www/video_scraper/test_eic_book.rb +14 -0
  29. data/test/www/video_scraper/test_moro_tube.rb +13 -0
  30. data/test/www/video_scraper/test_nico_video.rb +23 -0
  31. data/test/www/video_scraper/test_pornhub.rb +14 -0
  32. data/test/www/video_scraper/test_pornotube.rb +21 -0
  33. data/test/www/video_scraper/test_red_tube.rb +13 -0
  34. data/test/www/video_scraper/test_tube8.rb +14 -0
  35. data/test/www/video_scraper/test_veoh.rb +24 -0
  36. data/test/www/video_scraper/test_you_porn.rb +13 -0
  37. data/test/www/video_scraper/test_you_tube.rb +32 -0
  38. data/test/www/video_scraper/test_your_file_host.rb +14 -0
  39. metadata +133 -0
data/ChangeLog ADDED
@@ -0,0 +1,4 @@
1
+ == 1.0.1 / 2009-01-17
2
+
3
+ * initial release
4
+
data/README ADDED
@@ -0,0 +1,71 @@
1
+ = WWW::VideoScraper
2
+
3
+ * http://coderepos.org/share/browser/lang/ruby/video_scraper
4
+ * http://github.com/valda/video_scraper/tree/master
5
+
6
+ == DESCRIPTION:
7
+
8
+ Web scraping library for video sharing sites.
9
+
10
+ == FEATURES/PROBLEMS:
11
+
12
+ Supported sites
13
+
14
+ * AdultSatellites
15
+ * AmebaVision
16
+ * Dailymotion
17
+ * MoroTube
18
+ * NICO NICO DOUGA
19
+ * Pornhub
20
+ * Pornotube
21
+ * RedTube
22
+ * Tube8
23
+ * Ura Agesage
24
+ * Veoh
25
+ * YouPorn
26
+ * YouTube
27
+ * YourFileHost
28
+
29
+ == SYNOPSIS:
30
+
31
+ >> require 'www/video_scraper'
32
+ >> scraper = WWW::VideoScraper.scrape('http://www.youtube.com/watch?v=OFPnvARUOHI')
33
+ >> scraper.video_url
34
+ => "http://www.youtube.com/get_video?video_id=OFPnvARUOHI&t=OEgsToPDskIpQJU48rm4-sS1RtbItouY"
35
+ >> scraper.thumb_url
36
+ => "http://i.ytimg.com/vi/OFPnvARUOHI/default.jpg"
37
+
38
+ == REQUIREMENTS:
39
+
40
+ * WWW::Mechanize
41
+ * Hpricot
42
+ * CGIAlt (recommend)
43
+
44
+ == INSTALL:
45
+
46
+ * sudo gem install valda-video_scraper
47
+
48
+ == LICENSE:
49
+
50
+ (The MIT License)
51
+
52
+ Copyright (c) 2009 YAMAGUCHI Seiji <valda at underscore.jp>
53
+
54
+ Permission is hereby granted, free of charge, to any person obtaining
55
+ a copy of this software and associated documentation files (the
56
+ 'Software'), to deal in the Software without restriction, including
57
+ without limitation the rights to use, copy, modify, merge, publish,
58
+ distribute, sublicense, and/or sell copies of the Software, and to
59
+ permit persons to whom the Software is furnished to do so, subject to
60
+ the following conditions:
61
+
62
+ The above copyright notice and this permission notice shall be
63
+ included in all copies or substantial portions of the Software.
64
+
65
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
66
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
67
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
68
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
69
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
70
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
71
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,146 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/rdoctask'
8
+ require 'rake/contrib/rubyforgepublisher'
9
+ require 'rake/contrib/sshpublisher'
10
+ require 'fileutils'
11
+ require 'lib/www/video_scraper'
12
+ include FileUtils
13
+
14
+ NAME = "video_scraper"
15
+ AUTHOR = "YAMAGUCHI Seiji"
16
+ EMAIL = "valda@underscore.jp"
17
+ DESCRIPTION = "Web scraping library for video sharing sites."
18
+ RUBYFORGE_PROJECT = "video_scraper"
19
+ HOMEPATH = "http://github.com/valda/video_scraper"
20
+ BIN_FILES = %w( )
21
+
22
+ VERS = WWW::VideoScraper::VERSION
23
+ REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
24
+ CLEAN.include ['**/.*.sw?', '*.gem', '.config']
25
+ RDOC_OPTS = [
26
+ '--title', "#{NAME} documentation",
27
+ "--charset", "utf-8",
28
+ "--opname", "index.html",
29
+ "--line-numbers",
30
+ "--main", "README",
31
+ "--inline-source",
32
+ ]
33
+
34
+ task :default => [:test]
35
+ task :package => [:clean]
36
+
37
+ Rake::TestTask.new("test") do |t|
38
+ t.libs << "test"
39
+ t.pattern = "test/**/*_test.rb"
40
+ t.verbose = true
41
+ end
42
+
43
+ spec = Gem::Specification.new do |s|
44
+ s.name = NAME
45
+ s.version = VERS
46
+ s.platform = Gem::Platform::RUBY
47
+ s.has_rdoc = true
48
+ s.extra_rdoc_files = ["README", "ChangeLog"]
49
+ s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
50
+ s.summary = DESCRIPTION
51
+ s.description = DESCRIPTION
52
+ s.author = AUTHOR
53
+ s.email = EMAIL
54
+ s.homepage = HOMEPATH
55
+ s.executables = BIN_FILES
56
+ s.rubyforge_project = RUBYFORGE_PROJECT
57
+ s.bindir = "bin"
58
+ s.require_path = "lib"
59
+ #s.autorequire = ""
60
+ s.test_files = Dir["test/*_test.rb"]
61
+
62
+ s.add_dependency('mechanize', '>=0.8.4')
63
+ s.add_dependency('hpricot', '>=0.6.164')
64
+ s.add_dependency('json', '>=1.1.3')
65
+ #s.required_ruby_version = '>= 1.8.2'
66
+
67
+ s.files = %w(README ChangeLog Rakefile) +
68
+ Dir.glob("{bin,doc,test,lib,templates,generator,extras,website,script}/**/*") +
69
+ Dir.glob("ext/**/*.{h,c,rb}") +
70
+ Dir.glob("examples/**/*.rb") +
71
+ Dir.glob("tools/*.rb") +
72
+ Dir.glob("rails/*.rb")
73
+
74
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
75
+ end
76
+
77
+ Rake::GemPackageTask.new(spec) do |p|
78
+ p.need_tar = true
79
+ p.gem_spec = spec
80
+ end
81
+
82
+ task :install do
83
+ name = "#{NAME}-#{VERS}.gem"
84
+ sh %{rake package}
85
+ sh %{sudo gem install pkg/#{name}}
86
+ end
87
+
88
+ task :uninstall => [:clean] do
89
+ sh %{sudo gem uninstall #{NAME}}
90
+ end
91
+
92
+
93
+ Rake::RDocTask.new do |rdoc|
94
+ rdoc.rdoc_dir = 'html'
95
+ rdoc.options += RDOC_OPTS
96
+ rdoc.template = "resh"
97
+ #rdoc.template = "#{ENV['template']}.rb" if ENV['template']
98
+ if ENV['DOC_FILES']
99
+ rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
100
+ else
101
+ rdoc.rdoc_files.include('README', 'ChangeLog')
102
+ rdoc.rdoc_files.include('lib/**/*.rb')
103
+ rdoc.rdoc_files.include('ext/**/*.c')
104
+ end
105
+ end
106
+
107
+ desc "Publish to RubyForge"
108
+ task :rubyforge => [:rdoc, :package] do
109
+ require 'rubyforge'
110
+ Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'yamaguchi').upload
111
+ end
112
+
113
+ desc 'Package and upload the release to rubyforge.'
114
+ task :release => [:clean, :package] do |t|
115
+ v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
116
+ abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
117
+ pkg = "pkg/#{NAME}-#{VERS}"
118
+
119
+ require 'rubyforge'
120
+ rf = RubyForge.new.configure
121
+ puts "Logging in"
122
+ rf.login
123
+
124
+ c = rf.userconfig
125
+ # c["release_notes"] = description if description
126
+ # c["release_changes"] = changes if changes
127
+ c["preformatted"] = true
128
+
129
+ files = [
130
+ "#{pkg}.tgz",
131
+ "#{pkg}.gem"
132
+ ].compact
133
+
134
+ puts "Releasing #{NAME} v. #{VERS}"
135
+ rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
136
+ end
137
+
138
+ desc 'Show information about the gem.'
139
+ task :debug_gem do
140
+ puts spec.to_ruby
141
+ end
142
+
143
+ desc 'Update gem spec'
144
+ task :gemspec do
145
+ open("#{NAME}.gemspec", 'w').write spec.to_ruby
146
+ end
@@ -0,0 +1,88 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require 'rubygems'
4
+ require 'open-uri'
5
+ require 'hpricot'
6
+ require 'mechanize'
7
+ require 'kconv'
8
+ require 'json'
9
+ require 'uri'
10
+ begin
11
+ require 'cgialt' unless defined? CGI
12
+ rescue LoadError
13
+ require 'cgi'
14
+ end
15
+
16
+ module WWW
17
+ module VideoScraper
18
+ VERSION = '1.0.5'
19
+
20
+ MODULES_NAME = %w(adult_satellites age_sage ameba_vision dailymotion eic_book
21
+ moro_tube nico_video pornhub pornotube red_tube tube8 veoh
22
+ you_porn you_tube your_file_host)
23
+
24
+ @@modules = MODULES_NAME.map do |name|
25
+ require File.expand_path(File.join(File.dirname(__FILE__), 'video_scraper', name))
26
+ const_get( name.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase } )
27
+ end
28
+
29
+ @@options = {
30
+ :logger => nil,
31
+ :cache => nil,
32
+ }
33
+
34
+ class << self
35
+ def modules
36
+ @@nodules
37
+ end
38
+
39
+ def options
40
+ @@options
41
+ end
42
+
43
+ def options=(opts)
44
+ @@options = opts
45
+ end
46
+
47
+ def configure(&proc)
48
+ raise ArgumentError, "Block is required." unless block_given?
49
+ yield @@options
50
+ end
51
+
52
+ def find_module(url)
53
+ @@modules.find { |mod| mod.valid_url?(url) }
54
+ end
55
+
56
+ # 与えられた URL を処理できるモジュールを @@modules から検索して実行する
57
+ def scrape(url, opt = nil)
58
+ opt = @@options.merge(opt || {})
59
+ opt[:logger] ||= logger
60
+ raise StandardError, "url param is requred" unless url
61
+
62
+ logger.info "url: #{url}"
63
+ if mod = find_module(url)
64
+ logger.info "found module: #{mod.to_s}"
65
+ return mod.scrape(url, opt)
66
+ end
67
+ logger.info "unsupport url."
68
+ return nil
69
+ rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT => e
70
+ logger.warn " Timeout : #{e.to_s}"
71
+ raise TryAgainLater, e.to_s
72
+ rescue OpenURI::HTTPError => e
73
+ raise TryAgainLater, e.to_s if e.to_s.match(/50\d/)
74
+ raise FileNotFound, e.to_s if e.to_s.match(/40\d/)
75
+ raise
76
+ rescue Exception => e
77
+ logger.error "#{e.class}: #{e.to_s}"
78
+ raise e
79
+ end
80
+
81
+ private
82
+ def logger
83
+ return @@options[:logger] if @@options[:logger]
84
+ @@options[:logger] = NullLogger.new
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,27 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class AdultSatellites < Base
8
+ url_regex %r!http://(?:www\.)?asa\.tv/movie_detail\.php.*!
9
+
10
+ def scrape
11
+ html = http_get(@page_url)
12
+ doc = Hpricot(html.toutf8)
13
+ raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
14
+ flashvars = CGI.parse(flashvars.attributes['value'])
15
+ @video_url = flashvars['videoName'][0]
16
+ uri = URI.parse(@page_url)
17
+ if m = @video_url.match(%r!/([[:alnum:]]+/[[:alnum:]]+)\.flv!)
18
+ @thumb_url = "#{uri.scheme}://#{uri.host}/captured/#{m[1]}_1.jpg"
19
+ end
20
+ @title = doc.at('//strong[@class="ptitle"]').inner_html rescue nil
21
+ if embed = doc.at('//input[@name="embed"]')
22
+ @embed_tag = CGI.unescapeHTML(embed.attributes['value'])
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,28 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class AgeSage < Base
8
+ url_regex %r!\Ahttp://adult\.agesage\.jp/contentsPage\.html\?mcd=[[:alnum:]]{16}!
9
+
10
+ def scrape
11
+ @request_url = @page_url.sub('.html', '.xml')
12
+ @response_body = http_get(@request_url)
13
+ raise FileNotFound if @response_body.nil? or @response_body.empty?
14
+ xdoc = Hpricot.XML(@response_body.toutf8)
15
+ if movie = xdoc.at('/movie')
16
+ @video_url = movie.at('/movieurl').inner_html
17
+ @thumb_url = movie.at('/thumbnail').inner_html
18
+ @title = movie.at('/title').inner_html
19
+ mcd = @page_url.match(%r|agesage\.jp/contentsPage\.html\?mcd=([[:alnum:]]{16})|)[1]
20
+ @embed_tag = <<-HTML
21
+ <script type="text/javascript" src="http://adult.agesage.jp/js/past_uraui.js"></script>
22
+ <script type="text/javascript">Purauifla("mcd=#{mcd}", 320, 275);</script>
23
+ HTML
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,22 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class AmebaVision < Base
8
+ url_regex %r!\Ahttp://vision\.ameba\.jp/watch\.do.*?\?movie=(\d+)!
9
+
10
+ def scrape
11
+ id = url_regex_match[1]
12
+ request_url = "http://vision.ameba.jp/api/get/detailMovie.do?movie=#{id}"
13
+ xml = http_get(request_url)
14
+ xdoc = Hpricot.XML(xml.toutf8)
15
+ @title = xdoc.at('//item/title').inner_html
16
+ @page_url = xdoc.at('//item/link').inner_html
17
+ @thumb_url = xdoc.at('//item/imageUrlLarge').inner_html
18
+ @video_url = @thumb_url.sub('//vi', '//vm').sub('/jpg/', '/flv/').sub('_4.jpg', '.flv')
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,88 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ module WWW
4
+ module VideoScraper
5
+ class TryAgainLater < RuntimeError; end
6
+ class FileNotFound < RuntimeError; end
7
+
8
+ class NullLogger
9
+ def method_missing(name, *args); return nil; end
10
+ end
11
+
12
+ class Base
13
+ attr_reader :page_url, :video_url, :thumb_url, :embed_tag, :title
14
+
15
+ ## class methods
16
+ class << self
17
+ def url_regex(regex)
18
+ @url_regex = regex
19
+ end
20
+
21
+ def valid_url?(url)
22
+ Array(@url_regex).any? { |r| r.match(url) }
23
+ end
24
+
25
+ def scrape(url, opt = nil)
26
+ instance = self.new(url, opt)
27
+ instance.scrape
28
+ instance
29
+ end
30
+ end
31
+
32
+ def initialize(url, opt = nil)
33
+ @page_url = url
34
+ @opt = (opt || {})
35
+ url_regex = self.class.instance_variable_get(:@url_regex)
36
+ Array(url_regex).any? do |r|
37
+ @url_regex_match = r.match(@page_url).freeze
38
+ end
39
+ raise StandardError, "url is not #{self.class.name} link: #{url}" if @url_regex_match.nil?
40
+ end
41
+
42
+ def scrape
43
+ raise StandardError, 'not implemented yet'
44
+ end
45
+
46
+ protected
47
+ def url_regex_match
48
+ @url_regex_match
49
+ end
50
+
51
+ def agent
52
+ @agent ||= WWW::Mechanize.new do |a|
53
+ a.user_agent_alias = 'Windows IE 6'
54
+ end
55
+ end
56
+
57
+ def logger
58
+ return @opt[:logger] if @opt[:logger]
59
+ @opt[:logger] = NullLogger.new
60
+ end
61
+
62
+ def http_get(url, opt = nil)
63
+ open_opt = {
64
+ "User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
65
+ }.merge( opt || {} )
66
+ if @opt[:cache]
67
+ unless @opt[:cache].respond_to?(:get) and @opt[:cache].respond_to?(:set)
68
+ raise RuntimeError, 'As for cache object what responds to :get and :set is required.'
69
+ end
70
+ @opt[:logger].debug 'use cache.'
71
+ cache_key = "#{url}|#{open_opt}"
72
+ unless content = @opt[:cache].get(cache_key)
73
+ content = open(url, open_opt) {|fh| fh.read }
74
+ @opt[:cache].set(cache_key, content)
75
+ end
76
+ else
77
+ content = open(url, open_opt) {|fh| fh.read }
78
+ end
79
+ content
80
+ rescue OpenURI::HTTPError => e
81
+ raise TryAgainLater, e.to_s if e.to_s.include?('503')
82
+ raise e
83
+ rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT => e
84
+ raise TryAgainLater, e.to_s
85
+ end
86
+ end
87
+ end
88
+ end