video_scraper 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/ChangeLog +4 -0
  2. data/README +71 -0
  3. data/Rakefile +146 -0
  4. data/lib/www/video_scraper.rb +88 -0
  5. data/lib/www/video_scraper/adult_satellites.rb +27 -0
  6. data/lib/www/video_scraper/age_sage.rb +28 -0
  7. data/lib/www/video_scraper/ameba_vision.rb +22 -0
  8. data/lib/www/video_scraper/base.rb +88 -0
  9. data/lib/www/video_scraper/dailymotion.rb +30 -0
  10. data/lib/www/video_scraper/eic_book.rb +34 -0
  11. data/lib/www/video_scraper/moro_tube.rb +31 -0
  12. data/lib/www/video_scraper/nico_video.rb +68 -0
  13. data/lib/www/video_scraper/pornhub.rb +24 -0
  14. data/lib/www/video_scraper/pornotube.rb +39 -0
  15. data/lib/www/video_scraper/red_tube.rb +89 -0
  16. data/lib/www/video_scraper/tube8.rb +31 -0
  17. data/lib/www/video_scraper/veoh.rb +28 -0
  18. data/lib/www/video_scraper/you_porn.rb +26 -0
  19. data/lib/www/video_scraper/you_tube.rb +53 -0
  20. data/lib/www/video_scraper/your_file_host.rb +54 -0
  21. data/test/test_helper.rb +23 -0
  22. data/test/www/test_video_scraper.rb +43 -0
  23. data/test/www/video_scraper/test_adult_satellites.rb +13 -0
  24. data/test/www/video_scraper/test_age_sage.rb +13 -0
  25. data/test/www/video_scraper/test_ameba_vision.rb +12 -0
  26. data/test/www/video_scraper/test_base.rb +14 -0
  27. data/test/www/video_scraper/test_dailymotion.rb +14 -0
  28. data/test/www/video_scraper/test_eic_book.rb +14 -0
  29. data/test/www/video_scraper/test_moro_tube.rb +13 -0
  30. data/test/www/video_scraper/test_nico_video.rb +23 -0
  31. data/test/www/video_scraper/test_pornhub.rb +14 -0
  32. data/test/www/video_scraper/test_pornotube.rb +21 -0
  33. data/test/www/video_scraper/test_red_tube.rb +13 -0
  34. data/test/www/video_scraper/test_tube8.rb +14 -0
  35. data/test/www/video_scraper/test_veoh.rb +24 -0
  36. data/test/www/video_scraper/test_you_porn.rb +13 -0
  37. data/test/www/video_scraper/test_you_tube.rb +32 -0
  38. data/test/www/video_scraper/test_your_file_host.rb +14 -0
  39. metadata +133 -0
data/ChangeLog ADDED
@@ -0,0 +1,4 @@
1
+ == 1.0.1 / 2009-01-17
2
+
3
+ * initial release
4
+
data/README ADDED
@@ -0,0 +1,71 @@
1
+ = WWW::VideoScraper
2
+
3
+ * http://coderepos.org/share/browser/lang/ruby/video_scraper
4
+ * http://github.com/valda/video_scraper/tree/master
5
+
6
+ == DESCRIPTION:
7
+
8
+ Web scraping library for video sharing sites.
9
+
10
+ == FEATURES/PROBLEMS:
11
+
12
+ Supported sites
13
+
14
+ * AdultSatellites
15
+ * AmebaVision
16
+ * Dailymotion
17
+ * MoroTube
18
+ * NICO NICO DOUGA
19
+ * Pornhub
20
+ * Pornotube
21
+ * RedTube
22
+ * Tube8
23
+ * Ura Agesage
24
+ * Veoh
25
+ * YouPorn
26
+ * YouTube
27
+ * YourFileHost
28
+
29
+ == SYNOPSIS:
30
+
31
+ >> require 'www/video_scraper'
32
+ >> scraper = WWW::VideoScraper.scrape('http://www.youtube.com/watch?v=OFPnvARUOHI')
33
+ >> scraper.video_url
34
+ => "http://www.youtube.com/get_video?video_id=OFPnvARUOHI&t=OEgsToPDskIpQJU48rm4-sS1RtbItouY"
35
+ >> scraper.thumb_url
36
+ => "http://i.ytimg.com/vi/OFPnvARUOHI/default.jpg"
37
+
38
+ == REQUIREMENTS:
39
+
40
+ * WWW::Mechanize
41
+ * Hpricot
42
+ * CGIAlt (recommend)
43
+
44
+ == INSTALL:
45
+
46
+ * sudo gem install valda-video_scraper
47
+
48
+ == LICENSE:
49
+
50
+ (The MIT License)
51
+
52
+ Copyright (c) 2009 YAMAGUCHI Seiji <valda at underscore.jp>
53
+
54
+ Permission is hereby granted, free of charge, to any person obtaining
55
+ a copy of this software and associated documentation files (the
56
+ 'Software'), to deal in the Software without restriction, including
57
+ without limitation the rights to use, copy, modify, merge, publish,
58
+ distribute, sublicense, and/or sell copies of the Software, and to
59
+ permit persons to whom the Software is furnished to do so, subject to
60
+ the following conditions:
61
+
62
+ The above copyright notice and this permission notice shall be
63
+ included in all copies or substantial portions of the Software.
64
+
65
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
66
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
67
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
68
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
69
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
70
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
71
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,146 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'rake/clean'
4
+ require 'rake/testtask'
5
+ require 'rake/packagetask'
6
+ require 'rake/gempackagetask'
7
+ require 'rake/rdoctask'
8
+ require 'rake/contrib/rubyforgepublisher'
9
+ require 'rake/contrib/sshpublisher'
10
+ require 'fileutils'
11
+ require 'lib/www/video_scraper'
12
+ include FileUtils
13
+
14
+ NAME = "video_scraper"
15
+ AUTHOR = "YAMAGUCHI Seiji"
16
+ EMAIL = "valda@underscore.jp"
17
+ DESCRIPTION = "Web scraping library for video sharing sites."
18
+ RUBYFORGE_PROJECT = "video_scraper"
19
+ HOMEPATH = "http://github.com/valda/video_scraper"
20
+ BIN_FILES = %w( )
21
+
22
+ VERS = WWW::VideoScraper::VERSION
23
+ REV = File.read(".svn/entries")[/committed-rev="(d+)"/, 1] rescue nil
24
+ CLEAN.include ['**/.*.sw?', '*.gem', '.config']
25
+ RDOC_OPTS = [
26
+ '--title', "#{NAME} documentation",
27
+ "--charset", "utf-8",
28
+ "--opname", "index.html",
29
+ "--line-numbers",
30
+ "--main", "README",
31
+ "--inline-source",
32
+ ]
33
+
34
+ task :default => [:test]
35
+ task :package => [:clean]
36
+
37
+ Rake::TestTask.new("test") do |t|
38
+ t.libs << "test"
39
+ t.pattern = "test/**/*_test.rb"
40
+ t.verbose = true
41
+ end
42
+
43
+ spec = Gem::Specification.new do |s|
44
+ s.name = NAME
45
+ s.version = VERS
46
+ s.platform = Gem::Platform::RUBY
47
+ s.has_rdoc = true
48
+ s.extra_rdoc_files = ["README", "ChangeLog"]
49
+ s.rdoc_options += RDOC_OPTS + ['--exclude', '^(examples|extras)/']
50
+ s.summary = DESCRIPTION
51
+ s.description = DESCRIPTION
52
+ s.author = AUTHOR
53
+ s.email = EMAIL
54
+ s.homepage = HOMEPATH
55
+ s.executables = BIN_FILES
56
+ s.rubyforge_project = RUBYFORGE_PROJECT
57
+ s.bindir = "bin"
58
+ s.require_path = "lib"
59
+ #s.autorequire = ""
60
+ s.test_files = Dir["test/*_test.rb"]
61
+
62
+ s.add_dependency('mechanize', '>=0.8.4')
63
+ s.add_dependency('hpricot', '>=0.6.164')
64
+ s.add_dependency('json', '>=1.1.3')
65
+ #s.required_ruby_version = '>= 1.8.2'
66
+
67
+ s.files = %w(README ChangeLog Rakefile) +
68
+ Dir.glob("{bin,doc,test,lib,templates,generator,extras,website,script}/**/*") +
69
+ Dir.glob("ext/**/*.{h,c,rb}") +
70
+ Dir.glob("examples/**/*.rb") +
71
+ Dir.glob("tools/*.rb") +
72
+ Dir.glob("rails/*.rb")
73
+
74
+ s.extensions = FileList["ext/**/extconf.rb"].to_a
75
+ end
76
+
77
+ Rake::GemPackageTask.new(spec) do |p|
78
+ p.need_tar = true
79
+ p.gem_spec = spec
80
+ end
81
+
82
+ task :install do
83
+ name = "#{NAME}-#{VERS}.gem"
84
+ sh %{rake package}
85
+ sh %{sudo gem install pkg/#{name}}
86
+ end
87
+
88
+ task :uninstall => [:clean] do
89
+ sh %{sudo gem uninstall #{NAME}}
90
+ end
91
+
92
+
93
+ Rake::RDocTask.new do |rdoc|
94
+ rdoc.rdoc_dir = 'html'
95
+ rdoc.options += RDOC_OPTS
96
+ rdoc.template = "resh"
97
+ #rdoc.template = "#{ENV['template']}.rb" if ENV['template']
98
+ if ENV['DOC_FILES']
99
+ rdoc.rdoc_files.include(ENV['DOC_FILES'].split(/,\s*/))
100
+ else
101
+ rdoc.rdoc_files.include('README', 'ChangeLog')
102
+ rdoc.rdoc_files.include('lib/**/*.rb')
103
+ rdoc.rdoc_files.include('ext/**/*.c')
104
+ end
105
+ end
106
+
107
+ desc "Publish to RubyForge"
108
+ task :rubyforge => [:rdoc, :package] do
109
+ require 'rubyforge'
110
+ Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, 'yamaguchi').upload
111
+ end
112
+
113
+ desc 'Package and upload the release to rubyforge.'
114
+ task :release => [:clean, :package] do |t|
115
+ v = ENV["VERSION"] or abort "Must supply VERSION=x.y.z"
116
+ abort "Versions don't match #{v} vs #{VERS}" unless v == VERS
117
+ pkg = "pkg/#{NAME}-#{VERS}"
118
+
119
+ require 'rubyforge'
120
+ rf = RubyForge.new.configure
121
+ puts "Logging in"
122
+ rf.login
123
+
124
+ c = rf.userconfig
125
+ # c["release_notes"] = description if description
126
+ # c["release_changes"] = changes if changes
127
+ c["preformatted"] = true
128
+
129
+ files = [
130
+ "#{pkg}.tgz",
131
+ "#{pkg}.gem"
132
+ ].compact
133
+
134
+ puts "Releasing #{NAME} v. #{VERS}"
135
+ rf.add_release RUBYFORGE_PROJECT, NAME, VERS, *files
136
+ end
137
+
138
+ desc 'Show information about the gem.'
139
+ task :debug_gem do
140
+ puts spec.to_ruby
141
+ end
142
+
143
+ desc 'Update gem spec'
144
+ task :gemspec do
145
+ open("#{NAME}.gemspec", 'w').write spec.to_ruby
146
+ end
@@ -0,0 +1,88 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require 'rubygems'
4
+ require 'open-uri'
5
+ require 'hpricot'
6
+ require 'mechanize'
7
+ require 'kconv'
8
+ require 'json'
9
+ require 'uri'
10
+ begin
11
+ require 'cgialt' unless defined? CGI
12
+ rescue LoadError
13
+ require 'cgi'
14
+ end
15
+
16
+ module WWW
17
+ module VideoScraper
18
+ VERSION = '1.0.5'
19
+
20
+ MODULES_NAME = %w(adult_satellites age_sage ameba_vision dailymotion eic_book
21
+ moro_tube nico_video pornhub pornotube red_tube tube8 veoh
22
+ you_porn you_tube your_file_host)
23
+
24
+ @@modules = MODULES_NAME.map do |name|
25
+ require File.expand_path(File.join(File.dirname(__FILE__), 'video_scraper', name))
26
+ const_get( name.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase } )
27
+ end
28
+
29
+ @@options = {
30
+ :logger => nil,
31
+ :cache => nil,
32
+ }
33
+
34
+ class << self
35
+ def modules
36
+ @@nodules
37
+ end
38
+
39
+ def options
40
+ @@options
41
+ end
42
+
43
+ def options=(opts)
44
+ @@options = opts
45
+ end
46
+
47
+ def configure(&proc)
48
+ raise ArgumentError, "Block is required." unless block_given?
49
+ yield @@options
50
+ end
51
+
52
+ def find_module(url)
53
+ @@modules.find { |mod| mod.valid_url?(url) }
54
+ end
55
+
56
+ # 与えられた URL を処理できるモジュールを @@modules から検索して実行する
57
+ def scrape(url, opt = nil)
58
+ opt = @@options.merge(opt || {})
59
+ opt[:logger] ||= logger
60
+ raise StandardError, "url param is requred" unless url
61
+
62
+ logger.info "url: #{url}"
63
+ if mod = find_module(url)
64
+ logger.info "found module: #{mod.to_s}"
65
+ return mod.scrape(url, opt)
66
+ end
67
+ logger.info "unsupport url."
68
+ return nil
69
+ rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT => e
70
+ logger.warn " Timeout : #{e.to_s}"
71
+ raise TryAgainLater, e.to_s
72
+ rescue OpenURI::HTTPError => e
73
+ raise TryAgainLater, e.to_s if e.to_s.match(/50\d/)
74
+ raise FileNotFound, e.to_s if e.to_s.match(/40\d/)
75
+ raise
76
+ rescue Exception => e
77
+ logger.error "#{e.class}: #{e.to_s}"
78
+ raise e
79
+ end
80
+
81
+ private
82
+ def logger
83
+ return @@options[:logger] if @@options[:logger]
84
+ @@options[:logger] = NullLogger.new
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,27 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class AdultSatellites < Base
8
+ url_regex %r!http://(?:www\.)?asa\.tv/movie_detail\.php.*!
9
+
10
+ def scrape
11
+ html = http_get(@page_url)
12
+ doc = Hpricot(html.toutf8)
13
+ raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
14
+ flashvars = CGI.parse(flashvars.attributes['value'])
15
+ @video_url = flashvars['videoName'][0]
16
+ uri = URI.parse(@page_url)
17
+ if m = @video_url.match(%r!/([[:alnum:]]+/[[:alnum:]]+)\.flv!)
18
+ @thumb_url = "#{uri.scheme}://#{uri.host}/captured/#{m[1]}_1.jpg"
19
+ end
20
+ @title = doc.at('//strong[@class="ptitle"]').inner_html rescue nil
21
+ if embed = doc.at('//input[@name="embed"]')
22
+ @embed_tag = CGI.unescapeHTML(embed.attributes['value'])
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,28 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class AgeSage < Base
8
+ url_regex %r!\Ahttp://adult\.agesage\.jp/contentsPage\.html\?mcd=[[:alnum:]]{16}!
9
+
10
+ def scrape
11
+ @request_url = @page_url.sub('.html', '.xml')
12
+ @response_body = http_get(@request_url)
13
+ raise FileNotFound if @response_body.nil? or @response_body.empty?
14
+ xdoc = Hpricot.XML(@response_body.toutf8)
15
+ if movie = xdoc.at('/movie')
16
+ @video_url = movie.at('/movieurl').inner_html
17
+ @thumb_url = movie.at('/thumbnail').inner_html
18
+ @title = movie.at('/title').inner_html
19
+ mcd = @page_url.match(%r|agesage\.jp/contentsPage\.html\?mcd=([[:alnum:]]{16})|)[1]
20
+ @embed_tag = <<-HTML
21
+ <script type="text/javascript" src="http://adult.agesage.jp/js/past_uraui.js"></script>
22
+ <script type="text/javascript">Purauifla("mcd=#{mcd}", 320, 275);</script>
23
+ HTML
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,22 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class AmebaVision < Base
8
+ url_regex %r!\Ahttp://vision\.ameba\.jp/watch\.do.*?\?movie=(\d+)!
9
+
10
+ def scrape
11
+ id = url_regex_match[1]
12
+ request_url = "http://vision.ameba.jp/api/get/detailMovie.do?movie=#{id}"
13
+ xml = http_get(request_url)
14
+ xdoc = Hpricot.XML(xml.toutf8)
15
+ @title = xdoc.at('//item/title').inner_html
16
+ @page_url = xdoc.at('//item/link').inner_html
17
+ @thumb_url = xdoc.at('//item/imageUrlLarge').inner_html
18
+ @video_url = @thumb_url.sub('//vi', '//vm').sub('/jpg/', '/flv/').sub('_4.jpg', '.flv')
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,88 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ module WWW
4
+ module VideoScraper
5
+ class TryAgainLater < RuntimeError; end
6
+ class FileNotFound < RuntimeError; end
7
+
8
+ class NullLogger
9
+ def method_missing(name, *args); return nil; end
10
+ end
11
+
12
+ class Base
13
+ attr_reader :page_url, :video_url, :thumb_url, :embed_tag, :title
14
+
15
+ ## class methods
16
+ class << self
17
+ def url_regex(regex)
18
+ @url_regex = regex
19
+ end
20
+
21
+ def valid_url?(url)
22
+ Array(@url_regex).any? { |r| r.match(url) }
23
+ end
24
+
25
+ def scrape(url, opt = nil)
26
+ instance = self.new(url, opt)
27
+ instance.scrape
28
+ instance
29
+ end
30
+ end
31
+
32
+ def initialize(url, opt = nil)
33
+ @page_url = url
34
+ @opt = (opt || {})
35
+ url_regex = self.class.instance_variable_get(:@url_regex)
36
+ Array(url_regex).any? do |r|
37
+ @url_regex_match = r.match(@page_url).freeze
38
+ end
39
+ raise StandardError, "url is not #{self.class.name} link: #{url}" if @url_regex_match.nil?
40
+ end
41
+
42
+ def scrape
43
+ raise StandardError, 'not implemented yet'
44
+ end
45
+
46
+ protected
47
+ def url_regex_match
48
+ @url_regex_match
49
+ end
50
+
51
+ def agent
52
+ @agent ||= WWW::Mechanize.new do |a|
53
+ a.user_agent_alias = 'Windows IE 6'
54
+ end
55
+ end
56
+
57
+ def logger
58
+ return @opt[:logger] if @opt[:logger]
59
+ @opt[:logger] = NullLogger.new
60
+ end
61
+
62
+ def http_get(url, opt = nil)
63
+ open_opt = {
64
+ "User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
65
+ }.merge( opt || {} )
66
+ if @opt[:cache]
67
+ unless @opt[:cache].respond_to?(:get) and @opt[:cache].respond_to?(:set)
68
+ raise RuntimeError, 'As for cache object what responds to :get and :set is required.'
69
+ end
70
+ @opt[:logger].debug 'use cache.'
71
+ cache_key = "#{url}|#{open_opt}"
72
+ unless content = @opt[:cache].get(cache_key)
73
+ content = open(url, open_opt) {|fh| fh.read }
74
+ @opt[:cache].set(cache_key, content)
75
+ end
76
+ else
77
+ content = open(url, open_opt) {|fh| fh.read }
78
+ end
79
+ content
80
+ rescue OpenURI::HTTPError => e
81
+ raise TryAgainLater, e.to_s if e.to_s.include?('503')
82
+ raise e
83
+ rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT => e
84
+ raise TryAgainLater, e.to_s
85
+ end
86
+ end
87
+ end
88
+ end