video_scraper 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/ChangeLog +4 -0
  2. data/README +71 -0
  3. data/Rakefile +146 -0
  4. data/lib/www/video_scraper.rb +88 -0
  5. data/lib/www/video_scraper/adult_satellites.rb +27 -0
  6. data/lib/www/video_scraper/age_sage.rb +28 -0
  7. data/lib/www/video_scraper/ameba_vision.rb +22 -0
  8. data/lib/www/video_scraper/base.rb +88 -0
  9. data/lib/www/video_scraper/dailymotion.rb +30 -0
  10. data/lib/www/video_scraper/eic_book.rb +34 -0
  11. data/lib/www/video_scraper/moro_tube.rb +31 -0
  12. data/lib/www/video_scraper/nico_video.rb +68 -0
  13. data/lib/www/video_scraper/pornhub.rb +24 -0
  14. data/lib/www/video_scraper/pornotube.rb +39 -0
  15. data/lib/www/video_scraper/red_tube.rb +89 -0
  16. data/lib/www/video_scraper/tube8.rb +31 -0
  17. data/lib/www/video_scraper/veoh.rb +28 -0
  18. data/lib/www/video_scraper/you_porn.rb +26 -0
  19. data/lib/www/video_scraper/you_tube.rb +53 -0
  20. data/lib/www/video_scraper/your_file_host.rb +54 -0
  21. data/test/test_helper.rb +23 -0
  22. data/test/www/test_video_scraper.rb +43 -0
  23. data/test/www/video_scraper/test_adult_satellites.rb +13 -0
  24. data/test/www/video_scraper/test_age_sage.rb +13 -0
  25. data/test/www/video_scraper/test_ameba_vision.rb +12 -0
  26. data/test/www/video_scraper/test_base.rb +14 -0
  27. data/test/www/video_scraper/test_dailymotion.rb +14 -0
  28. data/test/www/video_scraper/test_eic_book.rb +14 -0
  29. data/test/www/video_scraper/test_moro_tube.rb +13 -0
  30. data/test/www/video_scraper/test_nico_video.rb +23 -0
  31. data/test/www/video_scraper/test_pornhub.rb +14 -0
  32. data/test/www/video_scraper/test_pornotube.rb +21 -0
  33. data/test/www/video_scraper/test_red_tube.rb +13 -0
  34. data/test/www/video_scraper/test_tube8.rb +14 -0
  35. data/test/www/video_scraper/test_veoh.rb +24 -0
  36. data/test/www/video_scraper/test_you_porn.rb +13 -0
  37. data/test/www/video_scraper/test_you_tube.rb +32 -0
  38. data/test/www/video_scraper/test_your_file_host.rb +14 -0
  39. metadata +133 -0
@@ -0,0 +1,30 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Dailymotion < Base
8
+ url_regex %r!\Ahttp://www\.dailymotion\.com/.*?/video/([\w/-]+)!
9
+
10
+ def scrape
11
+ uri = URI.parse(@page_url)
12
+ html = http_get(@page_url)
13
+ doc = Hpricot(html.toutf8)
14
+ doc.search('//script').each do |elem|
15
+ if m = elem.inner_html.match(/\.addVariable\("video",\s*"([^"]+)"/i)
16
+ path = CGI.unescape(m[1]).split(/\|\||@@/).first
17
+ @video_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
18
+ end
19
+ if m = elem.inner_html.match(/\.addVariable\("preview",\s+"([^"]+)"/)
20
+ path = CGI.unescape(m[1]).split(/\|\||@@/).first
21
+ @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
22
+ end
23
+ end
24
+ @title = doc.at('//h1[@class="nav"]').inner_html rescue nil
25
+ @embed_tag = CGI.unescapeHTML(doc.at('//textarea[@id="video_player_embed_code_text"]').inner_html) rescue nil
26
+ end
27
+ end
28
+ end
29
+ end
30
+
@@ -0,0 +1,34 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class EicBook < Base
8
+ attr_reader :capture_urls
9
+ url_regex %r!\Ahttp://www\.eic-book\.com/(detail_\d+\.html).*!
10
+
11
+ def scrape
12
+ uri = URI.parse(@page_url)
13
+ @page_url = "#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm"
14
+ html = http_get(@page_url)
15
+ doc = Hpricot(html.toutf8)
16
+ raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
17
+ flashvars = CGI.parse(flashvars.attributes['value'])
18
+ @video_url = flashvars['flv'][0]
19
+ @title = CGI.unescapeHTML(doc.at('//h2[@class="detailTtl"]').inner_html).gsub('&nbsp;', ' ') rescue nil
20
+ html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=h4")
21
+ doc = Hpricot(html.toutf8)
22
+ if img = doc.at('//div[@class="detailMN"]/img[@class="waku01"]')
23
+ @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
24
+ end
25
+ html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=cp")
26
+ doc = Hpricot(html.toutf8)
27
+ @capture_urls = []
28
+ doc.search('//div[@class="detailMN"]/img[@class="waku01"]') do |img|
29
+ @capture_urls << URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,31 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class MoroTube < Base
8
+ url_regex %r!\Ahttp://www\.morotube\.com/watch\.php\?clip=([[:alnum:]]{8})!
9
+ attr_reader :author, :duration
10
+
11
+ def scrape
12
+ uri = URI.parse(@page_url)
13
+ uri.path = '/gen_xml.php'
14
+ uri.query = "type=o&id=#{url_regex_match[1]}"
15
+ xml = http_get(uri.to_s)
16
+ xdoc = Hpricot.XML(xml.toutf8)
17
+ @title = xdoc.search('/root/video/title').inner_html
18
+ @video_url = xdoc.search('/root/video/file').inner_html
19
+ @thumb_url = xdoc.search('/root/video/image').inner_html
20
+ @author = xdoc.search('/root/video/author').inner_html
21
+ @duration = xdoc.search('/root/video/duration').inner_html
22
+
23
+ html = http_get(@page_url)
24
+ doc = Hpricot(html)
25
+ doc.search('//input#inpVdoEmbed') do |elem|
26
+ @embed_tag = elem.attributes['value']
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,68 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class NicoVideo < Base
8
+ url_regex %r!\Ahttp://www\.nicovideo\.jp/watch/([[:alnum:]]+)!
9
+
10
+ def scrape
11
+ begin
12
+ login
13
+ id = url_regex_match[1]
14
+ get_flv(id)
15
+ get_thumb(id)
16
+ get_embed_tag(id)
17
+ rescue Timeout::Error => e
18
+ raise TryAgainLater, e.to_s
19
+ rescue WWW::Mechanize::ResponseCodeError => e
20
+ case e.response_code
21
+ when '404', '403'
22
+ raise FileNotFound, e.to_s
23
+ when '502'
24
+ raise TryAgainLater, e.to_s
25
+ else
26
+ raise TryAgainLater, e.to_s
27
+ end
28
+ end
29
+ end
30
+
31
+ private
32
+ def login
33
+ page = agent.post('https://secure.nicovideo.jp/secure/login?site=niconico',
34
+ 'mail' => @opt[:nico_video_mail],
35
+ 'password' => @opt[:nico_video_password])
36
+ raise RuntimeError, 'login failure' unless page.header['x-niconico-authflag'] == '1'
37
+ end
38
+
39
+ def get_flv(id)
40
+ request_url = "http://www.nicovideo.jp/api/getflv?v=#{id}"
41
+ page = agent.get(request_url)
42
+ q = CGI.parse(page.body)
43
+ raise FileNotFound unless q['url']
44
+ @video_url = q['url'].first
45
+ end
46
+
47
+ def get_thumb(id)
48
+ page = agent.get("http://www.nicovideo.jp/api/getthumbinfo/#{id}")
49
+ xdoc = Hpricot.XML(page.body.toutf8)
50
+ xdoc.search('//thumbnail_url') do |elem|
51
+ @thumb_url = elem.inner_html
52
+ end
53
+ xdoc.search('//thumb/title') do |elem|
54
+ @title = elem.inner_html
55
+ end
56
+ end
57
+
58
+ def get_embed_tag(id)
59
+ page = agent.get(@page_url)
60
+ response_body = page.body
61
+ doc = Hpricot(response_body)
62
+ doc.search('//form[@name="form_iframe"] //input[@name="input_iframe"]') do |elem|
63
+ @embed_tag = elem.attributes['value']
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,24 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Pornhub < Base
8
+ url_regex %r|\Ahttp://www\.pornhub\.com/view_video\.php.*viewkey=[[:alnum:]]{20}|
9
+
10
+ def scrape
11
+ html = http_get(@page_url)
12
+ raise FileNotFound unless m = html.match(/\.addVariable\("options",\s*"([^"]+)"\);/i)
13
+ request_url = URI.decode m[1]
14
+ response_body = http_get(request_url)
15
+ @video_url = response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
16
+ if m = @video_url.match(%r|videos/(\d{3}/\d{3}/\d{3})/\d+.flv|)
17
+ @thumb_url = "http://p1.pornhub.com/thumbs/#{m[1]}/small.jpg"
18
+ end
19
+ @embed_tag = html.match(%r|<textarea[^>]+class="share-flag-embed">(<object type="application/x-shockwave-flash".*?</object>)</textarea>|).to_a[1]
20
+ @title = html.match(%r|<title>(.*) - Pornhub\.com</title>|).to_a[1]
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,39 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Pornotube < Base
8
+ url_regex %r!\Ahttp://(?:www\.)?pornotube\.com/(?:media|channels)\.php\?.*m=(\d+)!
9
+
10
+ def scrape
11
+ id = url_regex_match[1]
12
+
13
+ login
14
+ page = agent.get(@page_url)
15
+ raise FileNotFound unless embed = page.root.at('//object/embed')
16
+ src = embed.attributes['src']
17
+ hash = src.to_s.match(/\?v=(.*)$/)[1]
18
+ t = page.at('//div[@class="contentheader"]//span[@class="blue"]')
19
+ @title = t.inner_html.gsub(/<[^>]*>/, '').strip
20
+ page = agent.get("http://pornotube.com/player/player.php?#{hash}")
21
+ q = CGI::parse(page.body)
22
+ @video_url = "http://#{q['mediaDomain'][0]}.pornotube.com/#{q['userId'][0]}/#{q['mediaId'][0]}.flv"
23
+ @thumb_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}.jpg";
24
+ @image_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}_full.jpg";
25
+ @embed_tag = q['embedCode'][0]
26
+ end
27
+
28
+ private
29
+ def login
30
+ agent.post("http://pornotube.com/index.php",
31
+ 'verifyAge' => 'true',
32
+ 'bMonth' => '01',
33
+ 'bDay' => '01',
34
+ 'bYear' => '1970',
35
+ 'submit' => 'View All Content')
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,89 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class RedTube < Base
8
+ url_regex %r|\Ahttp://www\.redtube\.com/(\d+)|
9
+
10
+ def scrape
11
+ s = content_id || '0'
12
+ s = '1' if s.empty?
13
+ pathnr = s.to_i / 1000
14
+ s = "%07d" % s.to_i
15
+ logger.debug s
16
+ pathnr = "%07d" % pathnr
17
+ logger.debug pathnr
18
+ xc = %w!R 1 5 3 4 2 O 7 K 9 H B C D X F G A I J 8 L M Z 6 P Q 0 S T U V W E Y N!
19
+ qsum = 0
20
+ s.length.times do |i|
21
+ qsum += s[i,1].to_i * (i + 1)
22
+ end
23
+ s1 = qsum.to_s
24
+ qsum = 0
25
+ s1.length.times do |i|
26
+ qsum += s1[i,1].to_i
27
+ end
28
+ qstr = "%02d" % qsum
29
+ code = ''
30
+ code += xc[s[3] - 48 + qsum + 3]
31
+ code += qstr[1,1]
32
+ code += xc[s[0] - 48 + qsum + 2]
33
+ code += xc[s[2] - 48 + qsum + 1]
34
+ code += xc[s[5] - 48 + qsum + 6]
35
+ code += xc[s[1] - 48 + qsum + 5]
36
+ code += qstr[0,1]
37
+ code += xc[s[4] - 48 + qsum + 7]
38
+ code += xc[s[6] - 48 + qsum + 4]
39
+ content_video = pathnr + '/' + code + '.flv'
40
+ @pathnr = pathnr
41
+ @s = s
42
+ @video_url = "http://dl.redtube.com/_videos_t4vn23s9jc5498tgj49icfj4678/#{content_video}"
43
+ end
44
+
45
+ def thumb_url
46
+ return @thumb_url if @thumb_url
47
+ 1.upto(10) do |i|
48
+ url = "http://thumbs.redtube.com/_thumbs/#{@pathnr}/#{@s}/#{@s}_#{'%03d' % i}.jpg"
49
+ logger.debug url
50
+ begin
51
+ uri = URI.parse(url)
52
+ Net::HTTP.start(uri.host, uri.port) do |http|
53
+ response = http.head(uri.request_uri,
54
+ {"User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)"})
55
+ logger.debug response.code
56
+ if 200 == response.code.to_i
57
+ @thumb_url = url
58
+ return @thumb_url
59
+ end
60
+ end
61
+ rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT
62
+ end
63
+ end
64
+ nil
65
+ end
66
+
67
+ def title
68
+ return @title if @title
69
+ html = http_get(@page_url)
70
+ doc = Hpricot(html.toutf8)
71
+ @title = doc.at("//table/tr[2]/td/table/tr[3]/td/table/tr/td").inner_html.gsub(/<[^>]*>/, '').strip
72
+ end
73
+
74
+ def embed_tag
75
+ return @embed_tag if @embed_tag
76
+ url = "http://www.redtube.com/embed/#{content_id}"
77
+ response_body = http_get(url)
78
+ doc = Hpricot(response_body)
79
+ doc.search('//textarea#cpf') do |elem|
80
+ @embed_tag = elem.inner_html
81
+ end
82
+ @embed_tag
83
+ end
84
+
85
+ private
86
+ def content_id; url_regex_match[1]; end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,31 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Tube8 < Base
8
+ attr_reader :video_url_3gp
9
+ url_regex %r!\Ahttp://www\.tube8\.com/.*/(\d+)(?:/|$)!
10
+
11
+ def scrape
12
+ html = http_get(@page_url)
13
+ doc = Hpricot(html.toutf8)
14
+ raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
15
+ flashvars = CGI.parse(flashvars.attributes['value'])
16
+ @video_url = flashvars['videoUrl'][0]
17
+ uri = URI.parse(@page_url)
18
+ @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", flashvars['imageUrl'][0]).to_s
19
+ @title = doc.at('//h1[@class="text"]').inner_html rescue nil
20
+ doc.search('//a').each do |elem|
21
+ if href = elem.attributes['href']
22
+ if href.match(/\.3gp$/)
23
+ @video_url_3gp = href
24
+ break
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,28 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Veoh < Base
8
+ url_regex [%r!\Ahttp://www\.veoh\.com/videos/(v\d+[[:alnum:]]+)!,
9
+ %r!\Ahttp://www\.veoh\.com/collection/\w+/watch/.*#watch%3[Dd](v\d+[[:alnum:]]+)!,
10
+ %r!\Ahttp://www\.veoh\.com/(?:browse|collection)/(?:[\w]+/)+watch/(v\d+[[:alnum:]]+)!]
11
+
12
+ def scrape
13
+ @id = url_regex_match[1]
14
+ @page_url = "http://www.veoh.com/videos/#{@id}"
15
+ request_url = "http://www.veoh.com/rest/video/#{@id}/details"
16
+ xml = http_get(request_url)
17
+ @video_url = xml.match(/fullPreviewHashPath="([^"]+)"/).to_a[1]
18
+ @title = xml.match(/title="([^"]+)"/).to_a[1]
19
+ @thumb_url = xml.match(/fullMedResImagePath="([^"]+)"/).to_a[1]
20
+ html = http_get(@page_url)
21
+ #logger.debug html
22
+ if embed_tag = html.match(/class="embedinput"\s[^>]*value="([^"]+)"/).to_a[1]
23
+ @embed_tag = CGI.unescapeHTML(embed_tag)
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,26 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class YouPorn < Base
8
+ url_regex %r!\Ahttp://(?:www\.)?youporn\.com/watch/(\d+)!
9
+
10
+ def scrape
11
+ id = url_regex_match[1]
12
+
13
+ request_url = @page_url.sub(/(\?.*)?$/, '?user_choice=Enter')
14
+ html = http_get(request_url, 'Cookie' => 'age_check=1')
15
+ doc = Hpricot(html)
16
+ doc.search('//div[@id="download"]//a').each do |elem|
17
+ href = elem.attributes['href']
18
+ (@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*!
19
+ end
20
+ h1 = doc.at('//div[@id="videoArea"]/h1')
21
+ @title = h1.inner_html.gsub(/<[^>]*>/, '').strip
22
+ @thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg') if h1.at('/img') != nil
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,53 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class YouTube < Base
8
+ url_regex %r!\Ahttp://(?:www|jp)\.youtube\.com/watch.*[?&]v=([[:alnum:]]+)!
9
+
10
+ def scrape
11
+ page = pass_verify_age
12
+ @title = page.root.at('//head/title').inner_html.sub(/^YouTube[\s-]*/, '') rescue ''
13
+ @embed_tag = page.root.at('//input[@id="embed_code"]').attributes['value'] rescue nil
14
+ page.root.search('//script').each do |script|
15
+ if m = script.inner_html.match(/var\s+swfArgs\s*=\s*([^;]+);/)
16
+ swf_args = JSON::parse(m[1])
17
+ uri = URI.parse(@page_url)
18
+ uri.path = '/get_video'
19
+ uri.query = "video_id=#{swf_args['video_id']}&t=#{swf_args['t']}"
20
+ @video_url = uri.to_s
21
+ @thumb_url = "http://i.ytimg.com/vi/#{swf_args['video_id']}/default.jpg"
22
+ end
23
+ end
24
+ raise FileNotFound, 'file not found' if @video_url.nil?
25
+ end
26
+
27
+ private
28
+ def login
29
+ uri = URI.parse(@page_url)
30
+ page = agent.get("#{uri.scheme}://#{uri.host}/login")
31
+ #login_form = page.form('loginForm')
32
+ #login_form.username = @opt[:you_tube_username]
33
+ #login_form.password = @opt[:you_tube_password]
34
+ login_form = page.form('gaia_loginform')
35
+ login_form.email = @opt[:you_tube_username]
36
+ login_form.passwd = @opt[:you_tube_password]
37
+ agent.submit(login_form)
38
+ end
39
+
40
+ def pass_verify_age
41
+ uri = URI.parse(@page_url)
42
+ page = agent.get(uri)
43
+ if page.uri.path =~ /verify_age/
44
+ login
45
+ page = agent.post(page.uri,
46
+ 'next_url' => "#{uri.path}?#{uri.query}",
47
+ 'action_confirm' => 'Confirm Birth Date')
48
+ end
49
+ page
50
+ end
51
+ end
52
+ end
53
+ end