video_scraper 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/ChangeLog +4 -0
  2. data/README +71 -0
  3. data/Rakefile +146 -0
  4. data/lib/www/video_scraper.rb +88 -0
  5. data/lib/www/video_scraper/adult_satellites.rb +27 -0
  6. data/lib/www/video_scraper/age_sage.rb +28 -0
  7. data/lib/www/video_scraper/ameba_vision.rb +22 -0
  8. data/lib/www/video_scraper/base.rb +88 -0
  9. data/lib/www/video_scraper/dailymotion.rb +30 -0
  10. data/lib/www/video_scraper/eic_book.rb +34 -0
  11. data/lib/www/video_scraper/moro_tube.rb +31 -0
  12. data/lib/www/video_scraper/nico_video.rb +68 -0
  13. data/lib/www/video_scraper/pornhub.rb +24 -0
  14. data/lib/www/video_scraper/pornotube.rb +39 -0
  15. data/lib/www/video_scraper/red_tube.rb +89 -0
  16. data/lib/www/video_scraper/tube8.rb +31 -0
  17. data/lib/www/video_scraper/veoh.rb +28 -0
  18. data/lib/www/video_scraper/you_porn.rb +26 -0
  19. data/lib/www/video_scraper/you_tube.rb +53 -0
  20. data/lib/www/video_scraper/your_file_host.rb +54 -0
  21. data/test/test_helper.rb +23 -0
  22. data/test/www/test_video_scraper.rb +43 -0
  23. data/test/www/video_scraper/test_adult_satellites.rb +13 -0
  24. data/test/www/video_scraper/test_age_sage.rb +13 -0
  25. data/test/www/video_scraper/test_ameba_vision.rb +12 -0
  26. data/test/www/video_scraper/test_base.rb +14 -0
  27. data/test/www/video_scraper/test_dailymotion.rb +14 -0
  28. data/test/www/video_scraper/test_eic_book.rb +14 -0
  29. data/test/www/video_scraper/test_moro_tube.rb +13 -0
  30. data/test/www/video_scraper/test_nico_video.rb +23 -0
  31. data/test/www/video_scraper/test_pornhub.rb +14 -0
  32. data/test/www/video_scraper/test_pornotube.rb +21 -0
  33. data/test/www/video_scraper/test_red_tube.rb +13 -0
  34. data/test/www/video_scraper/test_tube8.rb +14 -0
  35. data/test/www/video_scraper/test_veoh.rb +24 -0
  36. data/test/www/video_scraper/test_you_porn.rb +13 -0
  37. data/test/www/video_scraper/test_you_tube.rb +32 -0
  38. data/test/www/video_scraper/test_your_file_host.rb +14 -0
  39. metadata +133 -0
@@ -0,0 +1,30 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Dailymotion < Base
8
+ url_regex %r!\Ahttp://www\.dailymotion\.com/.*?/video/([\w/-]+)!
9
+
10
+ def scrape
11
+ uri = URI.parse(@page_url)
12
+ html = http_get(@page_url)
13
+ doc = Hpricot(html.toutf8)
14
+ doc.search('//script').each do |elem|
15
+ if m = elem.inner_html.match(/\.addVariable\("video",\s*"([^"]+)"/i)
16
+ path = CGI.unescape(m[1]).split(/\|\||@@/).first
17
+ @video_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
18
+ end
19
+ if m = elem.inner_html.match(/\.addVariable\("preview",\s+"([^"]+)"/)
20
+ path = CGI.unescape(m[1]).split(/\|\||@@/).first
21
+ @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
22
+ end
23
+ end
24
+ @title = doc.at('//h1[@class="nav"]').inner_html rescue nil
25
+ @embed_tag = CGI.unescapeHTML(doc.at('//textarea[@id="video_player_embed_code_text"]').inner_html) rescue nil
26
+ end
27
+ end
28
+ end
29
+ end
30
+
@@ -0,0 +1,34 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class EicBook < Base
8
+ attr_reader :capture_urls
9
+ url_regex %r!\Ahttp://www\.eic-book\.com/(detail_\d+\.html).*!
10
+
11
+ def scrape
12
+ uri = URI.parse(@page_url)
13
+ @page_url = "#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm"
14
+ html = http_get(@page_url)
15
+ doc = Hpricot(html.toutf8)
16
+ raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
17
+ flashvars = CGI.parse(flashvars.attributes['value'])
18
+ @video_url = flashvars['flv'][0]
19
+ @title = CGI.unescapeHTML(doc.at('//h2[@class="detailTtl"]').inner_html).gsub('&nbsp;', ' ') rescue nil
20
+ html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=h4")
21
+ doc = Hpricot(html.toutf8)
22
+ if img = doc.at('//div[@class="detailMN"]/img[@class="waku01"]')
23
+ @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
24
+ end
25
+ html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=cp")
26
+ doc = Hpricot(html.toutf8)
27
+ @capture_urls = []
28
+ doc.search('//div[@class="detailMN"]/img[@class="waku01"]') do |img|
29
+ @capture_urls << URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,31 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class MoroTube < Base
8
+ url_regex %r!\Ahttp://www\.morotube\.com/watch\.php\?clip=([[:alnum:]]{8})!
9
+ attr_reader :author, :duration
10
+
11
+ def scrape
12
+ uri = URI.parse(@page_url)
13
+ uri.path = '/gen_xml.php'
14
+ uri.query = "type=o&id=#{url_regex_match[1]}"
15
+ xml = http_get(uri.to_s)
16
+ xdoc = Hpricot.XML(xml.toutf8)
17
+ @title = xdoc.search('/root/video/title').inner_html
18
+ @video_url = xdoc.search('/root/video/file').inner_html
19
+ @thumb_url = xdoc.search('/root/video/image').inner_html
20
+ @author = xdoc.search('/root/video/author').inner_html
21
+ @duration = xdoc.search('/root/video/duration').inner_html
22
+
23
+ html = http_get(@page_url)
24
+ doc = Hpricot(html)
25
+ doc.search('//input#inpVdoEmbed') do |elem|
26
+ @embed_tag = elem.attributes['value']
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,68 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class NicoVideo < Base
8
+ url_regex %r!\Ahttp://www\.nicovideo\.jp/watch/([[:alnum:]]+)!
9
+
10
+ def scrape
11
+ begin
12
+ login
13
+ id = url_regex_match[1]
14
+ get_flv(id)
15
+ get_thumb(id)
16
+ get_embed_tag(id)
17
+ rescue Timeout::Error => e
18
+ raise TryAgainLater, e.to_s
19
+ rescue WWW::Mechanize::ResponseCodeError => e
20
+ case e.response_code
21
+ when '404', '403'
22
+ raise FileNotFound, e.to_s
23
+ when '502'
24
+ raise TryAgainLater, e.to_s
25
+ else
26
+ raise TryAgainLater, e.to_s
27
+ end
28
+ end
29
+ end
30
+
31
+ private
32
+ def login
33
+ page = agent.post('https://secure.nicovideo.jp/secure/login?site=niconico',
34
+ 'mail' => @opt[:nico_video_mail],
35
+ 'password' => @opt[:nico_video_password])
36
+ raise RuntimeError, 'login failure' unless page.header['x-niconico-authflag'] == '1'
37
+ end
38
+
39
+ def get_flv(id)
40
+ request_url = "http://www.nicovideo.jp/api/getflv?v=#{id}"
41
+ page = agent.get(request_url)
42
+ q = CGI.parse(page.body)
43
+ raise FileNotFound unless q['url']
44
+ @video_url = q['url'].first
45
+ end
46
+
47
+ def get_thumb(id)
48
+ page = agent.get("http://www.nicovideo.jp/api/getthumbinfo/#{id}")
49
+ xdoc = Hpricot.XML(page.body.toutf8)
50
+ xdoc.search('//thumbnail_url') do |elem|
51
+ @thumb_url = elem.inner_html
52
+ end
53
+ xdoc.search('//thumb/title') do |elem|
54
+ @title = elem.inner_html
55
+ end
56
+ end
57
+
58
+ def get_embed_tag(id)
59
+ page = agent.get(@page_url)
60
+ response_body = page.body
61
+ doc = Hpricot(response_body)
62
+ doc.search('//form[@name="form_iframe"] //input[@name="input_iframe"]') do |elem|
63
+ @embed_tag = elem.attributes['value']
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,24 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Pornhub < Base
8
+ url_regex %r|\Ahttp://www\.pornhub\.com/view_video\.php.*viewkey=[[:alnum:]]{20}|
9
+
10
+ def scrape
11
+ html = http_get(@page_url)
12
+ raise FileNotFound unless m = html.match(/\.addVariable\("options",\s*"([^"]+)"\);/i)
13
+ request_url = URI.decode m[1]
14
+ response_body = http_get(request_url)
15
+ @video_url = response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
16
+ if m = @video_url.match(%r|videos/(\d{3}/\d{3}/\d{3})/\d+.flv|)
17
+ @thumb_url = "http://p1.pornhub.com/thumbs/#{m[1]}/small.jpg"
18
+ end
19
+ @embed_tag = html.match(%r|<textarea[^>]+class="share-flag-embed">(<object type="application/x-shockwave-flash".*?</object>)</textarea>|).to_a[1]
20
+ @title = html.match(%r|<title>(.*) - Pornhub\.com</title>|).to_a[1]
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,39 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Pornotube < Base
8
+ url_regex %r!\Ahttp://(?:www\.)?pornotube\.com/(?:media|channels)\.php\?.*m=(\d+)!
9
+
10
+ def scrape
11
+ id = url_regex_match[1]
12
+
13
+ login
14
+ page = agent.get(@page_url)
15
+ raise FileNotFound unless embed = page.root.at('//object/embed')
16
+ src = embed.attributes['src']
17
+ hash = src.to_s.match(/\?v=(.*)$/)[1]
18
+ t = page.at('//div[@class="contentheader"]//span[@class="blue"]')
19
+ @title = t.inner_html.gsub(/<[^>]*>/, '').strip
20
+ page = agent.get("http://pornotube.com/player/player.php?#{hash}")
21
+ q = CGI::parse(page.body)
22
+ @video_url = "http://#{q['mediaDomain'][0]}.pornotube.com/#{q['userId'][0]}/#{q['mediaId'][0]}.flv"
23
+ @thumb_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}.jpg";
24
+ @image_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}_full.jpg";
25
+ @embed_tag = q['embedCode'][0]
26
+ end
27
+
28
+ private
29
+ def login
30
+ agent.post("http://pornotube.com/index.php",
31
+ 'verifyAge' => 'true',
32
+ 'bMonth' => '01',
33
+ 'bDay' => '01',
34
+ 'bYear' => '1970',
35
+ 'submit' => 'View All Content')
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,89 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class RedTube < Base
8
+ url_regex %r|\Ahttp://www\.redtube\.com/(\d+)|
9
+
10
+ def scrape
11
+ s = content_id || '0'
12
+ s = '1' if s.empty?
13
+ pathnr = s.to_i / 1000
14
+ s = "%07d" % s.to_i
15
+ logger.debug s
16
+ pathnr = "%07d" % pathnr
17
+ logger.debug pathnr
18
+ xc = %w!R 1 5 3 4 2 O 7 K 9 H B C D X F G A I J 8 L M Z 6 P Q 0 S T U V W E Y N!
19
+ qsum = 0
20
+ s.length.times do |i|
21
+ qsum += s[i,1].to_i * (i + 1)
22
+ end
23
+ s1 = qsum.to_s
24
+ qsum = 0
25
+ s1.length.times do |i|
26
+ qsum += s1[i,1].to_i
27
+ end
28
+ qstr = "%02d" % qsum
29
+ code = ''
30
+ code += xc[s[3] - 48 + qsum + 3]
31
+ code += qstr[1,1]
32
+ code += xc[s[0] - 48 + qsum + 2]
33
+ code += xc[s[2] - 48 + qsum + 1]
34
+ code += xc[s[5] - 48 + qsum + 6]
35
+ code += xc[s[1] - 48 + qsum + 5]
36
+ code += qstr[0,1]
37
+ code += xc[s[4] - 48 + qsum + 7]
38
+ code += xc[s[6] - 48 + qsum + 4]
39
+ content_video = pathnr + '/' + code + '.flv'
40
+ @pathnr = pathnr
41
+ @s = s
42
+ @video_url = "http://dl.redtube.com/_videos_t4vn23s9jc5498tgj49icfj4678/#{content_video}"
43
+ end
44
+
45
+ def thumb_url
46
+ return @thumb_url if @thumb_url
47
+ 1.upto(10) do |i|
48
+ url = "http://thumbs.redtube.com/_thumbs/#{@pathnr}/#{@s}/#{@s}_#{'%03d' % i}.jpg"
49
+ logger.debug url
50
+ begin
51
+ uri = URI.parse(url)
52
+ Net::HTTP.start(uri.host, uri.port) do |http|
53
+ response = http.head(uri.request_uri,
54
+ {"User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)"})
55
+ logger.debug response.code
56
+ if 200 == response.code.to_i
57
+ @thumb_url = url
58
+ return @thumb_url
59
+ end
60
+ end
61
+ rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT
62
+ end
63
+ end
64
+ nil
65
+ end
66
+
67
+ def title
68
+ return @title if @title
69
+ html = http_get(@page_url)
70
+ doc = Hpricot(html.toutf8)
71
+ @title = doc.at("//table/tr[2]/td/table/tr[3]/td/table/tr/td").inner_html.gsub(/<[^>]*>/, '').strip
72
+ end
73
+
74
+ def embed_tag
75
+ return @embed_tag if @embed_tag
76
+ url = "http://www.redtube.com/embed/#{content_id}"
77
+ response_body = http_get(url)
78
+ doc = Hpricot(response_body)
79
+ doc.search('//textarea#cpf') do |elem|
80
+ @embed_tag = elem.inner_html
81
+ end
82
+ @embed_tag
83
+ end
84
+
85
+ private
86
+ def content_id; url_regex_match[1]; end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,31 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Tube8 < Base
8
+ attr_reader :video_url_3gp
9
+ url_regex %r!\Ahttp://www\.tube8\.com/.*/(\d+)(?:/|$)!
10
+
11
+ def scrape
12
+ html = http_get(@page_url)
13
+ doc = Hpricot(html.toutf8)
14
+ raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
15
+ flashvars = CGI.parse(flashvars.attributes['value'])
16
+ @video_url = flashvars['videoUrl'][0]
17
+ uri = URI.parse(@page_url)
18
+ @thumb_url = URI.join("#{uri.scheme}://#{uri.host}", flashvars['imageUrl'][0]).to_s
19
+ @title = doc.at('//h1[@class="text"]').inner_html rescue nil
20
+ doc.search('//a').each do |elem|
21
+ if href = elem.attributes['href']
22
+ if href.match(/\.3gp$/)
23
+ @video_url_3gp = href
24
+ break
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,28 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class Veoh < Base
8
+ url_regex [%r!\Ahttp://www\.veoh\.com/videos/(v\d+[[:alnum:]]+)!,
9
+ %r!\Ahttp://www\.veoh\.com/collection/\w+/watch/.*#watch%3[Dd](v\d+[[:alnum:]]+)!,
10
+ %r!\Ahttp://www\.veoh\.com/(?:browse|collection)/(?:[\w]+/)+watch/(v\d+[[:alnum:]]+)!]
11
+
12
+ def scrape
13
+ @id = url_regex_match[1]
14
+ @page_url = "http://www.veoh.com/videos/#{@id}"
15
+ request_url = "http://www.veoh.com/rest/video/#{@id}/details"
16
+ xml = http_get(request_url)
17
+ @video_url = xml.match(/fullPreviewHashPath="([^"]+)"/).to_a[1]
18
+ @title = xml.match(/title="([^"]+)"/).to_a[1]
19
+ @thumb_url = xml.match(/fullMedResImagePath="([^"]+)"/).to_a[1]
20
+ html = http_get(@page_url)
21
+ #logger.debug html
22
+ if embed_tag = html.match(/class="embedinput"\s[^>]*value="([^"]+)"/).to_a[1]
23
+ @embed_tag = CGI.unescapeHTML(embed_tag)
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,26 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class YouPorn < Base
8
+ url_regex %r!\Ahttp://(?:www\.)?youporn\.com/watch/(\d+)!
9
+
10
+ def scrape
11
+ id = url_regex_match[1]
12
+
13
+ request_url = @page_url.sub(/(\?.*)?$/, '?user_choice=Enter')
14
+ html = http_get(request_url, 'Cookie' => 'age_check=1')
15
+ doc = Hpricot(html)
16
+ doc.search('//div[@id="download"]//a').each do |elem|
17
+ href = elem.attributes['href']
18
+ (@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*!
19
+ end
20
+ h1 = doc.at('//div[@id="videoArea"]/h1')
21
+ @title = h1.inner_html.gsub(/<[^>]*>/, '').strip
22
+ @thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg') if h1.at('/img') != nil
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,53 @@
1
+ # -*- mode:ruby; coding:utf-8 -*-
2
+
3
+ require File.expand_path(File.dirname(__FILE__) + '/base')
4
+
5
+ module WWW
6
+ module VideoScraper
7
+ class YouTube < Base
8
+ url_regex %r!\Ahttp://(?:www|jp)\.youtube\.com/watch.*[?&]v=([[:alnum:]]+)!
9
+
10
+ def scrape
11
+ page = pass_verify_age
12
+ @title = page.root.at('//head/title').inner_html.sub(/^YouTube[\s-]*/, '') rescue ''
13
+ @embed_tag = page.root.at('//input[@id="embed_code"]').attributes['value'] rescue nil
14
+ page.root.search('//script').each do |script|
15
+ if m = script.inner_html.match(/var\s+swfArgs\s*=\s*([^;]+);/)
16
+ swf_args = JSON::parse(m[1])
17
+ uri = URI.parse(@page_url)
18
+ uri.path = '/get_video'
19
+ uri.query = "video_id=#{swf_args['video_id']}&t=#{swf_args['t']}"
20
+ @video_url = uri.to_s
21
+ @thumb_url = "http://i.ytimg.com/vi/#{swf_args['video_id']}/default.jpg"
22
+ end
23
+ end
24
+ raise FileNotFound, 'file not found' if @video_url.nil?
25
+ end
26
+
27
+ private
28
+ def login
29
+ uri = URI.parse(@page_url)
30
+ page = agent.get("#{uri.scheme}://#{uri.host}/login")
31
+ #login_form = page.form('loginForm')
32
+ #login_form.username = @opt[:you_tube_username]
33
+ #login_form.password = @opt[:you_tube_password]
34
+ login_form = page.form('gaia_loginform')
35
+ login_form.email = @opt[:you_tube_username]
36
+ login_form.passwd = @opt[:you_tube_password]
37
+ agent.submit(login_form)
38
+ end
39
+
40
+ def pass_verify_age
41
+ uri = URI.parse(@page_url)
42
+ page = agent.get(uri)
43
+ if page.uri.path =~ /verify_age/
44
+ login
45
+ page = agent.post(page.uri,
46
+ 'next_url' => "#{uri.path}?#{uri.query}",
47
+ 'action_confirm' => 'Confirm Birth Date')
48
+ end
49
+ page
50
+ end
51
+ end
52
+ end
53
+ end