video_scraper 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +4 -0
- data/README +71 -0
- data/Rakefile +146 -0
- data/lib/www/video_scraper.rb +88 -0
- data/lib/www/video_scraper/adult_satellites.rb +27 -0
- data/lib/www/video_scraper/age_sage.rb +28 -0
- data/lib/www/video_scraper/ameba_vision.rb +22 -0
- data/lib/www/video_scraper/base.rb +88 -0
- data/lib/www/video_scraper/dailymotion.rb +30 -0
- data/lib/www/video_scraper/eic_book.rb +34 -0
- data/lib/www/video_scraper/moro_tube.rb +31 -0
- data/lib/www/video_scraper/nico_video.rb +68 -0
- data/lib/www/video_scraper/pornhub.rb +24 -0
- data/lib/www/video_scraper/pornotube.rb +39 -0
- data/lib/www/video_scraper/red_tube.rb +89 -0
- data/lib/www/video_scraper/tube8.rb +31 -0
- data/lib/www/video_scraper/veoh.rb +28 -0
- data/lib/www/video_scraper/you_porn.rb +26 -0
- data/lib/www/video_scraper/you_tube.rb +53 -0
- data/lib/www/video_scraper/your_file_host.rb +54 -0
- data/test/test_helper.rb +23 -0
- data/test/www/test_video_scraper.rb +43 -0
- data/test/www/video_scraper/test_adult_satellites.rb +13 -0
- data/test/www/video_scraper/test_age_sage.rb +13 -0
- data/test/www/video_scraper/test_ameba_vision.rb +12 -0
- data/test/www/video_scraper/test_base.rb +14 -0
- data/test/www/video_scraper/test_dailymotion.rb +14 -0
- data/test/www/video_scraper/test_eic_book.rb +14 -0
- data/test/www/video_scraper/test_moro_tube.rb +13 -0
- data/test/www/video_scraper/test_nico_video.rb +23 -0
- data/test/www/video_scraper/test_pornhub.rb +14 -0
- data/test/www/video_scraper/test_pornotube.rb +21 -0
- data/test/www/video_scraper/test_red_tube.rb +13 -0
- data/test/www/video_scraper/test_tube8.rb +14 -0
- data/test/www/video_scraper/test_veoh.rb +24 -0
- data/test/www/video_scraper/test_you_porn.rb +13 -0
- data/test/www/video_scraper/test_you_tube.rb +32 -0
- data/test/www/video_scraper/test_your_file_host.rb +14 -0
- metadata +133 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class Dailymotion < Base
|
8
|
+
url_regex %r!\Ahttp://www\.dailymotion\.com/.*?/video/([\w/-]+)!
|
9
|
+
|
10
|
+
def scrape
|
11
|
+
uri = URI.parse(@page_url)
|
12
|
+
html = http_get(@page_url)
|
13
|
+
doc = Hpricot(html.toutf8)
|
14
|
+
doc.search('//script').each do |elem|
|
15
|
+
if m = elem.inner_html.match(/\.addVariable\("video",\s*"([^"]+)"/i)
|
16
|
+
path = CGI.unescape(m[1]).split(/\|\||@@/).first
|
17
|
+
@video_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
|
18
|
+
end
|
19
|
+
if m = elem.inner_html.match(/\.addVariable\("preview",\s+"([^"]+)"/)
|
20
|
+
path = CGI.unescape(m[1]).split(/\|\||@@/).first
|
21
|
+
@thumb_url = URI.join("#{uri.scheme}://#{uri.host}", path).to_s
|
22
|
+
end
|
23
|
+
end
|
24
|
+
@title = doc.at('//h1[@class="nav"]').inner_html rescue nil
|
25
|
+
@embed_tag = CGI.unescapeHTML(doc.at('//textarea[@id="video_player_embed_code_text"]').inner_html) rescue nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class EicBook < Base
|
8
|
+
attr_reader :capture_urls
|
9
|
+
url_regex %r!\Ahttp://www\.eic-book\.com/(detail_\d+\.html).*!
|
10
|
+
|
11
|
+
def scrape
|
12
|
+
uri = URI.parse(@page_url)
|
13
|
+
@page_url = "#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm"
|
14
|
+
html = http_get(@page_url)
|
15
|
+
doc = Hpricot(html.toutf8)
|
16
|
+
raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
|
17
|
+
flashvars = CGI.parse(flashvars.attributes['value'])
|
18
|
+
@video_url = flashvars['flv'][0]
|
19
|
+
@title = CGI.unescapeHTML(doc.at('//h2[@class="detailTtl"]').inner_html).gsub(' ', ' ') rescue nil
|
20
|
+
html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=h4")
|
21
|
+
doc = Hpricot(html.toutf8)
|
22
|
+
if img = doc.at('//div[@class="detailMN"]/img[@class="waku01"]')
|
23
|
+
@thumb_url = URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
|
24
|
+
end
|
25
|
+
html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=cp")
|
26
|
+
doc = Hpricot(html.toutf8)
|
27
|
+
@capture_urls = []
|
28
|
+
doc.search('//div[@class="detailMN"]/img[@class="waku01"]') do |img|
|
29
|
+
@capture_urls << URI.join("#{uri.scheme}://#{uri.host}", img.attributes['src']).to_s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class MoroTube < Base
|
8
|
+
url_regex %r!\Ahttp://www\.morotube\.com/watch\.php\?clip=([[:alnum:]]{8})!
|
9
|
+
attr_reader :author, :duration
|
10
|
+
|
11
|
+
def scrape
|
12
|
+
uri = URI.parse(@page_url)
|
13
|
+
uri.path = '/gen_xml.php'
|
14
|
+
uri.query = "type=o&id=#{url_regex_match[1]}"
|
15
|
+
xml = http_get(uri.to_s)
|
16
|
+
xdoc = Hpricot.XML(xml.toutf8)
|
17
|
+
@title = xdoc.search('/root/video/title').inner_html
|
18
|
+
@video_url = xdoc.search('/root/video/file').inner_html
|
19
|
+
@thumb_url = xdoc.search('/root/video/image').inner_html
|
20
|
+
@author = xdoc.search('/root/video/author').inner_html
|
21
|
+
@duration = xdoc.search('/root/video/duration').inner_html
|
22
|
+
|
23
|
+
html = http_get(@page_url)
|
24
|
+
doc = Hpricot(html)
|
25
|
+
doc.search('//input#inpVdoEmbed') do |elem|
|
26
|
+
@embed_tag = elem.attributes['value']
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class NicoVideo < Base
|
8
|
+
url_regex %r!\Ahttp://www\.nicovideo\.jp/watch/([[:alnum:]]+)!
|
9
|
+
|
10
|
+
def scrape
|
11
|
+
begin
|
12
|
+
login
|
13
|
+
id = url_regex_match[1]
|
14
|
+
get_flv(id)
|
15
|
+
get_thumb(id)
|
16
|
+
get_embed_tag(id)
|
17
|
+
rescue Timeout::Error => e
|
18
|
+
raise TryAgainLater, e.to_s
|
19
|
+
rescue WWW::Mechanize::ResponseCodeError => e
|
20
|
+
case e.response_code
|
21
|
+
when '404', '403'
|
22
|
+
raise FileNotFound, e.to_s
|
23
|
+
when '502'
|
24
|
+
raise TryAgainLater, e.to_s
|
25
|
+
else
|
26
|
+
raise TryAgainLater, e.to_s
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def login
|
33
|
+
page = agent.post('https://secure.nicovideo.jp/secure/login?site=niconico',
|
34
|
+
'mail' => @opt[:nico_video_mail],
|
35
|
+
'password' => @opt[:nico_video_password])
|
36
|
+
raise RuntimeError, 'login failure' unless page.header['x-niconico-authflag'] == '1'
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_flv(id)
|
40
|
+
request_url = "http://www.nicovideo.jp/api/getflv?v=#{id}"
|
41
|
+
page = agent.get(request_url)
|
42
|
+
q = CGI.parse(page.body)
|
43
|
+
raise FileNotFound unless q['url']
|
44
|
+
@video_url = q['url'].first
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_thumb(id)
|
48
|
+
page = agent.get("http://www.nicovideo.jp/api/getthumbinfo/#{id}")
|
49
|
+
xdoc = Hpricot.XML(page.body.toutf8)
|
50
|
+
xdoc.search('//thumbnail_url') do |elem|
|
51
|
+
@thumb_url = elem.inner_html
|
52
|
+
end
|
53
|
+
xdoc.search('//thumb/title') do |elem|
|
54
|
+
@title = elem.inner_html
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_embed_tag(id)
|
59
|
+
page = agent.get(@page_url)
|
60
|
+
response_body = page.body
|
61
|
+
doc = Hpricot(response_body)
|
62
|
+
doc.search('//form[@name="form_iframe"] //input[@name="input_iframe"]') do |elem|
|
63
|
+
@embed_tag = elem.attributes['value']
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class Pornhub < Base
|
8
|
+
url_regex %r|\Ahttp://www\.pornhub\.com/view_video\.php.*viewkey=[[:alnum:]]{20}|
|
9
|
+
|
10
|
+
def scrape
|
11
|
+
html = http_get(@page_url)
|
12
|
+
raise FileNotFound unless m = html.match(/\.addVariable\("options",\s*"([^"]+)"\);/i)
|
13
|
+
request_url = URI.decode m[1]
|
14
|
+
response_body = http_get(request_url)
|
15
|
+
@video_url = response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
|
16
|
+
if m = @video_url.match(%r|videos/(\d{3}/\d{3}/\d{3})/\d+.flv|)
|
17
|
+
@thumb_url = "http://p1.pornhub.com/thumbs/#{m[1]}/small.jpg"
|
18
|
+
end
|
19
|
+
@embed_tag = html.match(%r|<textarea[^>]+class="share-flag-embed">(<object type="application/x-shockwave-flash".*?</object>)</textarea>|).to_a[1]
|
20
|
+
@title = html.match(%r|<title>(.*) - Pornhub\.com</title>|).to_a[1]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class Pornotube < Base
|
8
|
+
url_regex %r!\Ahttp://(?:www\.)?pornotube\.com/(?:media|channels)\.php\?.*m=(\d+)!
|
9
|
+
|
10
|
+
def scrape
|
11
|
+
id = url_regex_match[1]
|
12
|
+
|
13
|
+
login
|
14
|
+
page = agent.get(@page_url)
|
15
|
+
raise FileNotFound unless embed = page.root.at('//object/embed')
|
16
|
+
src = embed.attributes['src']
|
17
|
+
hash = src.to_s.match(/\?v=(.*)$/)[1]
|
18
|
+
t = page.at('//div[@class="contentheader"]//span[@class="blue"]')
|
19
|
+
@title = t.inner_html.gsub(/<[^>]*>/, '').strip
|
20
|
+
page = agent.get("http://pornotube.com/player/player.php?#{hash}")
|
21
|
+
q = CGI::parse(page.body)
|
22
|
+
@video_url = "http://#{q['mediaDomain'][0]}.pornotube.com/#{q['userId'][0]}/#{q['mediaId'][0]}.flv"
|
23
|
+
@thumb_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}.jpg";
|
24
|
+
@image_url = "http://photo.pornotube.com/thumbnails/video/#{q['userId'][0]}/#{q['mediaId'][0]}_full.jpg";
|
25
|
+
@embed_tag = q['embedCode'][0]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
def login
|
30
|
+
agent.post("http://pornotube.com/index.php",
|
31
|
+
'verifyAge' => 'true',
|
32
|
+
'bMonth' => '01',
|
33
|
+
'bDay' => '01',
|
34
|
+
'bYear' => '1970',
|
35
|
+
'submit' => 'View All Content')
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class RedTube < Base
|
8
|
+
url_regex %r|\Ahttp://www\.redtube\.com/(\d+)|
|
9
|
+
|
10
|
+
def scrape
|
11
|
+
s = content_id || '0'
|
12
|
+
s = '1' if s.empty?
|
13
|
+
pathnr = s.to_i / 1000
|
14
|
+
s = "%07d" % s.to_i
|
15
|
+
logger.debug s
|
16
|
+
pathnr = "%07d" % pathnr
|
17
|
+
logger.debug pathnr
|
18
|
+
xc = %w!R 1 5 3 4 2 O 7 K 9 H B C D X F G A I J 8 L M Z 6 P Q 0 S T U V W E Y N!
|
19
|
+
qsum = 0
|
20
|
+
s.length.times do |i|
|
21
|
+
qsum += s[i,1].to_i * (i + 1)
|
22
|
+
end
|
23
|
+
s1 = qsum.to_s
|
24
|
+
qsum = 0
|
25
|
+
s1.length.times do |i|
|
26
|
+
qsum += s1[i,1].to_i
|
27
|
+
end
|
28
|
+
qstr = "%02d" % qsum
|
29
|
+
code = ''
|
30
|
+
code += xc[s[3] - 48 + qsum + 3]
|
31
|
+
code += qstr[1,1]
|
32
|
+
code += xc[s[0] - 48 + qsum + 2]
|
33
|
+
code += xc[s[2] - 48 + qsum + 1]
|
34
|
+
code += xc[s[5] - 48 + qsum + 6]
|
35
|
+
code += xc[s[1] - 48 + qsum + 5]
|
36
|
+
code += qstr[0,1]
|
37
|
+
code += xc[s[4] - 48 + qsum + 7]
|
38
|
+
code += xc[s[6] - 48 + qsum + 4]
|
39
|
+
content_video = pathnr + '/' + code + '.flv'
|
40
|
+
@pathnr = pathnr
|
41
|
+
@s = s
|
42
|
+
@video_url = "http://dl.redtube.com/_videos_t4vn23s9jc5498tgj49icfj4678/#{content_video}"
|
43
|
+
end
|
44
|
+
|
45
|
+
def thumb_url
|
46
|
+
return @thumb_url if @thumb_url
|
47
|
+
1.upto(10) do |i|
|
48
|
+
url = "http://thumbs.redtube.com/_thumbs/#{@pathnr}/#{@s}/#{@s}_#{'%03d' % i}.jpg"
|
49
|
+
logger.debug url
|
50
|
+
begin
|
51
|
+
uri = URI.parse(url)
|
52
|
+
Net::HTTP.start(uri.host, uri.port) do |http|
|
53
|
+
response = http.head(uri.request_uri,
|
54
|
+
{"User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)"})
|
55
|
+
logger.debug response.code
|
56
|
+
if 200 == response.code.to_i
|
57
|
+
@thumb_url = url
|
58
|
+
return @thumb_url
|
59
|
+
end
|
60
|
+
end
|
61
|
+
rescue TimeoutError, Timeout::Error, Errno::ETIMEDOUT
|
62
|
+
end
|
63
|
+
end
|
64
|
+
nil
|
65
|
+
end
|
66
|
+
|
67
|
+
def title
|
68
|
+
return @title if @title
|
69
|
+
html = http_get(@page_url)
|
70
|
+
doc = Hpricot(html.toutf8)
|
71
|
+
@title = doc.at("//table/tr[2]/td/table/tr[3]/td/table/tr/td").inner_html.gsub(/<[^>]*>/, '').strip
|
72
|
+
end
|
73
|
+
|
74
|
+
def embed_tag
|
75
|
+
return @embed_tag if @embed_tag
|
76
|
+
url = "http://www.redtube.com/embed/#{content_id}"
|
77
|
+
response_body = http_get(url)
|
78
|
+
doc = Hpricot(response_body)
|
79
|
+
doc.search('//textarea#cpf') do |elem|
|
80
|
+
@embed_tag = elem.inner_html
|
81
|
+
end
|
82
|
+
@embed_tag
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
def content_id; url_regex_match[1]; end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class Tube8 < Base
|
8
|
+
attr_reader :video_url_3gp
|
9
|
+
url_regex %r!\Ahttp://www\.tube8\.com/.*/(\d+)(?:/|$)!
|
10
|
+
|
11
|
+
def scrape
|
12
|
+
html = http_get(@page_url)
|
13
|
+
doc = Hpricot(html.toutf8)
|
14
|
+
raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
|
15
|
+
flashvars = CGI.parse(flashvars.attributes['value'])
|
16
|
+
@video_url = flashvars['videoUrl'][0]
|
17
|
+
uri = URI.parse(@page_url)
|
18
|
+
@thumb_url = URI.join("#{uri.scheme}://#{uri.host}", flashvars['imageUrl'][0]).to_s
|
19
|
+
@title = doc.at('//h1[@class="text"]').inner_html rescue nil
|
20
|
+
doc.search('//a').each do |elem|
|
21
|
+
if href = elem.attributes['href']
|
22
|
+
if href.match(/\.3gp$/)
|
23
|
+
@video_url_3gp = href
|
24
|
+
break
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class Veoh < Base
|
8
|
+
url_regex [%r!\Ahttp://www\.veoh\.com/videos/(v\d+[[:alnum:]]+)!,
|
9
|
+
%r!\Ahttp://www\.veoh\.com/collection/\w+/watch/.*#watch%3[Dd](v\d+[[:alnum:]]+)!,
|
10
|
+
%r!\Ahttp://www\.veoh\.com/(?:browse|collection)/(?:[\w]+/)+watch/(v\d+[[:alnum:]]+)!]
|
11
|
+
|
12
|
+
def scrape
|
13
|
+
@id = url_regex_match[1]
|
14
|
+
@page_url = "http://www.veoh.com/videos/#{@id}"
|
15
|
+
request_url = "http://www.veoh.com/rest/video/#{@id}/details"
|
16
|
+
xml = http_get(request_url)
|
17
|
+
@video_url = xml.match(/fullPreviewHashPath="([^"]+)"/).to_a[1]
|
18
|
+
@title = xml.match(/title="([^"]+)"/).to_a[1]
|
19
|
+
@thumb_url = xml.match(/fullMedResImagePath="([^"]+)"/).to_a[1]
|
20
|
+
html = http_get(@page_url)
|
21
|
+
#logger.debug html
|
22
|
+
if embed_tag = html.match(/class="embedinput"\s[^>]*value="([^"]+)"/).to_a[1]
|
23
|
+
@embed_tag = CGI.unescapeHTML(embed_tag)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class YouPorn < Base
|
8
|
+
url_regex %r!\Ahttp://(?:www\.)?youporn\.com/watch/(\d+)!
|
9
|
+
|
10
|
+
def scrape
|
11
|
+
id = url_regex_match[1]
|
12
|
+
|
13
|
+
request_url = @page_url.sub(/(\?.*)?$/, '?user_choice=Enter')
|
14
|
+
html = http_get(request_url, 'Cookie' => 'age_check=1')
|
15
|
+
doc = Hpricot(html)
|
16
|
+
doc.search('//div[@id="download"]//a').each do |elem|
|
17
|
+
href = elem.attributes['href']
|
18
|
+
(@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*!
|
19
|
+
end
|
20
|
+
h1 = doc.at('//div[@id="videoArea"]/h1')
|
21
|
+
@title = h1.inner_html.gsub(/<[^>]*>/, '').strip
|
22
|
+
@thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg') if h1.at('/img') != nil
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# -*- mode:ruby; coding:utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path(File.dirname(__FILE__) + '/base')
|
4
|
+
|
5
|
+
module WWW
|
6
|
+
module VideoScraper
|
7
|
+
class YouTube < Base
|
8
|
+
url_regex %r!\Ahttp://(?:www|jp)\.youtube\.com/watch.*[?&]v=([[:alnum:]]+)!
|
9
|
+
|
10
|
+
def scrape
|
11
|
+
page = pass_verify_age
|
12
|
+
@title = page.root.at('//head/title').inner_html.sub(/^YouTube[\s-]*/, '') rescue ''
|
13
|
+
@embed_tag = page.root.at('//input[@id="embed_code"]').attributes['value'] rescue nil
|
14
|
+
page.root.search('//script').each do |script|
|
15
|
+
if m = script.inner_html.match(/var\s+swfArgs\s*=\s*([^;]+);/)
|
16
|
+
swf_args = JSON::parse(m[1])
|
17
|
+
uri = URI.parse(@page_url)
|
18
|
+
uri.path = '/get_video'
|
19
|
+
uri.query = "video_id=#{swf_args['video_id']}&t=#{swf_args['t']}"
|
20
|
+
@video_url = uri.to_s
|
21
|
+
@thumb_url = "http://i.ytimg.com/vi/#{swf_args['video_id']}/default.jpg"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
raise FileNotFound, 'file not found' if @video_url.nil?
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
def login
|
29
|
+
uri = URI.parse(@page_url)
|
30
|
+
page = agent.get("#{uri.scheme}://#{uri.host}/login")
|
31
|
+
#login_form = page.form('loginForm')
|
32
|
+
#login_form.username = @opt[:you_tube_username]
|
33
|
+
#login_form.password = @opt[:you_tube_password]
|
34
|
+
login_form = page.form('gaia_loginform')
|
35
|
+
login_form.email = @opt[:you_tube_username]
|
36
|
+
login_form.passwd = @opt[:you_tube_password]
|
37
|
+
agent.submit(login_form)
|
38
|
+
end
|
39
|
+
|
40
|
+
def pass_verify_age
|
41
|
+
uri = URI.parse(@page_url)
|
42
|
+
page = agent.get(uri)
|
43
|
+
if page.uri.path =~ /verify_age/
|
44
|
+
login
|
45
|
+
page = agent.post(page.uri,
|
46
|
+
'next_url' => "#{uri.path}?#{uri.query}",
|
47
|
+
'action_confirm' => 'Confirm Birth Date')
|
48
|
+
end
|
49
|
+
page
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|