valda-video_scraper 1.0.4 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/www/video_scraper.rb +1 -1
- data/lib/www/video_scraper/base.rb +5 -2
- data/lib/www/video_scraper/eic_book.rb +2 -1
- data/lib/www/video_scraper/pornhub.rb +4 -3
- data/lib/www/video_scraper/pornotube.rb +3 -1
- data/lib/www/video_scraper/red_tube.rb +7 -0
- data/lib/www/video_scraper/veoh.rb +6 -2
- data/lib/www/video_scraper/you_porn.rb +3 -3
- data/lib/www/video_scraper/you_tube.rb +6 -3
- data/test/www/test_video_scraper.rb +2 -0
- data/test/www/video_scraper/test_eic_book.rb +1 -2
- data/test/www/video_scraper/test_pornhub.rb +1 -0
- data/test/www/video_scraper/test_veoh.rb +12 -1
- metadata +6 -5
data/lib/www/video_scraper.rb
CHANGED
@@ -19,7 +19,7 @@ module WWW
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def valid_url?(url)
|
22
|
-
|
22
|
+
Array(@url_regex).any? { |r| r.match(url) }
|
23
23
|
end
|
24
24
|
|
25
25
|
def scrape(url, opt = nil)
|
@@ -32,7 +32,10 @@ module WWW
|
|
32
32
|
def initialize(url, opt = nil)
|
33
33
|
@page_url = url
|
34
34
|
@opt = (opt || {})
|
35
|
-
|
35
|
+
url_regex = self.class.instance_variable_get(:@url_regex)
|
36
|
+
Array(url_regex).any? do |r|
|
37
|
+
@url_regex_match = r.match(@page_url).freeze
|
38
|
+
end
|
36
39
|
raise StandardError, "url is not #{self.class.name} link: #{url}" if @url_regex_match.nil?
|
37
40
|
end
|
38
41
|
|
@@ -10,7 +10,8 @@ module WWW
|
|
10
10
|
|
11
11
|
def scrape
|
12
12
|
uri = URI.parse(@page_url)
|
13
|
-
|
13
|
+
@page_url = "#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm"
|
14
|
+
html = http_get(@page_url)
|
14
15
|
doc = Hpricot(html.toutf8)
|
15
16
|
raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
|
16
17
|
flashvars = CGI.parse(flashvars.attributes['value'])
|
@@ -10,13 +10,14 @@ module WWW
|
|
10
10
|
def scrape
|
11
11
|
html = http_get(@page_url)
|
12
12
|
raise FileNotFound unless m = html.match(/\.addVariable\("options",\s*"([^"]+)"\);/i)
|
13
|
-
|
14
|
-
|
15
|
-
@video_url =
|
13
|
+
request_url = URI.decode m[1]
|
14
|
+
response_body = http_get(request_url)
|
15
|
+
@video_url = response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
|
16
16
|
if m = @video_url.match(%r|videos/(\d{3}/\d{3}/\d{3})/\d+.flv|)
|
17
17
|
@thumb_url = "http://p1.pornhub.com/thumbs/#{m[1]}/small.jpg"
|
18
18
|
end
|
19
19
|
@embed_tag = html.match(%r|<textarea[^>]+class="share-flag-embed">(<object type="application/x-shockwave-flash".*?</object>)</textarea>|).to_a[1]
|
20
|
+
@title = html.match(%r|<title>(.*) - Pornhub\.com</title>|).to_a[1]
|
20
21
|
end
|
21
22
|
end
|
22
23
|
end
|
@@ -14,7 +14,9 @@ module WWW
|
|
14
14
|
page = agent.get(@page_url)
|
15
15
|
raise FileNotFound unless embed = page.root.at('//object/embed')
|
16
16
|
src = embed.attributes['src']
|
17
|
-
hash = src.match(/\?v=(.*)$/)[1]
|
17
|
+
hash = src.to_s.match(/\?v=(.*)$/)[1]
|
18
|
+
t = page.at('//div[@class="contentheader"]//span[@class="blue"]')
|
19
|
+
@title = t.inner_html.gsub(/<[^>]*>/, '').strip
|
18
20
|
page = agent.get("http://pornotube.com/player/player.php?#{hash}")
|
19
21
|
q = CGI::parse(page.body)
|
20
22
|
@video_url = "http://#{q['mediaDomain'][0]}.pornotube.com/#{q['userId'][0]}/#{q['mediaId'][0]}.flv"
|
@@ -64,6 +64,13 @@ module WWW
|
|
64
64
|
nil
|
65
65
|
end
|
66
66
|
|
67
|
+
def title
|
68
|
+
return @title if @title
|
69
|
+
html = http_get(@page_url)
|
70
|
+
doc = Hpricot(html.toutf8)
|
71
|
+
@title = doc.at("//table/tr[2]/td/table/tr[3]/td/table/tr/td").inner_html.gsub(/<[^>]*>/, '').strip
|
72
|
+
end
|
73
|
+
|
67
74
|
def embed_tag
|
68
75
|
return @embed_tag if @embed_tag
|
69
76
|
url = "http://www.redtube.com/embed/#{content_id}"
|
@@ -5,17 +5,21 @@ require File.expand_path(File.dirname(__FILE__) + '/base')
|
|
5
5
|
module WWW
|
6
6
|
module VideoScraper
|
7
7
|
class Veoh < Base
|
8
|
-
url_regex %r!\Ahttp://www\.veoh\.com/videos/([[:alnum:]]+)
|
8
|
+
url_regex [%r!\Ahttp://www\.veoh\.com/videos/(v\d+[[:alnum:]]+)!,
|
9
|
+
%r!\Ahttp://www\.veoh\.com/collection/\w+/watch/.*#watch%3[Dd](v\d+[[:alnum:]]+)!,
|
10
|
+
%r!\Ahttp://www\.veoh\.com/(?:browse|collection)/(?:[\w]+/)+watch/(v\d+[[:alnum:]]+)!]
|
9
11
|
|
10
12
|
def scrape
|
11
13
|
@id = url_regex_match[1]
|
14
|
+
@page_url = "http://www.veoh.com/videos/#{@id}"
|
12
15
|
request_url = "http://www.veoh.com/rest/video/#{@id}/details"
|
13
16
|
xml = http_get(request_url)
|
14
17
|
@video_url = xml.match(/fullPreviewHashPath="([^"]+)"/).to_a[1]
|
15
18
|
@title = xml.match(/title="([^"]+)"/).to_a[1]
|
16
19
|
@thumb_url = xml.match(/fullMedResImagePath="([^"]+)"/).to_a[1]
|
17
20
|
html = http_get(@page_url)
|
18
|
-
|
21
|
+
#logger.debug html
|
22
|
+
if embed_tag = html.match(/class="embedinput"\s[^>]*value="([^"]+)"/).to_a[1]
|
19
23
|
@embed_tag = CGI.unescapeHTML(embed_tag)
|
20
24
|
end
|
21
25
|
end
|
@@ -5,7 +5,7 @@ require File.expand_path(File.dirname(__FILE__) + '/base')
|
|
5
5
|
module WWW
|
6
6
|
module VideoScraper
|
7
7
|
class YouPorn < Base
|
8
|
-
url_regex %r!\Ahttp://youporn\.com/watch/(\d+)!
|
8
|
+
url_regex %r!\Ahttp://(?:www\.)?youporn\.com/watch/(\d+)!
|
9
9
|
|
10
10
|
def scrape
|
11
11
|
id = url_regex_match[1]
|
@@ -15,11 +15,11 @@ module WWW
|
|
15
15
|
doc = Hpricot(html)
|
16
16
|
doc.search('//div[@id="download"]//a').each do |elem|
|
17
17
|
href = elem.attributes['href']
|
18
|
-
(@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download
|
18
|
+
(@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*!
|
19
19
|
end
|
20
20
|
h1 = doc.at('//div[@id="videoArea"]/h1')
|
21
21
|
@title = h1.inner_html.gsub(/<[^>]*>/, '').strip
|
22
|
-
@thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg')
|
22
|
+
@thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg') if h1.at('/img') != nil
|
23
23
|
end
|
24
24
|
end
|
25
25
|
end
|
@@ -28,9 +28,12 @@ module WWW
|
|
28
28
|
def login
|
29
29
|
uri = URI.parse(@page_url)
|
30
30
|
page = agent.get("#{uri.scheme}://#{uri.host}/login")
|
31
|
-
login_form = page.form('loginForm')
|
32
|
-
login_form.username = @opt[:you_tube_username]
|
33
|
-
login_form.password = @opt[:you_tube_password]
|
31
|
+
#login_form = page.form('loginForm')
|
32
|
+
#login_form.username = @opt[:you_tube_username]
|
33
|
+
#login_form.password = @opt[:you_tube_password]
|
34
|
+
login_form = page.form('gaia_loginform')
|
35
|
+
login_form.email = @opt[:you_tube_username]
|
36
|
+
login_form.passwd = @opt[:you_tube_password]
|
34
37
|
agent.submit(login_form)
|
35
38
|
end
|
36
39
|
|
@@ -34,6 +34,8 @@ class TestVideoScraper < Test::Unit::TestCase
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def test_scrape
|
37
|
+
mod = WWW::VideoScraper.find_module('http://www.yourfilehost.com/media.php?cat=video&file=XV436__03.wmv')
|
38
|
+
assert_equal WWW::VideoScraper::YourFileHost, mod
|
37
39
|
vs = WWW::VideoScraper.scrape('http://www.yourfilehost.com/media.php?cat=video&file=XV436__03.wmv')
|
38
40
|
assert_kind_of WWW::VideoScraper::YourFileHost, vs
|
39
41
|
end
|
@@ -5,11 +5,10 @@ require File.dirname(__FILE__) + '/../../test_helper'
|
|
5
5
|
class EicBook < Test::Unit::TestCase
|
6
6
|
def test_scrape
|
7
7
|
vs = WWW::VideoScraper::EicBook.scrape('http://www.eic-book.com/detail_12759.html', default_opt)
|
8
|
-
assert_equal 'http://www.eic-book.com/detail_12759.html', vs.page_url
|
8
|
+
assert_equal 'http://www.eic-book.com/detail_12759.html?flg=sm', vs.page_url
|
9
9
|
assert_equal 'http://flv.idol-mile.com/book/12759.flv', vs.video_url
|
10
10
|
assert_equal 'http://www.eic-book.com/img/product/h4/pp_12759.jpg', vs.thumb_url
|
11
11
|
assert_equal '藤木あやか DVD 「お蔵入り寸前!藤木あやか A面」', vs.title
|
12
12
|
assert_equal 24, vs.capture_urls.count
|
13
13
|
end
|
14
14
|
end
|
15
|
-
|
@@ -9,5 +9,6 @@ class TestPornhub < Test::Unit::TestCase
|
|
9
9
|
assert_match %r|http://media1.pornhub.com/dl/[[:alnum:]]{32}/[[:alnum:]]{8}/videos/000/191/743/191743\.flv|, vs.video_url
|
10
10
|
assert_equal 'http://p1.pornhub.com/thumbs/000/191/743/small.jpg', vs.thumb_url
|
11
11
|
assert_match %r|^<object type=\"application/x-shockwave-flash\" data=\".*</object>$|, vs.embed_tag
|
12
|
+
assert_equal 'Liliane Tiger and Jane Darling to hot to handle', vs.title
|
12
13
|
end
|
13
14
|
end
|
@@ -8,6 +8,17 @@ class TestVeoh < Test::Unit::TestCase
|
|
8
8
|
assert_equal 'http://www.veoh.com/videos/v6245232rh8aGEM9', vs.page_url
|
9
9
|
assert_match %r|http://content\.veoh\.com/flash/p/\d/[[:alnum:]]{16}/[[:alnum:]]{40}\.fll\?ct=[[:alnum:]]{48}|, vs.video_url
|
10
10
|
assert_match %r|http://p-images\.veoh\.com/image\.out\?imageId=media-[[:alnum:]]+.jpg|, vs.thumb_url
|
11
|
-
assert_match %r|^<
|
11
|
+
assert_match %r|^<object\s.*>$|, vs.embed_tag
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_canonical_url
|
15
|
+
vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/collection/maysaku/watch/v19937773gwSJPMk', default_opt)
|
16
|
+
assert_equal 'http://www.veoh.com/videos/v19937773gwSJPMk', vs.page_url
|
17
|
+
vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/collection/maysaku/watch/v19937773gwSJPMk#watch%3Dv16112008KGD7Pg2n', default_opt)
|
18
|
+
assert_equal 'http://www.veoh.com/videos/v16112008KGD7Pg2n', vs.page_url
|
19
|
+
vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/videos/v19937773gwSJPMk?rank=0&jsonParams=%7B%22numResults%22%3A20%2C%22rlmin%22%3A0%2C%22query%22%3A%22Shaman+King+01%22%2C%22rlmax%22%3Anull%2C%22veohOnly%22%3Atrue%2C%22order%22%3A%22default%22%2C%22range%22%3A%22a%22%2C%22sId%22%3A%22192998624295114150%22%7D&searchId=192998624295114150&rank=1', default_opt)
|
20
|
+
assert_equal 'http://www.veoh.com/videos/v19937773gwSJPMk', vs.page_url
|
21
|
+
vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/browse/videos/category/comedy/watch/v17078605sszQzbBF')
|
22
|
+
assert_equal 'http://www.veoh.com/videos/v17078605sszQzbBF', vs.page_url
|
12
23
|
end
|
13
24
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: valda-video_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- YAMAGUCHI Seiji
|
@@ -9,11 +9,12 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-06-10 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: mechanize
|
17
|
+
type: :runtime
|
17
18
|
version_requirement:
|
18
19
|
version_requirements: !ruby/object:Gem::Requirement
|
19
20
|
requirements:
|
@@ -23,6 +24,7 @@ dependencies:
|
|
23
24
|
version:
|
24
25
|
- !ruby/object:Gem::Dependency
|
25
26
|
name: hpricot
|
27
|
+
type: :runtime
|
26
28
|
version_requirement:
|
27
29
|
version_requirements: !ruby/object:Gem::Requirement
|
28
30
|
requirements:
|
@@ -63,7 +65,6 @@ files:
|
|
63
65
|
- test/www/video_scraper/test_red_tube.rb
|
64
66
|
- test/www/video_scraper/test_base.rb
|
65
67
|
- test/www/test_video_scraper.rb
|
66
|
-
- test/www/test_video_scraper_flymake.rb
|
67
68
|
- lib/www
|
68
69
|
- lib/www/video_scraper
|
69
70
|
- lib/www/video_scraper/nico_video.rb
|
@@ -83,7 +84,7 @@ files:
|
|
83
84
|
- lib/www/video_scraper/your_file_host.rb
|
84
85
|
- lib/www/video_scraper/tube8.rb
|
85
86
|
- lib/www/video_scraper.rb
|
86
|
-
has_rdoc:
|
87
|
+
has_rdoc: false
|
87
88
|
homepage: http://github.com/valda/video_scraper
|
88
89
|
post_install_message:
|
89
90
|
rdoc_options:
|
@@ -118,7 +119,7 @@ requirements: []
|
|
118
119
|
rubyforge_project: video_scraper
|
119
120
|
rubygems_version: 1.2.0
|
120
121
|
signing_key:
|
121
|
-
specification_version:
|
122
|
+
specification_version: 3
|
122
123
|
summary: Web scraping library for video sharing sites.
|
123
124
|
test_files: []
|
124
125
|
|