valda-video_scraper 1.0.4 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -15,7 +15,7 @@ end
15
15
 
16
16
  module WWW
17
17
  module VideoScraper
18
- VERSION = '1.0.4'
18
+ VERSION = '1.0.5'
19
19
 
20
20
  MODULES_NAME = %w(adult_satellites age_sage ameba_vision dailymotion eic_book
21
21
  moro_tube nico_video pornhub pornotube red_tube tube8 veoh
@@ -19,7 +19,7 @@ module WWW
19
19
  end
20
20
 
21
21
  def valid_url?(url)
22
- not (url =~ @url_regex).nil?
22
+ Array(@url_regex).any? { |r| r.match(url) }
23
23
  end
24
24
 
25
25
  def scrape(url, opt = nil)
@@ -32,7 +32,10 @@ module WWW
32
32
  def initialize(url, opt = nil)
33
33
  @page_url = url
34
34
  @opt = (opt || {})
35
- @url_regex_match = self.class.instance_variable_get(:@url_regex).match(@page_url).freeze
35
+ url_regex = self.class.instance_variable_get(:@url_regex)
36
+ Array(url_regex).any? do |r|
37
+ @url_regex_match = r.match(@page_url).freeze
38
+ end
36
39
  raise StandardError, "url is not #{self.class.name} link: #{url}" if @url_regex_match.nil?
37
40
  end
38
41
 
@@ -10,7 +10,8 @@ module WWW
10
10
 
11
11
  def scrape
12
12
  uri = URI.parse(@page_url)
13
- html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm")
13
+ @page_url = "#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm"
14
+ html = http_get(@page_url)
14
15
  doc = Hpricot(html.toutf8)
15
16
  raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
16
17
  flashvars = CGI.parse(flashvars.attributes['value'])
@@ -10,13 +10,14 @@ module WWW
10
10
  def scrape
11
11
  html = http_get(@page_url)
12
12
  raise FileNotFound unless m = html.match(/\.addVariable\("options",\s*"([^"]+)"\);/i)
13
- @request_url = URI.decode m[1]
14
- @response_body = http_get(@request_url)
15
- @video_url = @response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
13
+ request_url = URI.decode m[1]
14
+ response_body = http_get(request_url)
15
+ @video_url = response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
16
16
  if m = @video_url.match(%r|videos/(\d{3}/\d{3}/\d{3})/\d+.flv|)
17
17
  @thumb_url = "http://p1.pornhub.com/thumbs/#{m[1]}/small.jpg"
18
18
  end
19
19
  @embed_tag = html.match(%r|<textarea[^>]+class="share-flag-embed">(<object type="application/x-shockwave-flash".*?</object>)</textarea>|).to_a[1]
20
+ @title = html.match(%r|<title>(.*) - Pornhub\.com</title>|).to_a[1]
20
21
  end
21
22
  end
22
23
  end
@@ -14,7 +14,9 @@ module WWW
14
14
  page = agent.get(@page_url)
15
15
  raise FileNotFound unless embed = page.root.at('//object/embed')
16
16
  src = embed.attributes['src']
17
- hash = src.match(/\?v=(.*)$/)[1]
17
+ hash = src.to_s.match(/\?v=(.*)$/)[1]
18
+ t = page.at('//div[@class="contentheader"]//span[@class="blue"]')
19
+ @title = t.inner_html.gsub(/<[^>]*>/, '').strip
18
20
  page = agent.get("http://pornotube.com/player/player.php?#{hash}")
19
21
  q = CGI::parse(page.body)
20
22
  @video_url = "http://#{q['mediaDomain'][0]}.pornotube.com/#{q['userId'][0]}/#{q['mediaId'][0]}.flv"
@@ -64,6 +64,13 @@ module WWW
64
64
  nil
65
65
  end
66
66
 
67
+ def title
68
+ return @title if @title
69
+ html = http_get(@page_url)
70
+ doc = Hpricot(html.toutf8)
71
+ @title = doc.at("//table/tr[2]/td/table/tr[3]/td/table/tr/td").inner_html.gsub(/<[^>]*>/, '').strip
72
+ end
73
+
67
74
  def embed_tag
68
75
  return @embed_tag if @embed_tag
69
76
  url = "http://www.redtube.com/embed/#{content_id}"
@@ -5,17 +5,21 @@ require File.expand_path(File.dirname(__FILE__) + '/base')
5
5
  module WWW
6
6
  module VideoScraper
7
7
  class Veoh < Base
8
- url_regex %r!\Ahttp://www\.veoh\.com/videos/([[:alnum:]]+)!
8
+ url_regex [%r!\Ahttp://www\.veoh\.com/videos/(v\d+[[:alnum:]]+)!,
9
+ %r!\Ahttp://www\.veoh\.com/collection/\w+/watch/.*#watch%3[Dd](v\d+[[:alnum:]]+)!,
10
+ %r!\Ahttp://www\.veoh\.com/(?:browse|collection)/(?:[\w]+/)+watch/(v\d+[[:alnum:]]+)!]
9
11
 
10
12
  def scrape
11
13
  @id = url_regex_match[1]
14
+ @page_url = "http://www.veoh.com/videos/#{@id}"
12
15
  request_url = "http://www.veoh.com/rest/video/#{@id}/details"
13
16
  xml = http_get(request_url)
14
17
  @video_url = xml.match(/fullPreviewHashPath="([^"]+)"/).to_a[1]
15
18
  @title = xml.match(/title="([^"]+)"/).to_a[1]
16
19
  @thumb_url = xml.match(/fullMedResImagePath="([^"]+)"/).to_a[1]
17
20
  html = http_get(@page_url)
18
- if embed_tag = html.match(/\sid="embed"\s[^>]*value="([^"]+)"/).to_a[1]
21
+ #logger.debug html
22
+ if embed_tag = html.match(/class="embedinput"\s[^>]*value="([^"]+)"/).to_a[1]
19
23
  @embed_tag = CGI.unescapeHTML(embed_tag)
20
24
  end
21
25
  end
@@ -5,7 +5,7 @@ require File.expand_path(File.dirname(__FILE__) + '/base')
5
5
  module WWW
6
6
  module VideoScraper
7
7
  class YouPorn < Base
8
- url_regex %r!\Ahttp://youporn\.com/watch/(\d+)!
8
+ url_regex %r!\Ahttp://(?:www\.)?youporn\.com/watch/(\d+)!
9
9
 
10
10
  def scrape
11
11
  id = url_regex_match[1]
@@ -15,11 +15,11 @@ module WWW
15
15
  doc = Hpricot(html)
16
16
  doc.search('//div[@id="download"]//a').each do |elem|
17
17
  href = elem.attributes['href']
18
- (@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*\.flv!
18
+ (@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*!
19
19
  end
20
20
  h1 = doc.at('//div[@id="videoArea"]/h1')
21
21
  @title = h1.inner_html.gsub(/<[^>]*>/, '').strip
22
- @thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg')
22
+ @thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg') if h1.at('/img') != nil
23
23
  end
24
24
  end
25
25
  end
@@ -28,9 +28,12 @@ module WWW
28
28
  def login
29
29
  uri = URI.parse(@page_url)
30
30
  page = agent.get("#{uri.scheme}://#{uri.host}/login")
31
- login_form = page.form('loginForm')
32
- login_form.username = @opt[:you_tube_username]
33
- login_form.password = @opt[:you_tube_password]
31
+ #login_form = page.form('loginForm')
32
+ #login_form.username = @opt[:you_tube_username]
33
+ #login_form.password = @opt[:you_tube_password]
34
+ login_form = page.form('gaia_loginform')
35
+ login_form.email = @opt[:you_tube_username]
36
+ login_form.passwd = @opt[:you_tube_password]
34
37
  agent.submit(login_form)
35
38
  end
36
39
 
@@ -34,6 +34,8 @@ class TestVideoScraper < Test::Unit::TestCase
34
34
  end
35
35
 
36
36
  def test_scrape
37
+ mod = WWW::VideoScraper.find_module('http://www.yourfilehost.com/media.php?cat=video&file=XV436__03.wmv')
38
+ assert_equal WWW::VideoScraper::YourFileHost, mod
37
39
  vs = WWW::VideoScraper.scrape('http://www.yourfilehost.com/media.php?cat=video&file=XV436__03.wmv')
38
40
  assert_kind_of WWW::VideoScraper::YourFileHost, vs
39
41
  end
@@ -5,11 +5,10 @@ require File.dirname(__FILE__) + '/../../test_helper'
5
5
  class EicBook < Test::Unit::TestCase
6
6
  def test_scrape
7
7
  vs = WWW::VideoScraper::EicBook.scrape('http://www.eic-book.com/detail_12759.html', default_opt)
8
- assert_equal 'http://www.eic-book.com/detail_12759.html', vs.page_url
8
+ assert_equal 'http://www.eic-book.com/detail_12759.html?flg=sm', vs.page_url
9
9
  assert_equal 'http://flv.idol-mile.com/book/12759.flv', vs.video_url
10
10
  assert_equal 'http://www.eic-book.com/img/product/h4/pp_12759.jpg', vs.thumb_url
11
11
  assert_equal '藤木あやか DVD 「お蔵入り寸前!藤木あやか A面」', vs.title
12
12
  assert_equal 24, vs.capture_urls.count
13
13
  end
14
14
  end
15
-
@@ -9,5 +9,6 @@ class TestPornhub < Test::Unit::TestCase
9
9
  assert_match %r|http://media1.pornhub.com/dl/[[:alnum:]]{32}/[[:alnum:]]{8}/videos/000/191/743/191743\.flv|, vs.video_url
10
10
  assert_equal 'http://p1.pornhub.com/thumbs/000/191/743/small.jpg', vs.thumb_url
11
11
  assert_match %r|^<object type=\"application/x-shockwave-flash\" data=\".*</object>$|, vs.embed_tag
12
+ assert_equal 'Liliane Tiger and Jane Darling to hot to handle', vs.title
12
13
  end
13
14
  end
@@ -8,6 +8,17 @@ class TestVeoh < Test::Unit::TestCase
8
8
  assert_equal 'http://www.veoh.com/videos/v6245232rh8aGEM9', vs.page_url
9
9
  assert_match %r|http://content\.veoh\.com/flash/p/\d/[[:alnum:]]{16}/[[:alnum:]]{40}\.fll\?ct=[[:alnum:]]{48}|, vs.video_url
10
10
  assert_match %r|http://p-images\.veoh\.com/image\.out\?imageId=media-[[:alnum:]]+.jpg|, vs.thumb_url
11
- assert_match %r|^<embed\s.*>$|, vs.embed_tag
11
+ assert_match %r|^<object\s.*>$|, vs.embed_tag
12
+ end
13
+
14
+ def test_canonical_url
15
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/collection/maysaku/watch/v19937773gwSJPMk', default_opt)
16
+ assert_equal 'http://www.veoh.com/videos/v19937773gwSJPMk', vs.page_url
17
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/collection/maysaku/watch/v19937773gwSJPMk#watch%3Dv16112008KGD7Pg2n', default_opt)
18
+ assert_equal 'http://www.veoh.com/videos/v16112008KGD7Pg2n', vs.page_url
19
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/videos/v19937773gwSJPMk?rank=0&jsonParams=%7B%22numResults%22%3A20%2C%22rlmin%22%3A0%2C%22query%22%3A%22Shaman+King+01%22%2C%22rlmax%22%3Anull%2C%22veohOnly%22%3Atrue%2C%22order%22%3A%22default%22%2C%22range%22%3A%22a%22%2C%22sId%22%3A%22192998624295114150%22%7D&searchId=192998624295114150&rank=1', default_opt)
20
+ assert_equal 'http://www.veoh.com/videos/v19937773gwSJPMk', vs.page_url
21
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/browse/videos/category/comedy/watch/v17078605sszQzbBF')
22
+ assert_equal 'http://www.veoh.com/videos/v17078605sszQzbBF', vs.page_url
12
23
  end
13
24
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: valda-video_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - YAMAGUCHI Seiji
@@ -9,11 +9,12 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-02-10 00:00:00 -08:00
12
+ date: 2009-06-10 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: mechanize
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -23,6 +24,7 @@ dependencies:
23
24
  version:
24
25
  - !ruby/object:Gem::Dependency
25
26
  name: hpricot
27
+ type: :runtime
26
28
  version_requirement:
27
29
  version_requirements: !ruby/object:Gem::Requirement
28
30
  requirements:
@@ -63,7 +65,6 @@ files:
63
65
  - test/www/video_scraper/test_red_tube.rb
64
66
  - test/www/video_scraper/test_base.rb
65
67
  - test/www/test_video_scraper.rb
66
- - test/www/test_video_scraper_flymake.rb
67
68
  - lib/www
68
69
  - lib/www/video_scraper
69
70
  - lib/www/video_scraper/nico_video.rb
@@ -83,7 +84,7 @@ files:
83
84
  - lib/www/video_scraper/your_file_host.rb
84
85
  - lib/www/video_scraper/tube8.rb
85
86
  - lib/www/video_scraper.rb
86
- has_rdoc: true
87
+ has_rdoc: false
87
88
  homepage: http://github.com/valda/video_scraper
88
89
  post_install_message:
89
90
  rdoc_options:
@@ -118,7 +119,7 @@ requirements: []
118
119
  rubyforge_project: video_scraper
119
120
  rubygems_version: 1.2.0
120
121
  signing_key:
121
- specification_version: 2
122
+ specification_version: 3
122
123
  summary: Web scraping library for video sharing sites.
123
124
  test_files: []
124
125