valda-video_scraper 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,7 @@ end
15
15
 
16
16
  module WWW
17
17
  module VideoScraper
18
- VERSION = '1.0.4'
18
+ VERSION = '1.0.5'
19
19
 
20
20
  MODULES_NAME = %w(adult_satellites age_sage ameba_vision dailymotion eic_book
21
21
  moro_tube nico_video pornhub pornotube red_tube tube8 veoh
@@ -19,7 +19,7 @@ module WWW
19
19
  end
20
20
 
21
21
  def valid_url?(url)
22
- not (url =~ @url_regex).nil?
22
+ Array(@url_regex).any? { |r| r.match(url) }
23
23
  end
24
24
 
25
25
  def scrape(url, opt = nil)
@@ -32,7 +32,10 @@ module WWW
32
32
  def initialize(url, opt = nil)
33
33
  @page_url = url
34
34
  @opt = (opt || {})
35
- @url_regex_match = self.class.instance_variable_get(:@url_regex).match(@page_url).freeze
35
+ url_regex = self.class.instance_variable_get(:@url_regex)
36
+ Array(url_regex).any? do |r|
37
+ @url_regex_match = r.match(@page_url).freeze
38
+ end
36
39
  raise StandardError, "url is not #{self.class.name} link: #{url}" if @url_regex_match.nil?
37
40
  end
38
41
 
@@ -10,7 +10,8 @@ module WWW
10
10
 
11
11
  def scrape
12
12
  uri = URI.parse(@page_url)
13
- html = http_get("#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm")
13
+ @page_url = "#{uri.scheme}://#{uri.host}#{uri.path}?flg=sm"
14
+ html = http_get(@page_url)
14
15
  doc = Hpricot(html.toutf8)
15
16
  raise FileNotFound unless flashvars = doc.at('//object //param[@name="FlashVars"]')
16
17
  flashvars = CGI.parse(flashvars.attributes['value'])
@@ -10,13 +10,14 @@ module WWW
10
10
  def scrape
11
11
  html = http_get(@page_url)
12
12
  raise FileNotFound unless m = html.match(/\.addVariable\("options",\s*"([^"]+)"\);/i)
13
- @request_url = URI.decode m[1]
14
- @response_body = http_get(@request_url)
15
- @video_url = @response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
13
+ request_url = URI.decode m[1]
14
+ response_body = http_get(request_url)
15
+ @video_url = response_body.match(%r|<flv_url>([^<]+)</flv_url>|).to_a[1]
16
16
  if m = @video_url.match(%r|videos/(\d{3}/\d{3}/\d{3})/\d+.flv|)
17
17
  @thumb_url = "http://p1.pornhub.com/thumbs/#{m[1]}/small.jpg"
18
18
  end
19
19
  @embed_tag = html.match(%r|<textarea[^>]+class="share-flag-embed">(<object type="application/x-shockwave-flash".*?</object>)</textarea>|).to_a[1]
20
+ @title = html.match(%r|<title>(.*) - Pornhub\.com</title>|).to_a[1]
20
21
  end
21
22
  end
22
23
  end
@@ -14,7 +14,9 @@ module WWW
14
14
  page = agent.get(@page_url)
15
15
  raise FileNotFound unless embed = page.root.at('//object/embed')
16
16
  src = embed.attributes['src']
17
- hash = src.match(/\?v=(.*)$/)[1]
17
+ hash = src.to_s.match(/\?v=(.*)$/)[1]
18
+ t = page.at('//div[@class="contentheader"]//span[@class="blue"]')
19
+ @title = t.inner_html.gsub(/<[^>]*>/, '').strip
18
20
  page = agent.get("http://pornotube.com/player/player.php?#{hash}")
19
21
  q = CGI::parse(page.body)
20
22
  @video_url = "http://#{q['mediaDomain'][0]}.pornotube.com/#{q['userId'][0]}/#{q['mediaId'][0]}.flv"
@@ -64,6 +64,13 @@ module WWW
64
64
  nil
65
65
  end
66
66
 
67
+ def title
68
+ return @title if @title
69
+ html = http_get(@page_url)
70
+ doc = Hpricot(html.toutf8)
71
+ @title = doc.at("//table/tr[2]/td/table/tr[3]/td/table/tr/td").inner_html.gsub(/<[^>]*>/, '').strip
72
+ end
73
+
67
74
  def embed_tag
68
75
  return @embed_tag if @embed_tag
69
76
  url = "http://www.redtube.com/embed/#{content_id}"
@@ -5,17 +5,21 @@ require File.expand_path(File.dirname(__FILE__) + '/base')
5
5
  module WWW
6
6
  module VideoScraper
7
7
  class Veoh < Base
8
- url_regex %r!\Ahttp://www\.veoh\.com/videos/([[:alnum:]]+)!
8
+ url_regex [%r!\Ahttp://www\.veoh\.com/videos/(v\d+[[:alnum:]]+)!,
9
+ %r!\Ahttp://www\.veoh\.com/collection/\w+/watch/.*#watch%3[Dd](v\d+[[:alnum:]]+)!,
10
+ %r!\Ahttp://www\.veoh\.com/(?:browse|collection)/(?:[\w]+/)+watch/(v\d+[[:alnum:]]+)!]
9
11
 
10
12
  def scrape
11
13
  @id = url_regex_match[1]
14
+ @page_url = "http://www.veoh.com/videos/#{@id}"
12
15
  request_url = "http://www.veoh.com/rest/video/#{@id}/details"
13
16
  xml = http_get(request_url)
14
17
  @video_url = xml.match(/fullPreviewHashPath="([^"]+)"/).to_a[1]
15
18
  @title = xml.match(/title="([^"]+)"/).to_a[1]
16
19
  @thumb_url = xml.match(/fullMedResImagePath="([^"]+)"/).to_a[1]
17
20
  html = http_get(@page_url)
18
- if embed_tag = html.match(/\sid="embed"\s[^>]*value="([^"]+)"/).to_a[1]
21
+ #logger.debug html
22
+ if embed_tag = html.match(/class="embedinput"\s[^>]*value="([^"]+)"/).to_a[1]
19
23
  @embed_tag = CGI.unescapeHTML(embed_tag)
20
24
  end
21
25
  end
@@ -5,7 +5,7 @@ require File.expand_path(File.dirname(__FILE__) + '/base')
5
5
  module WWW
6
6
  module VideoScraper
7
7
  class YouPorn < Base
8
- url_regex %r!\Ahttp://youporn\.com/watch/(\d+)!
8
+ url_regex %r!\Ahttp://(?:www\.)?youporn\.com/watch/(\d+)!
9
9
 
10
10
  def scrape
11
11
  id = url_regex_match[1]
@@ -15,11 +15,11 @@ module WWW
15
15
  doc = Hpricot(html)
16
16
  doc.search('//div[@id="download"]//a').each do |elem|
17
17
  href = elem.attributes['href']
18
- (@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*\.flv!
18
+ (@video_url = href; break) if href =~ %r!^http://download\.youporn\.com/download/.*!
19
19
  end
20
20
  h1 = doc.at('//div[@id="videoArea"]/h1')
21
21
  @title = h1.inner_html.gsub(/<[^>]*>/, '').strip
22
- @thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg')
22
+ @thumb_url = h1.at('/img').attributes['src'].sub(/(\d+)_small\.jpg$/, '\1_large.jpg') if h1.at('/img') != nil
23
23
  end
24
24
  end
25
25
  end
@@ -28,9 +28,12 @@ module WWW
28
28
  def login
29
29
  uri = URI.parse(@page_url)
30
30
  page = agent.get("#{uri.scheme}://#{uri.host}/login")
31
- login_form = page.form('loginForm')
32
- login_form.username = @opt[:you_tube_username]
33
- login_form.password = @opt[:you_tube_password]
31
+ #login_form = page.form('loginForm')
32
+ #login_form.username = @opt[:you_tube_username]
33
+ #login_form.password = @opt[:you_tube_password]
34
+ login_form = page.form('gaia_loginform')
35
+ login_form.email = @opt[:you_tube_username]
36
+ login_form.passwd = @opt[:you_tube_password]
34
37
  agent.submit(login_form)
35
38
  end
36
39
 
@@ -34,6 +34,8 @@ class TestVideoScraper < Test::Unit::TestCase
34
34
  end
35
35
 
36
36
  def test_scrape
37
+ mod = WWW::VideoScraper.find_module('http://www.yourfilehost.com/media.php?cat=video&file=XV436__03.wmv')
38
+ assert_equal WWW::VideoScraper::YourFileHost, mod
37
39
  vs = WWW::VideoScraper.scrape('http://www.yourfilehost.com/media.php?cat=video&file=XV436__03.wmv')
38
40
  assert_kind_of WWW::VideoScraper::YourFileHost, vs
39
41
  end
@@ -5,11 +5,10 @@ require File.dirname(__FILE__) + '/../../test_helper'
5
5
  class EicBook < Test::Unit::TestCase
6
6
  def test_scrape
7
7
  vs = WWW::VideoScraper::EicBook.scrape('http://www.eic-book.com/detail_12759.html', default_opt)
8
- assert_equal 'http://www.eic-book.com/detail_12759.html', vs.page_url
8
+ assert_equal 'http://www.eic-book.com/detail_12759.html?flg=sm', vs.page_url
9
9
  assert_equal 'http://flv.idol-mile.com/book/12759.flv', vs.video_url
10
10
  assert_equal 'http://www.eic-book.com/img/product/h4/pp_12759.jpg', vs.thumb_url
11
11
  assert_equal '藤木あやか DVD 「お蔵入り寸前!藤木あやか A面」', vs.title
12
12
  assert_equal 24, vs.capture_urls.count
13
13
  end
14
14
  end
15
-
@@ -9,5 +9,6 @@ class TestPornhub < Test::Unit::TestCase
9
9
  assert_match %r|http://media1.pornhub.com/dl/[[:alnum:]]{32}/[[:alnum:]]{8}/videos/000/191/743/191743\.flv|, vs.video_url
10
10
  assert_equal 'http://p1.pornhub.com/thumbs/000/191/743/small.jpg', vs.thumb_url
11
11
  assert_match %r|^<object type=\"application/x-shockwave-flash\" data=\".*</object>$|, vs.embed_tag
12
+ assert_equal 'Liliane Tiger and Jane Darling to hot to handle', vs.title
12
13
  end
13
14
  end
@@ -8,6 +8,17 @@ class TestVeoh < Test::Unit::TestCase
8
8
  assert_equal 'http://www.veoh.com/videos/v6245232rh8aGEM9', vs.page_url
9
9
  assert_match %r|http://content\.veoh\.com/flash/p/\d/[[:alnum:]]{16}/[[:alnum:]]{40}\.fll\?ct=[[:alnum:]]{48}|, vs.video_url
10
10
  assert_match %r|http://p-images\.veoh\.com/image\.out\?imageId=media-[[:alnum:]]+.jpg|, vs.thumb_url
11
- assert_match %r|^<embed\s.*>$|, vs.embed_tag
11
+ assert_match %r|^<object\s.*>$|, vs.embed_tag
12
+ end
13
+
14
+ def test_canonical_url
15
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/collection/maysaku/watch/v19937773gwSJPMk', default_opt)
16
+ assert_equal 'http://www.veoh.com/videos/v19937773gwSJPMk', vs.page_url
17
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/collection/maysaku/watch/v19937773gwSJPMk#watch%3Dv16112008KGD7Pg2n', default_opt)
18
+ assert_equal 'http://www.veoh.com/videos/v16112008KGD7Pg2n', vs.page_url
19
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/videos/v19937773gwSJPMk?rank=0&jsonParams=%7B%22numResults%22%3A20%2C%22rlmin%22%3A0%2C%22query%22%3A%22Shaman+King+01%22%2C%22rlmax%22%3Anull%2C%22veohOnly%22%3Atrue%2C%22order%22%3A%22default%22%2C%22range%22%3A%22a%22%2C%22sId%22%3A%22192998624295114150%22%7D&searchId=192998624295114150&rank=1', default_opt)
20
+ assert_equal 'http://www.veoh.com/videos/v19937773gwSJPMk', vs.page_url
21
+ vs = WWW::VideoScraper::Veoh.scrape('http://www.veoh.com/browse/videos/category/comedy/watch/v17078605sszQzbBF')
22
+ assert_equal 'http://www.veoh.com/videos/v17078605sszQzbBF', vs.page_url
12
23
  end
13
24
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: valda-video_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - YAMAGUCHI Seiji
@@ -9,11 +9,12 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-02-10 00:00:00 -08:00
12
+ date: 2009-06-10 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: mechanize
17
+ type: :runtime
17
18
  version_requirement:
18
19
  version_requirements: !ruby/object:Gem::Requirement
19
20
  requirements:
@@ -23,6 +24,7 @@ dependencies:
23
24
  version:
24
25
  - !ruby/object:Gem::Dependency
25
26
  name: hpricot
27
+ type: :runtime
26
28
  version_requirement:
27
29
  version_requirements: !ruby/object:Gem::Requirement
28
30
  requirements:
@@ -63,7 +65,6 @@ files:
63
65
  - test/www/video_scraper/test_red_tube.rb
64
66
  - test/www/video_scraper/test_base.rb
65
67
  - test/www/test_video_scraper.rb
66
- - test/www/test_video_scraper_flymake.rb
67
68
  - lib/www
68
69
  - lib/www/video_scraper
69
70
  - lib/www/video_scraper/nico_video.rb
@@ -83,7 +84,7 @@ files:
83
84
  - lib/www/video_scraper/your_file_host.rb
84
85
  - lib/www/video_scraper/tube8.rb
85
86
  - lib/www/video_scraper.rb
86
- has_rdoc: true
87
+ has_rdoc: false
87
88
  homepage: http://github.com/valda/video_scraper
88
89
  post_install_message:
89
90
  rdoc_options:
@@ -118,7 +119,7 @@ requirements: []
118
119
  rubyforge_project: video_scraper
119
120
  rubygems_version: 1.2.0
120
121
  signing_key:
121
- specification_version: 2
122
+ specification_version: 3
122
123
  summary: Web scraping library for video sharing sites.
123
124
  test_files: []
124
125