video_grabber 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3a484f6d9d5b763ae659792576bb8cc1d4e3a9ca
4
+ data.tar.gz: 8e7c4c25c9cfb9f06baf50f14f1e4e1c7ea0b37f
5
+ SHA512:
6
+ metadata.gz: 0c92fa9ac5e5c9de371d2753cd70ae284f4daec88f038ac5df3f6d1a74262b73248c8166e710e7cc0c1c49cdab2a86ddde8b1a44204b96c004ff421b2f49f142
7
+ data.tar.gz: dad4f99ccd6fd9a64672645c84e6ed013daa00f300186d668f39ee9d3f4175005cc1df238ff77e88abd6ab47dce1a52e33b9c93fa20969aaf6357b1b25902a86
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.1.2
4
+ before_install: gem install bundler -v 1.11.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in video_grabber.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,66 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ video_grabber (1.4.0)
5
+ headless
6
+ nokogiri
7
+ selenium-webdriver (~> 2.53.4)
8
+ watir
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ childprocess (0.6.2)
14
+ ffi (~> 1.0, >= 1.0.11)
15
+ coderay (1.1.1)
16
+ commonwatir (4.0.0)
17
+ diff-lcs (1.2.5)
18
+ ffi (1.9.18)
19
+ headless (2.3.1)
20
+ method_source (0.8.2)
21
+ mini_portile2 (2.1.0)
22
+ nokogiri (1.7.1)
23
+ mini_portile2 (~> 2.1.0)
24
+ pry (0.10.4)
25
+ coderay (~> 1.1.0)
26
+ method_source (~> 0.8.1)
27
+ slop (~> 3.4)
28
+ rake (10.5.0)
29
+ rspec (3.4.0)
30
+ rspec-core (~> 3.4.0)
31
+ rspec-expectations (~> 3.4.0)
32
+ rspec-mocks (~> 3.4.0)
33
+ rspec-core (3.4.4)
34
+ rspec-support (~> 3.4.0)
35
+ rspec-expectations (3.4.0)
36
+ diff-lcs (>= 1.2.0, < 2.0)
37
+ rspec-support (~> 3.4.0)
38
+ rspec-mocks (3.4.1)
39
+ diff-lcs (>= 1.2.0, < 2.0)
40
+ rspec-support (~> 3.4.0)
41
+ rspec-support (3.4.1)
42
+ rubyzip (1.2.1)
43
+ selenium-webdriver (2.53.4)
44
+ childprocess (~> 0.5)
45
+ rubyzip (~> 1.0)
46
+ websocket (~> 1.0)
47
+ slop (3.6.0)
48
+ watir (5.0.0)
49
+ commonwatir (~> 4)
50
+ watir-webdriver
51
+ watir-webdriver (0.9.9)
52
+ selenium-webdriver (>= 2.46.2)
53
+ websocket (1.2.4)
54
+
55
+ PLATFORMS
56
+ ruby
57
+
58
+ DEPENDENCIES
59
+ bundler (~> 1.11)
60
+ pry
61
+ rake (~> 10.0)
62
+ rspec (~> 3.0)
63
+ video_grabber!
64
+
65
+ BUNDLED WITH
66
+ 1.12.5
data/LICENSE.md ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2017 Bridge2Think AG
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,96 @@
1
+ # Video Grabber
2
+
3
+ __Video Grabber__ is a simple tool to get video tags from a given URL. It includes advanced techniques to desobfuscate advertisements, run through embedded iframes, run javascript, and other frivolities that could usually and notably prevent one to fetch the wanted video.
4
+
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'video_grabber'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ ```shell
17
+ $ bundle
18
+ ```
19
+ Or install it yourself as:
20
+
21
+ ```ruby
22
+ $ gem install video_grabber
23
+ ```
24
+ And require it in your application:
25
+
26
+ ```ruby
27
+ irb(main):001:0> require 'video_grabber'
28
+ => true
29
+ ```
30
+
31
+ ## Requirements
32
+
33
+ `video_grabber` relies on Firefox and [Headless](https://github.com/leonid-shevtsov/headless) (unless you specifically disable it). To get Headless working you will need linux and `xvfb`.
34
+
35
+ Install `xvfb` on Debian:
36
+
37
+ ```shell
38
+ sudo apt-get install xvfb
39
+ ```
40
+
41
+ Headless is used to run Firefox inside a headless display (in background).
42
+
43
+
44
+ ## Usage
45
+
46
+ Start the Scraper
47
+
48
+ ```ruby
49
+ video_grabber = VideoGrabber.new(url: 'https://en.wikipedia.org/wiki/Big_Buck_Bunny').call
50
+ ```
51
+ Fetch your links and shut down your scraper:
52
+
53
+ ```ruby
54
+ video_grabber.fetch_videos
55
+ => ["<video id=\"mwe_player_1\" poster=\"//upload.wikimedia.org/wikipedia/com...
56
+ ```
57
+
58
+ If you want to fetch a new time those data, you can manually restart the service using:
59
+
60
+ ```ruby
61
+ video_grabber.start
62
+ ```
63
+
64
+ Or you can directly pass the param `keep_browser_open` during initialization.
65
+
66
+ ## Parameters
67
+
68
+ - **url:** The url of the resource containing the video(s)
69
+ - **timeout:** *(default: 60)* The timeout for the scraper. Will trigger a `VideoGrabber::Timeout` if the delay is met.
70
+ - **keep_browser_open** *(default: false)* If activated, will keep the scraper's browser open as long as you do not stop it (using the `stop` public method.).
71
+ - **headless_enabled** *(default: true)* If disabled, will open your Firefox browser to crawl your links.
72
+ - **attributes** This option enables you to pass html attributes that will be passed to your crawled links elements.
73
+ - **firefox_extension_path** If passed, your Scraper instance will run using the given extension (`.xpi` file). Useful if you want to benefit from an Adblocker for instance
74
+
75
+ ## Versioning
76
+
77
+ __Video Grabber__ follows [Semantic Versioning 2.0](http://semver.org/).
78
+
79
+ ## Contributing
80
+
81
+ 1. Fork it ( https://github.com/bridge2think/video_grabber/fork )
82
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
83
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
84
+ 4. Push to the branch (`git push origin my-new-feature`)
85
+ 5. Create a new Pull Request
86
+
87
+ ## Contact
88
+
89
+ Any question ? Feel free to contact me at `ss(at)bridge2think.com` .
90
+ Any issue ? Open a [ticket](https://github.com/bridge2think/video_grabber/issues) !
91
+
92
+ ## License
93
+
94
+ Copyright (c) 2017 Bridge2Think AG
95
+
96
+ Released under the MIT license. See [LICENSE.md](https://github.com/bridge2think/video_grabber/blob/master/LICENSE.md) for more details.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/video_grabber ADDED
File without changes
@@ -0,0 +1 @@
1
+ <html><body><iframe><video src="iframe.mp4"></video></iframe></body></html>
@@ -0,0 +1 @@
1
+ <html><body><video src='normal.mp4'></video></body></html>
@@ -0,0 +1,26 @@
1
+ require 'video_grabber/application'
2
+ require 'video_grabber/config.rb'
3
+ require 'video_grabber/scraper.rb'
4
+ require 'video_grabber/exceptions.rb'
5
+ require 'selenium-webdriver'
6
+ require 'headless'
7
+ require 'nokogiri'
8
+ require 'watir'
9
+ require 'cgi'
10
+ require 'pry'
11
+
12
+ module VideoGrabber
13
+
14
+ class << self
15
+
16
+ attr_reader :application
17
+
18
+ def new(opts)
19
+ @application = VideoGrabber::Application.new(opts)
20
+ end
21
+
22
+ def call
23
+ application.call
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,22 @@
1
+ module VideoGrabber
2
+ class Application
3
+
4
+ def initialize(options)
5
+ set_options(options)
6
+ end
7
+
8
+ def call
9
+ scraper = Scraper.new(config).start
10
+ end
11
+
12
+ private
13
+
14
+ def config
15
+ @config ||= ::VideoGrabber.config.dup
16
+ end
17
+
18
+ def set_options(options)
19
+ options.each { |k, v| config.send("#{k}=", v) }
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,34 @@
1
+ module VideoGrabber
2
+
3
+ # Access point for the gem configurations.
4
+ #
5
+ # @return [VideoGrabber::Configuration] a configuration instance.
6
+ def self.config
7
+ @config ||= Configuration.new
8
+ end
9
+
10
+ # Configure hook used in the gem initializer. Convinient way to set all the
11
+ # gem configurations.
12
+ #
13
+ # example:
14
+ # VideoGrabber.configure do |config|
15
+ # config.timeout = 60
16
+ # end
17
+ #
18
+ # @return [void]
19
+ def self.configure
20
+ yield config if block_given?
21
+ end
22
+
23
+ class Configuration
24
+
25
+ attr_accessor :url, :keep_browser_open, :timeout, :headless_enabled,
26
+ :firefox_extension_path, :attributes
27
+
28
+ def initialize
29
+ @keep_browser_open = false
30
+ @timeout = 60
31
+ @headless_enabled = true
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,6 @@
1
+ module VideoGrabber
2
+ class Exception < StandardError; end
3
+ class BrowserIsClosed < Exception; end
4
+ class ExtensionError < Exception; end
5
+ class TimeOut < Exception; end
6
+ end
@@ -0,0 +1,91 @@
1
+ module VideoGrabber
2
+ class Scraper
3
+
4
+ attr_reader :url, :browser, :timeout, :keep_browser_open, :headless_enabled,
5
+ :firefox_extension_path, :profile, :attributes
6
+
7
+ def initialize(config)
8
+ @keep_browser_open = config.keep_browser_open
9
+ @url = config.url
10
+ @timeout = config.timeout
11
+ @headless_enabled = config.headless_enabled
12
+ @firefox_extension_path = config.firefox_extension_path
13
+ @attributes = config.attributes
14
+ end
15
+
16
+ def start
17
+ open_browser
18
+ browser.goto(url) ; self
19
+ rescue ::Net::ReadTimeout
20
+ stop
21
+ raise ::VideoGrabber::TimeOut
22
+ end
23
+
24
+ def stop
25
+ browser.close
26
+ end
27
+
28
+ def fetch_videos
29
+ links_list = []
30
+ links_list += browser.videos.map(&:html)
31
+
32
+ links_list += ::Nokogiri::HTML(browser.html).xpath('//iframe').map do |iframe_node|
33
+ ::Nokogiri::HTML(::CGI.unescapeHTML(iframe_node.to_s)).xpath('.//video').map{ |element| element.to_s }
34
+ end.flatten
35
+
36
+ links_list += begin
37
+ html = ::CGI.unescapeHTML(browser.html)
38
+ html = html.split('<video').map{|e| '<video ' + e if e.match('</video>')}.compact
39
+ html = html.map{|e| e.split('</video>')[0..-2].join('</video>') + '</video>' }
40
+ end
41
+
42
+ stop unless keep_browser_open
43
+
44
+ links_list = links_list.map{|element| element.split.join(" ") }.reject(&:empty?).uniq
45
+
46
+ add_attributes(links_list) || links_list
47
+ rescue ::Watir::Exception::Error
48
+ raise ::VideoGrabber::BrowserIsClosed, 'Please restart the scraper (scraper_instance.start), or keep the browser open'
49
+ end
50
+
51
+ private
52
+
53
+ def open_browser
54
+ start_headless
55
+
56
+ @profile = ::Selenium::WebDriver::Firefox::Profile.new ; load_extension
57
+ client = Selenium::WebDriver::Remote::Http::Default.new
58
+ client.timeout = timeout
59
+ @browser = ::Watir::Browser.new(:firefox, profile: profile, http_client: client)
60
+ end
61
+
62
+ def start_headless
63
+ return unless headless_enabled
64
+
65
+ ::Headless.new.start
66
+ end
67
+
68
+ def load_extension
69
+ return unless firefox_extension_path
70
+
71
+ @profile.add_extension(firefox_extension_path)
72
+ rescue Selenium::WebDriver::Error::WebDriverError => e
73
+ raise ::VideoGrabber::ExtensionError, e
74
+ end
75
+
76
+ def add_attributes(list)
77
+ return unless attributes
78
+
79
+ list.map do |element|
80
+
81
+ parsed_element = Nokogiri::XML(element)
82
+
83
+ attributes.each do |key, value|
84
+ parsed_element.xpath('//video').first.set_attribute(key, value)
85
+ end
86
+
87
+ parsed_element.xpath('//video').to_s
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,3 @@
1
+ module VideoGrabber
2
+ VERSION = "1.5.0"
3
+ end
Binary file
data/pkg_checksum ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'digest/sha2'
4
+
5
+ gemname = :video_grabber
6
+ ARGV[0] = File.read('VERSION.semver').chomp if ARGV[0].nil?
7
+ built_gem_path = "pkg/#{gemname}-#{ARGV[0]}.gem"
8
+ checksum = Digest::SHA512.new.hexdigest(File.read(built_gem_path))
9
+ checksum_path = "checksum/#{gemname}-#{ARGV[0]}.gem.sha512"
10
+
11
+ File.open(checksum_path, 'w') { |f| f.write("#{checksum}\n") }
Binary file
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'video_grabber/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'video_grabber'
8
+ spec.version = VideoGrabber::VERSION
9
+ spec.authors = ['sidney']
10
+ spec.email = ['ss@bridge2think.com']
11
+ spec.summary = 'VideoGrabber is a simple tool to get video tags from a given URL'
12
+ spec.description = 'VideoGrabber crawl headlessly websites to extract their videos'
13
+ spec.homepage = 'https://github.com/bridge2think/video_grabber'
14
+ spec.license = 'MIT'
15
+ spec.required_ruby_version = '>= 1.9.3'
16
+
17
+ # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
18
+ # delete this section to allow pushing this gem to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
21
+ else
22
+ raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.'
23
+ end
24
+
25
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
26
+ spec.executables = ['video_grabber']
27
+ spec.require_paths = ['lib']
28
+
29
+ spec.add_dependency 'selenium-webdriver', '~> 2.53.4'
30
+ spec.add_dependency 'watir'
31
+ spec.add_dependency 'headless'
32
+ spec.add_dependency 'nokogiri'
33
+ spec.add_development_dependency 'bundler', '~> 1.11'
34
+ spec.add_development_dependency 'rake', '~> 10.0'
35
+ spec.add_development_dependency 'rspec', '~> 3.0'
36
+ spec.add_development_dependency 'pry'
37
+ end
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: video_grabber
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.5.0
5
+ platform: ruby
6
+ authors:
7
+ - sidney
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-03-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: selenium-webdriver
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 2.53.4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 2.53.4
27
+ - !ruby/object:Gem::Dependency
28
+ name: watir
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: headless
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.11'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.11'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: VideoGrabber crawl headlessly websites to extract their videos
126
+ email:
127
+ - ss@bridge2think.com
128
+ executables:
129
+ - video_grabber
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".rspec"
134
+ - ".travis.yml"
135
+ - Gemfile
136
+ - Gemfile.lock
137
+ - LICENSE.md
138
+ - README.md
139
+ - Rakefile
140
+ - bin/video_grabber
141
+ - fixtures/iframe_embedded_video.html
142
+ - fixtures/normal.html
143
+ - lib/video_grabber.rb
144
+ - lib/video_grabber/application.rb
145
+ - lib/video_grabber/config.rb
146
+ - lib/video_grabber/exceptions.rb
147
+ - lib/video_grabber/scraper.rb
148
+ - lib/video_grabber/version.rb
149
+ - pkg/video_grabber-1.1.0.gem
150
+ - pkg_checksum
151
+ - video_grabber-1.0.0.gem
152
+ - video_grabber.gemspec
153
+ homepage: https://github.com/bridge2think/video_grabber
154
+ licenses:
155
+ - MIT
156
+ metadata:
157
+ allowed_push_host: https://rubygems.org
158
+ post_install_message:
159
+ rdoc_options: []
160
+ require_paths:
161
+ - lib
162
+ required_ruby_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 1.9.3
167
+ required_rubygems_version: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ requirements: []
173
+ rubyforge_project:
174
+ rubygems_version: 2.5.1
175
+ signing_key:
176
+ specification_version: 4
177
+ summary: VideoGrabber is a simple tool to get video tags from a given URL
178
+ test_files: []
179
+ has_rdoc: