youtube-data 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +61 -0
- data/Rakefile +4 -0
- data/TODO.md +12 -0
- data/lib/youtube/extractor.rb +138 -0
- data/lib/youtube/thumbnail.rb +91 -0
- data/lib/youtube/version.rb +5 -0
- data/lib/youtube-data.rb +13 -0
- data/sig/youtube.rbs +4 -0
- data/tests/data/raw_video.html +89 -0
- data/tests/mocks.rb +29 -0
- data/tests/test_all.rb +3 -0
- data/tests/test_extractor.rb +32 -0
- metadata +60 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 9d7d85c28c1258272db9500f0aaf57b66cd831b8d8134b411a0e25ced0cd4912
|
|
4
|
+
data.tar.gz: 2de11bacc9a8ebd4b075d23c6da2b6b2f2e0f8898e8ee1f70ae2ebac30216a21
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: b85bf4e5f90c9bf857f2853fb971c8d61b53f9fff978d318c02f70f4458bbf1bda169954c2de6f22d2a6aa42f6c25d114cba51b760b9fd472c7dee24aa5cd96a
|
|
7
|
+
data.tar.gz: 9d2fb4892bda2a428c0da64fc27def5cf243b39fba5fec8245f178da8937c05a16f2afadcbce621ea72c271c748ca3bc604847e8b799063b02567b1748a47e7a
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 https://github.com/boddz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# Youtube
|
|
2
|
+
|
|
3
|
+
A ruby gem for extracting youtube video data.
|
|
4
|
+
|
|
5
|
+
Currently a work in progress. This is just a mirror at the moment so support will be limited because of this for now,
|
|
6
|
+
and I am still new to Ruby, so some things will be changed a lot in the future most likely.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Written and tested with and `ruby 3.0.2` on `Ubuntu 22.04.3 LTS`.
|
|
12
|
+
|
|
13
|
+
Install required gems:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Build and install gem file locally:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
gem build && gem install youtube-data-[version].gem
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or install from gem server:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
gem install youtube-data
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
TODO: Write usage instructions here...
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
## Development
|
|
38
|
+
|
|
39
|
+
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive
|
|
40
|
+
prompt that will allow you to experiment.
|
|
41
|
+
|
|
42
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the
|
|
43
|
+
version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version,
|
|
44
|
+
push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
45
|
+
|
|
46
|
+
Running tests:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
cd tests # These commands must be run in ./tests for relative path reasons.
|
|
50
|
+
ruby test_all.rb # To run all tests, or you can just call an individual test instead.
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
## Contributing
|
|
55
|
+
|
|
56
|
+
Bug reports and pull requests are welcome on [GitHub](https://github.com/boddz/youtube).
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
## License
|
|
60
|
+
|
|
61
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/TODO.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Youtube
|
|
4
|
+
|
|
5
|
+
class InitExtractorError < RuntimeError
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
class InvalidVideoIDError < StandardError
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
class InvalidPathError < StandardError
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# For extracting the data needed from a video in a native format.
|
|
15
|
+
#
|
|
16
|
+
# @param video_id [String] The video ID in which to scrape data from.
|
|
17
|
+
#
|
|
18
|
+
# Options
|
|
19
|
+
# =======
|
|
20
|
+
#
|
|
21
|
+
# :mock_session => [MockSession] The mock session to use when testing.
|
|
22
|
+
#
|
|
23
|
+
class DataExtractor
|
|
24
|
+
|
|
25
|
+
HOMEPAGE = URI('https://www.youtube.com')
|
|
26
|
+
|
|
27
|
+
def initialize(video_id, opts = {})
|
|
28
|
+
@video_id = video_id
|
|
29
|
+
@video_path = "/watch?v=#{@video_id}"
|
|
30
|
+
|
|
31
|
+
# The session to use for the extractor allows information to persist during requests/session.
|
|
32
|
+
@session = Net::HTTP.start(HOMEPAGE.hostname, {'use_ssl': true})
|
|
33
|
+
if opts.include?(:mock_session)
|
|
34
|
+
@session = opts[:mock_session]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Cache variables for raw data, saves overhead and less requests from the client session.
|
|
38
|
+
# Required requests.
|
|
39
|
+
@video_html = get_raw_html
|
|
40
|
+
# Required non-requests.
|
|
41
|
+
@video_json_raw = find_raw_json_in_html(@video_html)
|
|
42
|
+
@video_player_path = find_player_base_js_path_in_html(@video_html)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Full url to the video's html page returned/ yielded as a `URI::HTTPS` instance.
|
|
46
|
+
def video_uri
|
|
47
|
+
uri = URI.join(HOMEPAGE, video_path)
|
|
48
|
+
return uri unless block_given?
|
|
49
|
+
yield uri
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Full url to the `base.js` video player script returned/ yielded as a `URI::HTTPS` instance.
|
|
53
|
+
def player_uri
|
|
54
|
+
uri = URI.join(HOMEPAGE, @video_player_path)
|
|
55
|
+
return uri unless block_given?
|
|
56
|
+
yield uri
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Send a simple get request using the extractor session. This should be how the module sends all further requests
|
|
60
|
+
# to the `youtube.com` hostname outside of this class also.
|
|
61
|
+
#
|
|
62
|
+
# @param path [String] Any valid path on the server, prefixed with `/`.
|
|
63
|
+
# @return [Net::HTTPResponse] The untouched response sent back from the request.
|
|
64
|
+
# @yield [Net::HTTPResponse] Same as return but yields to block if present.
|
|
65
|
+
#
|
|
66
|
+
def get_raw(path)
|
|
67
|
+
res = get_request_path(path)
|
|
68
|
+
return res unless block_given?
|
|
69
|
+
yield res
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def video_raw_html
|
|
73
|
+
return @video_html unless block_given?
|
|
74
|
+
yield @video_html
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# This is the json data for the video that is not yet been altered in terms of it's underlying structure, and of
|
|
78
|
+
# which is in it's purest form, but has been processed in a way that makes it easy to work with either through a
|
|
79
|
+
# file or a hash that is returned to the caller.
|
|
80
|
+
#
|
|
81
|
+
# @param dump_file [String] File path to write to (don't open if not provided).
|
|
82
|
+
# @param opt [Hash, String] `:pretty` if a dump file is specified, when set to true then format pretty else raw.
|
|
83
|
+
#
|
|
84
|
+
# @return [Hash] A hash representing the untouched json data parsed from the raw json html data of a video.
|
|
85
|
+
# @yield [Hash] Same as return but yields to block if provided.
|
|
86
|
+
#
|
|
87
|
+
def video_json_untouched(dump_file = nil, opt = {:pretty => true})
|
|
88
|
+
parsed_json = JSON.parse(@video_json_raw)
|
|
89
|
+
|
|
90
|
+
if dump_file.nil? == false
|
|
91
|
+
File.open("#{dump_file}", 'w') do |json_file|
|
|
92
|
+
if opt.include?(:pretty) and opt[:pretty] == true
|
|
93
|
+
json_file.write(JSON.pretty_generate(parsed_json))
|
|
94
|
+
else
|
|
95
|
+
JSON.dump(parsed_json, io=json_file)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
return parsed_json unless block_given?
|
|
101
|
+
yield parsed_json
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Path from the homepage to the video's page.
|
|
105
|
+
private def video_path
|
|
106
|
+
if @video_id.length != 11 # 11 is the fixed length of a video ID on youtube.
|
|
107
|
+
raise InvalidVideoIDError, "The video id `#{@video_id}' is not valid (too short)"
|
|
108
|
+
end
|
|
109
|
+
return @video_path
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Sends a GET request to a specified path on the server using the request handler's session.
|
|
113
|
+
private def get_request_path(path)
|
|
114
|
+
if path.class != "".class
|
|
115
|
+
raise InvalidPathError, 'Path on server must be type `String\''
|
|
116
|
+
end
|
|
117
|
+
if path.empty? == false and path[0] != "/"
|
|
118
|
+
raise InvalidPathError, 'Path must be prefixed with `/\''
|
|
119
|
+
end
|
|
120
|
+
return @session.get(path)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
private def get_raw_html
|
|
124
|
+
return get_request_path(video_path).body
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
private def find_raw_json_in_html(html)
|
|
128
|
+
var = html[/ytInitialPlayerResponse.*=.*\{.*\};/] # Regex match containing var.
|
|
129
|
+
return var[/\{.*\}/] # From matched var, extract the valid js object.
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
private def find_player_base_js_path_in_html(html)
|
|
133
|
+
return html[/([A-Za-z0-9]+(\/[A-Za-z0-9]+)+)_[A-Za-z0-9]+\.[A-Za-z0-9]+\/[A-Za-z0-9]+_[A-Za-z0-9]+\/base\.js/]
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Youtube
|
|
4
|
+
|
|
5
|
+
# Used for extracting thumbnails/ images data and bytes stored on `i.ytimg.com`.
|
|
6
|
+
#
|
|
7
|
+
# @param extractor [Youtube::DataExtractor] The extractor to use when getting initial required video data.
|
|
8
|
+
#
|
|
9
|
+
class Thumbnail
|
|
10
|
+
BASEHOST = URI('https://i.ytimg.com/')
|
|
11
|
+
|
|
12
|
+
def initialize(extractor)
|
|
13
|
+
@extractor = extractor
|
|
14
|
+
|
|
15
|
+
@session = Net::HTTP.start(BASEHOST.hostname, {'use_ssl': true})
|
|
16
|
+
|
|
17
|
+
@thumb_json_array = jump_to_in_json
|
|
18
|
+
@thumb_default_json = @thumb_json_array[0]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# An array of thumbnail(s) json data (stored as a hash).
|
|
22
|
+
#
|
|
23
|
+
# @return [Array] The array of hashes about the thumbnail(s).
|
|
24
|
+
# @yield [Array] The same as return, but yields to block if given.
|
|
25
|
+
#
|
|
26
|
+
def thumbnails_json
|
|
27
|
+
return @thumb_json_array unless block_given?
|
|
28
|
+
yield @thumb_json_array
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# The json data for the default thumbnail (stored as a hash).
|
|
32
|
+
#
|
|
33
|
+
# @return [Hash] The hash form of the default thumbnail's json.
|
|
34
|
+
# @yield [Hash] The same as return, but yields to block if given.
|
|
35
|
+
#
|
|
36
|
+
def default_json
|
|
37
|
+
return @thumb_default_json unless block_given?
|
|
38
|
+
yield @thumb_default_json
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Destination URL to default thumbnail file on the server which is returned/ yielded as a `URI::HTTPS` instance.
|
|
42
|
+
def default_url
|
|
43
|
+
return URI(@thumb_default_json['url']) unless block_given?
|
|
44
|
+
yield URI(@thumb_default_json['url'])
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# The file name for the default thumbnail which is returned/ yielded as `String` instance.
|
|
48
|
+
def default_filename
|
|
49
|
+
filename = default_url.path[/[A-Za-z0-9]+\.[A-Za-z0-9]+/]
|
|
50
|
+
return filename unless block_given?
|
|
51
|
+
yield filename
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# The bytes contained in the default thumbnail file stored on the server returned/ yielded as `String` instance.
|
|
55
|
+
def default_bytes
|
|
56
|
+
res = get_raw(default_url.path)
|
|
57
|
+
return res.body unless block_given?
|
|
58
|
+
yield res.body
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Send a simple get request using the thumbnail session. This should be how the module sends all further requests
|
|
62
|
+
# to the `i.ytimg.com` hostname outside of this class also.
|
|
63
|
+
#
|
|
64
|
+
# @param path [String] Any valid path on the server, prefixed with `/`.
|
|
65
|
+
# @return [Net::HTTPResponse] The untouched response sent back from the request.
|
|
66
|
+
# @yield [Net::HTTPResponse] Same as return but yielded to block.
|
|
67
|
+
#
|
|
68
|
+
def get_raw(path)
|
|
69
|
+
res = get_request_path(path)
|
|
70
|
+
return res unless block_given?
|
|
71
|
+
yield res
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Use `get_raw` method it's the same, but is shorter and can yield.
|
|
75
|
+
private def get_request_path(path)
|
|
76
|
+
if path.class != "".class
|
|
77
|
+
raise InvalidPathError, 'Path on server must be type `String\''
|
|
78
|
+
end
|
|
79
|
+
if path.empty? == false and path[0] != "/"
|
|
80
|
+
raise InvalidPathError, 'Path must be prefixed with `/\''
|
|
81
|
+
end
|
|
82
|
+
return @session.get(path)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
private def jump_to_in_json
|
|
86
|
+
return @extractor.video_json_untouched['microformat']['playerMicroformatRenderer']['thumbnail']['thumbnails']
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
end
|
data/lib/youtube-data.rb
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'uri'
|
|
4
|
+
require 'net/http'
|
|
5
|
+
require 'json'
|
|
6
|
+
|
|
7
|
+
require_relative 'youtube/version'
|
|
8
|
+
require_relative 'youtube/extractor'
|
|
9
|
+
require_relative 'youtube/thumbnail'
|
|
10
|
+
|
|
11
|
+
module Youtube
|
|
12
|
+
class Error < StandardError; end
|
|
13
|
+
end
|
data/sig/youtube.rbs
ADDED