youtube-transcript-rb 0.1.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/.rubocop_todo.yml +166 -0
- data/README.md +42 -42
- data/lib/youtube-transcript-rb.rb +4 -0
- data/lib/youtube_rb/formatters.rb +263 -0
- data/lib/youtube_rb/transcript/api.rb +144 -0
- data/lib/youtube_rb/transcript/errors.rb +215 -0
- data/lib/youtube_rb/transcript/settings.rb +26 -0
- data/lib/youtube_rb/transcript/transcript.rb +237 -0
- data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
- data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +220 -0
- data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
- data/lib/youtube_rb/transcript.rb +33 -0
- data/lib/youtube_rb/version.rb +5 -0
- data/sig/youtube_rb/transcript.rbs +4 -0
- data/spec/api_spec.rb +27 -27
- data/spec/errors_spec.rb +41 -41
- data/spec/formatters_spec.rb +45 -46
- data/spec/integration_spec.rb +39 -48
- data/spec/settings_spec.rb +16 -16
- data/spec/spec_helper.rb +52 -52
- data/spec/transcript_list_fetcher_spec.rb +38 -33
- data/spec/transcript_list_spec.rb +16 -19
- data/spec/transcript_parser_spec.rb +3 -3
- data/spec/transcript_spec.rb +23 -24
- metadata +17 -13
- data/lib/youtube/transcript/rb/api.rb +0 -150
- data/lib/youtube/transcript/rb/errors.rb +0 -217
- data/lib/youtube/transcript/rb/formatters.rb +0 -269
- data/lib/youtube/transcript/rb/settings.rb +0 -28
- data/lib/youtube/transcript/rb/transcript.rb +0 -239
- data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
- data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
- data/lib/youtube/transcript/rb/version.rb +0 -9
- data/lib/youtube/transcript/rb.rb +0 -37
- data/sig/youtube/transcript/rb.rbs +0 -8
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 60cad31d1d80bf186d231cf3eed48cd1599f41000a3de1a185e24480421ea0dd
|
|
4
|
+
data.tar.gz: cc370e6e42208f18a0ed456800de0f2e8b470754c63908149171c5558e15500a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 42f16cf9961a05528f4289886ebb08b2b06cb9060fbecfc6b41ffd4267920ef0d2123afec048c231a4f507bd44a8f1df9e383addbb3390e4e9076d9617bb22ba
|
|
7
|
+
data.tar.gz: b529273917d15dca2f50d28b5c7ea6f04d359c111346bd0ef4547db86b7a016dae83fdefce80b75784b43849931c6595c67833d4977e8b816176bca027883491
|
data/.rubocop.yml
ADDED
data/.rubocop_todo.yml
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# This configuration was generated by
|
|
2
|
+
# `rubocop --auto-gen-config`
|
|
3
|
+
# on 2026-01-09 13:39:24 UTC using RuboCop version 1.82.1.
|
|
4
|
+
# The point is for the user to remove these configuration records
|
|
5
|
+
# one by one as the offenses are removed from the code base.
|
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
|
8
|
+
|
|
9
|
+
# Offense count: 3
|
|
10
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
11
|
+
# Configuration parameters: AllowUnusedKeywordArguments, IgnoreEmptyMethods, IgnoreNotImplementedMethods, NotImplementedExceptions.
|
|
12
|
+
# NotImplementedExceptions: NotImplementedError
|
|
13
|
+
Lint/UnusedMethodArgument:
|
|
14
|
+
Exclude:
|
|
15
|
+
- 'lib/youtube_rb/formatters.rb'
|
|
16
|
+
|
|
17
|
+
# Offense count: 3
|
|
18
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
|
|
19
|
+
Metrics/AbcSize:
|
|
20
|
+
Max: 25
|
|
21
|
+
|
|
22
|
+
# Offense count: 1
|
|
23
|
+
# Configuration parameters: CountComments, CountAsOne.
|
|
24
|
+
Metrics/ClassLength:
|
|
25
|
+
Max: 103
|
|
26
|
+
|
|
27
|
+
# Offense count: 2
|
|
28
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
29
|
+
Metrics/CyclomaticComplexity:
|
|
30
|
+
Max: 14
|
|
31
|
+
|
|
32
|
+
# Offense count: 7
|
|
33
|
+
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
34
|
+
Metrics/MethodLength:
|
|
35
|
+
Max: 29
|
|
36
|
+
|
|
37
|
+
# Offense count: 1
|
|
38
|
+
# Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
|
|
39
|
+
Metrics/ParameterLists:
|
|
40
|
+
Max: 7
|
|
41
|
+
|
|
42
|
+
# Offense count: 2
|
|
43
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
44
|
+
Metrics/PerceivedComplexity:
|
|
45
|
+
Max: 15
|
|
46
|
+
|
|
47
|
+
# Offense count: 1
|
|
48
|
+
# Configuration parameters: ExpectMatchingDefinition, CheckDefinitionPathHierarchy, CheckDefinitionPathHierarchyRoots, Regex, IgnoreExecutableScripts, AllowedAcronyms.
|
|
49
|
+
# CheckDefinitionPathHierarchyRoots: lib, spec, test, src
|
|
50
|
+
# AllowedAcronyms: CLI, DSL, ACL, API, ASCII, CPU, CSS, DNS, EOF, GUID, HTML, HTTP, HTTPS, ID, IP, JSON, LHS, QPS, RAM, RHS, RPC, SLA, SMTP, SQL, SSH, TCP, TLS, TTL, UDP, UI, UID, UUID, URI, URL, UTF8, VM, XML, XMPP, XSRF, XSS
|
|
51
|
+
Naming/FileName:
|
|
52
|
+
Exclude:
|
|
53
|
+
- 'Rakefile.rb'
|
|
54
|
+
- 'lib/youtube-transcript-rb.rb'
|
|
55
|
+
|
|
56
|
+
# Offense count: 3
|
|
57
|
+
# Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
|
|
58
|
+
# AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
|
|
59
|
+
Naming/MethodParameterName:
|
|
60
|
+
Exclude:
|
|
61
|
+
- 'lib/youtube_rb/formatters.rb'
|
|
62
|
+
|
|
63
|
+
# Offense count: 2
|
|
64
|
+
RSpec/BeforeAfterAll:
|
|
65
|
+
Exclude:
|
|
66
|
+
- '**/spec/spec_helper.rb'
|
|
67
|
+
- '**/spec/rails_helper.rb'
|
|
68
|
+
- '**/spec/support/**/*.rb'
|
|
69
|
+
- 'spec/integration_spec.rb'
|
|
70
|
+
|
|
71
|
+
# Offense count: 2
|
|
72
|
+
# Configuration parameters: IgnoredMetadata.
|
|
73
|
+
RSpec/DescribeClass:
|
|
74
|
+
Exclude:
|
|
75
|
+
- '**/spec/features/**/*'
|
|
76
|
+
- '**/spec/requests/**/*'
|
|
77
|
+
- '**/spec/routing/**/*'
|
|
78
|
+
- '**/spec/system/**/*'
|
|
79
|
+
- '**/spec/views/**/*'
|
|
80
|
+
- 'spec/integration_spec.rb'
|
|
81
|
+
- 'spec/settings_spec.rb'
|
|
82
|
+
|
|
83
|
+
# Offense count: 30
|
|
84
|
+
# Configuration parameters: CountAsOne.
|
|
85
|
+
RSpec/ExampleLength:
|
|
86
|
+
Max: 22
|
|
87
|
+
|
|
88
|
+
# Offense count: 4
|
|
89
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
90
|
+
RSpec/ExpectActual:
|
|
91
|
+
Exclude:
|
|
92
|
+
- '**/spec/routing/**/*'
|
|
93
|
+
- 'spec/integration_spec.rb'
|
|
94
|
+
|
|
95
|
+
# Offense count: 2
|
|
96
|
+
# Configuration parameters: Max, AllowedIdentifiers, AllowedPatterns.
|
|
97
|
+
RSpec/IndexedLet:
|
|
98
|
+
Exclude:
|
|
99
|
+
- 'spec/transcript_spec.rb'
|
|
100
|
+
|
|
101
|
+
# Offense count: 91
|
|
102
|
+
RSpec/MultipleExpectations:
|
|
103
|
+
Max: 7
|
|
104
|
+
|
|
105
|
+
# Offense count: 44
|
|
106
|
+
# Configuration parameters: AllowSubject.
|
|
107
|
+
RSpec/MultipleMemoizedHelpers:
|
|
108
|
+
Max: 11
|
|
109
|
+
|
|
110
|
+
# Offense count: 3
|
|
111
|
+
# Configuration parameters: AllowedGroups.
|
|
112
|
+
RSpec/NestedGroups:
|
|
113
|
+
Max: 4
|
|
114
|
+
|
|
115
|
+
# Offense count: 7
|
|
116
|
+
# Configuration parameters: CustomTransform, IgnoreMethods, IgnoreMetadata, InflectorPath, EnforcedInflector.
|
|
117
|
+
# SupportedInflectors: default, active_support
|
|
118
|
+
RSpec/SpecFilePathFormat:
|
|
119
|
+
Exclude:
|
|
120
|
+
- '**/spec/routing/**/*'
|
|
121
|
+
- 'spec/api_spec.rb'
|
|
122
|
+
- 'spec/errors_spec.rb'
|
|
123
|
+
- 'spec/formatters_spec.rb'
|
|
124
|
+
- 'spec/transcript_list_fetcher_spec.rb'
|
|
125
|
+
- 'spec/transcript_list_spec.rb'
|
|
126
|
+
- 'spec/transcript_parser_spec.rb'
|
|
127
|
+
- 'spec/transcript_spec.rb'
|
|
128
|
+
|
|
129
|
+
# Offense count: 10
|
|
130
|
+
# Configuration parameters: IgnoreNameless, IgnoreSymbolicNames.
|
|
131
|
+
RSpec/VerifiedDoubles:
|
|
132
|
+
Exclude:
|
|
133
|
+
- 'spec/api_spec.rb'
|
|
134
|
+
- 'spec/errors_spec.rb'
|
|
135
|
+
- 'spec/transcript_list_fetcher_spec.rb'
|
|
136
|
+
- 'spec/transcript_spec.rb'
|
|
137
|
+
|
|
138
|
+
# Offense count: 1
|
|
139
|
+
# Configuration parameters: AllowedConstants.
|
|
140
|
+
Style/Documentation:
|
|
141
|
+
Exclude:
|
|
142
|
+
- 'spec/**/*'
|
|
143
|
+
- 'test/**/*'
|
|
144
|
+
- 'lib/youtube_rb/transcript.rb'
|
|
145
|
+
|
|
146
|
+
# Offense count: 8
|
|
147
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
148
|
+
# Configuration parameters: EnforcedStyle, MaxUnannotatedPlaceholdersAllowed, Mode, AllowedMethods, AllowedPatterns.
|
|
149
|
+
# SupportedStyles: annotated, template, unannotated
|
|
150
|
+
Style/FormatStringToken:
|
|
151
|
+
Exclude:
|
|
152
|
+
- 'lib/youtube_rb/formatters.rb'
|
|
153
|
+
|
|
154
|
+
# Offense count: 1168
|
|
155
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
156
|
+
# Configuration parameters: EnforcedStyle, ConsistentQuotesInMultiline.
|
|
157
|
+
# SupportedStyles: single_quotes, double_quotes
|
|
158
|
+
Style/StringLiterals:
|
|
159
|
+
Enabled: false
|
|
160
|
+
|
|
161
|
+
# Offense count: 6
|
|
162
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
163
|
+
# Configuration parameters: AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
164
|
+
# URISchemes: http, https
|
|
165
|
+
Layout/LineLength:
|
|
166
|
+
Max: 142
|
data/README.md
CHANGED
|
@@ -47,9 +47,9 @@ gem install youtube-transcript-rb
|
|
|
47
47
|
The easiest way to get a transcript for a given video is to execute:
|
|
48
48
|
|
|
49
49
|
```ruby
|
|
50
|
-
require '
|
|
50
|
+
require 'youtube_rb/transcript'
|
|
51
51
|
|
|
52
|
-
api =
|
|
52
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
53
53
|
api.fetch(video_id)
|
|
54
54
|
```
|
|
55
55
|
|
|
@@ -62,14 +62,14 @@ api.fetch(video_id)
|
|
|
62
62
|
This will return a `FetchedTranscript` object looking somewhat like this:
|
|
63
63
|
|
|
64
64
|
```ruby
|
|
65
|
-
#<
|
|
65
|
+
#<YoutubeRb::Transcript::FetchedTranscript
|
|
66
66
|
@video_id="12345",
|
|
67
67
|
@language="English",
|
|
68
68
|
@language_code="en",
|
|
69
69
|
@is_generated=false,
|
|
70
70
|
@snippets=[
|
|
71
|
-
#<
|
|
72
|
-
#<
|
|
71
|
+
#<YoutubeRb::Transcript::TranscriptSnippet @text="Hey there", @start=0.0, @duration=1.54>,
|
|
72
|
+
#<YoutubeRb::Transcript::TranscriptSnippet @text="how are you", @start=1.54, @duration=4.16>,
|
|
73
73
|
# ...
|
|
74
74
|
]
|
|
75
75
|
>
|
|
@@ -78,7 +78,7 @@ This will return a `FetchedTranscript` object looking somewhat like this:
|
|
|
78
78
|
This object implements `Enumerable`, so you can iterate over it:
|
|
79
79
|
|
|
80
80
|
```ruby
|
|
81
|
-
api =
|
|
81
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
82
82
|
fetched_transcript = api.fetch(video_id)
|
|
83
83
|
|
|
84
84
|
# is iterable
|
|
@@ -117,13 +117,13 @@ an array of hashes:
|
|
|
117
117
|
You can also use the convenience methods on the module directly:
|
|
118
118
|
|
|
119
119
|
```ruby
|
|
120
|
-
require '
|
|
120
|
+
require 'youtube_rb/transcript'
|
|
121
121
|
|
|
122
122
|
# Fetch a transcript
|
|
123
|
-
transcript =
|
|
123
|
+
transcript = YoutubeRb::Transcript.fetch(video_id)
|
|
124
124
|
|
|
125
125
|
# List available transcripts
|
|
126
|
-
transcript_list =
|
|
126
|
+
transcript_list = YoutubeRb::Transcript.list(video_id)
|
|
127
127
|
```
|
|
128
128
|
|
|
129
129
|
### Retrieve different languages
|
|
@@ -132,7 +132,7 @@ You can add the `languages` param if you want to make sure the transcripts are r
|
|
|
132
132
|
(it defaults to english).
|
|
133
133
|
|
|
134
134
|
```ruby
|
|
135
|
-
|
|
135
|
+
YoutubeRb::Transcript::YouTubeTranscriptApi.new.fetch(video_id, languages: ['de', 'en'])
|
|
136
136
|
```
|
|
137
137
|
|
|
138
138
|
It's an array of language codes in a descending priority. In this example it will first try to fetch the german
|
|
@@ -142,7 +142,7 @@ which languages are available first, [have a look at `list`](#list-available-tra
|
|
|
142
142
|
If you only want one language, you still need to format the `languages` argument as an array:
|
|
143
143
|
|
|
144
144
|
```ruby
|
|
145
|
-
|
|
145
|
+
YoutubeRb::Transcript::YouTubeTranscriptApi.new.fetch(video_id, languages: ['de'])
|
|
146
146
|
```
|
|
147
147
|
|
|
148
148
|
### Preserve formatting
|
|
@@ -151,7 +151,7 @@ You can also add `preserve_formatting: true` if you'd like to keep HTML formatti
|
|
|
151
151
|
and `<b>` (bold).
|
|
152
152
|
|
|
153
153
|
```ruby
|
|
154
|
-
|
|
154
|
+
YoutubeRb::Transcript::YouTubeTranscriptApi.new.fetch(video_id, languages: ['de', 'en'], preserve_formatting: true)
|
|
155
155
|
```
|
|
156
156
|
|
|
157
157
|
### List available transcripts
|
|
@@ -159,7 +159,7 @@ Youtube::Transcript::Rb::YouTubeTranscriptApi.new.fetch(video_id, languages: ['d
|
|
|
159
159
|
If you want to list all transcripts which are available for a given video you can call:
|
|
160
160
|
|
|
161
161
|
```ruby
|
|
162
|
-
api =
|
|
162
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
163
163
|
transcript_list = api.list(video_id)
|
|
164
164
|
```
|
|
165
165
|
|
|
@@ -220,9 +220,9 @@ puts translated_transcript.fetch
|
|
|
220
220
|
### By example
|
|
221
221
|
|
|
222
222
|
```ruby
|
|
223
|
-
require '
|
|
223
|
+
require 'youtube_rb/transcript'
|
|
224
224
|
|
|
225
|
-
api =
|
|
225
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
226
226
|
|
|
227
227
|
# retrieve the available transcripts
|
|
228
228
|
transcript_list = api.list('video_id')
|
|
@@ -262,7 +262,7 @@ transcript = transcript_list.find_generated_transcript(['de', 'en'])
|
|
|
262
262
|
You can fetch transcripts for multiple videos at once:
|
|
263
263
|
|
|
264
264
|
```ruby
|
|
265
|
-
api =
|
|
265
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
266
266
|
|
|
267
267
|
# Fetch multiple videos
|
|
268
268
|
transcripts = api.fetch_all(['video1', 'video2', 'video3'])
|
|
@@ -297,14 +297,14 @@ The `Formatters` module provides a few basic formatters:
|
|
|
297
297
|
Here is how to import from the `Formatters` module:
|
|
298
298
|
|
|
299
299
|
```ruby
|
|
300
|
-
require '
|
|
300
|
+
require 'youtube_rb/transcript'
|
|
301
301
|
|
|
302
302
|
# Some provided formatter classes, each outputs a different string format.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
303
|
+
YoutubeRb::Formatters::JSONFormatter
|
|
304
|
+
YoutubeRb::Formatters::TextFormatter
|
|
305
|
+
YoutubeRb::Formatters::PrettyPrintFormatter
|
|
306
|
+
YoutubeRb::Formatters::WebVTTFormatter
|
|
307
|
+
YoutubeRb::Formatters::SRTFormatter
|
|
308
308
|
```
|
|
309
309
|
|
|
310
310
|
### Formatter Example
|
|
@@ -312,12 +312,12 @@ Youtube::Transcript::Rb::Formatters::SRTFormatter
|
|
|
312
312
|
Let's say we wanted to retrieve a transcript and store it to a JSON file. That would look something like this:
|
|
313
313
|
|
|
314
314
|
```ruby
|
|
315
|
-
require '
|
|
315
|
+
require 'youtube_rb/transcript'
|
|
316
316
|
|
|
317
|
-
api =
|
|
317
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new
|
|
318
318
|
transcript = api.fetch(video_id)
|
|
319
319
|
|
|
320
|
-
formatter =
|
|
320
|
+
formatter = YoutubeRb::Formatters::JSONFormatter.new
|
|
321
321
|
|
|
322
322
|
# .format_transcript(transcript) turns the transcript into a JSON string.
|
|
323
323
|
json_formatted = formatter.format_transcript(transcript)
|
|
@@ -334,7 +334,7 @@ Since `JSONFormatter` leverages `JSON.generate` you can also forward keyword arg
|
|
|
334
334
|
`.format_transcript(transcript)` such as making your file output prettier:
|
|
335
335
|
|
|
336
336
|
```ruby
|
|
337
|
-
json_formatted =
|
|
337
|
+
json_formatted = YoutubeRb::Formatters::JSONFormatter.new.format_transcript(
|
|
338
338
|
transcript,
|
|
339
339
|
indent: ' ',
|
|
340
340
|
space: ' '
|
|
@@ -346,9 +346,9 @@ json_formatted = Youtube::Transcript::Rb::Formatters::JSONFormatter.new.format_t
|
|
|
346
346
|
You can also use the `FormatterLoader` to dynamically load formatters by name:
|
|
347
347
|
|
|
348
348
|
```ruby
|
|
349
|
-
require '
|
|
349
|
+
require 'youtube_rb/transcript'
|
|
350
350
|
|
|
351
|
-
loader =
|
|
351
|
+
loader = YoutubeRb::Formatters::FormatterLoader.new
|
|
352
352
|
|
|
353
353
|
# Load by type name: "json", "pretty", "text", "webvtt", "srt"
|
|
354
354
|
formatter = loader.load("json")
|
|
@@ -364,7 +364,7 @@ You can implement your own formatter class. Just inherit from the `Formatter` ba
|
|
|
364
364
|
`format_transcript` and `format_transcripts` methods which should ultimately return a string:
|
|
365
365
|
|
|
366
366
|
```ruby
|
|
367
|
-
class MyCustomFormatter <
|
|
367
|
+
class MyCustomFormatter < YoutubeRb::Formatters::Formatter
|
|
368
368
|
def format_transcript(transcript, **options)
|
|
369
369
|
# Do your custom work in here, but return a string.
|
|
370
370
|
'your processed output data as a string.'
|
|
@@ -382,28 +382,28 @@ end
|
|
|
382
382
|
The library provides a comprehensive set of exceptions for different error scenarios:
|
|
383
383
|
|
|
384
384
|
```ruby
|
|
385
|
-
require '
|
|
385
|
+
require 'youtube_rb/transcript'
|
|
386
386
|
|
|
387
387
|
begin
|
|
388
|
-
transcript =
|
|
389
|
-
rescue
|
|
388
|
+
transcript = YoutubeRb::Transcript.fetch(video_id)
|
|
389
|
+
rescue YoutubeRb::Transcript::TranscriptsDisabled => e
|
|
390
390
|
puts "Subtitles are disabled for this video"
|
|
391
|
-
rescue
|
|
391
|
+
rescue YoutubeRb::Transcript::NoTranscriptFound => e
|
|
392
392
|
puts "No transcript found for the requested languages"
|
|
393
393
|
puts e.requested_language_codes
|
|
394
|
-
rescue
|
|
394
|
+
rescue YoutubeRb::Transcript::NoTranscriptAvailable => e
|
|
395
395
|
puts "No transcripts are available for this video"
|
|
396
|
-
rescue
|
|
396
|
+
rescue YoutubeRb::Transcript::VideoUnavailable => e
|
|
397
397
|
puts "The video is no longer available"
|
|
398
|
-
rescue
|
|
398
|
+
rescue YoutubeRb::Transcript::TooManyRequests => e
|
|
399
399
|
puts "Rate limited by YouTube"
|
|
400
|
-
rescue
|
|
400
|
+
rescue YoutubeRb::Transcript::RequestBlocked => e
|
|
401
401
|
puts "Request blocked by YouTube"
|
|
402
|
-
rescue
|
|
402
|
+
rescue YoutubeRb::Transcript::IpBlocked => e
|
|
403
403
|
puts "Your IP has been blocked by YouTube"
|
|
404
|
-
rescue
|
|
404
|
+
rescue YoutubeRb::Transcript::PoTokenRequired => e
|
|
405
405
|
puts "PO token required - this is a YouTube limitation"
|
|
406
|
-
rescue
|
|
406
|
+
rescue YoutubeRb::Transcript::CouldNotRetrieveTranscript => e
|
|
407
407
|
puts "Could not retrieve transcript: #{e.message}"
|
|
408
408
|
end
|
|
409
409
|
```
|
|
@@ -456,11 +456,11 @@ http_client = Faraday.new do |conn|
|
|
|
456
456
|
conn.adapter Faraday.default_adapter
|
|
457
457
|
end
|
|
458
458
|
|
|
459
|
-
api =
|
|
459
|
+
api = YoutubeRb::Transcript::YouTubeTranscriptApi.new(http_client: http_client)
|
|
460
460
|
api.fetch(video_id)
|
|
461
461
|
|
|
462
462
|
# Share same connection between two instances
|
|
463
|
-
api_2 =
|
|
463
|
+
api_2 = YoutubeRb::Transcript::YouTubeTranscriptApi.new(http_client: http_client)
|
|
464
464
|
api_2.fetch(video_id)
|
|
465
465
|
```
|
|
466
466
|
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module YoutubeRb
|
|
6
|
+
# Module containing all transcript formatters
|
|
7
|
+
module Formatters
|
|
8
|
+
# Base formatter class. All formatters should inherit from this class
|
|
9
|
+
# and implement their own format_transcript and format_transcripts methods.
|
|
10
|
+
class Formatter
|
|
11
|
+
# Format a single transcript
|
|
12
|
+
#
|
|
13
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
14
|
+
# @param options [Hash] Additional formatting options
|
|
15
|
+
# @return [String] The formatted transcript
|
|
16
|
+
def format_transcript(transcript, **options)
|
|
17
|
+
raise NotImplementedError, "Subclass must implement #format_transcript"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Format multiple transcripts
|
|
21
|
+
#
|
|
22
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
23
|
+
# @param options [Hash] Additional formatting options
|
|
24
|
+
# @return [String] The formatted transcripts
|
|
25
|
+
def format_transcripts(transcripts, **options)
|
|
26
|
+
raise NotImplementedError, "Subclass must implement #format_transcripts"
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Formats transcript as pretty-printed Ruby data structures
|
|
31
|
+
class PrettyPrintFormatter < Formatter
|
|
32
|
+
# Format a single transcript as pretty-printed output
|
|
33
|
+
#
|
|
34
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
35
|
+
# @param options [Hash] Options passed to PP.pp
|
|
36
|
+
# @return [String] Pretty-printed transcript data
|
|
37
|
+
def format_transcript(transcript, **options)
|
|
38
|
+
require "pp"
|
|
39
|
+
PP.pp(transcript.to_raw_data, +"", options[:width] || 79)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Format multiple transcripts as pretty-printed output
|
|
43
|
+
#
|
|
44
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
45
|
+
# @param options [Hash] Options passed to PP.pp
|
|
46
|
+
# @return [String] Pretty-printed transcripts data
|
|
47
|
+
def format_transcripts(transcripts, **options)
|
|
48
|
+
require "pp"
|
|
49
|
+
data = transcripts.map(&:to_raw_data)
|
|
50
|
+
PP.pp(data, +"", options[:width] || 79)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Formats transcript as JSON
|
|
55
|
+
class JSONFormatter < Formatter
|
|
56
|
+
# Format a single transcript as JSON
|
|
57
|
+
#
|
|
58
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
59
|
+
# @param options [Hash] Options passed to JSON.generate (e.g., :indent, :space)
|
|
60
|
+
# @return [String] JSON representation of the transcript
|
|
61
|
+
def format_transcript(transcript, **options)
|
|
62
|
+
JSON.generate(transcript.to_raw_data, options)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Format multiple transcripts as JSON array
|
|
66
|
+
#
|
|
67
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
68
|
+
# @param options [Hash] Options passed to JSON.generate
|
|
69
|
+
# @return [String] JSON array representation of the transcripts
|
|
70
|
+
def format_transcripts(transcripts, **options)
|
|
71
|
+
data = transcripts.map(&:to_raw_data)
|
|
72
|
+
JSON.generate(data, options)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Formats transcript as plain text (text only, no timestamps)
|
|
77
|
+
class TextFormatter < Formatter
|
|
78
|
+
# Format a single transcript as plain text
|
|
79
|
+
#
|
|
80
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
81
|
+
# @param options [Hash] Unused options
|
|
82
|
+
# @return [String] Plain text with each line separated by newlines
|
|
83
|
+
def format_transcript(transcript, **options)
|
|
84
|
+
transcript.map(&:text).join("\n")
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Format multiple transcripts as plain text
|
|
88
|
+
#
|
|
89
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
90
|
+
# @param options [Hash] Unused options
|
|
91
|
+
# @return [String] Plain text with transcripts separated by triple newlines
|
|
92
|
+
def format_transcripts(transcripts, **options)
|
|
93
|
+
transcripts.map { |t| format_transcript(t, **options) }.join("\n\n\n")
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Base class for timestamp-based formatters (SRT, WebVTT)
|
|
98
|
+
class TextBasedFormatter < TextFormatter
|
|
99
|
+
# Format a single transcript with timestamps
|
|
100
|
+
#
|
|
101
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
102
|
+
# @param options [Hash] Unused options
|
|
103
|
+
# @return [String] Formatted transcript with timestamps
|
|
104
|
+
def format_transcript(transcript, **options)
|
|
105
|
+
lines = []
|
|
106
|
+
snippets = transcript.to_a
|
|
107
|
+
|
|
108
|
+
snippets.each_with_index do |snippet, i|
|
|
109
|
+
end_time = snippet.start + snippet.duration
|
|
110
|
+
|
|
111
|
+
# Use next snippet's start time if it starts before current end time
|
|
112
|
+
end_time = snippets[i + 1].start if i < snippets.length - 1 && snippets[i + 1].start < end_time
|
|
113
|
+
|
|
114
|
+
time_text = "#{seconds_to_timestamp(snippet.start)} --> #{seconds_to_timestamp(end_time)}"
|
|
115
|
+
lines << format_transcript_helper(i, time_text, snippet)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
format_transcript_header(lines)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
protected
|
|
122
|
+
|
|
123
|
+
# Format a timestamp from components
|
|
124
|
+
#
|
|
125
|
+
# @param hours [Integer] Hours component
|
|
126
|
+
# @param mins [Integer] Minutes component
|
|
127
|
+
# @param secs [Integer] Seconds component
|
|
128
|
+
# @param ms [Integer] Milliseconds component
|
|
129
|
+
# @return [String] Formatted timestamp
|
|
130
|
+
def format_timestamp(hours, mins, secs, ms)
|
|
131
|
+
raise NotImplementedError, "Subclass must implement #format_timestamp"
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Format the transcript header/wrapper
|
|
135
|
+
#
|
|
136
|
+
# @param lines [Array<String>] The formatted lines
|
|
137
|
+
# @return [String] The complete formatted transcript
|
|
138
|
+
def format_transcript_header(lines)
|
|
139
|
+
raise NotImplementedError, "Subclass must implement #format_transcript_header"
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Format a single transcript entry
|
|
143
|
+
#
|
|
144
|
+
# @param index [Integer] The entry index (0-based)
|
|
145
|
+
# @param time_text [String] The formatted time range
|
|
146
|
+
# @param snippet [TranscriptSnippet] The snippet to format
|
|
147
|
+
# @return [String] The formatted entry
|
|
148
|
+
def format_transcript_helper(index, time_text, snippet)
|
|
149
|
+
raise NotImplementedError, "Subclass must implement #format_transcript_helper"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
private
|
|
153
|
+
|
|
154
|
+
# Convert seconds to timestamp string
|
|
155
|
+
#
|
|
156
|
+
# @param time [Float] Time in seconds
|
|
157
|
+
# @return [String] Formatted timestamp
|
|
158
|
+
def seconds_to_timestamp(time)
|
|
159
|
+
time = time.to_f
|
|
160
|
+
hours, remainder = time.divmod(3600)
|
|
161
|
+
mins, secs_float = remainder.divmod(60)
|
|
162
|
+
secs = secs_float.to_i
|
|
163
|
+
ms = ((time - time.to_i) * 1000).round
|
|
164
|
+
|
|
165
|
+
format_timestamp(hours.to_i, mins.to_i, secs, ms)
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Formats transcript as SRT (SubRip) subtitle format
|
|
170
|
+
#
|
|
171
|
+
# @example SRT format
|
|
172
|
+
# 1
|
|
173
|
+
# 00:00:00,000 --> 00:00:02,500
|
|
174
|
+
# Hello world
|
|
175
|
+
#
|
|
176
|
+
# 2
|
|
177
|
+
# 00:00:02,500 --> 00:00:05,000
|
|
178
|
+
# This is a test
|
|
179
|
+
#
|
|
180
|
+
class SRTFormatter < TextBasedFormatter
|
|
181
|
+
protected
|
|
182
|
+
|
|
183
|
+
def format_timestamp(hours, mins, secs, ms)
|
|
184
|
+
format("%02d:%02d:%02d,%03d", hours, mins, secs, ms)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def format_transcript_header(lines)
|
|
188
|
+
"#{lines.join("\n\n")}\n"
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def format_transcript_helper(index, time_text, snippet)
|
|
192
|
+
"#{index + 1}\n#{time_text}\n#{snippet.text}"
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Formats transcript as WebVTT (Web Video Text Tracks) format
|
|
197
|
+
#
|
|
198
|
+
# @example WebVTT format
|
|
199
|
+
# WEBVTT
|
|
200
|
+
#
|
|
201
|
+
# 00:00:00.000 --> 00:00:02.500
|
|
202
|
+
# Hello world
|
|
203
|
+
#
|
|
204
|
+
# 00:00:02.500 --> 00:00:05.000
|
|
205
|
+
# This is a test
|
|
206
|
+
#
|
|
207
|
+
class WebVTTFormatter < TextBasedFormatter
|
|
208
|
+
protected
|
|
209
|
+
|
|
210
|
+
def format_timestamp(hours, mins, secs, ms)
|
|
211
|
+
format("%02d:%02d:%02d.%03d", hours, mins, secs, ms)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def format_transcript_header(lines)
|
|
215
|
+
"WEBVTT\n\n#{lines.join("\n\n")}\n"
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def format_transcript_helper(index, time_text, snippet)
|
|
219
|
+
"#{time_text}\n#{snippet.text}"
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Utility class to load formatters by type name
|
|
224
|
+
class FormatterLoader
|
|
225
|
+
# Mapping of format names to formatter classes
|
|
226
|
+
TYPES = {
|
|
227
|
+
"json" => JSONFormatter,
|
|
228
|
+
"pretty" => PrettyPrintFormatter,
|
|
229
|
+
"text" => TextFormatter,
|
|
230
|
+
"webvtt" => WebVTTFormatter,
|
|
231
|
+
"srt" => SRTFormatter
|
|
232
|
+
}.freeze
|
|
233
|
+
|
|
234
|
+
# Error raised when an unknown formatter type is requested
|
|
235
|
+
class UnknownFormatterType < StandardError
|
|
236
|
+
def initialize(formatter_type)
|
|
237
|
+
super(
|
|
238
|
+
"The format '#{formatter_type}' is not supported. " \
|
|
239
|
+
"Choose one of the following formats: #{TYPES.keys.join(', ')}"
|
|
240
|
+
)
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Load a formatter by type name
|
|
245
|
+
#
|
|
246
|
+
# @param formatter_type [String] The formatter type (json, pretty, text, webvtt, srt)
|
|
247
|
+
# @return [Formatter] An instance of the requested formatter
|
|
248
|
+
# @raise [UnknownFormatterType] If the formatter type is not supported
|
|
249
|
+
#
|
|
250
|
+
# @example
|
|
251
|
+
# loader = FormatterLoader.new
|
|
252
|
+
# formatter = loader.load("json")
|
|
253
|
+
# output = formatter.format_transcript(transcript)
|
|
254
|
+
#
|
|
255
|
+
def load(formatter_type = "pretty")
|
|
256
|
+
formatter_type = formatter_type.to_s
|
|
257
|
+
raise UnknownFormatterType, formatter_type unless TYPES.key?(formatter_type)
|
|
258
|
+
|
|
259
|
+
TYPES[formatter_type].new
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|