youtube-transcript-rb 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of youtube-transcript-rb might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/README.md +42 -42
- data/lib/youtube-transcript-rb.rb +3 -0
- data/lib/youtube_rb/transcript/api.rb +148 -0
- data/lib/youtube_rb/transcript/errors.rb +215 -0
- data/lib/youtube_rb/transcript/formatters.rb +267 -0
- data/lib/youtube_rb/transcript/settings.rb +26 -0
- data/lib/youtube_rb/transcript/transcript.rb +237 -0
- data/lib/youtube_rb/transcript/transcript_list.rb +168 -0
- data/lib/youtube_rb/transcript/transcript_list_fetcher.rb +223 -0
- data/lib/youtube_rb/transcript/transcript_parser.rb +81 -0
- data/lib/{youtube/transcript/rb → youtube_rb/transcript}/version.rb +2 -4
- data/lib/youtube_rb/transcript.rb +35 -0
- data/sig/youtube_rb/transcript.rbs +6 -0
- data/spec/api_spec.rb +20 -20
- data/spec/errors_spec.rb +39 -39
- data/spec/formatters_spec.rb +36 -36
- data/spec/integration_spec.rb +32 -32
- data/spec/settings_spec.rb +16 -16
- data/spec/spec_helper.rb +1 -1
- data/spec/transcript_list_fetcher_spec.rb +27 -27
- data/spec/transcript_list_spec.rb +6 -6
- data/spec/transcript_parser_spec.rb +3 -3
- data/spec/transcript_spec.rb +16 -16
- metadata +13 -12
- data/lib/youtube/transcript/rb/api.rb +0 -150
- data/lib/youtube/transcript/rb/errors.rb +0 -217
- data/lib/youtube/transcript/rb/formatters.rb +0 -269
- data/lib/youtube/transcript/rb/settings.rb +0 -28
- data/lib/youtube/transcript/rb/transcript.rb +0 -239
- data/lib/youtube/transcript/rb/transcript_list.rb +0 -170
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +0 -225
- data/lib/youtube/transcript/rb/transcript_parser.rb +0 -83
- data/lib/youtube/transcript/rb.rb +0 -37
- data/sig/youtube/transcript/rb.rbs +0 -8
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module YoutubeRb
|
|
6
|
+
module Transcript
|
|
7
|
+
# Module containing all transcript formatters
|
|
8
|
+
module Formatters
|
|
9
|
+
# Base formatter class. All formatters should inherit from this class
|
|
10
|
+
# and implement their own format_transcript and format_transcripts methods.
|
|
11
|
+
class Formatter
|
|
12
|
+
# Format a single transcript
|
|
13
|
+
#
|
|
14
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
15
|
+
# @param options [Hash] Additional formatting options
|
|
16
|
+
# @return [String] The formatted transcript
|
|
17
|
+
def format_transcript(transcript, **options)
|
|
18
|
+
raise NotImplementedError, "Subclass must implement #format_transcript"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Format multiple transcripts
|
|
22
|
+
#
|
|
23
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
24
|
+
# @param options [Hash] Additional formatting options
|
|
25
|
+
# @return [String] The formatted transcripts
|
|
26
|
+
def format_transcripts(transcripts, **options)
|
|
27
|
+
raise NotImplementedError, "Subclass must implement #format_transcripts"
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Formats transcript as pretty-printed Ruby data structures
|
|
32
|
+
class PrettyPrintFormatter < Formatter
|
|
33
|
+
# Format a single transcript as pretty-printed output
|
|
34
|
+
#
|
|
35
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
36
|
+
# @param options [Hash] Options passed to PP.pp
|
|
37
|
+
# @return [String] Pretty-printed transcript data
|
|
38
|
+
def format_transcript(transcript, **options)
|
|
39
|
+
require "pp"
|
|
40
|
+
PP.pp(transcript.to_raw_data, +"", options[:width] || 79)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Format multiple transcripts as pretty-printed output
|
|
44
|
+
#
|
|
45
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
46
|
+
# @param options [Hash] Options passed to PP.pp
|
|
47
|
+
# @return [String] Pretty-printed transcripts data
|
|
48
|
+
def format_transcripts(transcripts, **options)
|
|
49
|
+
require "pp"
|
|
50
|
+
data = transcripts.map(&:to_raw_data)
|
|
51
|
+
PP.pp(data, +"", options[:width] || 79)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Formats transcript as JSON
|
|
56
|
+
class JSONFormatter < Formatter
|
|
57
|
+
# Format a single transcript as JSON
|
|
58
|
+
#
|
|
59
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
60
|
+
# @param options [Hash] Options passed to JSON.generate (e.g., :indent, :space)
|
|
61
|
+
# @return [String] JSON representation of the transcript
|
|
62
|
+
def format_transcript(transcript, **options)
|
|
63
|
+
JSON.generate(transcript.to_raw_data, options)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Format multiple transcripts as JSON array
|
|
67
|
+
#
|
|
68
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
69
|
+
# @param options [Hash] Options passed to JSON.generate
|
|
70
|
+
# @return [String] JSON array representation of the transcripts
|
|
71
|
+
def format_transcripts(transcripts, **options)
|
|
72
|
+
data = transcripts.map(&:to_raw_data)
|
|
73
|
+
JSON.generate(data, options)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Formats transcript as plain text (text only, no timestamps)
|
|
78
|
+
class TextFormatter < Formatter
|
|
79
|
+
# Format a single transcript as plain text
|
|
80
|
+
#
|
|
81
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
82
|
+
# @param options [Hash] Unused options
|
|
83
|
+
# @return [String] Plain text with each line separated by newlines
|
|
84
|
+
def format_transcript(transcript, **options)
|
|
85
|
+
transcript.map(&:text).join("\n")
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Format multiple transcripts as plain text
|
|
89
|
+
#
|
|
90
|
+
# @param transcripts [Array<FetchedTranscript>] The transcripts to format
|
|
91
|
+
# @param options [Hash] Unused options
|
|
92
|
+
# @return [String] Plain text with transcripts separated by triple newlines
|
|
93
|
+
def format_transcripts(transcripts, **options)
|
|
94
|
+
transcripts.map { |t| format_transcript(t, **options) }.join("\n\n\n")
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Base class for timestamp-based formatters (SRT, WebVTT)
|
|
99
|
+
class TextBasedFormatter < TextFormatter
|
|
100
|
+
# Format a single transcript with timestamps
|
|
101
|
+
#
|
|
102
|
+
# @param transcript [FetchedTranscript] The transcript to format
|
|
103
|
+
# @param options [Hash] Unused options
|
|
104
|
+
# @return [String] Formatted transcript with timestamps
|
|
105
|
+
def format_transcript(transcript, **options)
|
|
106
|
+
lines = []
|
|
107
|
+
snippets = transcript.to_a
|
|
108
|
+
|
|
109
|
+
snippets.each_with_index do |snippet, i|
|
|
110
|
+
end_time = snippet.start + snippet.duration
|
|
111
|
+
|
|
112
|
+
# Use next snippet's start time if it starts before current end time
|
|
113
|
+
if i < snippets.length - 1 && snippets[i + 1].start < end_time
|
|
114
|
+
end_time = snippets[i + 1].start
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
time_text = "#{seconds_to_timestamp(snippet.start)} --> #{seconds_to_timestamp(end_time)}"
|
|
118
|
+
lines << format_transcript_helper(i, time_text, snippet)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
format_transcript_header(lines)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
protected
|
|
125
|
+
|
|
126
|
+
# Format a timestamp from components
|
|
127
|
+
#
|
|
128
|
+
# @param hours [Integer] Hours component
|
|
129
|
+
# @param mins [Integer] Minutes component
|
|
130
|
+
# @param secs [Integer] Seconds component
|
|
131
|
+
# @param ms [Integer] Milliseconds component
|
|
132
|
+
# @return [String] Formatted timestamp
|
|
133
|
+
def format_timestamp(hours, mins, secs, ms)
|
|
134
|
+
raise NotImplementedError, "Subclass must implement #format_timestamp"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Format the transcript header/wrapper
|
|
138
|
+
#
|
|
139
|
+
# @param lines [Array<String>] The formatted lines
|
|
140
|
+
# @return [String] The complete formatted transcript
|
|
141
|
+
def format_transcript_header(lines)
|
|
142
|
+
raise NotImplementedError, "Subclass must implement #format_transcript_header"
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Format a single transcript entry
|
|
146
|
+
#
|
|
147
|
+
# @param index [Integer] The entry index (0-based)
|
|
148
|
+
# @param time_text [String] The formatted time range
|
|
149
|
+
# @param snippet [TranscriptSnippet] The snippet to format
|
|
150
|
+
# @return [String] The formatted entry
|
|
151
|
+
def format_transcript_helper(index, time_text, snippet)
|
|
152
|
+
raise NotImplementedError, "Subclass must implement #format_transcript_helper"
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
private
|
|
156
|
+
|
|
157
|
+
# Convert seconds to timestamp string
|
|
158
|
+
#
|
|
159
|
+
# @param time [Float] Time in seconds
|
|
160
|
+
# @return [String] Formatted timestamp
|
|
161
|
+
def seconds_to_timestamp(time)
|
|
162
|
+
time = time.to_f
|
|
163
|
+
hours, remainder = time.divmod(3600)
|
|
164
|
+
mins, secs_float = remainder.divmod(60)
|
|
165
|
+
secs = secs_float.to_i
|
|
166
|
+
ms = ((time - time.to_i) * 1000).round
|
|
167
|
+
|
|
168
|
+
format_timestamp(hours.to_i, mins.to_i, secs, ms)
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Formats transcript as SRT (SubRip) subtitle format
|
|
173
|
+
#
|
|
174
|
+
# @example SRT format
|
|
175
|
+
# 1
|
|
176
|
+
# 00:00:00,000 --> 00:00:02,500
|
|
177
|
+
# Hello world
|
|
178
|
+
#
|
|
179
|
+
# 2
|
|
180
|
+
# 00:00:02,500 --> 00:00:05,000
|
|
181
|
+
# This is a test
|
|
182
|
+
#
|
|
183
|
+
class SRTFormatter < TextBasedFormatter
|
|
184
|
+
protected
|
|
185
|
+
|
|
186
|
+
def format_timestamp(hours, mins, secs, ms)
|
|
187
|
+
format("%02d:%02d:%02d,%03d", hours, mins, secs, ms)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def format_transcript_header(lines)
|
|
191
|
+
lines.join("\n\n") + "\n"
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def format_transcript_helper(index, time_text, snippet)
|
|
195
|
+
"#{index + 1}\n#{time_text}\n#{snippet.text}"
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Formats transcript as WebVTT (Web Video Text Tracks) format
|
|
200
|
+
#
|
|
201
|
+
# @example WebVTT format
|
|
202
|
+
# WEBVTT
|
|
203
|
+
#
|
|
204
|
+
# 00:00:00.000 --> 00:00:02.500
|
|
205
|
+
# Hello world
|
|
206
|
+
#
|
|
207
|
+
# 00:00:02.500 --> 00:00:05.000
|
|
208
|
+
# This is a test
|
|
209
|
+
#
|
|
210
|
+
class WebVTTFormatter < TextBasedFormatter
|
|
211
|
+
protected
|
|
212
|
+
|
|
213
|
+
def format_timestamp(hours, mins, secs, ms)
|
|
214
|
+
format("%02d:%02d:%02d.%03d", hours, mins, secs, ms)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def format_transcript_header(lines)
|
|
218
|
+
"WEBVTT\n\n" + lines.join("\n\n") + "\n"
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def format_transcript_helper(index, time_text, snippet)
|
|
222
|
+
"#{time_text}\n#{snippet.text}"
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Utility class to load formatters by type name
|
|
227
|
+
class FormatterLoader
|
|
228
|
+
# Mapping of format names to formatter classes
|
|
229
|
+
TYPES = {
|
|
230
|
+
"json" => JSONFormatter,
|
|
231
|
+
"pretty" => PrettyPrintFormatter,
|
|
232
|
+
"text" => TextFormatter,
|
|
233
|
+
"webvtt" => WebVTTFormatter,
|
|
234
|
+
"srt" => SRTFormatter
|
|
235
|
+
}.freeze
|
|
236
|
+
|
|
237
|
+
# Error raised when an unknown formatter type is requested
|
|
238
|
+
class UnknownFormatterType < StandardError
|
|
239
|
+
def initialize(formatter_type)
|
|
240
|
+
super(
|
|
241
|
+
"The format '#{formatter_type}' is not supported. " \
|
|
242
|
+
"Choose one of the following formats: #{TYPES.keys.join(", ")}"
|
|
243
|
+
)
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Load a formatter by type name
|
|
248
|
+
#
|
|
249
|
+
# @param formatter_type [String] The formatter type (json, pretty, text, webvtt, srt)
|
|
250
|
+
# @return [Formatter] An instance of the requested formatter
|
|
251
|
+
# @raise [UnknownFormatterType] If the formatter type is not supported
|
|
252
|
+
#
|
|
253
|
+
# @example
|
|
254
|
+
# loader = FormatterLoader.new
|
|
255
|
+
# formatter = loader.load("json")
|
|
256
|
+
# output = formatter.format_transcript(transcript)
|
|
257
|
+
#
|
|
258
|
+
def load(formatter_type = "pretty")
|
|
259
|
+
formatter_type = formatter_type.to_s
|
|
260
|
+
raise UnknownFormatterType, formatter_type unless TYPES.key?(formatter_type)
|
|
261
|
+
|
|
262
|
+
TYPES[formatter_type].new
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module YoutubeRb
|
|
4
|
+
module Transcript
|
|
5
|
+
# YouTube watch URL template
|
|
6
|
+
# @example
|
|
7
|
+
# format(WATCH_URL, video_id: "abc123")
|
|
8
|
+
# # => "https://www.youtube.com/watch?v=abc123"
|
|
9
|
+
WATCH_URL = "https://www.youtube.com/watch?v=%<video_id>s"
|
|
10
|
+
|
|
11
|
+
# YouTube Innertube API URL template
|
|
12
|
+
# @example
|
|
13
|
+
# format(INNERTUBE_API_URL, api_key: "key123")
|
|
14
|
+
# # => "https://www.youtube.com/youtubei/v1/player?key=key123"
|
|
15
|
+
INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key=%<api_key>s"
|
|
16
|
+
|
|
17
|
+
# Innertube API context for Android client
|
|
18
|
+
# Used in POST requests to the Innertube API
|
|
19
|
+
INNERTUBE_CONTEXT = {
|
|
20
|
+
"client" => {
|
|
21
|
+
"clientName" => "ANDROID",
|
|
22
|
+
"clientVersion" => "20.10.38"
|
|
23
|
+
}
|
|
24
|
+
}.freeze
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module YoutubeRb
|
|
4
|
+
module Transcript
|
|
5
|
+
# Represents a language available for translation
|
|
6
|
+
class TranslationLanguage
|
|
7
|
+
# @return [String] the language name (e.g., "Spanish")
|
|
8
|
+
attr_reader :language
|
|
9
|
+
|
|
10
|
+
# @return [String] the language code (e.g., "es")
|
|
11
|
+
attr_reader :language_code
|
|
12
|
+
|
|
13
|
+
# @param language [String] the language name
|
|
14
|
+
# @param language_code [String] the language code
|
|
15
|
+
def initialize(language:, language_code:)
|
|
16
|
+
@language = language
|
|
17
|
+
@language_code = language_code
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Represents a single transcript snippet/segment
|
|
22
|
+
class TranscriptSnippet
|
|
23
|
+
# @return [String] the text content of the snippet
|
|
24
|
+
attr_reader :text
|
|
25
|
+
|
|
26
|
+
# @return [Float] the start time in seconds
|
|
27
|
+
attr_reader :start
|
|
28
|
+
|
|
29
|
+
# @return [Float] the duration in seconds
|
|
30
|
+
attr_reader :duration
|
|
31
|
+
|
|
32
|
+
# @param text [String] the text content
|
|
33
|
+
# @param start [Float] the start time in seconds
|
|
34
|
+
# @param duration [Float] the duration in seconds
|
|
35
|
+
def initialize(text:, start:, duration:)
|
|
36
|
+
@text = text
|
|
37
|
+
@start = start.to_f
|
|
38
|
+
@duration = duration.to_f
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Convert to hash representation
|
|
42
|
+
# @return [Hash] hash with text, start, and duration keys
|
|
43
|
+
def to_h
|
|
44
|
+
{
|
|
45
|
+
"text" => @text,
|
|
46
|
+
"start" => @start,
|
|
47
|
+
"duration" => @duration
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Represents a fetched transcript containing multiple snippets
|
|
53
|
+
# This class is Enumerable, allowing iteration over snippets
|
|
54
|
+
class FetchedTranscript
|
|
55
|
+
include Enumerable
|
|
56
|
+
|
|
57
|
+
# @return [String] the video ID
|
|
58
|
+
attr_reader :video_id
|
|
59
|
+
|
|
60
|
+
# @return [String] the language name (e.g., "English")
|
|
61
|
+
attr_reader :language
|
|
62
|
+
|
|
63
|
+
# @return [String] the language code (e.g., "en")
|
|
64
|
+
attr_reader :language_code
|
|
65
|
+
|
|
66
|
+
# @return [Boolean] whether the transcript was auto-generated
|
|
67
|
+
attr_reader :is_generated
|
|
68
|
+
|
|
69
|
+
# @return [Array<TranscriptSnippet>] the transcript snippets
|
|
70
|
+
attr_reader :snippets
|
|
71
|
+
|
|
72
|
+
# @param video_id [String] the YouTube video ID
|
|
73
|
+
# @param language [String] the language name
|
|
74
|
+
# @param language_code [String] the language code
|
|
75
|
+
# @param is_generated [Boolean] whether auto-generated
|
|
76
|
+
# @param snippets [Array<TranscriptSnippet>] the snippets (optional)
|
|
77
|
+
def initialize(video_id:, language:, language_code:, is_generated:, snippets: [])
|
|
78
|
+
@video_id = video_id
|
|
79
|
+
@language = language
|
|
80
|
+
@language_code = language_code
|
|
81
|
+
@is_generated = is_generated
|
|
82
|
+
@snippets = snippets
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Add a snippet to the transcript
|
|
86
|
+
# @param snippet [TranscriptSnippet] the snippet to add
|
|
87
|
+
# @return [self]
|
|
88
|
+
def add_snippet(snippet)
|
|
89
|
+
@snippets << snippet
|
|
90
|
+
self
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Iterate over each snippet
|
|
94
|
+
# @yield [TranscriptSnippet] each snippet in the transcript
|
|
95
|
+
def each(&block)
|
|
96
|
+
@snippets.each(&block)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Get a snippet by index
|
|
100
|
+
# @param index [Integer] the index
|
|
101
|
+
# @return [TranscriptSnippet] the snippet at the given index
|
|
102
|
+
def [](index)
|
|
103
|
+
@snippets[index]
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Get the number of snippets
|
|
107
|
+
# @return [Integer] the count of snippets
|
|
108
|
+
def length
|
|
109
|
+
@snippets.length
|
|
110
|
+
end
|
|
111
|
+
alias size length
|
|
112
|
+
|
|
113
|
+
# Convert to raw data (array of hashes)
|
|
114
|
+
# @return [Array<Hash>] array of snippet hashes
|
|
115
|
+
def to_raw_data
|
|
116
|
+
@snippets.map(&:to_h)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Check if transcript was auto-generated
|
|
120
|
+
# @return [Boolean]
|
|
121
|
+
def generated?
|
|
122
|
+
@is_generated
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Represents transcript metadata and provides fetch/translate capabilities
|
|
127
|
+
class TranscriptMetadata
|
|
128
|
+
# @return [String] the video ID
|
|
129
|
+
attr_reader :video_id
|
|
130
|
+
|
|
131
|
+
# @return [String] the language name
|
|
132
|
+
attr_reader :language
|
|
133
|
+
|
|
134
|
+
# @return [String] the language code
|
|
135
|
+
attr_reader :language_code
|
|
136
|
+
|
|
137
|
+
# @return [Boolean] whether auto-generated
|
|
138
|
+
attr_reader :is_generated
|
|
139
|
+
|
|
140
|
+
# @return [Array<TranslationLanguage>] available translation languages
|
|
141
|
+
attr_reader :translation_languages
|
|
142
|
+
|
|
143
|
+
# @param http_client [Faraday::Connection] the HTTP client
|
|
144
|
+
# @param video_id [String] the YouTube video ID
|
|
145
|
+
# @param url [String] the transcript URL
|
|
146
|
+
# @param language [String] the language name
|
|
147
|
+
# @param language_code [String] the language code
|
|
148
|
+
# @param is_generated [Boolean] whether auto-generated
|
|
149
|
+
# @param translation_languages [Array<TranslationLanguage>] available translations
|
|
150
|
+
def initialize(http_client:, video_id:, url:, language:, language_code:, is_generated:, translation_languages:)
|
|
151
|
+
@http_client = http_client
|
|
152
|
+
@video_id = video_id
|
|
153
|
+
@url = url
|
|
154
|
+
@language = language
|
|
155
|
+
@language_code = language_code
|
|
156
|
+
@is_generated = is_generated
|
|
157
|
+
@translation_languages = translation_languages
|
|
158
|
+
@translation_languages_dict = translation_languages.each_with_object({}) do |tl, hash|
|
|
159
|
+
hash[tl.language_code] = tl.language
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Fetch the actual transcript data
|
|
164
|
+
# @param preserve_formatting [Boolean] whether to preserve HTML formatting
|
|
165
|
+
# @return [FetchedTranscript] the fetched transcript
|
|
166
|
+
# @raise [PoTokenRequired] if a PO token is required
|
|
167
|
+
def fetch(preserve_formatting: false)
|
|
168
|
+
raise PoTokenRequired, @video_id if @url.include?("&exp=xpe")
|
|
169
|
+
|
|
170
|
+
response = @http_client.get(@url)
|
|
171
|
+
raise_http_errors(response)
|
|
172
|
+
|
|
173
|
+
parser = TranscriptParser.new(preserve_formatting: preserve_formatting)
|
|
174
|
+
snippets = parser.parse(response.body)
|
|
175
|
+
|
|
176
|
+
FetchedTranscript.new(
|
|
177
|
+
video_id: @video_id,
|
|
178
|
+
language: @language,
|
|
179
|
+
language_code: @language_code,
|
|
180
|
+
is_generated: @is_generated,
|
|
181
|
+
snippets: snippets
|
|
182
|
+
)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Check if this transcript can be translated
|
|
186
|
+
# @return [Boolean]
|
|
187
|
+
def translatable?
|
|
188
|
+
!@translation_languages.empty?
|
|
189
|
+
end
|
|
190
|
+
alias is_translatable translatable?
|
|
191
|
+
|
|
192
|
+
# Translate this transcript to another language
|
|
193
|
+
# @param language_code [String] the target language code
|
|
194
|
+
# @return [TranscriptMetadata] a new TranscriptMetadata object for the translated version
|
|
195
|
+
# @raise [NotTranslatable] if the transcript cannot be translated
|
|
196
|
+
# @raise [TranslationLanguageNotAvailable] if the language is not available
|
|
197
|
+
def translate(language_code)
|
|
198
|
+
raise NotTranslatable, @video_id unless translatable?
|
|
199
|
+
raise TranslationLanguageNotAvailable, @video_id unless @translation_languages_dict.key?(language_code)
|
|
200
|
+
|
|
201
|
+
TranscriptMetadata.new(
|
|
202
|
+
http_client: @http_client,
|
|
203
|
+
video_id: @video_id,
|
|
204
|
+
url: "#{@url}&tlang=#{language_code}",
|
|
205
|
+
language: @translation_languages_dict[language_code],
|
|
206
|
+
language_code: language_code,
|
|
207
|
+
is_generated: true,
|
|
208
|
+
translation_languages: []
|
|
209
|
+
)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Check if transcript was auto-generated
|
|
213
|
+
# @return [Boolean]
|
|
214
|
+
def generated?
|
|
215
|
+
@is_generated
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# String representation of the transcript
|
|
219
|
+
# @return [String]
|
|
220
|
+
def to_s
|
|
221
|
+
translation_desc = translatable? ? "[TRANSLATABLE]" : ""
|
|
222
|
+
"#{@language_code} (\"#{@language}\")#{translation_desc}"
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
private
|
|
226
|
+
|
|
227
|
+
def raise_http_errors(response)
|
|
228
|
+
case response.status
|
|
229
|
+
when 429
|
|
230
|
+
raise IpBlocked, @video_id
|
|
231
|
+
when 400..599
|
|
232
|
+
raise YouTubeRequestFailed.new(@video_id, StandardError.new("HTTP #{response.status}"))
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
end
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module YoutubeRb
|
|
4
|
+
module Transcript
|
|
5
|
+
# Represents a list of available transcripts for a YouTube video.
|
|
6
|
+
# This class is Enumerable, allowing iteration over all available transcripts.
|
|
7
|
+
# It provides functionality to search for transcripts in specific languages.
|
|
8
|
+
class TranscriptList
|
|
9
|
+
include Enumerable
|
|
10
|
+
|
|
11
|
+
# @return [String] the video ID this TranscriptList is for
|
|
12
|
+
attr_reader :video_id
|
|
13
|
+
|
|
14
|
+
# Build a TranscriptList from captions JSON data
|
|
15
|
+
#
|
|
16
|
+
# @param http_client [Faraday::Connection] the HTTP client for fetching transcripts
|
|
17
|
+
# @param video_id [String] the YouTube video ID
|
|
18
|
+
# @param captions_json [Hash] the captions JSON parsed from YouTube
|
|
19
|
+
# @return [TranscriptList] the created TranscriptList
|
|
20
|
+
def self.build(http_client:, video_id:, captions_json:)
|
|
21
|
+
translation_languages = (captions_json["translationLanguages"] || []).map do |tl|
|
|
22
|
+
TranslationLanguage.new(
|
|
23
|
+
language: tl.dig("languageName", "runs", 0, "text") || "",
|
|
24
|
+
language_code: tl["languageCode"]
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
manually_created_transcripts = {}
|
|
29
|
+
generated_transcripts = {}
|
|
30
|
+
|
|
31
|
+
(captions_json["captionTracks"] || []).each do |caption|
|
|
32
|
+
is_generated = caption.fetch("kind", "") == "asr"
|
|
33
|
+
target_dict = is_generated ? generated_transcripts : manually_created_transcripts
|
|
34
|
+
|
|
35
|
+
language_code = caption["languageCode"]
|
|
36
|
+
transcript_translation_languages = caption.fetch("isTranslatable", false) ? translation_languages : []
|
|
37
|
+
|
|
38
|
+
target_dict[language_code] = TranscriptMetadata.new(
|
|
39
|
+
http_client: http_client,
|
|
40
|
+
video_id: video_id,
|
|
41
|
+
url: caption["baseUrl"].to_s.gsub("&fmt=srv3", ""),
|
|
42
|
+
language: caption.dig("name", "runs", 0, "text") || "",
|
|
43
|
+
language_code: language_code,
|
|
44
|
+
is_generated: is_generated,
|
|
45
|
+
translation_languages: transcript_translation_languages
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
new(
|
|
50
|
+
video_id: video_id,
|
|
51
|
+
manually_created_transcripts: manually_created_transcripts,
|
|
52
|
+
generated_transcripts: generated_transcripts,
|
|
53
|
+
translation_languages: translation_languages
|
|
54
|
+
)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @param video_id [String] the YouTube video ID
|
|
58
|
+
# @param manually_created_transcripts [Hash<String, TranscriptMetadata>] manually created transcripts by language code
|
|
59
|
+
# @param generated_transcripts [Hash<String, TranscriptMetadata>] auto-generated transcripts by language code
|
|
60
|
+
# @param translation_languages [Array<TranslationLanguage>] available translation languages
|
|
61
|
+
def initialize(video_id:, manually_created_transcripts:, generated_transcripts:, translation_languages:)
|
|
62
|
+
@video_id = video_id
|
|
63
|
+
@manually_created_transcripts = manually_created_transcripts
|
|
64
|
+
@generated_transcripts = generated_transcripts
|
|
65
|
+
@translation_languages = translation_languages
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Iterate over all transcripts (manually created first, then generated)
|
|
69
|
+
#
|
|
70
|
+
# @yield [TranscriptMetadata] each available transcript
|
|
71
|
+
# @return [Enumerator] if no block given
|
|
72
|
+
def each(&block)
|
|
73
|
+
return to_enum(:each) unless block_given?
|
|
74
|
+
|
|
75
|
+
@manually_created_transcripts.each_value(&block)
|
|
76
|
+
@generated_transcripts.each_value(&block)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Find a transcript for the given language codes.
|
|
80
|
+
# Manually created transcripts are preferred over generated ones.
|
|
81
|
+
#
|
|
82
|
+
# @param language_codes [Array<String>] language codes in descending priority
|
|
83
|
+
# @return [TranscriptMetadata] the found transcript
|
|
84
|
+
# @raise [NoTranscriptFound] if no transcript matches the requested languages
|
|
85
|
+
def find_transcript(language_codes)
|
|
86
|
+
find_transcript_in(
|
|
87
|
+
language_codes,
|
|
88
|
+
[@manually_created_transcripts, @generated_transcripts]
|
|
89
|
+
)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Find an automatically generated transcript for the given language codes.
|
|
93
|
+
#
|
|
94
|
+
# @param language_codes [Array<String>] language codes in descending priority
|
|
95
|
+
# @return [TranscriptMetadata] the found transcript
|
|
96
|
+
# @raise [NoTranscriptFound] if no generated transcript matches
|
|
97
|
+
def find_generated_transcript(language_codes)
|
|
98
|
+
find_transcript_in(language_codes, [@generated_transcripts])
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Find a manually created transcript for the given language codes.
|
|
102
|
+
#
|
|
103
|
+
# @param language_codes [Array<String>] language codes in descending priority
|
|
104
|
+
# @return [TranscriptMetadata] the found transcript
|
|
105
|
+
# @raise [NoTranscriptFound] if no manually created transcript matches
|
|
106
|
+
def find_manually_created_transcript(language_codes)
|
|
107
|
+
find_transcript_in(language_codes, [@manually_created_transcripts])
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# String representation of the transcript list
|
|
111
|
+
#
|
|
112
|
+
# @return [String] human-readable description of available transcripts
|
|
113
|
+
def to_s
|
|
114
|
+
<<~DESC
|
|
115
|
+
For this video (#{@video_id}) transcripts are available in the following languages:
|
|
116
|
+
|
|
117
|
+
(MANUALLY CREATED)
|
|
118
|
+
#{format_language_list(@manually_created_transcripts.values)}
|
|
119
|
+
|
|
120
|
+
(GENERATED)
|
|
121
|
+
#{format_language_list(@generated_transcripts.values)}
|
|
122
|
+
|
|
123
|
+
(TRANSLATION LANGUAGES)
|
|
124
|
+
#{format_translation_languages}
|
|
125
|
+
DESC
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
private
|
|
129
|
+
|
|
130
|
+
# Find a transcript from the given dictionaries
|
|
131
|
+
#
|
|
132
|
+
# @param language_codes [Array<String>] language codes to search for
|
|
133
|
+
# @param transcript_dicts [Array<Hash>] transcript dictionaries to search
|
|
134
|
+
# @return [TranscriptMetadata] the found transcript
|
|
135
|
+
# @raise [NoTranscriptFound] if no transcript matches
|
|
136
|
+
def find_transcript_in(language_codes, transcript_dicts)
|
|
137
|
+
language_codes.each do |language_code|
|
|
138
|
+
transcript_dicts.each do |dict|
|
|
139
|
+
return dict[language_code] if dict.key?(language_code)
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
raise NoTranscriptFound.new(@video_id, language_codes, self)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Format a list of transcripts for display
|
|
147
|
+
#
|
|
148
|
+
# @param transcripts [Array<TranscriptMetadata>] transcripts to format
|
|
149
|
+
# @return [String] formatted list or "None"
|
|
150
|
+
def format_language_list(transcripts)
|
|
151
|
+
return "None" if transcripts.empty?
|
|
152
|
+
|
|
153
|
+
transcripts.map { |t| " - #{t}" }.join("\n")
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Format translation languages for display
|
|
157
|
+
#
|
|
158
|
+
# @return [String] formatted list or "None"
|
|
159
|
+
def format_translation_languages
|
|
160
|
+
return "None" if @translation_languages.empty?
|
|
161
|
+
|
|
162
|
+
@translation_languages.map do |tl|
|
|
163
|
+
" - #{tl.language_code} (\"#{tl.language}\")"
|
|
164
|
+
end.join("\n")
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|