youtube-transcript-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +1 -0
- data/.serena/.gitignore +1 -0
- data/.serena/memories/code_style_and_conventions.md +35 -0
- data/.serena/memories/project_overview.md +40 -0
- data/.serena/memories/suggested_commands.md +50 -0
- data/.serena/memories/task_completion_checklist.md +25 -0
- data/.serena/memories/tech_stack.md +20 -0
- data/.serena/project.yml +84 -0
- data/LICENSE +21 -0
- data/PLAN.md +422 -0
- data/README.md +496 -0
- data/Rakefile +4 -0
- data/lib/youtube/transcript/rb/api.rb +150 -0
- data/lib/youtube/transcript/rb/errors.rb +217 -0
- data/lib/youtube/transcript/rb/formatters.rb +269 -0
- data/lib/youtube/transcript/rb/settings.rb +28 -0
- data/lib/youtube/transcript/rb/transcript.rb +239 -0
- data/lib/youtube/transcript/rb/transcript_list.rb +170 -0
- data/lib/youtube/transcript/rb/transcript_list_fetcher.rb +225 -0
- data/lib/youtube/transcript/rb/transcript_parser.rb +83 -0
- data/lib/youtube/transcript/rb/version.rb +9 -0
- data/lib/youtube/transcript/rb.rb +37 -0
- data/sig/youtube/transcript/rb.rbs +8 -0
- data/spec/api_spec.rb +397 -0
- data/spec/errors_spec.rb +240 -0
- data/spec/formatters_spec.rb +436 -0
- data/spec/integration_spec.rb +363 -0
- data/spec/settings_spec.rb +67 -0
- data/spec/spec_helper.rb +109 -0
- data/spec/transcript_list_fetcher_spec.rb +520 -0
- data/spec/transcript_list_spec.rb +380 -0
- data/spec/transcript_parser_spec.rb +355 -0
- data/spec/transcript_spec.rb +435 -0
- metadata +118 -0
data/spec/errors_spec.rb
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "youtube/transcript/rb"
|
|
5
|
+
|
|
6
|
+
RSpec.describe Youtube::Transcript::Rb do
|
|
7
|
+
describe "Error hierarchy" do
|
|
8
|
+
it "has Error as the base class" do
|
|
9
|
+
expect(Youtube::Transcript::Rb::Error).to be < StandardError
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "has CouldNotRetrieveTranscript inheriting from Error" do
|
|
13
|
+
expect(Youtube::Transcript::Rb::CouldNotRetrieveTranscript).to be < Youtube::Transcript::Rb::Error
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe "error classes inherit from CouldNotRetrieveTranscript" do
|
|
17
|
+
[
|
|
18
|
+
Youtube::Transcript::Rb::YouTubeDataUnparsable,
|
|
19
|
+
Youtube::Transcript::Rb::YouTubeRequestFailed,
|
|
20
|
+
Youtube::Transcript::Rb::VideoUnplayable,
|
|
21
|
+
Youtube::Transcript::Rb::VideoUnavailable,
|
|
22
|
+
Youtube::Transcript::Rb::InvalidVideoId,
|
|
23
|
+
Youtube::Transcript::Rb::RequestBlocked,
|
|
24
|
+
Youtube::Transcript::Rb::IpBlocked,
|
|
25
|
+
Youtube::Transcript::Rb::TooManyRequests,
|
|
26
|
+
Youtube::Transcript::Rb::TranscriptsDisabled,
|
|
27
|
+
Youtube::Transcript::Rb::AgeRestricted,
|
|
28
|
+
Youtube::Transcript::Rb::NotTranslatable,
|
|
29
|
+
Youtube::Transcript::Rb::TranslationLanguageNotAvailable,
|
|
30
|
+
Youtube::Transcript::Rb::FailedToCreateConsentCookie,
|
|
31
|
+
Youtube::Transcript::Rb::NoTranscriptFound,
|
|
32
|
+
Youtube::Transcript::Rb::NoTranscriptAvailable,
|
|
33
|
+
Youtube::Transcript::Rb::PoTokenRequired
|
|
34
|
+
].each do |error_class|
|
|
35
|
+
it "#{error_class} inherits from CouldNotRetrieveTranscript" do
|
|
36
|
+
expect(error_class).to be < Youtube::Transcript::Rb::CouldNotRetrieveTranscript
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
describe Youtube::Transcript::Rb::CouldNotRetrieveTranscript do
|
|
43
|
+
let(:video_id) { "test_video_123" }
|
|
44
|
+
|
|
45
|
+
it "stores the video_id" do
|
|
46
|
+
# Using a subclass since CouldNotRetrieveTranscript needs CAUSE_MESSAGE
|
|
47
|
+
error = Youtube::Transcript::Rb::VideoUnavailable.new(video_id)
|
|
48
|
+
expect(error.video_id).to eq(video_id)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "includes video URL in error message" do
|
|
52
|
+
error = Youtube::Transcript::Rb::VideoUnavailable.new(video_id)
|
|
53
|
+
expect(error.message).to include("https://www.youtube.com/watch?v=#{video_id}")
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it "includes cause message in error message" do
|
|
57
|
+
error = Youtube::Transcript::Rb::VideoUnavailable.new(video_id)
|
|
58
|
+
expect(error.message).to include("The video is no longer available")
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
describe Youtube::Transcript::Rb::VideoUnavailable do
|
|
63
|
+
let(:video_id) { "unavailable_video" }
|
|
64
|
+
let(:error) { described_class.new(video_id) }
|
|
65
|
+
|
|
66
|
+
it "has the correct cause message" do
|
|
67
|
+
expect(error.cause_message).to eq("The video is no longer available")
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
describe Youtube::Transcript::Rb::TranscriptsDisabled do
|
|
72
|
+
let(:video_id) { "disabled_video" }
|
|
73
|
+
let(:error) { described_class.new(video_id) }
|
|
74
|
+
|
|
75
|
+
it "has the correct cause message" do
|
|
76
|
+
expect(error.cause_message).to eq("Subtitles are disabled for this video")
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
describe Youtube::Transcript::Rb::TooManyRequests do
|
|
81
|
+
let(:video_id) { "rate_limited" }
|
|
82
|
+
let(:error) { described_class.new(video_id) }
|
|
83
|
+
|
|
84
|
+
it "has the correct cause message" do
|
|
85
|
+
expect(error.cause_message).to include("rate limiting")
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
describe Youtube::Transcript::Rb::PoTokenRequired do
|
|
90
|
+
let(:video_id) { "po_token_video" }
|
|
91
|
+
let(:error) { described_class.new(video_id) }
|
|
92
|
+
|
|
93
|
+
it "has the correct cause message" do
|
|
94
|
+
expect(error.cause_message).to include("PO Token")
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
describe Youtube::Transcript::Rb::InvalidVideoId do
|
|
99
|
+
let(:video_id) { "https://www.youtube.com/watch?v=1234" }
|
|
100
|
+
let(:error) { described_class.new(video_id) }
|
|
101
|
+
|
|
102
|
+
it "includes usage instructions in cause message" do
|
|
103
|
+
expect(error.cause_message).to include("Do NOT run")
|
|
104
|
+
expect(error.cause_message).to include("Instead run")
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
describe Youtube::Transcript::Rb::YouTubeRequestFailed do
|
|
109
|
+
let(:video_id) { "failed_request" }
|
|
110
|
+
let(:http_error) { StandardError.new("Connection refused") }
|
|
111
|
+
let(:error) { described_class.new(video_id, http_error) }
|
|
112
|
+
|
|
113
|
+
it "stores the reason" do
|
|
114
|
+
expect(error.reason).to eq("Connection refused")
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "includes the reason in cause message" do
|
|
118
|
+
expect(error.cause_message).to include("Connection refused")
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
describe Youtube::Transcript::Rb::VideoUnplayable do
|
|
123
|
+
let(:video_id) { "unplayable_video" }
|
|
124
|
+
|
|
125
|
+
context "with reason only" do
|
|
126
|
+
let(:error) { described_class.new(video_id, "Video is private") }
|
|
127
|
+
|
|
128
|
+
it "stores the reason" do
|
|
129
|
+
expect(error.reason).to eq("Video is private")
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
it "includes reason in cause message" do
|
|
133
|
+
expect(error.cause_message).to include("Video is private")
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
context "with no reason" do
|
|
138
|
+
let(:error) { described_class.new(video_id) }
|
|
139
|
+
|
|
140
|
+
it "uses default reason text" do
|
|
141
|
+
expect(error.cause_message).to include("No reason specified!")
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
context "with sub_reasons" do
|
|
146
|
+
let(:error) { described_class.new(video_id, "Video is restricted", ["Region blocked", "Age restricted"]) }
|
|
147
|
+
|
|
148
|
+
it "stores sub_reasons" do
|
|
149
|
+
expect(error.sub_reasons).to eq(["Region blocked", "Age restricted"])
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it "includes sub_reasons in cause message" do
|
|
153
|
+
expect(error.cause_message).to include("Region blocked")
|
|
154
|
+
expect(error.cause_message).to include("Age restricted")
|
|
155
|
+
expect(error.cause_message).to include("Additional Details")
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
describe Youtube::Transcript::Rb::NoTranscriptFound do
|
|
161
|
+
let(:video_id) { "no_transcript" }
|
|
162
|
+
let(:requested_languages) { ["ko", "ja"] }
|
|
163
|
+
let(:transcript_data) { double("TranscriptList", to_s: "Available: en, es") }
|
|
164
|
+
let(:error) { described_class.new(video_id, requested_languages, transcript_data) }
|
|
165
|
+
|
|
166
|
+
it "stores requested_language_codes" do
|
|
167
|
+
expect(error.requested_language_codes).to eq(["ko", "ja"])
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it "stores transcript_data" do
|
|
171
|
+
expect(error.transcript_data).to eq(transcript_data)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
it "includes requested languages in cause message" do
|
|
175
|
+
expect(error.cause_message).to include("ko")
|
|
176
|
+
expect(error.cause_message).to include("ja")
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
it "includes transcript data in cause message" do
|
|
180
|
+
expect(error.cause_message).to include("Available: en, es")
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
describe Youtube::Transcript::Rb::RequestBlocked do
|
|
185
|
+
let(:video_id) { "blocked_video" }
|
|
186
|
+
let(:error) { described_class.new(video_id) }
|
|
187
|
+
|
|
188
|
+
it "mentions IP blocking" do
|
|
189
|
+
expect(error.cause_message).to include("YouTube is blocking requests from your IP")
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it "mentions cloud providers" do
|
|
193
|
+
expect(error.cause_message).to include("cloud provider")
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
describe Youtube::Transcript::Rb::IpBlocked do
|
|
198
|
+
let(:video_id) { "ip_blocked" }
|
|
199
|
+
let(:error) { described_class.new(video_id) }
|
|
200
|
+
|
|
201
|
+
it "inherits from RequestBlocked" do
|
|
202
|
+
expect(described_class).to be < Youtube::Transcript::Rb::RequestBlocked
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
it "mentions IP or proxies as workaround" do
|
|
206
|
+
expect(error.cause_message).to include("IP").or include("proxy")
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
describe Youtube::Transcript::Rb::AgeRestricted do
|
|
211
|
+
let(:video_id) { "age_restricted" }
|
|
212
|
+
let(:error) { described_class.new(video_id) }
|
|
213
|
+
|
|
214
|
+
it "mentions age restriction" do
|
|
215
|
+
expect(error.cause_message).to include("age-restricted")
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
it "mentions authentication limitation" do
|
|
219
|
+
expect(error.cause_message).to include("Cookie Authentication is temporarily unsupported")
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
describe Youtube::Transcript::Rb::NotTranslatable do
|
|
224
|
+
let(:video_id) { "not_translatable" }
|
|
225
|
+
let(:error) { described_class.new(video_id) }
|
|
226
|
+
|
|
227
|
+
it "has the correct cause message" do
|
|
228
|
+
expect(error.cause_message).to include("not translatable")
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
describe Youtube::Transcript::Rb::TranslationLanguageNotAvailable do
|
|
233
|
+
let(:video_id) { "translation_unavailable" }
|
|
234
|
+
let(:error) { described_class.new(video_id) }
|
|
235
|
+
|
|
236
|
+
it "has the correct cause message" do
|
|
237
|
+
expect(error.cause_message).to include("translation language is not available")
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe Youtube::Transcript::Rb::Formatters do
|
|
6
|
+
# Helper to create a FetchedTranscript with snippets
|
|
7
|
+
def create_transcript(video_id: "test123", language: "English", language_code: "en", is_generated: false, snippets: nil)
|
|
8
|
+
snippets ||= [
|
|
9
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Hello world", start: 0.0, duration: 2.5),
|
|
10
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "This is a test", start: 2.5, duration: 3.0),
|
|
11
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Thank you", start: 5.5, duration: 2.0)
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
Youtube::Transcript::Rb::FetchedTranscript.new(
|
|
15
|
+
video_id: video_id,
|
|
16
|
+
language: language,
|
|
17
|
+
language_code: language_code,
|
|
18
|
+
is_generated: is_generated,
|
|
19
|
+
snippets: snippets
|
|
20
|
+
)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
let(:transcript) { create_transcript }
|
|
24
|
+
let(:transcript2) { create_transcript(video_id: "video2", language_code: "es", language: "Spanish") }
|
|
25
|
+
let(:transcripts) { [transcript, transcript2] }
|
|
26
|
+
|
|
27
|
+
describe Youtube::Transcript::Rb::Formatters::Formatter do
|
|
28
|
+
let(:formatter) { described_class.new }
|
|
29
|
+
|
|
30
|
+
describe "#format_transcript" do
|
|
31
|
+
it "raises NotImplementedError" do
|
|
32
|
+
expect { formatter.format_transcript(transcript) }.to raise_error(NotImplementedError)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
describe "#format_transcripts" do
|
|
37
|
+
it "raises NotImplementedError" do
|
|
38
|
+
expect { formatter.format_transcripts(transcripts) }.to raise_error(NotImplementedError)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
describe Youtube::Transcript::Rb::Formatters::JSONFormatter do
|
|
44
|
+
let(:formatter) { described_class.new }
|
|
45
|
+
|
|
46
|
+
describe "#format_transcript" do
|
|
47
|
+
it "returns valid JSON" do
|
|
48
|
+
result = formatter.format_transcript(transcript)
|
|
49
|
+
expect { JSON.parse(result) }.not_to raise_error
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it "contains all snippets" do
|
|
53
|
+
result = formatter.format_transcript(transcript)
|
|
54
|
+
parsed = JSON.parse(result)
|
|
55
|
+
expect(parsed.length).to eq(3)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it "includes text, start, and duration for each snippet" do
|
|
59
|
+
result = formatter.format_transcript(transcript)
|
|
60
|
+
parsed = JSON.parse(result)
|
|
61
|
+
|
|
62
|
+
expect(parsed[0]["text"]).to eq("Hello world")
|
|
63
|
+
expect(parsed[0]["start"]).to eq(0.0)
|
|
64
|
+
expect(parsed[0]["duration"]).to eq(2.5)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
it "supports JSON options" do
|
|
68
|
+
# JSON.generate with indent requires array_nl and object_nl for newlines
|
|
69
|
+
result = formatter.format_transcript(transcript, indent: " ", array_nl: "\n", object_nl: "\n")
|
|
70
|
+
expect(result).to include("\n")
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
describe "#format_transcripts" do
|
|
75
|
+
it "returns valid JSON array" do
|
|
76
|
+
result = formatter.format_transcripts(transcripts)
|
|
77
|
+
parsed = JSON.parse(result)
|
|
78
|
+
expect(parsed).to be_an(Array)
|
|
79
|
+
expect(parsed.length).to eq(2)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it "contains all transcripts" do
|
|
83
|
+
result = formatter.format_transcripts(transcripts)
|
|
84
|
+
parsed = JSON.parse(result)
|
|
85
|
+
expect(parsed[0].length).to eq(3)
|
|
86
|
+
expect(parsed[1].length).to eq(3)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
describe Youtube::Transcript::Rb::Formatters::TextFormatter do
|
|
92
|
+
let(:formatter) { described_class.new }
|
|
93
|
+
|
|
94
|
+
describe "#format_transcript" do
|
|
95
|
+
it "returns plain text with newlines" do
|
|
96
|
+
result = formatter.format_transcript(transcript)
|
|
97
|
+
expect(result).to eq("Hello world\nThis is a test\nThank you")
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it "contains only text, no timestamps" do
|
|
101
|
+
result = formatter.format_transcript(transcript)
|
|
102
|
+
expect(result).not_to include("0.0")
|
|
103
|
+
expect(result).not_to include("-->")
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
describe "#format_transcripts" do
|
|
108
|
+
it "separates transcripts with triple newlines" do
|
|
109
|
+
result = formatter.format_transcripts(transcripts)
|
|
110
|
+
expect(result).to include("\n\n\n")
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it "contains all transcript texts" do
|
|
114
|
+
result = formatter.format_transcripts(transcripts)
|
|
115
|
+
expect(result).to include("Hello world")
|
|
116
|
+
expect(result).to include("Thank you")
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe Youtube::Transcript::Rb::Formatters::PrettyPrintFormatter do
|
|
122
|
+
let(:formatter) { described_class.new }
|
|
123
|
+
|
|
124
|
+
describe "#format_transcript" do
|
|
125
|
+
it "returns a string" do
|
|
126
|
+
result = formatter.format_transcript(transcript)
|
|
127
|
+
expect(result).to be_a(String)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
it "contains transcript data" do
|
|
131
|
+
result = formatter.format_transcript(transcript)
|
|
132
|
+
expect(result).to include("Hello world")
|
|
133
|
+
expect(result).to include("text")
|
|
134
|
+
expect(result).to include("start")
|
|
135
|
+
expect(result).to include("duration")
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it "is formatted with indentation" do
|
|
139
|
+
result = formatter.format_transcript(transcript)
|
|
140
|
+
# PP output typically has newlines for arrays
|
|
141
|
+
expect(result).to include("\n") if transcript.length > 1
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
it "accepts width option" do
|
|
145
|
+
result = formatter.format_transcript(transcript, width: 40)
|
|
146
|
+
expect(result).to be_a(String)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
describe "#format_transcripts" do
|
|
151
|
+
it "returns a string containing all transcripts" do
|
|
152
|
+
result = formatter.format_transcripts(transcripts)
|
|
153
|
+
expect(result).to be_a(String)
|
|
154
|
+
expect(result).to include("Hello world")
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
describe Youtube::Transcript::Rb::Formatters::SRTFormatter do
|
|
160
|
+
let(:formatter) { described_class.new }
|
|
161
|
+
|
|
162
|
+
describe "#format_transcript" do
|
|
163
|
+
let(:result) { formatter.format_transcript(transcript) }
|
|
164
|
+
|
|
165
|
+
it "includes sequence numbers starting from 1" do
|
|
166
|
+
expect(result).to include("1\n")
|
|
167
|
+
expect(result).to include("2\n")
|
|
168
|
+
expect(result).to include("3\n")
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
it "uses comma as millisecond separator" do
|
|
172
|
+
expect(result).to include(",")
|
|
173
|
+
expect(result).not_to match(/\d{2}:\d{2}:\d{2}\.\d{3}/)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it "formats timestamps correctly" do
|
|
177
|
+
expect(result).to include("00:00:00,000 --> 00:00:02,500")
|
|
178
|
+
expect(result).to include("00:00:02,500 --> 00:00:05,500")
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
it "includes the text content" do
|
|
182
|
+
expect(result).to include("Hello world")
|
|
183
|
+
expect(result).to include("This is a test")
|
|
184
|
+
expect(result).to include("Thank you")
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
it "separates entries with blank lines" do
|
|
188
|
+
expect(result).to include("\n\n")
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it "ends with a newline" do
|
|
192
|
+
expect(result).to end_with("\n")
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
it "follows SRT format structure" do
|
|
196
|
+
lines = result.split("\n\n")
|
|
197
|
+
first_entry = lines[0].split("\n")
|
|
198
|
+
|
|
199
|
+
expect(first_entry[0]).to eq("1")
|
|
200
|
+
expect(first_entry[1]).to match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}/)
|
|
201
|
+
expect(first_entry[2]).to eq("Hello world")
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
describe "timestamp edge cases" do
|
|
206
|
+
it "handles hours correctly" do
|
|
207
|
+
snippets = [
|
|
208
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Long video", start: 3661.5, duration: 2.0)
|
|
209
|
+
]
|
|
210
|
+
transcript = create_transcript(snippets: snippets)
|
|
211
|
+
result = formatter.format_transcript(transcript)
|
|
212
|
+
|
|
213
|
+
expect(result).to include("01:01:01,500")
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
it "handles overlapping timestamps" do
|
|
217
|
+
snippets = [
|
|
218
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "First", start: 0.0, duration: 5.0),
|
|
219
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Second", start: 2.0, duration: 3.0)
|
|
220
|
+
]
|
|
221
|
+
transcript = create_transcript(snippets: snippets)
|
|
222
|
+
result = formatter.format_transcript(transcript)
|
|
223
|
+
|
|
224
|
+
# First snippet should end at second snippet's start
|
|
225
|
+
expect(result).to include("00:00:00,000 --> 00:00:02,000")
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
describe Youtube::Transcript::Rb::Formatters::WebVTTFormatter do
|
|
231
|
+
let(:formatter) { described_class.new }
|
|
232
|
+
|
|
233
|
+
describe "#format_transcript" do
|
|
234
|
+
let(:result) { formatter.format_transcript(transcript) }
|
|
235
|
+
|
|
236
|
+
it "starts with WEBVTT header" do
|
|
237
|
+
expect(result).to start_with("WEBVTT\n\n")
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
it "uses period as millisecond separator" do
|
|
241
|
+
expect(result).to match(/\d{2}:\d{2}:\d{2}\.\d{3}/)
|
|
242
|
+
expect(result).not_to match(/\d{2}:\d{2}:\d{2},\d{3}/)
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
it "formats timestamps correctly" do
|
|
246
|
+
expect(result).to include("00:00:00.000 --> 00:00:02.500")
|
|
247
|
+
expect(result).to include("00:00:02.500 --> 00:00:05.500")
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
it "does not include sequence numbers" do
|
|
251
|
+
lines = result.split("\n")
|
|
252
|
+
# Skip WEBVTT header
|
|
253
|
+
timestamp_lines = lines.select { |l| l.include?("-->") }
|
|
254
|
+
timestamp_lines.each_with_index do |line, i|
|
|
255
|
+
prev_line = lines[lines.index(line) - 1]
|
|
256
|
+
# Previous line should be empty or WEBVTT, not a number
|
|
257
|
+
expect(prev_line).not_to match(/^\d+$/)
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
it "includes the text content" do
|
|
262
|
+
expect(result).to include("Hello world")
|
|
263
|
+
expect(result).to include("This is a test")
|
|
264
|
+
expect(result).to include("Thank you")
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
it "ends with a newline" do
|
|
268
|
+
expect(result).to end_with("\n")
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
describe "timestamp edge cases" do
|
|
273
|
+
it "handles hours correctly" do
|
|
274
|
+
snippets = [
|
|
275
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Long video", start: 3661.5, duration: 2.0)
|
|
276
|
+
]
|
|
277
|
+
transcript = create_transcript(snippets: snippets)
|
|
278
|
+
result = formatter.format_transcript(transcript)
|
|
279
|
+
|
|
280
|
+
expect(result).to include("01:01:01.500")
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
describe Youtube::Transcript::Rb::Formatters::FormatterLoader do
|
|
286
|
+
let(:loader) { described_class.new }
|
|
287
|
+
|
|
288
|
+
describe "#load" do
|
|
289
|
+
it "loads JSONFormatter for 'json'" do
|
|
290
|
+
formatter = loader.load("json")
|
|
291
|
+
expect(formatter).to be_a(Youtube::Transcript::Rb::Formatters::JSONFormatter)
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
it "loads TextFormatter for 'text'" do
|
|
295
|
+
formatter = loader.load("text")
|
|
296
|
+
expect(formatter).to be_a(Youtube::Transcript::Rb::Formatters::TextFormatter)
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
it "loads PrettyPrintFormatter for 'pretty'" do
|
|
300
|
+
formatter = loader.load("pretty")
|
|
301
|
+
expect(formatter).to be_a(Youtube::Transcript::Rb::Formatters::PrettyPrintFormatter)
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
it "loads SRTFormatter for 'srt'" do
|
|
305
|
+
formatter = loader.load("srt")
|
|
306
|
+
expect(formatter).to be_a(Youtube::Transcript::Rb::Formatters::SRTFormatter)
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
it "loads WebVTTFormatter for 'webvtt'" do
|
|
310
|
+
formatter = loader.load("webvtt")
|
|
311
|
+
expect(formatter).to be_a(Youtube::Transcript::Rb::Formatters::WebVTTFormatter)
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
it "defaults to PrettyPrintFormatter" do
|
|
315
|
+
formatter = loader.load
|
|
316
|
+
expect(formatter).to be_a(Youtube::Transcript::Rb::Formatters::PrettyPrintFormatter)
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
it "accepts symbol as formatter type" do
|
|
320
|
+
formatter = loader.load(:json)
|
|
321
|
+
expect(formatter).to be_a(Youtube::Transcript::Rb::Formatters::JSONFormatter)
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
it "raises UnknownFormatterType for invalid type" do
|
|
325
|
+
expect { loader.load("invalid") }.to raise_error(
|
|
326
|
+
Youtube::Transcript::Rb::Formatters::FormatterLoader::UnknownFormatterType
|
|
327
|
+
)
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
it "includes available formats in error message" do
|
|
331
|
+
begin
|
|
332
|
+
loader.load("invalid")
|
|
333
|
+
rescue Youtube::Transcript::Rb::Formatters::FormatterLoader::UnknownFormatterType => e
|
|
334
|
+
expect(e.message).to include("json")
|
|
335
|
+
expect(e.message).to include("text")
|
|
336
|
+
expect(e.message).to include("srt")
|
|
337
|
+
expect(e.message).to include("webvtt")
|
|
338
|
+
expect(e.message).to include("pretty")
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
describe "TYPES constant" do
|
|
344
|
+
it "contains all expected formatter types" do
|
|
345
|
+
expect(described_class::TYPES.keys).to contain_exactly("json", "pretty", "text", "webvtt", "srt")
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
it "is frozen" do
|
|
349
|
+
expect(described_class::TYPES).to be_frozen
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
describe "integration tests" do
|
|
355
|
+
let(:loader) { Youtube::Transcript::Rb::Formatters::FormatterLoader.new }
|
|
356
|
+
|
|
357
|
+
it "can format transcript with each formatter type" do
|
|
358
|
+
%w[json text pretty srt webvtt].each do |type|
|
|
359
|
+
formatter = loader.load(type)
|
|
360
|
+
result = formatter.format_transcript(transcript)
|
|
361
|
+
expect(result).to be_a(String)
|
|
362
|
+
expect(result.length).to be > 0
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
it "can format multiple transcripts with each formatter type" do
|
|
367
|
+
%w[json text pretty].each do |type|
|
|
368
|
+
formatter = loader.load(type)
|
|
369
|
+
result = formatter.format_transcripts(transcripts)
|
|
370
|
+
expect(result).to be_a(String)
|
|
371
|
+
expect(result.length).to be > 0
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
describe "empty transcript handling" do
|
|
377
|
+
let(:empty_snippets) { [] }
|
|
378
|
+
let(:empty_transcript) { create_transcript(snippets: empty_snippets) }
|
|
379
|
+
|
|
380
|
+
it "JSONFormatter handles empty transcript" do
|
|
381
|
+
formatter = Youtube::Transcript::Rb::Formatters::JSONFormatter.new
|
|
382
|
+
result = formatter.format_transcript(empty_transcript)
|
|
383
|
+
expect(JSON.parse(result)).to eq([])
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
it "TextFormatter handles empty transcript" do
|
|
387
|
+
formatter = Youtube::Transcript::Rb::Formatters::TextFormatter.new
|
|
388
|
+
result = formatter.format_transcript(empty_transcript)
|
|
389
|
+
expect(result).to eq("")
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
it "SRTFormatter handles empty transcript" do
|
|
393
|
+
formatter = Youtube::Transcript::Rb::Formatters::SRTFormatter.new
|
|
394
|
+
result = formatter.format_transcript(empty_transcript)
|
|
395
|
+
expect(result).to eq("\n")
|
|
396
|
+
end
|
|
397
|
+
|
|
398
|
+
it "WebVTTFormatter handles empty transcript" do
|
|
399
|
+
formatter = Youtube::Transcript::Rb::Formatters::WebVTTFormatter.new
|
|
400
|
+
result = formatter.format_transcript(empty_transcript)
|
|
401
|
+
expect(result).to eq("WEBVTT\n\n\n")
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
describe "special character handling" do
|
|
406
|
+
let(:special_snippets) do
|
|
407
|
+
[
|
|
408
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Hello <b>world</b>", start: 0.0, duration: 2.0),
|
|
409
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: 'Quote: "test"', start: 2.0, duration: 2.0),
|
|
410
|
+
Youtube::Transcript::Rb::TranscriptSnippet.new(text: "Line1\nLine2", start: 4.0, duration: 2.0)
|
|
411
|
+
]
|
|
412
|
+
end
|
|
413
|
+
let(:special_transcript) { create_transcript(snippets: special_snippets) }
|
|
414
|
+
|
|
415
|
+
it "JSONFormatter escapes special characters" do
|
|
416
|
+
formatter = Youtube::Transcript::Rb::Formatters::JSONFormatter.new
|
|
417
|
+
result = formatter.format_transcript(special_transcript)
|
|
418
|
+
parsed = JSON.parse(result)
|
|
419
|
+
expect(parsed[0]["text"]).to eq("Hello <b>world</b>")
|
|
420
|
+
expect(parsed[1]["text"]).to eq('Quote: "test"')
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
it "TextFormatter preserves special characters" do
|
|
424
|
+
formatter = Youtube::Transcript::Rb::Formatters::TextFormatter.new
|
|
425
|
+
result = formatter.format_transcript(special_transcript)
|
|
426
|
+
expect(result).to include("<b>world</b>")
|
|
427
|
+
expect(result).to include('"test"')
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
it "SRTFormatter preserves HTML tags in text" do
|
|
431
|
+
formatter = Youtube::Transcript::Rb::Formatters::SRTFormatter.new
|
|
432
|
+
result = formatter.format_transcript(special_transcript)
|
|
433
|
+
expect(result).to include("<b>world</b>")
|
|
434
|
+
end
|
|
435
|
+
end
|
|
436
|
+
end
|