@livekit/agents 1.0.40 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs +20 -18
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +20 -18
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +5 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/stt.cjs +2 -1
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +2 -1
- package/dist/inference/stt.js.map +1 -1
- package/dist/llm/realtime.cjs.map +1 -1
- package/dist/llm/realtime.d.cts +5 -1
- package/dist/llm/realtime.d.ts +5 -1
- package/dist/llm/realtime.d.ts.map +1 -1
- package/dist/llm/realtime.js.map +1 -1
- package/dist/tts/stream_adapter.cjs +15 -1
- package/dist/tts/stream_adapter.cjs.map +1 -1
- package/dist/tts/stream_adapter.d.ts.map +1 -1
- package/dist/tts/stream_adapter.js +15 -1
- package/dist/tts/stream_adapter.js.map +1 -1
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.d.cts +9 -1
- package/dist/tts/tts.d.ts +9 -1
- package/dist/tts/tts.d.ts.map +1 -1
- package/dist/tts/tts.js.map +1 -1
- package/dist/types.cjs +3 -0
- package/dist/types.cjs.map +1 -1
- package/dist/types.d.cts +4 -0
- package/dist/types.d.ts +4 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -1
- package/dist/voice/agent.cjs +11 -1
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +7 -3
- package/dist/voice/agent.d.ts +7 -3
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +11 -1
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +30 -14
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +1 -0
- package/dist/voice/agent_activity.d.ts +1 -0
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +30 -14
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +5 -1
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +2 -0
- package/dist/voice/agent_session.d.ts +2 -0
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +5 -1
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/background_audio.cjs +2 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/background_audio.d.cts +4 -2
- package/dist/voice/background_audio.d.ts +4 -2
- package/dist/voice/background_audio.d.ts.map +1 -1
- package/dist/voice/background_audio.js +2 -1
- package/dist/voice/background_audio.js.map +1 -1
- package/dist/voice/generation.cjs +58 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +17 -3
- package/dist/voice/generation.d.ts +17 -3
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +63 -6
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/index.cjs.map +1 -1
- package/dist/voice/index.d.cts +1 -1
- package/dist/voice/index.d.ts +1 -1
- package/dist/voice/index.d.ts.map +1 -1
- package/dist/voice/index.js.map +1 -1
- package/dist/voice/io.cjs +22 -2
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +21 -5
- package/dist/voice/io.d.ts +21 -5
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +18 -1
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +3 -2
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +3 -3
- package/dist/voice/room_io/_output.d.ts +3 -3
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +4 -3
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/transcription/synchronizer.cjs +137 -13
- package/dist/voice/transcription/synchronizer.cjs.map +1 -1
- package/dist/voice/transcription/synchronizer.d.cts +34 -4
- package/dist/voice/transcription/synchronizer.d.ts +34 -4
- package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
- package/dist/voice/transcription/synchronizer.js +141 -14
- package/dist/voice/transcription/synchronizer.js.map +1 -1
- package/dist/voice/transcription/synchronizer.test.cjs +151 -0
- package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
- package/dist/voice/transcription/synchronizer.test.js +150 -0
- package/dist/voice/transcription/synchronizer.test.js.map +1 -0
- package/package.json +1 -1
- package/src/cli.ts +20 -18
- package/src/index.ts +1 -0
- package/src/inference/stt.ts +9 -8
- package/src/llm/realtime.ts +5 -1
- package/src/tts/stream_adapter.ts +23 -1
- package/src/tts/tts.ts +10 -1
- package/src/types.ts +5 -0
- package/src/voice/agent.ts +19 -4
- package/src/voice/agent_activity.ts +38 -13
- package/src/voice/agent_session.ts +6 -0
- package/src/voice/background_audio.ts +6 -3
- package/src/voice/generation.ts +115 -10
- package/src/voice/index.ts +1 -1
- package/src/voice/io.ts +40 -5
- package/src/voice/room_io/_output.ts +6 -5
- package/src/voice/transcription/synchronizer.test.ts +206 -0
- package/src/voice/transcription/synchronizer.ts +202 -17
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { describe, expect, it } from 'vitest';
|
|
5
|
+
import { SpeakingRateData } from './synchronizer.js';
|
|
6
|
+
|
|
7
|
+
describe('SpeakingRateData', () => {
|
|
8
|
+
describe('constructor', () => {
|
|
9
|
+
it('should initialize with empty arrays', () => {
|
|
10
|
+
const data = new SpeakingRateData();
|
|
11
|
+
expect(data.timestamps).toEqual([]);
|
|
12
|
+
expect(data.speakingRate).toEqual([]);
|
|
13
|
+
expect(data.speakIntegrals).toEqual([]);
|
|
14
|
+
expect(data.pushedDuration).toBe(0);
|
|
15
|
+
});
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
describe('addByRate', () => {
|
|
19
|
+
it('should add a single rate entry', () => {
|
|
20
|
+
const data = new SpeakingRateData();
|
|
21
|
+
data.addByRate(1.0, 5.0);
|
|
22
|
+
|
|
23
|
+
expect(data.timestamps).toEqual([1.0]);
|
|
24
|
+
expect(data.speakingRate).toEqual([5.0]);
|
|
25
|
+
// integral = 0 + 5.0 * (1.0 - 0) = 5.0
|
|
26
|
+
expect(data.speakIntegrals).toEqual([5.0]);
|
|
27
|
+
expect(data.pushedDuration).toBe(1.0);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('should accumulate integrals across multiple entries', () => {
|
|
31
|
+
const data = new SpeakingRateData();
|
|
32
|
+
data.addByRate(1.0, 4.0); // integral = 0 + 4.0 * 1.0 = 4.0
|
|
33
|
+
data.addByRate(2.0, 6.0); // integral = 4.0 + 6.0 * 1.0 = 10.0
|
|
34
|
+
data.addByRate(3.5, 2.0); // integral = 10.0 + 2.0 * 1.5 = 13.0
|
|
35
|
+
|
|
36
|
+
expect(data.timestamps).toEqual([1.0, 2.0, 3.5]);
|
|
37
|
+
expect(data.speakingRate).toEqual([4.0, 6.0, 2.0]);
|
|
38
|
+
expect(data.speakIntegrals).toEqual([4.0, 10.0, 13.0]);
|
|
39
|
+
expect(data.pushedDuration).toBe(3.5);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it('should handle zero rate', () => {
|
|
43
|
+
const data = new SpeakingRateData();
|
|
44
|
+
data.addByRate(1.0, 0.0);
|
|
45
|
+
|
|
46
|
+
expect(data.timestamps).toEqual([1.0]);
|
|
47
|
+
expect(data.speakingRate).toEqual([0.0]);
|
|
48
|
+
expect(data.speakIntegrals).toEqual([0.0]);
|
|
49
|
+
});
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
describe('addByAnnotation', () => {
|
|
53
|
+
it('should buffer text without startTime', () => {
|
|
54
|
+
const data = new SpeakingRateData();
|
|
55
|
+
data.addByAnnotation('hello', undefined, undefined);
|
|
56
|
+
|
|
57
|
+
// Text is buffered, no timestamp entry yet
|
|
58
|
+
expect(data.timestamps).toEqual([]);
|
|
59
|
+
expect(data.pushedDuration).toBe(0);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it('should add entry when startTime is provided', () => {
|
|
63
|
+
const data = new SpeakingRateData();
|
|
64
|
+
data.addByAnnotation('hello', undefined, undefined); // buffer "hello"
|
|
65
|
+
data.addByAnnotation('world', 1.0, undefined); // flush with startTime
|
|
66
|
+
|
|
67
|
+
expect(data.timestamps).toEqual([1.0]);
|
|
68
|
+
// textLen = 5 (hello), dt = 1.0, rate = 5/1 = 5.0
|
|
69
|
+
expect(data.speakingRate).toEqual([5.0]);
|
|
70
|
+
expect(data.speakIntegrals).toEqual([5.0]);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it('should handle startTime and endTime together', () => {
|
|
74
|
+
const data = new SpeakingRateData();
|
|
75
|
+
data.addByAnnotation('hello ', 0.0, 0.5);
|
|
76
|
+
data.addByAnnotation('world', 0.5, 1.0);
|
|
77
|
+
|
|
78
|
+
// First annotation: startTime=0.0, text="hello ", then recursively calls with endTime=0.5
|
|
79
|
+
// Second annotation: startTime=0.5, text="world", then recursively calls with endTime=1.0
|
|
80
|
+
expect(data.timestamps.length).toBeGreaterThanOrEqual(2);
|
|
81
|
+
expect(data.pushedDuration).toBe(1.0);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('should calculate rate based on buffered text length', () => {
|
|
85
|
+
const data = new SpeakingRateData();
|
|
86
|
+
data.addByAnnotation('ab', undefined, undefined); // buffer 2 chars
|
|
87
|
+
data.addByAnnotation('cde', undefined, undefined); // buffer 3 more chars
|
|
88
|
+
data.addByAnnotation('', 2.0, undefined); // flush: textLen=5, dt=2.0, rate=2.5
|
|
89
|
+
|
|
90
|
+
expect(data.timestamps).toEqual([2.0]);
|
|
91
|
+
expect(data.speakingRate).toEqual([2.5]);
|
|
92
|
+
expect(data.speakIntegrals).toEqual([5.0]);
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
it('should handle zero time delta gracefully', () => {
|
|
96
|
+
const data = new SpeakingRateData();
|
|
97
|
+
data.addByAnnotation('hello', 0.0, undefined); // dt=0, rate should be 0
|
|
98
|
+
|
|
99
|
+
expect(data.timestamps).toEqual([0.0]);
|
|
100
|
+
expect(data.speakingRate).toEqual([0.0]);
|
|
101
|
+
expect(data.speakIntegrals).toEqual([0.0]);
|
|
102
|
+
});
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
describe('accumulateTo', () => {
|
|
106
|
+
it('should return 0 for empty data', () => {
|
|
107
|
+
const data = new SpeakingRateData();
|
|
108
|
+
expect(data.accumulateTo(1.0)).toBe(0);
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
it('should return 0 for timestamp before first entry', () => {
|
|
112
|
+
const data = new SpeakingRateData();
|
|
113
|
+
data.addByRate(1.0, 5.0);
|
|
114
|
+
expect(data.accumulateTo(0.5)).toBe(0);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
it('should return exact integral at timestamp', () => {
|
|
118
|
+
const data = new SpeakingRateData();
|
|
119
|
+
data.addByRate(1.0, 4.0); // integral = 4.0
|
|
120
|
+
data.addByRate(2.0, 6.0); // integral = 10.0
|
|
121
|
+
|
|
122
|
+
expect(data.accumulateTo(1.0)).toBe(4.0);
|
|
123
|
+
expect(data.accumulateTo(2.0)).toBe(10.0);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('should interpolate between timestamps', () => {
|
|
127
|
+
const data = new SpeakingRateData();
|
|
128
|
+
data.addByRate(1.0, 4.0); // integral = 4.0
|
|
129
|
+
data.addByRate(2.0, 6.0); // integral = 10.0
|
|
130
|
+
|
|
131
|
+
// At 1.5: integral = 4.0 + 6.0 * 0.5 = 7.0
|
|
132
|
+
expect(data.accumulateTo(1.5)).toBe(7.0);
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
it('should extrapolate beyond last timestamp', () => {
|
|
136
|
+
const data = new SpeakingRateData();
|
|
137
|
+
data.addByRate(1.0, 4.0); // integral = 4.0
|
|
138
|
+
data.addByRate(2.0, 6.0); // integral = 10.0
|
|
139
|
+
|
|
140
|
+
// At 3.0: integral = 10.0 + 6.0 * 1.0 = 16.0
|
|
141
|
+
expect(data.accumulateTo(3.0)).toBe(16.0);
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
it('should not exceed next integral when interpolating', () => {
|
|
145
|
+
const data = new SpeakingRateData();
|
|
146
|
+
data.addByRate(1.0, 100.0); // integral = 100.0 (very high rate)
|
|
147
|
+
data.addByRate(2.0, 1.0); // integral = 101.0
|
|
148
|
+
|
|
149
|
+
// At 1.5 with rate 1.0: would be 100.0 + 1.0 * 0.5 = 100.5
|
|
150
|
+
// But capped at next integral 101.0, so result is min(100.5, 101.0) = 100.5
|
|
151
|
+
expect(data.accumulateTo(1.5)).toBe(100.5);
|
|
152
|
+
});
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
describe('pushedDuration', () => {
|
|
156
|
+
it('should return 0 when empty', () => {
|
|
157
|
+
const data = new SpeakingRateData();
|
|
158
|
+
expect(data.pushedDuration).toBe(0);
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
it('should return last timestamp', () => {
|
|
162
|
+
const data = new SpeakingRateData();
|
|
163
|
+
data.addByRate(1.0, 5.0);
|
|
164
|
+
data.addByRate(2.5, 3.0);
|
|
165
|
+
data.addByRate(4.0, 7.0);
|
|
166
|
+
|
|
167
|
+
expect(data.pushedDuration).toBe(4.0);
|
|
168
|
+
});
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
describe('integration scenarios', () => {
|
|
172
|
+
it('should handle typical TTS word timing scenario', () => {
|
|
173
|
+
const data = new SpeakingRateData();
|
|
174
|
+
|
|
175
|
+
// Simulating words with timing: "Hello " at 0-0.3s, "world" at 0.3-0.6s
|
|
176
|
+
data.addByAnnotation('Hello ', 0.0, 0.3);
|
|
177
|
+
data.addByAnnotation('world', 0.3, 0.6);
|
|
178
|
+
|
|
179
|
+
// Should have accumulated text lengths at each timestamp
|
|
180
|
+
expect(data.pushedDuration).toBe(0.6);
|
|
181
|
+
|
|
182
|
+
// At 0.15s (middle of first word), should be partway through
|
|
183
|
+
const mid1 = data.accumulateTo(0.15);
|
|
184
|
+
expect(mid1).toBeGreaterThan(0);
|
|
185
|
+
expect(mid1).toBeLessThan(6); // "Hello " is 6 chars
|
|
186
|
+
|
|
187
|
+
// At 0.45s (middle of second word), should be past first word
|
|
188
|
+
const mid2 = data.accumulateTo(0.45);
|
|
189
|
+
expect(mid2).toBeGreaterThan(6);
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
it('should handle mixed rate and annotation data', () => {
|
|
193
|
+
const data = new SpeakingRateData();
|
|
194
|
+
|
|
195
|
+
// Start with rate-based data
|
|
196
|
+
data.addByRate(0.5, 4.0); // integral = 2.0
|
|
197
|
+
|
|
198
|
+
// Then add annotation
|
|
199
|
+
data.addByAnnotation('test', undefined, undefined);
|
|
200
|
+
data.addByAnnotation('', 1.0, undefined); // textLen=4, dt=0.5, rate=8.0, integral = 2.0 + 4.0 = 6.0
|
|
201
|
+
|
|
202
|
+
expect(data.timestamps).toEqual([0.5, 1.0]);
|
|
203
|
+
expect(data.speakIntegrals).toEqual([2.0, 6.0]);
|
|
204
|
+
});
|
|
205
|
+
});
|
|
206
|
+
});
|
|
@@ -8,7 +8,13 @@ import { IdentityTransform } from '../../stream/identity_transform.js';
|
|
|
8
8
|
import type { SentenceStream, SentenceTokenizer } from '../../tokenize/index.js';
|
|
9
9
|
import { basic } from '../../tokenize/index.js';
|
|
10
10
|
import { Future, Task, delay } from '../../utils.js';
|
|
11
|
-
import {
|
|
11
|
+
import {
|
|
12
|
+
AudioOutput,
|
|
13
|
+
type PlaybackFinishedEvent,
|
|
14
|
+
TextOutput,
|
|
15
|
+
type TimedString,
|
|
16
|
+
isTimedString,
|
|
17
|
+
} from '../io.js';
|
|
12
18
|
|
|
13
19
|
const STANDARD_SPEECH_RATE = 3.83; // hyphens (syllables) per second
|
|
14
20
|
|
|
@@ -27,9 +33,110 @@ interface TextData {
|
|
|
27
33
|
forwardedText: string;
|
|
28
34
|
}
|
|
29
35
|
|
|
36
|
+
/**
|
|
37
|
+
* Tracks speaking rate data from TTS timing annotations.
|
|
38
|
+
* @internal Exported for testing purposes.
|
|
39
|
+
*/
|
|
40
|
+
export class SpeakingRateData {
|
|
41
|
+
/** Timestamps of the speaking rate. */
|
|
42
|
+
timestamps: number[] = [];
|
|
43
|
+
/** Speed at the timestamp. */
|
|
44
|
+
speakingRate: number[] = [];
|
|
45
|
+
/** Accumulated speaking units up to the timestamp. */
|
|
46
|
+
speakIntegrals: number[] = [];
|
|
47
|
+
/** Buffer for text without timing annotations yet. */
|
|
48
|
+
private textBuffer: string[] = [];
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Add by speaking rate estimation.
|
|
52
|
+
*/
|
|
53
|
+
addByRate(timestamp: number, speakingRate: number): void {
|
|
54
|
+
const integral =
|
|
55
|
+
this.speakIntegrals.length > 0 ? this.speakIntegrals[this.speakIntegrals.length - 1]! : 0;
|
|
56
|
+
const dt = timestamp - this.pushedDuration;
|
|
57
|
+
const newIntegral = integral + speakingRate * dt;
|
|
58
|
+
|
|
59
|
+
this.timestamps.push(timestamp);
|
|
60
|
+
this.speakingRate.push(speakingRate);
|
|
61
|
+
this.speakIntegrals.push(newIntegral);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Add annotation from TimedString with start_time/end_time.
|
|
66
|
+
*/
|
|
67
|
+
addByAnnotation(text: string, startTime: number | undefined, endTime: number | undefined): void {
|
|
68
|
+
if (startTime !== undefined) {
|
|
69
|
+
// Calculate the integral of the speaking rate up to the start time
|
|
70
|
+
const integral =
|
|
71
|
+
this.speakIntegrals.length > 0 ? this.speakIntegrals[this.speakIntegrals.length - 1]! : 0;
|
|
72
|
+
|
|
73
|
+
const dt = startTime - this.pushedDuration;
|
|
74
|
+
// Use the length of the text directly instead of hyphens
|
|
75
|
+
const textLen = this.textBuffer.reduce((sum, t) => sum + t.length, 0);
|
|
76
|
+
const newIntegral = integral + textLen;
|
|
77
|
+
const rate = dt > 0 ? textLen / dt : 0;
|
|
78
|
+
|
|
79
|
+
this.timestamps.push(startTime);
|
|
80
|
+
this.speakingRate.push(rate);
|
|
81
|
+
this.speakIntegrals.push(newIntegral);
|
|
82
|
+
this.textBuffer = [];
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
this.textBuffer.push(text);
|
|
86
|
+
|
|
87
|
+
if (endTime !== undefined) {
|
|
88
|
+
this.addByAnnotation('', endTime, undefined);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Get accumulated speaking units up to the given timestamp.
|
|
94
|
+
*/
|
|
95
|
+
accumulateTo(timestamp: number): number {
|
|
96
|
+
if (this.timestamps.length === 0) {
|
|
97
|
+
return 0;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Binary search for the right position (equivalent to np.searchsorted with side="right")
|
|
101
|
+
let idx = 0;
|
|
102
|
+
for (let i = 0; i < this.timestamps.length; i++) {
|
|
103
|
+
if (this.timestamps[i]! <= timestamp) {
|
|
104
|
+
idx = i + 1;
|
|
105
|
+
} else {
|
|
106
|
+
break;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (idx === 0) {
|
|
111
|
+
return 0;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
let integralT = this.speakIntegrals[idx - 1]!;
|
|
115
|
+
|
|
116
|
+
// Fill the tail assuming the speaking rate is constant
|
|
117
|
+
const dt = timestamp - this.timestamps[idx - 1]!;
|
|
118
|
+
const rate =
|
|
119
|
+
idx < this.speakingRate.length ? this.speakingRate[idx]! : this.speakingRate[idx - 1]!;
|
|
120
|
+
integralT += rate * dt;
|
|
121
|
+
|
|
122
|
+
// If there is a next timestamp, make sure the integral does not exceed the next
|
|
123
|
+
if (idx < this.timestamps.length) {
|
|
124
|
+
integralT = Math.min(integralT, this.speakIntegrals[idx]!);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return integralT;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/** Get the last pushed timestamp. */
|
|
131
|
+
get pushedDuration(): number {
|
|
132
|
+
return this.timestamps.length > 0 ? this.timestamps[this.timestamps.length - 1]! : 0;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
30
136
|
interface AudioData {
|
|
31
137
|
pushedDuration: number;
|
|
32
138
|
done: boolean;
|
|
139
|
+
annotatedRate: SpeakingRateData | null;
|
|
33
140
|
}
|
|
34
141
|
|
|
35
142
|
class SegmentSynchronizerImpl {
|
|
@@ -62,6 +169,7 @@ class SegmentSynchronizerImpl {
|
|
|
62
169
|
this.audioData = {
|
|
63
170
|
pushedDuration: 0,
|
|
64
171
|
done: false,
|
|
172
|
+
annotatedRate: null,
|
|
65
173
|
};
|
|
66
174
|
this.outputStream = new IdentityTransform();
|
|
67
175
|
this.outputStreamWriter = this.outputStream.writable.getWriter();
|
|
@@ -88,6 +196,10 @@ class SegmentSynchronizerImpl {
|
|
|
88
196
|
return this.textData.done;
|
|
89
197
|
}
|
|
90
198
|
|
|
199
|
+
get hasPendingText(): boolean {
|
|
200
|
+
return this.textData.pushedText.length > this.textData.forwardedText.length;
|
|
201
|
+
}
|
|
202
|
+
|
|
91
203
|
get readable(): ReadableStream<string> {
|
|
92
204
|
return this.outputStream.readable;
|
|
93
205
|
}
|
|
@@ -117,14 +229,36 @@ class SegmentSynchronizerImpl {
|
|
|
117
229
|
this.audioData.done = true;
|
|
118
230
|
}
|
|
119
231
|
|
|
120
|
-
pushText(text: string) {
|
|
232
|
+
pushText(text: string | TimedString) {
|
|
121
233
|
if (this.closed) {
|
|
122
234
|
this.logger.warn('SegmentSynchronizerImpl.pushText called after close');
|
|
123
235
|
return;
|
|
124
236
|
}
|
|
125
237
|
|
|
126
|
-
|
|
127
|
-
|
|
238
|
+
// Check if text is a TimedString (has timing information)
|
|
239
|
+
let textStr: string;
|
|
240
|
+
let startTime: number | undefined;
|
|
241
|
+
let endTime: number | undefined;
|
|
242
|
+
|
|
243
|
+
if (isTimedString(text)) {
|
|
244
|
+
// This is a TimedString
|
|
245
|
+
textStr = text.text;
|
|
246
|
+
startTime = text.startTime;
|
|
247
|
+
endTime = text.endTime;
|
|
248
|
+
|
|
249
|
+
// Create annotatedRate if it doesn't exist
|
|
250
|
+
if (!this.audioData.annotatedRate) {
|
|
251
|
+
this.audioData.annotatedRate = new SpeakingRateData();
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Add the timing annotation
|
|
255
|
+
this.audioData.annotatedRate.addByAnnotation(textStr, startTime, endTime);
|
|
256
|
+
} else {
|
|
257
|
+
textStr = text;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
this.textData.sentenceStream.pushText(textStr);
|
|
261
|
+
this.textData.pushedText += textStr;
|
|
128
262
|
}
|
|
129
263
|
|
|
130
264
|
endTextInput() {
|
|
@@ -148,6 +282,10 @@ class SegmentSynchronizerImpl {
|
|
|
148
282
|
{ textDone: this.textData.done, audioDone: this.audioData.done },
|
|
149
283
|
'SegmentSynchronizerImpl.markPlaybackFinished called before text/audio input is done',
|
|
150
284
|
);
|
|
285
|
+
// This allows mainTask to flush remaining text even if audio wasn't formally ended
|
|
286
|
+
if (!interrupted) {
|
|
287
|
+
this.playbackCompleted = true;
|
|
288
|
+
}
|
|
151
289
|
return;
|
|
152
290
|
}
|
|
153
291
|
|
|
@@ -166,13 +304,13 @@ class SegmentSynchronizerImpl {
|
|
|
166
304
|
private async captureTaskImpl() {
|
|
167
305
|
// Don't use a for-await loop here, because exiting the loop will close the writer in the
|
|
168
306
|
// outputStream, which will cause an error in the mainTask.then method.
|
|
307
|
+
// NOTE: forwardedText is updated in mainTask, NOT here
|
|
169
308
|
const reader = this.outputStream.readable.getReader();
|
|
170
309
|
while (true) {
|
|
171
310
|
const { done, value: text } = await reader.read();
|
|
172
311
|
if (done) {
|
|
173
312
|
break;
|
|
174
313
|
}
|
|
175
|
-
this.textData.forwardedText += text;
|
|
176
314
|
await this.nextInChain.captureText(text);
|
|
177
315
|
}
|
|
178
316
|
reader.releaseLock();
|
|
@@ -211,19 +349,42 @@ class SegmentSynchronizerImpl {
|
|
|
211
349
|
|
|
212
350
|
const wordHphens = this.options.hyphenateWord(word).length;
|
|
213
351
|
const elapsedSeconds = (Date.now() - this.startWallTime) / 1000;
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
352
|
+
|
|
353
|
+
let dHyphens = 0;
|
|
354
|
+
const annotated = this.audioData.annotatedRate;
|
|
355
|
+
|
|
356
|
+
if (annotated && annotated.pushedDuration >= elapsedSeconds) {
|
|
357
|
+
// Use actual TTS timing annotations for accurate sync
|
|
358
|
+
const targetLen = Math.floor(annotated.accumulateTo(elapsedSeconds));
|
|
359
|
+
const forwardedLen = this.textData.forwardedText.length;
|
|
360
|
+
|
|
361
|
+
if (targetLen >= forwardedLen) {
|
|
362
|
+
const dText = this.textData.pushedText.slice(forwardedLen, targetLen);
|
|
363
|
+
dHyphens = this.calcHyphens(dText).length;
|
|
364
|
+
} else {
|
|
365
|
+
const dText = this.textData.pushedText.slice(targetLen, forwardedLen);
|
|
366
|
+
dHyphens = -this.calcHyphens(dText).length;
|
|
367
|
+
}
|
|
368
|
+
} else {
|
|
369
|
+
// Fall back to estimated hyphens-per-second calculation
|
|
370
|
+
const targetHyphens = elapsedSeconds * this.options.speed;
|
|
371
|
+
dHyphens = Math.max(0, targetHyphens - this.textData.forwardedHyphens);
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
let delayTime = Math.max(0, wordHphens - dHyphens) / this.speed;
|
|
217
375
|
|
|
218
376
|
if (this.playbackCompleted) {
|
|
219
|
-
|
|
377
|
+
delayTime = 0;
|
|
220
378
|
}
|
|
221
379
|
|
|
222
|
-
await this.sleepIfNotClosed(
|
|
223
|
-
|
|
224
|
-
|
|
380
|
+
await this.sleepIfNotClosed(delayTime / 2);
|
|
381
|
+
const forwardedWord = sentence.slice(textCursor, endPos);
|
|
382
|
+
this.outputStreamWriter.write(forwardedWord);
|
|
383
|
+
|
|
384
|
+
await this.sleepIfNotClosed(delayTime / 2);
|
|
225
385
|
|
|
226
386
|
this.textData.forwardedHyphens += wordHphens;
|
|
387
|
+
this.textData.forwardedText += forwardedWord;
|
|
227
388
|
textCursor = endPos;
|
|
228
389
|
}
|
|
229
390
|
|
|
@@ -234,6 +395,15 @@ class SegmentSynchronizerImpl {
|
|
|
234
395
|
}
|
|
235
396
|
}
|
|
236
397
|
|
|
398
|
+
private calcHyphens(text: string): string[] {
|
|
399
|
+
const words = this.options.splitWords(text);
|
|
400
|
+
const hyphens: string[] = [];
|
|
401
|
+
for (const [word] of words) {
|
|
402
|
+
hyphens.push(...this.options.hyphenateWord(word));
|
|
403
|
+
}
|
|
404
|
+
return hyphens;
|
|
405
|
+
}
|
|
406
|
+
|
|
237
407
|
private async sleepIfNotClosed(sleepTimeSeconds: number) {
|
|
238
408
|
if (this.closed) {
|
|
239
409
|
return;
|
|
@@ -350,6 +520,7 @@ export class TranscriptionSynchronizer {
|
|
|
350
520
|
if (abort.aborted) {
|
|
351
521
|
return;
|
|
352
522
|
}
|
|
523
|
+
|
|
353
524
|
await this._impl.close();
|
|
354
525
|
this._impl = new SegmentSynchronizerImpl(this.options, this.textOutput.nextInChain);
|
|
355
526
|
}
|
|
@@ -399,7 +570,15 @@ class SyncedAudioOutput extends AudioOutput {
|
|
|
399
570
|
}
|
|
400
571
|
|
|
401
572
|
if (!this.pushedDuration) {
|
|
402
|
-
//
|
|
573
|
+
// For timed texts, audio goes directly to room without going through synchronizer.
|
|
574
|
+
// If text was pushed but no audio, still end audio input so text can be processed.
|
|
575
|
+
// Only rotate if there's also no text (truly empty segment).
|
|
576
|
+
if (this.synchronizer._impl.hasPendingText) {
|
|
577
|
+
// Text is pending - end audio input to allow text processing
|
|
578
|
+
this.synchronizer._impl.endAudioInput();
|
|
579
|
+
return;
|
|
580
|
+
}
|
|
581
|
+
// No text and no audio - rotate the segment
|
|
403
582
|
this.synchronizer.rotateSegment();
|
|
404
583
|
return;
|
|
405
584
|
}
|
|
@@ -441,12 +620,14 @@ class SyncedTextOutput extends TextOutput {
|
|
|
441
620
|
super(nextInChain);
|
|
442
621
|
}
|
|
443
622
|
|
|
444
|
-
async captureText(text: string): Promise<void> {
|
|
623
|
+
async captureText(text: string | TimedString): Promise<void> {
|
|
445
624
|
await this.synchronizer.barrier();
|
|
446
625
|
|
|
626
|
+
const textStr = isTimedString(text) ? text.text : text;
|
|
627
|
+
|
|
447
628
|
if (!this.synchronizer.enabled) {
|
|
448
|
-
// pass through to the next in chain
|
|
449
|
-
await this.nextInChain.captureText(
|
|
629
|
+
// pass through to the next in chain (extract string from TimedString if needed)
|
|
630
|
+
await this.nextInChain.captureText(textStr);
|
|
450
631
|
return;
|
|
451
632
|
}
|
|
452
633
|
|
|
@@ -458,10 +639,14 @@ class SyncedTextOutput extends TextOutput {
|
|
|
458
639
|
this.synchronizer.rotateSegment();
|
|
459
640
|
await this.synchronizer.barrier();
|
|
460
641
|
}
|
|
642
|
+
// Pass the TimedString to pushText for timing extraction
|
|
461
643
|
this.synchronizer._impl.pushText(text);
|
|
462
644
|
}
|
|
463
645
|
|
|
464
|
-
flush() {
|
|
646
|
+
async flush() {
|
|
647
|
+
// Wait for any pending rotation to complete before accessing _impl
|
|
648
|
+
await this.synchronizer.barrier();
|
|
649
|
+
|
|
465
650
|
if (!this.synchronizer.enabled) {
|
|
466
651
|
this.nextInChain.flush(); // passthrough text if the synchronizer is disabled
|
|
467
652
|
return;
|