@livekit/agents 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/dist/cli.cjs +20 -18
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +20 -18
  5. package/dist/cli.js.map +1 -1
  6. package/dist/http_server.cjs +9 -6
  7. package/dist/http_server.cjs.map +1 -1
  8. package/dist/http_server.d.cts +5 -1
  9. package/dist/http_server.d.ts +5 -1
  10. package/dist/http_server.d.ts.map +1 -1
  11. package/dist/http_server.js +9 -6
  12. package/dist/http_server.js.map +1 -1
  13. package/dist/index.cjs +5 -0
  14. package/dist/index.cjs.map +1 -1
  15. package/dist/index.d.cts +1 -0
  16. package/dist/index.d.ts +1 -0
  17. package/dist/index.d.ts.map +1 -1
  18. package/dist/index.js +3 -0
  19. package/dist/index.js.map +1 -1
  20. package/dist/inference/stt.cjs +2 -1
  21. package/dist/inference/stt.cjs.map +1 -1
  22. package/dist/inference/stt.d.ts.map +1 -1
  23. package/dist/inference/stt.js +2 -1
  24. package/dist/inference/stt.js.map +1 -1
  25. package/dist/ipc/supervised_proc.cjs +4 -0
  26. package/dist/ipc/supervised_proc.cjs.map +1 -1
  27. package/dist/ipc/supervised_proc.d.cts +1 -0
  28. package/dist/ipc/supervised_proc.d.ts +1 -0
  29. package/dist/ipc/supervised_proc.d.ts.map +1 -1
  30. package/dist/ipc/supervised_proc.js +4 -0
  31. package/dist/ipc/supervised_proc.js.map +1 -1
  32. package/dist/llm/realtime.cjs.map +1 -1
  33. package/dist/llm/realtime.d.cts +5 -1
  34. package/dist/llm/realtime.d.ts +5 -1
  35. package/dist/llm/realtime.d.ts.map +1 -1
  36. package/dist/llm/realtime.js.map +1 -1
  37. package/dist/tokenize/basic/sentence.cjs +3 -3
  38. package/dist/tokenize/basic/sentence.cjs.map +1 -1
  39. package/dist/tokenize/basic/sentence.js +3 -3
  40. package/dist/tokenize/basic/sentence.js.map +1 -1
  41. package/dist/tokenize/tokenizer.test.cjs +3 -1
  42. package/dist/tokenize/tokenizer.test.cjs.map +1 -1
  43. package/dist/tokenize/tokenizer.test.js +3 -1
  44. package/dist/tokenize/tokenizer.test.js.map +1 -1
  45. package/dist/tts/stream_adapter.cjs +15 -1
  46. package/dist/tts/stream_adapter.cjs.map +1 -1
  47. package/dist/tts/stream_adapter.d.ts.map +1 -1
  48. package/dist/tts/stream_adapter.js +15 -1
  49. package/dist/tts/stream_adapter.js.map +1 -1
  50. package/dist/tts/tts.cjs.map +1 -1
  51. package/dist/tts/tts.d.cts +9 -1
  52. package/dist/tts/tts.d.ts +9 -1
  53. package/dist/tts/tts.d.ts.map +1 -1
  54. package/dist/tts/tts.js.map +1 -1
  55. package/dist/types.cjs +3 -0
  56. package/dist/types.cjs.map +1 -1
  57. package/dist/types.d.cts +4 -0
  58. package/dist/types.d.ts +4 -0
  59. package/dist/types.d.ts.map +1 -1
  60. package/dist/types.js +2 -0
  61. package/dist/types.js.map +1 -1
  62. package/dist/voice/agent.cjs +11 -1
  63. package/dist/voice/agent.cjs.map +1 -1
  64. package/dist/voice/agent.d.cts +7 -3
  65. package/dist/voice/agent.d.ts +7 -3
  66. package/dist/voice/agent.d.ts.map +1 -1
  67. package/dist/voice/agent.js +11 -1
  68. package/dist/voice/agent.js.map +1 -1
  69. package/dist/voice/agent_activity.cjs +30 -14
  70. package/dist/voice/agent_activity.cjs.map +1 -1
  71. package/dist/voice/agent_activity.d.cts +1 -0
  72. package/dist/voice/agent_activity.d.ts +1 -0
  73. package/dist/voice/agent_activity.d.ts.map +1 -1
  74. package/dist/voice/agent_activity.js +30 -14
  75. package/dist/voice/agent_activity.js.map +1 -1
  76. package/dist/voice/agent_session.cjs +5 -1
  77. package/dist/voice/agent_session.cjs.map +1 -1
  78. package/dist/voice/agent_session.d.cts +2 -0
  79. package/dist/voice/agent_session.d.ts +2 -0
  80. package/dist/voice/agent_session.d.ts.map +1 -1
  81. package/dist/voice/agent_session.js +5 -1
  82. package/dist/voice/agent_session.js.map +1 -1
  83. package/dist/voice/background_audio.cjs +2 -1
  84. package/dist/voice/background_audio.cjs.map +1 -1
  85. package/dist/voice/background_audio.d.cts +4 -2
  86. package/dist/voice/background_audio.d.ts +4 -2
  87. package/dist/voice/background_audio.d.ts.map +1 -1
  88. package/dist/voice/background_audio.js +2 -1
  89. package/dist/voice/background_audio.js.map +1 -1
  90. package/dist/voice/generation.cjs +58 -5
  91. package/dist/voice/generation.cjs.map +1 -1
  92. package/dist/voice/generation.d.cts +17 -3
  93. package/dist/voice/generation.d.ts +17 -3
  94. package/dist/voice/generation.d.ts.map +1 -1
  95. package/dist/voice/generation.js +63 -6
  96. package/dist/voice/generation.js.map +1 -1
  97. package/dist/voice/index.cjs.map +1 -1
  98. package/dist/voice/index.d.cts +1 -1
  99. package/dist/voice/index.d.ts +1 -1
  100. package/dist/voice/index.d.ts.map +1 -1
  101. package/dist/voice/index.js.map +1 -1
  102. package/dist/voice/io.cjs +22 -2
  103. package/dist/voice/io.cjs.map +1 -1
  104. package/dist/voice/io.d.cts +21 -5
  105. package/dist/voice/io.d.ts +21 -5
  106. package/dist/voice/io.d.ts.map +1 -1
  107. package/dist/voice/io.js +18 -1
  108. package/dist/voice/io.js.map +1 -1
  109. package/dist/voice/room_io/_output.cjs +3 -2
  110. package/dist/voice/room_io/_output.cjs.map +1 -1
  111. package/dist/voice/room_io/_output.d.cts +3 -3
  112. package/dist/voice/room_io/_output.d.ts +3 -3
  113. package/dist/voice/room_io/_output.d.ts.map +1 -1
  114. package/dist/voice/room_io/_output.js +4 -3
  115. package/dist/voice/room_io/_output.js.map +1 -1
  116. package/dist/voice/transcription/synchronizer.cjs +137 -13
  117. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  118. package/dist/voice/transcription/synchronizer.d.cts +34 -4
  119. package/dist/voice/transcription/synchronizer.d.ts +34 -4
  120. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  121. package/dist/voice/transcription/synchronizer.js +141 -14
  122. package/dist/voice/transcription/synchronizer.js.map +1 -1
  123. package/dist/voice/transcription/synchronizer.test.cjs +151 -0
  124. package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
  125. package/dist/voice/transcription/synchronizer.test.js +150 -0
  126. package/dist/voice/transcription/synchronizer.test.js.map +1 -0
  127. package/dist/worker.cjs +12 -2
  128. package/dist/worker.cjs.map +1 -1
  129. package/dist/worker.d.ts.map +1 -1
  130. package/dist/worker.js +12 -2
  131. package/dist/worker.js.map +1 -1
  132. package/package.json +1 -1
  133. package/src/cli.ts +20 -18
  134. package/src/http_server.ts +18 -6
  135. package/src/index.ts +1 -0
  136. package/src/inference/stt.ts +9 -8
  137. package/src/ipc/supervised_proc.ts +4 -0
  138. package/src/llm/realtime.ts +5 -1
  139. package/src/tokenize/basic/sentence.ts +3 -3
  140. package/src/tokenize/tokenizer.test.ts +4 -0
  141. package/src/tts/stream_adapter.ts +23 -1
  142. package/src/tts/tts.ts +10 -1
  143. package/src/types.ts +5 -0
  144. package/src/voice/agent.ts +19 -4
  145. package/src/voice/agent_activity.ts +38 -13
  146. package/src/voice/agent_session.ts +6 -0
  147. package/src/voice/background_audio.ts +6 -3
  148. package/src/voice/generation.ts +115 -10
  149. package/src/voice/index.ts +1 -1
  150. package/src/voice/io.ts +40 -5
  151. package/src/voice/room_io/_output.ts +6 -5
  152. package/src/voice/transcription/synchronizer.test.ts +206 -0
  153. package/src/voice/transcription/synchronizer.ts +202 -17
  154. package/src/worker.ts +24 -2
@@ -0,0 +1,206 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { describe, expect, it } from 'vitest';
5
+ import { SpeakingRateData } from './synchronizer.js';
6
+
7
+ describe('SpeakingRateData', () => {
8
+ describe('constructor', () => {
9
+ it('should initialize with empty arrays', () => {
10
+ const data = new SpeakingRateData();
11
+ expect(data.timestamps).toEqual([]);
12
+ expect(data.speakingRate).toEqual([]);
13
+ expect(data.speakIntegrals).toEqual([]);
14
+ expect(data.pushedDuration).toBe(0);
15
+ });
16
+ });
17
+
18
+ describe('addByRate', () => {
19
+ it('should add a single rate entry', () => {
20
+ const data = new SpeakingRateData();
21
+ data.addByRate(1.0, 5.0);
22
+
23
+ expect(data.timestamps).toEqual([1.0]);
24
+ expect(data.speakingRate).toEqual([5.0]);
25
+ // integral = 0 + 5.0 * (1.0 - 0) = 5.0
26
+ expect(data.speakIntegrals).toEqual([5.0]);
27
+ expect(data.pushedDuration).toBe(1.0);
28
+ });
29
+
30
+ it('should accumulate integrals across multiple entries', () => {
31
+ const data = new SpeakingRateData();
32
+ data.addByRate(1.0, 4.0); // integral = 0 + 4.0 * 1.0 = 4.0
33
+ data.addByRate(2.0, 6.0); // integral = 4.0 + 6.0 * 1.0 = 10.0
34
+ data.addByRate(3.5, 2.0); // integral = 10.0 + 2.0 * 1.5 = 13.0
35
+
36
+ expect(data.timestamps).toEqual([1.0, 2.0, 3.5]);
37
+ expect(data.speakingRate).toEqual([4.0, 6.0, 2.0]);
38
+ expect(data.speakIntegrals).toEqual([4.0, 10.0, 13.0]);
39
+ expect(data.pushedDuration).toBe(3.5);
40
+ });
41
+
42
+ it('should handle zero rate', () => {
43
+ const data = new SpeakingRateData();
44
+ data.addByRate(1.0, 0.0);
45
+
46
+ expect(data.timestamps).toEqual([1.0]);
47
+ expect(data.speakingRate).toEqual([0.0]);
48
+ expect(data.speakIntegrals).toEqual([0.0]);
49
+ });
50
+ });
51
+
52
+ describe('addByAnnotation', () => {
53
+ it('should buffer text without startTime', () => {
54
+ const data = new SpeakingRateData();
55
+ data.addByAnnotation('hello', undefined, undefined);
56
+
57
+ // Text is buffered, no timestamp entry yet
58
+ expect(data.timestamps).toEqual([]);
59
+ expect(data.pushedDuration).toBe(0);
60
+ });
61
+
62
+ it('should add entry when startTime is provided', () => {
63
+ const data = new SpeakingRateData();
64
+ data.addByAnnotation('hello', undefined, undefined); // buffer "hello"
65
+ data.addByAnnotation('world', 1.0, undefined); // flush with startTime
66
+
67
+ expect(data.timestamps).toEqual([1.0]);
68
+ // textLen = 5 (hello), dt = 1.0, rate = 5/1 = 5.0
69
+ expect(data.speakingRate).toEqual([5.0]);
70
+ expect(data.speakIntegrals).toEqual([5.0]);
71
+ });
72
+
73
+ it('should handle startTime and endTime together', () => {
74
+ const data = new SpeakingRateData();
75
+ data.addByAnnotation('hello ', 0.0, 0.5);
76
+ data.addByAnnotation('world', 0.5, 1.0);
77
+
78
+ // First annotation: startTime=0.0, text="hello ", then recursively calls with endTime=0.5
79
+ // Second annotation: startTime=0.5, text="world", then recursively calls with endTime=1.0
80
+ expect(data.timestamps.length).toBeGreaterThanOrEqual(2);
81
+ expect(data.pushedDuration).toBe(1.0);
82
+ });
83
+
84
+ it('should calculate rate based on buffered text length', () => {
85
+ const data = new SpeakingRateData();
86
+ data.addByAnnotation('ab', undefined, undefined); // buffer 2 chars
87
+ data.addByAnnotation('cde', undefined, undefined); // buffer 3 more chars
88
+ data.addByAnnotation('', 2.0, undefined); // flush: textLen=5, dt=2.0, rate=2.5
89
+
90
+ expect(data.timestamps).toEqual([2.0]);
91
+ expect(data.speakingRate).toEqual([2.5]);
92
+ expect(data.speakIntegrals).toEqual([5.0]);
93
+ });
94
+
95
+ it('should handle zero time delta gracefully', () => {
96
+ const data = new SpeakingRateData();
97
+ data.addByAnnotation('hello', 0.0, undefined); // dt=0, rate should be 0
98
+
99
+ expect(data.timestamps).toEqual([0.0]);
100
+ expect(data.speakingRate).toEqual([0.0]);
101
+ expect(data.speakIntegrals).toEqual([0.0]);
102
+ });
103
+ });
104
+
105
+ describe('accumulateTo', () => {
106
+ it('should return 0 for empty data', () => {
107
+ const data = new SpeakingRateData();
108
+ expect(data.accumulateTo(1.0)).toBe(0);
109
+ });
110
+
111
+ it('should return 0 for timestamp before first entry', () => {
112
+ const data = new SpeakingRateData();
113
+ data.addByRate(1.0, 5.0);
114
+ expect(data.accumulateTo(0.5)).toBe(0);
115
+ });
116
+
117
+ it('should return exact integral at timestamp', () => {
118
+ const data = new SpeakingRateData();
119
+ data.addByRate(1.0, 4.0); // integral = 4.0
120
+ data.addByRate(2.0, 6.0); // integral = 10.0
121
+
122
+ expect(data.accumulateTo(1.0)).toBe(4.0);
123
+ expect(data.accumulateTo(2.0)).toBe(10.0);
124
+ });
125
+
126
+ it('should interpolate between timestamps', () => {
127
+ const data = new SpeakingRateData();
128
+ data.addByRate(1.0, 4.0); // integral = 4.0
129
+ data.addByRate(2.0, 6.0); // integral = 10.0
130
+
131
+ // At 1.5: integral = 4.0 + 6.0 * 0.5 = 7.0
132
+ expect(data.accumulateTo(1.5)).toBe(7.0);
133
+ });
134
+
135
+ it('should extrapolate beyond last timestamp', () => {
136
+ const data = new SpeakingRateData();
137
+ data.addByRate(1.0, 4.0); // integral = 4.0
138
+ data.addByRate(2.0, 6.0); // integral = 10.0
139
+
140
+ // At 3.0: integral = 10.0 + 6.0 * 1.0 = 16.0
141
+ expect(data.accumulateTo(3.0)).toBe(16.0);
142
+ });
143
+
144
+ it('should not exceed next integral when interpolating', () => {
145
+ const data = new SpeakingRateData();
146
+ data.addByRate(1.0, 100.0); // integral = 100.0 (very high rate)
147
+ data.addByRate(2.0, 1.0); // integral = 101.0
148
+
149
+ // At 1.5 with rate 1.0: would be 100.0 + 1.0 * 0.5 = 100.5
150
+ // But capped at next integral 101.0, so result is min(100.5, 101.0) = 100.5
151
+ expect(data.accumulateTo(1.5)).toBe(100.5);
152
+ });
153
+ });
154
+
155
+ describe('pushedDuration', () => {
156
+ it('should return 0 when empty', () => {
157
+ const data = new SpeakingRateData();
158
+ expect(data.pushedDuration).toBe(0);
159
+ });
160
+
161
+ it('should return last timestamp', () => {
162
+ const data = new SpeakingRateData();
163
+ data.addByRate(1.0, 5.0);
164
+ data.addByRate(2.5, 3.0);
165
+ data.addByRate(4.0, 7.0);
166
+
167
+ expect(data.pushedDuration).toBe(4.0);
168
+ });
169
+ });
170
+
171
+ describe('integration scenarios', () => {
172
+ it('should handle typical TTS word timing scenario', () => {
173
+ const data = new SpeakingRateData();
174
+
175
+ // Simulating words with timing: "Hello " at 0-0.3s, "world" at 0.3-0.6s
176
+ data.addByAnnotation('Hello ', 0.0, 0.3);
177
+ data.addByAnnotation('world', 0.3, 0.6);
178
+
179
+ // Should have accumulated text lengths at each timestamp
180
+ expect(data.pushedDuration).toBe(0.6);
181
+
182
+ // At 0.15s (middle of first word), should be partway through
183
+ const mid1 = data.accumulateTo(0.15);
184
+ expect(mid1).toBeGreaterThan(0);
185
+ expect(mid1).toBeLessThan(6); // "Hello " is 6 chars
186
+
187
+ // At 0.45s (middle of second word), should be past first word
188
+ const mid2 = data.accumulateTo(0.45);
189
+ expect(mid2).toBeGreaterThan(6);
190
+ });
191
+
192
+ it('should handle mixed rate and annotation data', () => {
193
+ const data = new SpeakingRateData();
194
+
195
+ // Start with rate-based data
196
+ data.addByRate(0.5, 4.0); // integral = 2.0
197
+
198
+ // Then add annotation
199
+ data.addByAnnotation('test', undefined, undefined);
200
+ data.addByAnnotation('', 1.0, undefined); // textLen=4, dt=0.5, rate=8.0, integral = 2.0 + 4.0 = 6.0
201
+
202
+ expect(data.timestamps).toEqual([0.5, 1.0]);
203
+ expect(data.speakIntegrals).toEqual([2.0, 6.0]);
204
+ });
205
+ });
206
+ });
@@ -8,7 +8,13 @@ import { IdentityTransform } from '../../stream/identity_transform.js';
8
8
  import type { SentenceStream, SentenceTokenizer } from '../../tokenize/index.js';
9
9
  import { basic } from '../../tokenize/index.js';
10
10
  import { Future, Task, delay } from '../../utils.js';
11
- import { AudioOutput, type PlaybackFinishedEvent, TextOutput } from '../io.js';
11
+ import {
12
+ AudioOutput,
13
+ type PlaybackFinishedEvent,
14
+ TextOutput,
15
+ type TimedString,
16
+ isTimedString,
17
+ } from '../io.js';
12
18
 
13
19
  const STANDARD_SPEECH_RATE = 3.83; // hyphens (syllables) per second
14
20
 
@@ -27,9 +33,110 @@ interface TextData {
27
33
  forwardedText: string;
28
34
  }
29
35
 
36
+ /**
37
+ * Tracks speaking rate data from TTS timing annotations.
38
+ * @internal Exported for testing purposes.
39
+ */
40
+ export class SpeakingRateData {
41
+ /** Timestamps of the speaking rate. */
42
+ timestamps: number[] = [];
43
+ /** Speed at the timestamp. */
44
+ speakingRate: number[] = [];
45
+ /** Accumulated speaking units up to the timestamp. */
46
+ speakIntegrals: number[] = [];
47
+ /** Buffer for text without timing annotations yet. */
48
+ private textBuffer: string[] = [];
49
+
50
+ /**
51
+ * Add by speaking rate estimation.
52
+ */
53
+ addByRate(timestamp: number, speakingRate: number): void {
54
+ const integral =
55
+ this.speakIntegrals.length > 0 ? this.speakIntegrals[this.speakIntegrals.length - 1]! : 0;
56
+ const dt = timestamp - this.pushedDuration;
57
+ const newIntegral = integral + speakingRate * dt;
58
+
59
+ this.timestamps.push(timestamp);
60
+ this.speakingRate.push(speakingRate);
61
+ this.speakIntegrals.push(newIntegral);
62
+ }
63
+
64
+ /**
65
+ * Add annotation from TimedString with start_time/end_time.
66
+ */
67
+ addByAnnotation(text: string, startTime: number | undefined, endTime: number | undefined): void {
68
+ if (startTime !== undefined) {
69
+ // Calculate the integral of the speaking rate up to the start time
70
+ const integral =
71
+ this.speakIntegrals.length > 0 ? this.speakIntegrals[this.speakIntegrals.length - 1]! : 0;
72
+
73
+ const dt = startTime - this.pushedDuration;
74
+ // Use the length of the text directly instead of hyphens
75
+ const textLen = this.textBuffer.reduce((sum, t) => sum + t.length, 0);
76
+ const newIntegral = integral + textLen;
77
+ const rate = dt > 0 ? textLen / dt : 0;
78
+
79
+ this.timestamps.push(startTime);
80
+ this.speakingRate.push(rate);
81
+ this.speakIntegrals.push(newIntegral);
82
+ this.textBuffer = [];
83
+ }
84
+
85
+ this.textBuffer.push(text);
86
+
87
+ if (endTime !== undefined) {
88
+ this.addByAnnotation('', endTime, undefined);
89
+ }
90
+ }
91
+
92
+ /**
93
+ * Get accumulated speaking units up to the given timestamp.
94
+ */
95
+ accumulateTo(timestamp: number): number {
96
+ if (this.timestamps.length === 0) {
97
+ return 0;
98
+ }
99
+
100
+ // Binary search for the right position (equivalent to np.searchsorted with side="right")
101
+ let idx = 0;
102
+ for (let i = 0; i < this.timestamps.length; i++) {
103
+ if (this.timestamps[i]! <= timestamp) {
104
+ idx = i + 1;
105
+ } else {
106
+ break;
107
+ }
108
+ }
109
+
110
+ if (idx === 0) {
111
+ return 0;
112
+ }
113
+
114
+ let integralT = this.speakIntegrals[idx - 1]!;
115
+
116
+ // Fill the tail assuming the speaking rate is constant
117
+ const dt = timestamp - this.timestamps[idx - 1]!;
118
+ const rate =
119
+ idx < this.speakingRate.length ? this.speakingRate[idx]! : this.speakingRate[idx - 1]!;
120
+ integralT += rate * dt;
121
+
122
+ // If there is a next timestamp, make sure the integral does not exceed the next
123
+ if (idx < this.timestamps.length) {
124
+ integralT = Math.min(integralT, this.speakIntegrals[idx]!);
125
+ }
126
+
127
+ return integralT;
128
+ }
129
+
130
+ /** Get the last pushed timestamp. */
131
+ get pushedDuration(): number {
132
+ return this.timestamps.length > 0 ? this.timestamps[this.timestamps.length - 1]! : 0;
133
+ }
134
+ }
135
+
30
136
  interface AudioData {
31
137
  pushedDuration: number;
32
138
  done: boolean;
139
+ annotatedRate: SpeakingRateData | null;
33
140
  }
34
141
 
35
142
  class SegmentSynchronizerImpl {
@@ -62,6 +169,7 @@ class SegmentSynchronizerImpl {
62
169
  this.audioData = {
63
170
  pushedDuration: 0,
64
171
  done: false,
172
+ annotatedRate: null,
65
173
  };
66
174
  this.outputStream = new IdentityTransform();
67
175
  this.outputStreamWriter = this.outputStream.writable.getWriter();
@@ -88,6 +196,10 @@ class SegmentSynchronizerImpl {
88
196
  return this.textData.done;
89
197
  }
90
198
 
199
+ get hasPendingText(): boolean {
200
+ return this.textData.pushedText.length > this.textData.forwardedText.length;
201
+ }
202
+
91
203
  get readable(): ReadableStream<string> {
92
204
  return this.outputStream.readable;
93
205
  }
@@ -117,14 +229,36 @@ class SegmentSynchronizerImpl {
117
229
  this.audioData.done = true;
118
230
  }
119
231
 
120
- pushText(text: string) {
232
+ pushText(text: string | TimedString) {
121
233
  if (this.closed) {
122
234
  this.logger.warn('SegmentSynchronizerImpl.pushText called after close');
123
235
  return;
124
236
  }
125
237
 
126
- this.textData.sentenceStream.pushText(text);
127
- this.textData.pushedText += text;
238
+ // Check if text is a TimedString (has timing information)
239
+ let textStr: string;
240
+ let startTime: number | undefined;
241
+ let endTime: number | undefined;
242
+
243
+ if (isTimedString(text)) {
244
+ // This is a TimedString
245
+ textStr = text.text;
246
+ startTime = text.startTime;
247
+ endTime = text.endTime;
248
+
249
+ // Create annotatedRate if it doesn't exist
250
+ if (!this.audioData.annotatedRate) {
251
+ this.audioData.annotatedRate = new SpeakingRateData();
252
+ }
253
+
254
+ // Add the timing annotation
255
+ this.audioData.annotatedRate.addByAnnotation(textStr, startTime, endTime);
256
+ } else {
257
+ textStr = text;
258
+ }
259
+
260
+ this.textData.sentenceStream.pushText(textStr);
261
+ this.textData.pushedText += textStr;
128
262
  }
129
263
 
130
264
  endTextInput() {
@@ -148,6 +282,10 @@ class SegmentSynchronizerImpl {
148
282
  { textDone: this.textData.done, audioDone: this.audioData.done },
149
283
  'SegmentSynchronizerImpl.markPlaybackFinished called before text/audio input is done',
150
284
  );
285
+ // This allows mainTask to flush remaining text even if audio wasn't formally ended
286
+ if (!interrupted) {
287
+ this.playbackCompleted = true;
288
+ }
151
289
  return;
152
290
  }
153
291
 
@@ -166,13 +304,13 @@ class SegmentSynchronizerImpl {
166
304
  private async captureTaskImpl() {
167
305
  // Don't use a for-await loop here, because exiting the loop will close the writer in the
168
306
  // outputStream, which will cause an error in the mainTask.then method.
307
+ // NOTE: forwardedText is updated in mainTask, NOT here
169
308
  const reader = this.outputStream.readable.getReader();
170
309
  while (true) {
171
310
  const { done, value: text } = await reader.read();
172
311
  if (done) {
173
312
  break;
174
313
  }
175
- this.textData.forwardedText += text;
176
314
  await this.nextInChain.captureText(text);
177
315
  }
178
316
  reader.releaseLock();
@@ -211,19 +349,42 @@ class SegmentSynchronizerImpl {
211
349
 
212
350
  const wordHphens = this.options.hyphenateWord(word).length;
213
351
  const elapsedSeconds = (Date.now() - this.startWallTime) / 1000;
214
- const targetHyphens = elapsedSeconds * this.options.speed;
215
- const hyphensBehind = Math.max(0, targetHyphens - this.textData.forwardedHyphens);
216
- let delay = Math.max(0, wordHphens - hyphensBehind) / this.speed;
352
+
353
+ let dHyphens = 0;
354
+ const annotated = this.audioData.annotatedRate;
355
+
356
+ if (annotated && annotated.pushedDuration >= elapsedSeconds) {
357
+ // Use actual TTS timing annotations for accurate sync
358
+ const targetLen = Math.floor(annotated.accumulateTo(elapsedSeconds));
359
+ const forwardedLen = this.textData.forwardedText.length;
360
+
361
+ if (targetLen >= forwardedLen) {
362
+ const dText = this.textData.pushedText.slice(forwardedLen, targetLen);
363
+ dHyphens = this.calcHyphens(dText).length;
364
+ } else {
365
+ const dText = this.textData.pushedText.slice(targetLen, forwardedLen);
366
+ dHyphens = -this.calcHyphens(dText).length;
367
+ }
368
+ } else {
369
+ // Fall back to estimated hyphens-per-second calculation
370
+ const targetHyphens = elapsedSeconds * this.options.speed;
371
+ dHyphens = Math.max(0, targetHyphens - this.textData.forwardedHyphens);
372
+ }
373
+
374
+ let delayTime = Math.max(0, wordHphens - dHyphens) / this.speed;
217
375
 
218
376
  if (this.playbackCompleted) {
219
- delay = 0;
377
+ delayTime = 0;
220
378
  }
221
379
 
222
- await this.sleepIfNotClosed(delay / 2);
223
- this.outputStreamWriter.write(sentence.slice(textCursor, endPos));
224
- await this.sleepIfNotClosed(delay / 2);
380
+ await this.sleepIfNotClosed(delayTime / 2);
381
+ const forwardedWord = sentence.slice(textCursor, endPos);
382
+ this.outputStreamWriter.write(forwardedWord);
383
+
384
+ await this.sleepIfNotClosed(delayTime / 2);
225
385
 
226
386
  this.textData.forwardedHyphens += wordHphens;
387
+ this.textData.forwardedText += forwardedWord;
227
388
  textCursor = endPos;
228
389
  }
229
390
 
@@ -234,6 +395,15 @@ class SegmentSynchronizerImpl {
234
395
  }
235
396
  }
236
397
 
398
+ private calcHyphens(text: string): string[] {
399
+ const words = this.options.splitWords(text);
400
+ const hyphens: string[] = [];
401
+ for (const [word] of words) {
402
+ hyphens.push(...this.options.hyphenateWord(word));
403
+ }
404
+ return hyphens;
405
+ }
406
+
237
407
  private async sleepIfNotClosed(sleepTimeSeconds: number) {
238
408
  if (this.closed) {
239
409
  return;
@@ -350,6 +520,7 @@ export class TranscriptionSynchronizer {
350
520
  if (abort.aborted) {
351
521
  return;
352
522
  }
523
+
353
524
  await this._impl.close();
354
525
  this._impl = new SegmentSynchronizerImpl(this.options, this.textOutput.nextInChain);
355
526
  }
@@ -399,7 +570,15 @@ class SyncedAudioOutput extends AudioOutput {
399
570
  }
400
571
 
401
572
  if (!this.pushedDuration) {
402
- // in case there is no audio after the text was pushed, rotate the segment
573
+ // For timed texts, audio goes directly to room without going through synchronizer.
574
+ // If text was pushed but no audio, still end audio input so text can be processed.
575
+ // Only rotate if there's also no text (truly empty segment).
576
+ if (this.synchronizer._impl.hasPendingText) {
577
+ // Text is pending - end audio input to allow text processing
578
+ this.synchronizer._impl.endAudioInput();
579
+ return;
580
+ }
581
+ // No text and no audio - rotate the segment
403
582
  this.synchronizer.rotateSegment();
404
583
  return;
405
584
  }
@@ -441,12 +620,14 @@ class SyncedTextOutput extends TextOutput {
441
620
  super(nextInChain);
442
621
  }
443
622
 
444
- async captureText(text: string): Promise<void> {
623
+ async captureText(text: string | TimedString): Promise<void> {
445
624
  await this.synchronizer.barrier();
446
625
 
626
+ const textStr = isTimedString(text) ? text.text : text;
627
+
447
628
  if (!this.synchronizer.enabled) {
448
- // pass through to the next in chain
449
- await this.nextInChain.captureText(text);
629
+ // pass through to the next in chain (extract string from TimedString if needed)
630
+ await this.nextInChain.captureText(textStr);
450
631
  return;
451
632
  }
452
633
 
@@ -458,10 +639,14 @@ class SyncedTextOutput extends TextOutput {
458
639
  this.synchronizer.rotateSegment();
459
640
  await this.synchronizer.barrier();
460
641
  }
642
+ // Pass the TimedString to pushText for timing extraction
461
643
  this.synchronizer._impl.pushText(text);
462
644
  }
463
645
 
464
- flush() {
646
+ async flush() {
647
+ // Wait for any pending rotation to complete before accessing _impl
648
+ await this.synchronizer.barrier();
649
+
465
650
  if (!this.synchronizer.enabled) {
466
651
  this.nextInChain.flush(); // passthrough text if the synchronizer is disabled
467
652
  return;
package/src/worker.ts CHANGED
@@ -339,13 +339,35 @@ export class AgentServer {
339
339
  );
340
340
 
341
341
  this.#opts = opts;
342
- this.#httpServer = new HTTPServer(opts.host, opts.port, () => ({
342
+
343
+ const healthCheck = () => {
344
+ // Check if inference executor exists and is not alive
345
+ if (this.#inferenceExecutor && !this.#inferenceExecutor.isAlive) {
346
+ return { healthy: false, message: 'inference process not running' };
347
+ }
348
+
349
+ // Only healthy when fully connected with an active WebSocket
350
+ if (
351
+ this.#closed ||
352
+ this.#connecting ||
353
+ !this.#session ||
354
+ this.#session.readyState !== WebSocket.OPEN
355
+ ) {
356
+ return { healthy: false, message: 'not connected to livekit' };
357
+ }
358
+
359
+ return { healthy: true, message: 'OK' };
360
+ };
361
+
362
+ const getWorkerInfo = () => ({
343
363
  agent_name: opts.agentName,
344
364
  worker_type: JobType[opts.serverType],
345
365
  active_jobs: this.activeJobs.length,
346
366
  sdk_version: version,
347
367
  project_type: PROJECT_TYPE,
348
- }));
368
+ });
369
+
370
+ this.#httpServer = new HTTPServer(opts.host, opts.port, healthCheck, getWorkerInfo);
349
371
  }
350
372
 
351
373
  /** @throws {@link WorkerError} if worker failed to connect or already running */