@livekit/agents 1.0.40 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/dist/cli.cjs +20 -18
  2. package/dist/cli.cjs.map +1 -1
  3. package/dist/cli.d.ts.map +1 -1
  4. package/dist/cli.js +20 -18
  5. package/dist/cli.js.map +1 -1
  6. package/dist/index.cjs +5 -0
  7. package/dist/index.cjs.map +1 -1
  8. package/dist/index.d.cts +1 -0
  9. package/dist/index.d.ts +1 -0
  10. package/dist/index.d.ts.map +1 -1
  11. package/dist/index.js +3 -0
  12. package/dist/index.js.map +1 -1
  13. package/dist/inference/stt.cjs +2 -1
  14. package/dist/inference/stt.cjs.map +1 -1
  15. package/dist/inference/stt.d.ts.map +1 -1
  16. package/dist/inference/stt.js +2 -1
  17. package/dist/inference/stt.js.map +1 -1
  18. package/dist/llm/realtime.cjs.map +1 -1
  19. package/dist/llm/realtime.d.cts +5 -1
  20. package/dist/llm/realtime.d.ts +5 -1
  21. package/dist/llm/realtime.d.ts.map +1 -1
  22. package/dist/llm/realtime.js.map +1 -1
  23. package/dist/tts/stream_adapter.cjs +15 -1
  24. package/dist/tts/stream_adapter.cjs.map +1 -1
  25. package/dist/tts/stream_adapter.d.ts.map +1 -1
  26. package/dist/tts/stream_adapter.js +15 -1
  27. package/dist/tts/stream_adapter.js.map +1 -1
  28. package/dist/tts/tts.cjs.map +1 -1
  29. package/dist/tts/tts.d.cts +9 -1
  30. package/dist/tts/tts.d.ts +9 -1
  31. package/dist/tts/tts.d.ts.map +1 -1
  32. package/dist/tts/tts.js.map +1 -1
  33. package/dist/types.cjs +3 -0
  34. package/dist/types.cjs.map +1 -1
  35. package/dist/types.d.cts +4 -0
  36. package/dist/types.d.ts +4 -0
  37. package/dist/types.d.ts.map +1 -1
  38. package/dist/types.js +2 -0
  39. package/dist/types.js.map +1 -1
  40. package/dist/voice/agent.cjs +11 -1
  41. package/dist/voice/agent.cjs.map +1 -1
  42. package/dist/voice/agent.d.cts +7 -3
  43. package/dist/voice/agent.d.ts +7 -3
  44. package/dist/voice/agent.d.ts.map +1 -1
  45. package/dist/voice/agent.js +11 -1
  46. package/dist/voice/agent.js.map +1 -1
  47. package/dist/voice/agent_activity.cjs +30 -14
  48. package/dist/voice/agent_activity.cjs.map +1 -1
  49. package/dist/voice/agent_activity.d.cts +1 -0
  50. package/dist/voice/agent_activity.d.ts +1 -0
  51. package/dist/voice/agent_activity.d.ts.map +1 -1
  52. package/dist/voice/agent_activity.js +30 -14
  53. package/dist/voice/agent_activity.js.map +1 -1
  54. package/dist/voice/agent_session.cjs +5 -1
  55. package/dist/voice/agent_session.cjs.map +1 -1
  56. package/dist/voice/agent_session.d.cts +2 -0
  57. package/dist/voice/agent_session.d.ts +2 -0
  58. package/dist/voice/agent_session.d.ts.map +1 -1
  59. package/dist/voice/agent_session.js +5 -1
  60. package/dist/voice/agent_session.js.map +1 -1
  61. package/dist/voice/background_audio.cjs +2 -1
  62. package/dist/voice/background_audio.cjs.map +1 -1
  63. package/dist/voice/background_audio.d.cts +4 -2
  64. package/dist/voice/background_audio.d.ts +4 -2
  65. package/dist/voice/background_audio.d.ts.map +1 -1
  66. package/dist/voice/background_audio.js +2 -1
  67. package/dist/voice/background_audio.js.map +1 -1
  68. package/dist/voice/generation.cjs +58 -5
  69. package/dist/voice/generation.cjs.map +1 -1
  70. package/dist/voice/generation.d.cts +17 -3
  71. package/dist/voice/generation.d.ts +17 -3
  72. package/dist/voice/generation.d.ts.map +1 -1
  73. package/dist/voice/generation.js +63 -6
  74. package/dist/voice/generation.js.map +1 -1
  75. package/dist/voice/index.cjs.map +1 -1
  76. package/dist/voice/index.d.cts +1 -1
  77. package/dist/voice/index.d.ts +1 -1
  78. package/dist/voice/index.d.ts.map +1 -1
  79. package/dist/voice/index.js.map +1 -1
  80. package/dist/voice/io.cjs +22 -2
  81. package/dist/voice/io.cjs.map +1 -1
  82. package/dist/voice/io.d.cts +21 -5
  83. package/dist/voice/io.d.ts +21 -5
  84. package/dist/voice/io.d.ts.map +1 -1
  85. package/dist/voice/io.js +18 -1
  86. package/dist/voice/io.js.map +1 -1
  87. package/dist/voice/room_io/_output.cjs +3 -2
  88. package/dist/voice/room_io/_output.cjs.map +1 -1
  89. package/dist/voice/room_io/_output.d.cts +3 -3
  90. package/dist/voice/room_io/_output.d.ts +3 -3
  91. package/dist/voice/room_io/_output.d.ts.map +1 -1
  92. package/dist/voice/room_io/_output.js +4 -3
  93. package/dist/voice/room_io/_output.js.map +1 -1
  94. package/dist/voice/transcription/synchronizer.cjs +137 -13
  95. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  96. package/dist/voice/transcription/synchronizer.d.cts +34 -4
  97. package/dist/voice/transcription/synchronizer.d.ts +34 -4
  98. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  99. package/dist/voice/transcription/synchronizer.js +141 -14
  100. package/dist/voice/transcription/synchronizer.js.map +1 -1
  101. package/dist/voice/transcription/synchronizer.test.cjs +151 -0
  102. package/dist/voice/transcription/synchronizer.test.cjs.map +1 -0
  103. package/dist/voice/transcription/synchronizer.test.js +150 -0
  104. package/dist/voice/transcription/synchronizer.test.js.map +1 -0
  105. package/package.json +1 -1
  106. package/src/cli.ts +20 -18
  107. package/src/index.ts +1 -0
  108. package/src/inference/stt.ts +9 -8
  109. package/src/llm/realtime.ts +5 -1
  110. package/src/tts/stream_adapter.ts +23 -1
  111. package/src/tts/tts.ts +10 -1
  112. package/src/types.ts +5 -0
  113. package/src/voice/agent.ts +19 -4
  114. package/src/voice/agent_activity.ts +38 -13
  115. package/src/voice/agent_session.ts +6 -0
  116. package/src/voice/background_audio.ts +6 -3
  117. package/src/voice/generation.ts +115 -10
  118. package/src/voice/index.ts +1 -1
  119. package/src/voice/io.ts +40 -5
  120. package/src/voice/room_io/_output.ts +6 -5
  121. package/src/voice/transcription/synchronizer.test.ts +206 -0
  122. package/src/voice/transcription/synchronizer.ts +202 -17
@@ -0,0 +1,151 @@
1
+ "use strict";
2
+ var import_vitest = require("vitest");
3
+ var import_synchronizer = require("./synchronizer.cjs");
4
+ (0, import_vitest.describe)("SpeakingRateData", () => {
5
+ (0, import_vitest.describe)("constructor", () => {
6
+ (0, import_vitest.it)("should initialize with empty arrays", () => {
7
+ const data = new import_synchronizer.SpeakingRateData();
8
+ (0, import_vitest.expect)(data.timestamps).toEqual([]);
9
+ (0, import_vitest.expect)(data.speakingRate).toEqual([]);
10
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([]);
11
+ (0, import_vitest.expect)(data.pushedDuration).toBe(0);
12
+ });
13
+ });
14
+ (0, import_vitest.describe)("addByRate", () => {
15
+ (0, import_vitest.it)("should add a single rate entry", () => {
16
+ const data = new import_synchronizer.SpeakingRateData();
17
+ data.addByRate(1, 5);
18
+ (0, import_vitest.expect)(data.timestamps).toEqual([1]);
19
+ (0, import_vitest.expect)(data.speakingRate).toEqual([5]);
20
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([5]);
21
+ (0, import_vitest.expect)(data.pushedDuration).toBe(1);
22
+ });
23
+ (0, import_vitest.it)("should accumulate integrals across multiple entries", () => {
24
+ const data = new import_synchronizer.SpeakingRateData();
25
+ data.addByRate(1, 4);
26
+ data.addByRate(2, 6);
27
+ data.addByRate(3.5, 2);
28
+ (0, import_vitest.expect)(data.timestamps).toEqual([1, 2, 3.5]);
29
+ (0, import_vitest.expect)(data.speakingRate).toEqual([4, 6, 2]);
30
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([4, 10, 13]);
31
+ (0, import_vitest.expect)(data.pushedDuration).toBe(3.5);
32
+ });
33
+ (0, import_vitest.it)("should handle zero rate", () => {
34
+ const data = new import_synchronizer.SpeakingRateData();
35
+ data.addByRate(1, 0);
36
+ (0, import_vitest.expect)(data.timestamps).toEqual([1]);
37
+ (0, import_vitest.expect)(data.speakingRate).toEqual([0]);
38
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([0]);
39
+ });
40
+ });
41
+ (0, import_vitest.describe)("addByAnnotation", () => {
42
+ (0, import_vitest.it)("should buffer text without startTime", () => {
43
+ const data = new import_synchronizer.SpeakingRateData();
44
+ data.addByAnnotation("hello", void 0, void 0);
45
+ (0, import_vitest.expect)(data.timestamps).toEqual([]);
46
+ (0, import_vitest.expect)(data.pushedDuration).toBe(0);
47
+ });
48
+ (0, import_vitest.it)("should add entry when startTime is provided", () => {
49
+ const data = new import_synchronizer.SpeakingRateData();
50
+ data.addByAnnotation("hello", void 0, void 0);
51
+ data.addByAnnotation("world", 1, void 0);
52
+ (0, import_vitest.expect)(data.timestamps).toEqual([1]);
53
+ (0, import_vitest.expect)(data.speakingRate).toEqual([5]);
54
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([5]);
55
+ });
56
+ (0, import_vitest.it)("should handle startTime and endTime together", () => {
57
+ const data = new import_synchronizer.SpeakingRateData();
58
+ data.addByAnnotation("hello ", 0, 0.5);
59
+ data.addByAnnotation("world", 0.5, 1);
60
+ (0, import_vitest.expect)(data.timestamps.length).toBeGreaterThanOrEqual(2);
61
+ (0, import_vitest.expect)(data.pushedDuration).toBe(1);
62
+ });
63
+ (0, import_vitest.it)("should calculate rate based on buffered text length", () => {
64
+ const data = new import_synchronizer.SpeakingRateData();
65
+ data.addByAnnotation("ab", void 0, void 0);
66
+ data.addByAnnotation("cde", void 0, void 0);
67
+ data.addByAnnotation("", 2, void 0);
68
+ (0, import_vitest.expect)(data.timestamps).toEqual([2]);
69
+ (0, import_vitest.expect)(data.speakingRate).toEqual([2.5]);
70
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([5]);
71
+ });
72
+ (0, import_vitest.it)("should handle zero time delta gracefully", () => {
73
+ const data = new import_synchronizer.SpeakingRateData();
74
+ data.addByAnnotation("hello", 0, void 0);
75
+ (0, import_vitest.expect)(data.timestamps).toEqual([0]);
76
+ (0, import_vitest.expect)(data.speakingRate).toEqual([0]);
77
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([0]);
78
+ });
79
+ });
80
+ (0, import_vitest.describe)("accumulateTo", () => {
81
+ (0, import_vitest.it)("should return 0 for empty data", () => {
82
+ const data = new import_synchronizer.SpeakingRateData();
83
+ (0, import_vitest.expect)(data.accumulateTo(1)).toBe(0);
84
+ });
85
+ (0, import_vitest.it)("should return 0 for timestamp before first entry", () => {
86
+ const data = new import_synchronizer.SpeakingRateData();
87
+ data.addByRate(1, 5);
88
+ (0, import_vitest.expect)(data.accumulateTo(0.5)).toBe(0);
89
+ });
90
+ (0, import_vitest.it)("should return exact integral at timestamp", () => {
91
+ const data = new import_synchronizer.SpeakingRateData();
92
+ data.addByRate(1, 4);
93
+ data.addByRate(2, 6);
94
+ (0, import_vitest.expect)(data.accumulateTo(1)).toBe(4);
95
+ (0, import_vitest.expect)(data.accumulateTo(2)).toBe(10);
96
+ });
97
+ (0, import_vitest.it)("should interpolate between timestamps", () => {
98
+ const data = new import_synchronizer.SpeakingRateData();
99
+ data.addByRate(1, 4);
100
+ data.addByRate(2, 6);
101
+ (0, import_vitest.expect)(data.accumulateTo(1.5)).toBe(7);
102
+ });
103
+ (0, import_vitest.it)("should extrapolate beyond last timestamp", () => {
104
+ const data = new import_synchronizer.SpeakingRateData();
105
+ data.addByRate(1, 4);
106
+ data.addByRate(2, 6);
107
+ (0, import_vitest.expect)(data.accumulateTo(3)).toBe(16);
108
+ });
109
+ (0, import_vitest.it)("should not exceed next integral when interpolating", () => {
110
+ const data = new import_synchronizer.SpeakingRateData();
111
+ data.addByRate(1, 100);
112
+ data.addByRate(2, 1);
113
+ (0, import_vitest.expect)(data.accumulateTo(1.5)).toBe(100.5);
114
+ });
115
+ });
116
+ (0, import_vitest.describe)("pushedDuration", () => {
117
+ (0, import_vitest.it)("should return 0 when empty", () => {
118
+ const data = new import_synchronizer.SpeakingRateData();
119
+ (0, import_vitest.expect)(data.pushedDuration).toBe(0);
120
+ });
121
+ (0, import_vitest.it)("should return last timestamp", () => {
122
+ const data = new import_synchronizer.SpeakingRateData();
123
+ data.addByRate(1, 5);
124
+ data.addByRate(2.5, 3);
125
+ data.addByRate(4, 7);
126
+ (0, import_vitest.expect)(data.pushedDuration).toBe(4);
127
+ });
128
+ });
129
+ (0, import_vitest.describe)("integration scenarios", () => {
130
+ (0, import_vitest.it)("should handle typical TTS word timing scenario", () => {
131
+ const data = new import_synchronizer.SpeakingRateData();
132
+ data.addByAnnotation("Hello ", 0, 0.3);
133
+ data.addByAnnotation("world", 0.3, 0.6);
134
+ (0, import_vitest.expect)(data.pushedDuration).toBe(0.6);
135
+ const mid1 = data.accumulateTo(0.15);
136
+ (0, import_vitest.expect)(mid1).toBeGreaterThan(0);
137
+ (0, import_vitest.expect)(mid1).toBeLessThan(6);
138
+ const mid2 = data.accumulateTo(0.45);
139
+ (0, import_vitest.expect)(mid2).toBeGreaterThan(6);
140
+ });
141
+ (0, import_vitest.it)("should handle mixed rate and annotation data", () => {
142
+ const data = new import_synchronizer.SpeakingRateData();
143
+ data.addByRate(0.5, 4);
144
+ data.addByAnnotation("test", void 0, void 0);
145
+ data.addByAnnotation("", 1, void 0);
146
+ (0, import_vitest.expect)(data.timestamps).toEqual([0.5, 1]);
147
+ (0, import_vitest.expect)(data.speakIntegrals).toEqual([2, 6]);
148
+ });
149
+ });
150
+ });
151
+ //# sourceMappingURL=synchronizer.test.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../../src/voice/transcription/synchronizer.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { describe, expect, it } from 'vitest';\nimport { SpeakingRateData } from './synchronizer.js';\n\ndescribe('SpeakingRateData', () => {\n describe('constructor', () => {\n it('should initialize with empty arrays', () => {\n const data = new SpeakingRateData();\n expect(data.timestamps).toEqual([]);\n expect(data.speakingRate).toEqual([]);\n expect(data.speakIntegrals).toEqual([]);\n expect(data.pushedDuration).toBe(0);\n });\n });\n\n describe('addByRate', () => {\n it('should add a single rate entry', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 5.0);\n\n expect(data.timestamps).toEqual([1.0]);\n expect(data.speakingRate).toEqual([5.0]);\n // integral = 0 + 5.0 * (1.0 - 0) = 5.0\n expect(data.speakIntegrals).toEqual([5.0]);\n expect(data.pushedDuration).toBe(1.0);\n });\n\n it('should accumulate integrals across multiple entries', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 0 + 4.0 * 1.0 = 4.0\n data.addByRate(2.0, 6.0); // integral = 4.0 + 6.0 * 1.0 = 10.0\n data.addByRate(3.5, 2.0); // integral = 10.0 + 2.0 * 1.5 = 13.0\n\n expect(data.timestamps).toEqual([1.0, 2.0, 3.5]);\n expect(data.speakingRate).toEqual([4.0, 6.0, 2.0]);\n expect(data.speakIntegrals).toEqual([4.0, 10.0, 13.0]);\n expect(data.pushedDuration).toBe(3.5);\n });\n\n it('should handle zero rate', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 0.0);\n\n expect(data.timestamps).toEqual([1.0]);\n expect(data.speakingRate).toEqual([0.0]);\n expect(data.speakIntegrals).toEqual([0.0]);\n });\n });\n\n describe('addByAnnotation', () => {\n it('should buffer text without startTime', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello', undefined, undefined);\n\n // Text is buffered, no timestamp entry yet\n expect(data.timestamps).toEqual([]);\n expect(data.pushedDuration).toBe(0);\n });\n\n it('should add entry when startTime is provided', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello', undefined, undefined); // buffer \"hello\"\n data.addByAnnotation('world', 1.0, undefined); // flush with startTime\n\n expect(data.timestamps).toEqual([1.0]);\n // textLen = 5 (hello), dt = 1.0, rate = 5/1 = 5.0\n expect(data.speakingRate).toEqual([5.0]);\n expect(data.speakIntegrals).toEqual([5.0]);\n });\n\n it('should handle startTime and endTime together', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello ', 0.0, 0.5);\n data.addByAnnotation('world', 0.5, 1.0);\n\n // First annotation: startTime=0.0, text=\"hello \", then recursively calls with endTime=0.5\n // Second annotation: startTime=0.5, text=\"world\", then recursively calls with endTime=1.0\n expect(data.timestamps.length).toBeGreaterThanOrEqual(2);\n expect(data.pushedDuration).toBe(1.0);\n });\n\n it('should calculate rate based on buffered text length', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('ab', undefined, undefined); // buffer 2 chars\n data.addByAnnotation('cde', undefined, undefined); // buffer 3 more chars\n data.addByAnnotation('', 2.0, undefined); // flush: textLen=5, dt=2.0, rate=2.5\n\n expect(data.timestamps).toEqual([2.0]);\n expect(data.speakingRate).toEqual([2.5]);\n expect(data.speakIntegrals).toEqual([5.0]);\n });\n\n it('should handle zero time delta gracefully', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello', 0.0, undefined); // dt=0, rate should be 0\n\n expect(data.timestamps).toEqual([0.0]);\n expect(data.speakingRate).toEqual([0.0]);\n expect(data.speakIntegrals).toEqual([0.0]);\n });\n });\n\n describe('accumulateTo', () => {\n it('should return 0 for empty data', () => {\n const data = new SpeakingRateData();\n expect(data.accumulateTo(1.0)).toBe(0);\n });\n\n it('should return 0 for timestamp before first entry', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 5.0);\n expect(data.accumulateTo(0.5)).toBe(0);\n });\n\n it('should return exact integral at timestamp', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 4.0\n data.addByRate(2.0, 6.0); // integral = 10.0\n\n expect(data.accumulateTo(1.0)).toBe(4.0);\n expect(data.accumulateTo(2.0)).toBe(10.0);\n });\n\n it('should interpolate between timestamps', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 4.0\n data.addByRate(2.0, 6.0); // integral = 10.0\n\n // At 1.5: integral = 4.0 + 6.0 * 0.5 = 7.0\n expect(data.accumulateTo(1.5)).toBe(7.0);\n });\n\n it('should extrapolate beyond last timestamp', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 4.0\n data.addByRate(2.0, 6.0); // integral = 10.0\n\n // At 3.0: integral = 10.0 + 6.0 * 1.0 = 16.0\n expect(data.accumulateTo(3.0)).toBe(16.0);\n });\n\n it('should not exceed next integral when interpolating', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 100.0); // integral = 100.0 (very high rate)\n data.addByRate(2.0, 1.0); // integral = 101.0\n\n // At 1.5 with rate 1.0: would be 100.0 + 1.0 * 0.5 = 100.5\n // But capped at next integral 101.0, so result is min(100.5, 101.0) = 100.5\n expect(data.accumulateTo(1.5)).toBe(100.5);\n });\n });\n\n describe('pushedDuration', () => {\n it('should return 0 when empty', () => {\n const data = new SpeakingRateData();\n expect(data.pushedDuration).toBe(0);\n });\n\n it('should return last timestamp', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 5.0);\n data.addByRate(2.5, 3.0);\n data.addByRate(4.0, 7.0);\n\n expect(data.pushedDuration).toBe(4.0);\n });\n });\n\n describe('integration scenarios', () => {\n it('should handle typical TTS word timing scenario', () => {\n const data = new SpeakingRateData();\n\n // Simulating words with timing: \"Hello \" at 0-0.3s, \"world\" at 0.3-0.6s\n data.addByAnnotation('Hello ', 0.0, 0.3);\n data.addByAnnotation('world', 0.3, 0.6);\n\n // Should have accumulated text lengths at each timestamp\n expect(data.pushedDuration).toBe(0.6);\n\n // At 0.15s (middle of first word), should be partway through\n const mid1 = data.accumulateTo(0.15);\n expect(mid1).toBeGreaterThan(0);\n expect(mid1).toBeLessThan(6); // \"Hello \" is 6 chars\n\n // At 0.45s (middle of second word), should be past first word\n const mid2 = data.accumulateTo(0.45);\n expect(mid2).toBeGreaterThan(6);\n });\n\n it('should handle mixed rate and annotation data', () => {\n const data = new SpeakingRateData();\n\n // Start with rate-based data\n data.addByRate(0.5, 4.0); // integral = 2.0\n\n // Then add annotation\n data.addByAnnotation('test', undefined, undefined);\n data.addByAnnotation('', 1.0, undefined); // textLen=4, dt=0.5, rate=8.0, integral = 2.0 + 4.0 = 6.0\n\n expect(data.timestamps).toEqual([0.5, 1.0]);\n expect(data.speakIntegrals).toEqual([2.0, 6.0]);\n });\n });\n});\n"],"mappings":";AAGA,oBAAqC;AACrC,0BAAiC;AAAA,IAEjC,wBAAS,oBAAoB,MAAM;AACjC,8BAAS,eAAe,MAAM;AAC5B,0BAAG,uCAAuC,MAAM;AAC9C,YAAM,OAAO,IAAI,qCAAiB;AAClC,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAC;AAClC,gCAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAC;AACpC,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAC;AACtC,gCAAO,KAAK,cAAc,EAAE,KAAK,CAAC;AAAA,IACpC,CAAC;AAAA,EACH,CAAC;AAED,8BAAS,aAAa,MAAM;AAC1B,0BAAG,kCAAkC,MAAM;AACzC,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AAEvB,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,gCAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AAEvC,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AACzC,gCAAO,KAAK,cAAc,EAAE,KAAK,CAAG;AAAA,IACtC,CAAC;AAED,0BAAG,uDAAuD,MAAM;AAC9D,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,KAAK,CAAG;AAEvB,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,GAAK,GAAK,GAAG,CAAC;AAC/C,gCAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,GAAK,GAAK,CAAG,CAAC;AACjD,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,GAAK,IAAM,EAAI,CAAC;AACrD,gCAAO,KAAK,cAAc,EAAE,KAAK,GAAG;AAAA,IACtC,CAAC;AAED,0BAAG,2BAA2B,MAAM;AAClC,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AAEvB,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,gCAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AACvC,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAAA,EACH,CAAC;AAED,8BAAS,mBAAmB,MAAM;AAChC,0BAAG,wCAAwC,MAAM;AAC/C,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,gBAAgB,SAAS,QAAW,MAAS;AAGlD,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAC;AAClC,gCAAO,KAAK,cAAc,EAAE,KAAK,CAAC;AAAA,IACpC,CAAC;AAED,0BAAG,+CAA+C,MAAM;AACtD,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,gBAAgB,SAAS,QAAW,MAAS;AAClD,WAAK,gBAAgB,SAAS,GAAK,MAAS;AAE5C,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AAErC,gCAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AACvC,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAED,0BAAG,gDAAgD,MAAM;AACvD,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,gBAAgB,UAAU,GAAK,GAAG;AACvC,WAAK,gBAAgB,SAAS,KAAK,CAAG;AAItC,gCAAO,KAAK,WAAW,MAAM,EAAE,uBAAuB,CAAC;AACvD,gCAAO,KAAK,cAAc,EAAE,KAAK,CAAG;AAAA,IACtC,CAAC;AAED,0BAAG,uDAAuD,MAAM;AAC9D,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,gBAAgB,MAAM,QAAW,MAAS;AAC/C,WAAK,gBAAgB,OAAO,QAAW,MAAS;AAChD,WAAK,gBAAgB,IAAI,GAAK,MAAS;AAEvC,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,gCAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,GAAG,CAAC;AACvC,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAED,0BAAG,4CAA4C,MAAM;AACnD,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,gBAAgB,SAAS,GAAK,MAAS;AAE5C,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,gCAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AACvC,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAAA,EACH,CAAC;AAED,8BAAS,gBAAgB,MAAM;AAC7B,0BAAG,kCAAkC,MAAM;AACzC,YAAM,OAAO,IAAI,qCAAiB;AAClC,gCAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,CAAC;AAAA,IACvC,CAAC;AAED,0BAAG,oDAAoD,MAAM;AAC3D,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,gCAAO,KAAK,aAAa,GAAG,CAAC,EAAE,KAAK,CAAC;AAAA,IACvC,CAAC;AAED,0BAAG,6CAA6C,MAAM;AACpD,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAEvB,gCAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,CAAG;AACvC,gCAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,EAAI;AAAA,IAC1C,CAAC;AAED,0BAAG,yCAAyC,MAAM;AAChD,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAGvB,gCAAO,KAAK,aAAa,GAAG,CAAC,EAAE,KAAK,CAAG;AAAA,IACzC,CAAC;AAED,0BAAG,4CAA4C,MAAM;AACnD,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAGvB,gCAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,EAAI;AAAA,IAC1C,CAAC;AAED,0BAAG,sDAAsD,MAAM;AAC7D,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,GAAK;AACzB,WAAK,UAAU,GAAK,CAAG;AAIvB,gCAAO,KAAK,aAAa,GAAG,CAAC,EAAE,KAAK,KAAK;AAAA,IAC3C,CAAC;AAAA,EACH,CAAC;AAED,8BAAS,kBAAkB,MAAM;AAC/B,0BAAG,8BAA8B,MAAM;AACrC,YAAM,OAAO,IAAI,qCAAiB;AAClC,gCAAO,KAAK,cAAc,EAAE,KAAK,CAAC;AAAA,IACpC,CAAC;AAED,0BAAG,gCAAgC,MAAM;AACvC,YAAM,OAAO,IAAI,qCAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,KAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAEvB,gCAAO,KAAK,cAAc,EAAE,KAAK,CAAG;AAAA,IACtC,CAAC;AAAA,EACH,CAAC;AAED,8BAAS,yBAAyB,MAAM;AACtC,0BAAG,kDAAkD,MAAM;AACzD,YAAM,OAAO,IAAI,qCAAiB;AAGlC,WAAK,gBAAgB,UAAU,GAAK,GAAG;AACvC,WAAK,gBAAgB,SAAS,KAAK,GAAG;AAGtC,gCAAO,KAAK,cAAc,EAAE,KAAK,GAAG;AAGpC,YAAM,OAAO,KAAK,aAAa,IAAI;AACnC,gCAAO,IAAI,EAAE,gBAAgB,CAAC;AAC9B,gCAAO,IAAI,EAAE,aAAa,CAAC;AAG3B,YAAM,OAAO,KAAK,aAAa,IAAI;AACnC,gCAAO,IAAI,EAAE,gBAAgB,CAAC;AAAA,IAChC,CAAC;AAED,0BAAG,gDAAgD,MAAM;AACvD,YAAM,OAAO,IAAI,qCAAiB;AAGlC,WAAK,UAAU,KAAK,CAAG;AAGvB,WAAK,gBAAgB,QAAQ,QAAW,MAAS;AACjD,WAAK,gBAAgB,IAAI,GAAK,MAAS;AAEvC,gCAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,KAAK,CAAG,CAAC;AAC1C,gCAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,GAAK,CAAG,CAAC;AAAA,IAChD,CAAC;AAAA,EACH,CAAC;AACH,CAAC;","names":[]}
@@ -0,0 +1,150 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { SpeakingRateData } from "./synchronizer.js";
3
+ describe("SpeakingRateData", () => {
4
+ describe("constructor", () => {
5
+ it("should initialize with empty arrays", () => {
6
+ const data = new SpeakingRateData();
7
+ expect(data.timestamps).toEqual([]);
8
+ expect(data.speakingRate).toEqual([]);
9
+ expect(data.speakIntegrals).toEqual([]);
10
+ expect(data.pushedDuration).toBe(0);
11
+ });
12
+ });
13
+ describe("addByRate", () => {
14
+ it("should add a single rate entry", () => {
15
+ const data = new SpeakingRateData();
16
+ data.addByRate(1, 5);
17
+ expect(data.timestamps).toEqual([1]);
18
+ expect(data.speakingRate).toEqual([5]);
19
+ expect(data.speakIntegrals).toEqual([5]);
20
+ expect(data.pushedDuration).toBe(1);
21
+ });
22
+ it("should accumulate integrals across multiple entries", () => {
23
+ const data = new SpeakingRateData();
24
+ data.addByRate(1, 4);
25
+ data.addByRate(2, 6);
26
+ data.addByRate(3.5, 2);
27
+ expect(data.timestamps).toEqual([1, 2, 3.5]);
28
+ expect(data.speakingRate).toEqual([4, 6, 2]);
29
+ expect(data.speakIntegrals).toEqual([4, 10, 13]);
30
+ expect(data.pushedDuration).toBe(3.5);
31
+ });
32
+ it("should handle zero rate", () => {
33
+ const data = new SpeakingRateData();
34
+ data.addByRate(1, 0);
35
+ expect(data.timestamps).toEqual([1]);
36
+ expect(data.speakingRate).toEqual([0]);
37
+ expect(data.speakIntegrals).toEqual([0]);
38
+ });
39
+ });
40
+ describe("addByAnnotation", () => {
41
+ it("should buffer text without startTime", () => {
42
+ const data = new SpeakingRateData();
43
+ data.addByAnnotation("hello", void 0, void 0);
44
+ expect(data.timestamps).toEqual([]);
45
+ expect(data.pushedDuration).toBe(0);
46
+ });
47
+ it("should add entry when startTime is provided", () => {
48
+ const data = new SpeakingRateData();
49
+ data.addByAnnotation("hello", void 0, void 0);
50
+ data.addByAnnotation("world", 1, void 0);
51
+ expect(data.timestamps).toEqual([1]);
52
+ expect(data.speakingRate).toEqual([5]);
53
+ expect(data.speakIntegrals).toEqual([5]);
54
+ });
55
+ it("should handle startTime and endTime together", () => {
56
+ const data = new SpeakingRateData();
57
+ data.addByAnnotation("hello ", 0, 0.5);
58
+ data.addByAnnotation("world", 0.5, 1);
59
+ expect(data.timestamps.length).toBeGreaterThanOrEqual(2);
60
+ expect(data.pushedDuration).toBe(1);
61
+ });
62
+ it("should calculate rate based on buffered text length", () => {
63
+ const data = new SpeakingRateData();
64
+ data.addByAnnotation("ab", void 0, void 0);
65
+ data.addByAnnotation("cde", void 0, void 0);
66
+ data.addByAnnotation("", 2, void 0);
67
+ expect(data.timestamps).toEqual([2]);
68
+ expect(data.speakingRate).toEqual([2.5]);
69
+ expect(data.speakIntegrals).toEqual([5]);
70
+ });
71
+ it("should handle zero time delta gracefully", () => {
72
+ const data = new SpeakingRateData();
73
+ data.addByAnnotation("hello", 0, void 0);
74
+ expect(data.timestamps).toEqual([0]);
75
+ expect(data.speakingRate).toEqual([0]);
76
+ expect(data.speakIntegrals).toEqual([0]);
77
+ });
78
+ });
79
+ describe("accumulateTo", () => {
80
+ it("should return 0 for empty data", () => {
81
+ const data = new SpeakingRateData();
82
+ expect(data.accumulateTo(1)).toBe(0);
83
+ });
84
+ it("should return 0 for timestamp before first entry", () => {
85
+ const data = new SpeakingRateData();
86
+ data.addByRate(1, 5);
87
+ expect(data.accumulateTo(0.5)).toBe(0);
88
+ });
89
+ it("should return exact integral at timestamp", () => {
90
+ const data = new SpeakingRateData();
91
+ data.addByRate(1, 4);
92
+ data.addByRate(2, 6);
93
+ expect(data.accumulateTo(1)).toBe(4);
94
+ expect(data.accumulateTo(2)).toBe(10);
95
+ });
96
+ it("should interpolate between timestamps", () => {
97
+ const data = new SpeakingRateData();
98
+ data.addByRate(1, 4);
99
+ data.addByRate(2, 6);
100
+ expect(data.accumulateTo(1.5)).toBe(7);
101
+ });
102
+ it("should extrapolate beyond last timestamp", () => {
103
+ const data = new SpeakingRateData();
104
+ data.addByRate(1, 4);
105
+ data.addByRate(2, 6);
106
+ expect(data.accumulateTo(3)).toBe(16);
107
+ });
108
+ it("should not exceed next integral when interpolating", () => {
109
+ const data = new SpeakingRateData();
110
+ data.addByRate(1, 100);
111
+ data.addByRate(2, 1);
112
+ expect(data.accumulateTo(1.5)).toBe(100.5);
113
+ });
114
+ });
115
+ describe("pushedDuration", () => {
116
+ it("should return 0 when empty", () => {
117
+ const data = new SpeakingRateData();
118
+ expect(data.pushedDuration).toBe(0);
119
+ });
120
+ it("should return last timestamp", () => {
121
+ const data = new SpeakingRateData();
122
+ data.addByRate(1, 5);
123
+ data.addByRate(2.5, 3);
124
+ data.addByRate(4, 7);
125
+ expect(data.pushedDuration).toBe(4);
126
+ });
127
+ });
128
+ describe("integration scenarios", () => {
129
+ it("should handle typical TTS word timing scenario", () => {
130
+ const data = new SpeakingRateData();
131
+ data.addByAnnotation("Hello ", 0, 0.3);
132
+ data.addByAnnotation("world", 0.3, 0.6);
133
+ expect(data.pushedDuration).toBe(0.6);
134
+ const mid1 = data.accumulateTo(0.15);
135
+ expect(mid1).toBeGreaterThan(0);
136
+ expect(mid1).toBeLessThan(6);
137
+ const mid2 = data.accumulateTo(0.45);
138
+ expect(mid2).toBeGreaterThan(6);
139
+ });
140
+ it("should handle mixed rate and annotation data", () => {
141
+ const data = new SpeakingRateData();
142
+ data.addByRate(0.5, 4);
143
+ data.addByAnnotation("test", void 0, void 0);
144
+ data.addByAnnotation("", 1, void 0);
145
+ expect(data.timestamps).toEqual([0.5, 1]);
146
+ expect(data.speakIntegrals).toEqual([2, 6]);
147
+ });
148
+ });
149
+ });
150
+ //# sourceMappingURL=synchronizer.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../../src/voice/transcription/synchronizer.test.ts"],"sourcesContent":["// SPDX-FileCopyrightText: 2024 LiveKit, Inc.\n//\n// SPDX-License-Identifier: Apache-2.0\nimport { describe, expect, it } from 'vitest';\nimport { SpeakingRateData } from './synchronizer.js';\n\ndescribe('SpeakingRateData', () => {\n describe('constructor', () => {\n it('should initialize with empty arrays', () => {\n const data = new SpeakingRateData();\n expect(data.timestamps).toEqual([]);\n expect(data.speakingRate).toEqual([]);\n expect(data.speakIntegrals).toEqual([]);\n expect(data.pushedDuration).toBe(0);\n });\n });\n\n describe('addByRate', () => {\n it('should add a single rate entry', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 5.0);\n\n expect(data.timestamps).toEqual([1.0]);\n expect(data.speakingRate).toEqual([5.0]);\n // integral = 0 + 5.0 * (1.0 - 0) = 5.0\n expect(data.speakIntegrals).toEqual([5.0]);\n expect(data.pushedDuration).toBe(1.0);\n });\n\n it('should accumulate integrals across multiple entries', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 0 + 4.0 * 1.0 = 4.0\n data.addByRate(2.0, 6.0); // integral = 4.0 + 6.0 * 1.0 = 10.0\n data.addByRate(3.5, 2.0); // integral = 10.0 + 2.0 * 1.5 = 13.0\n\n expect(data.timestamps).toEqual([1.0, 2.0, 3.5]);\n expect(data.speakingRate).toEqual([4.0, 6.0, 2.0]);\n expect(data.speakIntegrals).toEqual([4.0, 10.0, 13.0]);\n expect(data.pushedDuration).toBe(3.5);\n });\n\n it('should handle zero rate', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 0.0);\n\n expect(data.timestamps).toEqual([1.0]);\n expect(data.speakingRate).toEqual([0.0]);\n expect(data.speakIntegrals).toEqual([0.0]);\n });\n });\n\n describe('addByAnnotation', () => {\n it('should buffer text without startTime', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello', undefined, undefined);\n\n // Text is buffered, no timestamp entry yet\n expect(data.timestamps).toEqual([]);\n expect(data.pushedDuration).toBe(0);\n });\n\n it('should add entry when startTime is provided', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello', undefined, undefined); // buffer \"hello\"\n data.addByAnnotation('world', 1.0, undefined); // flush with startTime\n\n expect(data.timestamps).toEqual([1.0]);\n // textLen = 5 (hello), dt = 1.0, rate = 5/1 = 5.0\n expect(data.speakingRate).toEqual([5.0]);\n expect(data.speakIntegrals).toEqual([5.0]);\n });\n\n it('should handle startTime and endTime together', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello ', 0.0, 0.5);\n data.addByAnnotation('world', 0.5, 1.0);\n\n // First annotation: startTime=0.0, text=\"hello \", then recursively calls with endTime=0.5\n // Second annotation: startTime=0.5, text=\"world\", then recursively calls with endTime=1.0\n expect(data.timestamps.length).toBeGreaterThanOrEqual(2);\n expect(data.pushedDuration).toBe(1.0);\n });\n\n it('should calculate rate based on buffered text length', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('ab', undefined, undefined); // buffer 2 chars\n data.addByAnnotation('cde', undefined, undefined); // buffer 3 more chars\n data.addByAnnotation('', 2.0, undefined); // flush: textLen=5, dt=2.0, rate=2.5\n\n expect(data.timestamps).toEqual([2.0]);\n expect(data.speakingRate).toEqual([2.5]);\n expect(data.speakIntegrals).toEqual([5.0]);\n });\n\n it('should handle zero time delta gracefully', () => {\n const data = new SpeakingRateData();\n data.addByAnnotation('hello', 0.0, undefined); // dt=0, rate should be 0\n\n expect(data.timestamps).toEqual([0.0]);\n expect(data.speakingRate).toEqual([0.0]);\n expect(data.speakIntegrals).toEqual([0.0]);\n });\n });\n\n describe('accumulateTo', () => {\n it('should return 0 for empty data', () => {\n const data = new SpeakingRateData();\n expect(data.accumulateTo(1.0)).toBe(0);\n });\n\n it('should return 0 for timestamp before first entry', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 5.0);\n expect(data.accumulateTo(0.5)).toBe(0);\n });\n\n it('should return exact integral at timestamp', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 4.0\n data.addByRate(2.0, 6.0); // integral = 10.0\n\n expect(data.accumulateTo(1.0)).toBe(4.0);\n expect(data.accumulateTo(2.0)).toBe(10.0);\n });\n\n it('should interpolate between timestamps', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 4.0\n data.addByRate(2.0, 6.0); // integral = 10.0\n\n // At 1.5: integral = 4.0 + 6.0 * 0.5 = 7.0\n expect(data.accumulateTo(1.5)).toBe(7.0);\n });\n\n it('should extrapolate beyond last timestamp', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 4.0); // integral = 4.0\n data.addByRate(2.0, 6.0); // integral = 10.0\n\n // At 3.0: integral = 10.0 + 6.0 * 1.0 = 16.0\n expect(data.accumulateTo(3.0)).toBe(16.0);\n });\n\n it('should not exceed next integral when interpolating', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 100.0); // integral = 100.0 (very high rate)\n data.addByRate(2.0, 1.0); // integral = 101.0\n\n // At 1.5 with rate 1.0: would be 100.0 + 1.0 * 0.5 = 100.5\n // But capped at next integral 101.0, so result is min(100.5, 101.0) = 100.5\n expect(data.accumulateTo(1.5)).toBe(100.5);\n });\n });\n\n describe('pushedDuration', () => {\n it('should return 0 when empty', () => {\n const data = new SpeakingRateData();\n expect(data.pushedDuration).toBe(0);\n });\n\n it('should return last timestamp', () => {\n const data = new SpeakingRateData();\n data.addByRate(1.0, 5.0);\n data.addByRate(2.5, 3.0);\n data.addByRate(4.0, 7.0);\n\n expect(data.pushedDuration).toBe(4.0);\n });\n });\n\n describe('integration scenarios', () => {\n it('should handle typical TTS word timing scenario', () => {\n const data = new SpeakingRateData();\n\n // Simulating words with timing: \"Hello \" at 0-0.3s, \"world\" at 0.3-0.6s\n data.addByAnnotation('Hello ', 0.0, 0.3);\n data.addByAnnotation('world', 0.3, 0.6);\n\n // Should have accumulated text lengths at each timestamp\n expect(data.pushedDuration).toBe(0.6);\n\n // At 0.15s (middle of first word), should be partway through\n const mid1 = data.accumulateTo(0.15);\n expect(mid1).toBeGreaterThan(0);\n expect(mid1).toBeLessThan(6); // \"Hello \" is 6 chars\n\n // At 0.45s (middle of second word), should be past first word\n const mid2 = data.accumulateTo(0.45);\n expect(mid2).toBeGreaterThan(6);\n });\n\n it('should handle mixed rate and annotation data', () => {\n const data = new SpeakingRateData();\n\n // Start with rate-based data\n data.addByRate(0.5, 4.0); // integral = 2.0\n\n // Then add annotation\n data.addByAnnotation('test', undefined, undefined);\n data.addByAnnotation('', 1.0, undefined); // textLen=4, dt=0.5, rate=8.0, integral = 2.0 + 4.0 = 6.0\n\n expect(data.timestamps).toEqual([0.5, 1.0]);\n expect(data.speakIntegrals).toEqual([2.0, 6.0]);\n });\n });\n});\n"],"mappings":"AAGA,SAAS,UAAU,QAAQ,UAAU;AACrC,SAAS,wBAAwB;AAEjC,SAAS,oBAAoB,MAAM;AACjC,WAAS,eAAe,MAAM;AAC5B,OAAG,uCAAuC,MAAM;AAC9C,YAAM,OAAO,IAAI,iBAAiB;AAClC,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAC;AAClC,aAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAC;AACpC,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAC;AACtC,aAAO,KAAK,cAAc,EAAE,KAAK,CAAC;AAAA,IACpC,CAAC;AAAA,EACH,CAAC;AAED,WAAS,aAAa,MAAM;AAC1B,OAAG,kCAAkC,MAAM;AACzC,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AAEvB,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,aAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AAEvC,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AACzC,aAAO,KAAK,cAAc,EAAE,KAAK,CAAG;AAAA,IACtC,CAAC;AAED,OAAG,uDAAuD,MAAM;AAC9D,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,KAAK,CAAG;AAEvB,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,GAAK,GAAK,GAAG,CAAC;AAC/C,aAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,GAAK,GAAK,CAAG,CAAC;AACjD,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,GAAK,IAAM,EAAI,CAAC;AACrD,aAAO,KAAK,cAAc,EAAE,KAAK,GAAG;AAAA,IACtC,CAAC;AAED,OAAG,2BAA2B,MAAM;AAClC,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AAEvB,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,aAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AACvC,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAAA,EACH,CAAC;AAED,WAAS,mBAAmB,MAAM;AAChC,OAAG,wCAAwC,MAAM;AAC/C,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,gBAAgB,SAAS,QAAW,MAAS;AAGlD,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAC;AAClC,aAAO,KAAK,cAAc,EAAE,KAAK,CAAC;AAAA,IACpC,CAAC;AAED,OAAG,+CAA+C,MAAM;AACtD,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,gBAAgB,SAAS,QAAW,MAAS;AAClD,WAAK,gBAAgB,SAAS,GAAK,MAAS;AAE5C,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AAErC,aAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AACvC,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAED,OAAG,gDAAgD,MAAM;AACvD,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,gBAAgB,UAAU,GAAK,GAAG;AACvC,WAAK,gBAAgB,SAAS,KAAK,CAAG;AAItC,aAAO,KAAK,WAAW,MAAM,EAAE,uBAAuB,CAAC;AACvD,aAAO,KAAK,cAAc,EAAE,KAAK,CAAG;AAAA,IACtC,CAAC;AAED,OAAG,uDAAuD,MAAM;AAC9D,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,gBAAgB,MAAM,QAAW,MAAS;AAC/C,WAAK,gBAAgB,OAAO,QAAW,MAAS;AAChD,WAAK,gBAAgB,IAAI,GAAK,MAAS;AAEvC,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,aAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,GAAG,CAAC;AACvC,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAED,OAAG,4CAA4C,MAAM;AACnD,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,gBAAgB,SAAS,GAAK,MAAS;AAE5C,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,CAAG,CAAC;AACrC,aAAO,KAAK,YAAY,EAAE,QAAQ,CAAC,CAAG,CAAC;AACvC,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,CAAG,CAAC;AAAA,IAC3C,CAAC;AAAA,EACH,CAAC;AAED,WAAS,gBAAgB,MAAM;AAC7B,OAAG,kCAAkC,MAAM;AACzC,YAAM,OAAO,IAAI,iBAAiB;AAClC,aAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,CAAC;AAAA,IACvC,CAAC;AAED,OAAG,oDAAoD,MAAM;AAC3D,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,aAAO,KAAK,aAAa,GAAG,CAAC,EAAE,KAAK,CAAC;AAAA,IACvC,CAAC;AAED,OAAG,6CAA6C,MAAM;AACpD,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAEvB,aAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,CAAG;AACvC,aAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,EAAI;AAAA,IAC1C,CAAC;AAED,OAAG,yCAAyC,MAAM;AAChD,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAGvB,aAAO,KAAK,aAAa,GAAG,CAAC,EAAE,KAAK,CAAG;AAAA,IACzC,CAAC;AAED,OAAG,4CAA4C,MAAM;AACnD,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAGvB,aAAO,KAAK,aAAa,CAAG,CAAC,EAAE,KAAK,EAAI;AAAA,IAC1C,CAAC;AAED,OAAG,sDAAsD,MAAM;AAC7D,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,GAAK;AACzB,WAAK,UAAU,GAAK,CAAG;AAIvB,aAAO,KAAK,aAAa,GAAG,CAAC,EAAE,KAAK,KAAK;AAAA,IAC3C,CAAC;AAAA,EACH,CAAC;AAED,WAAS,kBAAkB,MAAM;AAC/B,OAAG,8BAA8B,MAAM;AACrC,YAAM,OAAO,IAAI,iBAAiB;AAClC,aAAO,KAAK,cAAc,EAAE,KAAK,CAAC;AAAA,IACpC,CAAC;AAED,OAAG,gCAAgC,MAAM;AACvC,YAAM,OAAO,IAAI,iBAAiB;AAClC,WAAK,UAAU,GAAK,CAAG;AACvB,WAAK,UAAU,KAAK,CAAG;AACvB,WAAK,UAAU,GAAK,CAAG;AAEvB,aAAO,KAAK,cAAc,EAAE,KAAK,CAAG;AAAA,IACtC,CAAC;AAAA,EACH,CAAC;AAED,WAAS,yBAAyB,MAAM;AACtC,OAAG,kDAAkD,MAAM;AACzD,YAAM,OAAO,IAAI,iBAAiB;AAGlC,WAAK,gBAAgB,UAAU,GAAK,GAAG;AACvC,WAAK,gBAAgB,SAAS,KAAK,GAAG;AAGtC,aAAO,KAAK,cAAc,EAAE,KAAK,GAAG;AAGpC,YAAM,OAAO,KAAK,aAAa,IAAI;AACnC,aAAO,IAAI,EAAE,gBAAgB,CAAC;AAC9B,aAAO,IAAI,EAAE,aAAa,CAAC;AAG3B,YAAM,OAAO,KAAK,aAAa,IAAI;AACnC,aAAO,IAAI,EAAE,gBAAgB,CAAC;AAAA,IAChC,CAAC;AAED,OAAG,gDAAgD,MAAM;AACvD,YAAM,OAAO,IAAI,iBAAiB;AAGlC,WAAK,UAAU,KAAK,CAAG;AAGvB,WAAK,gBAAgB,QAAQ,QAAW,MAAS;AACjD,WAAK,gBAAgB,IAAI,GAAK,MAAS;AAEvC,aAAO,KAAK,UAAU,EAAE,QAAQ,CAAC,KAAK,CAAG,CAAC;AAC1C,aAAO,KAAK,cAAc,EAAE,QAAQ,CAAC,GAAK,CAAG,CAAC;AAAA,IAChD,CAAC;AAAA,EACH,CAAC;AACH,CAAC;","names":[]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@livekit/agents",
3
- "version": "1.0.40",
3
+ "version": "1.0.41",
4
4
  "description": "LiveKit Agents - Node.js",
5
5
  "main": "dist/index.js",
6
6
  "require": "dist/index.cjs",
package/src/cli.ts CHANGED
@@ -143,13 +143,14 @@ export const runApp = (opts: ServerOptions) => {
143
143
  .default('debug')
144
144
  .env('LOG_LEVEL'),
145
145
  )
146
- .action(() => {
147
- const options = program.optsWithGlobals();
148
- opts.wsURL = options.url || opts.wsURL;
149
- opts.apiKey = options.apiKey || opts.apiKey;
150
- opts.apiSecret = options.apiSecret || opts.apiSecret;
151
- opts.logLevel = options.logLevel || opts.logLevel;
152
- opts.workerToken = options.workerToken || opts.workerToken;
146
+ .action((...[, command]) => {
147
+ const globalOptions = program.optsWithGlobals();
148
+ const commandOptions = command.opts();
149
+ opts.wsURL = globalOptions.url || opts.wsURL;
150
+ opts.apiKey = globalOptions.apiKey || opts.apiKey;
151
+ opts.apiSecret = globalOptions.apiSecret || opts.apiSecret;
152
+ opts.logLevel = commandOptions.logLevel || globalOptions.logLevel || opts.logLevel;
153
+ opts.workerToken = globalOptions.workerToken || opts.workerToken;
153
154
  runServer({
154
155
  opts,
155
156
  production: false,
@@ -169,18 +170,19 @@ export const runApp = (opts: ServerOptions) => {
169
170
  .env('LOG_LEVEL'),
170
171
  )
171
172
  .action((...[, command]) => {
172
- const options = command.optsWithGlobals();
173
- opts.wsURL = options.url || opts.wsURL;
174
- opts.apiKey = options.apiKey || opts.apiKey;
175
- opts.apiSecret = options.apiSecret || opts.apiSecret;
176
- opts.logLevel = options.logLevel || opts.logLevel;
177
- opts.workerToken = options.workerToken || opts.workerToken;
173
+ const globalOptions = program.optsWithGlobals();
174
+ const commandOptions = command.opts();
175
+ opts.wsURL = globalOptions.url || opts.wsURL;
176
+ opts.apiKey = globalOptions.apiKey || opts.apiKey;
177
+ opts.apiSecret = globalOptions.apiSecret || opts.apiSecret;
178
+ opts.logLevel = commandOptions.logLevel || globalOptions.logLevel || opts.logLevel;
179
+ opts.workerToken = globalOptions.workerToken || opts.workerToken;
178
180
  runServer({
179
181
  opts,
180
182
  production: false,
181
183
  watch: false,
182
- room: options.room,
183
- participantIdentity: options.participantIdentity,
184
+ room: commandOptions.room,
185
+ participantIdentity: commandOptions.participantIdentity,
184
186
  });
185
187
  });
186
188
 
@@ -193,9 +195,9 @@ export const runApp = (opts: ServerOptions) => {
193
195
  .default('debug')
194
196
  .env('LOG_LEVEL'),
195
197
  )
196
- .action(() => {
197
- const options = program.optsWithGlobals();
198
- initializeLogger({ pretty: true, level: options.logLevel });
198
+ .action((...[, command]) => {
199
+ const commandOptions = command.opts();
200
+ initializeLogger({ pretty: true, level: commandOptions.logLevel });
199
201
  const logger = log();
200
202
 
201
203
  const downloadFiles = async () => {
package/src/index.ts CHANGED
@@ -34,6 +34,7 @@ export * from './types.js';
34
34
  export * from './utils.js';
35
35
  export * from './vad.js';
36
36
  export * from './version.js';
37
+ export { createTimedString, isTimedString, type TimedString } from './voice/io.js';
37
38
  export * from './worker.js';
38
39
 
39
40
  export { cli, inference, ipc, llm, metrics, stream, stt, telemetry, tokenize, tts, voice };
@@ -16,7 +16,7 @@ import {
16
16
  } from '../stt/index.js';
17
17
  import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
18
18
  import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
19
- import type { TimedString } from '../voice/io.js';
19
+ import { type TimedString, createTimedString } from '../voice/io.js';
20
20
  import {
21
21
  type SttServerEvent,
22
22
  type SttTranscriptEvent,
@@ -491,13 +491,14 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
491
491
  confidence: data.confidence,
492
492
  text,
493
493
  words: data.words.map(
494
- (word): TimedString => ({
495
- text: word.word,
496
- startTime: word.start + this.startTimeOffset,
497
- endTime: word.end + this.startTimeOffset,
498
- startTimeOffset: this.startTimeOffset,
499
- confidence: word.confidence,
500
- }),
494
+ (word): TimedString =>
495
+ createTimedString({
496
+ text: word.word,
497
+ startTime: word.start + this.startTimeOffset,
498
+ endTime: word.end + this.startTimeOffset,
499
+ startTimeOffset: this.startTimeOffset,
500
+ confidence: word.confidence,
501
+ }),
501
502
  ),
502
503
  };
503
504
 
@@ -6,6 +6,7 @@ import { EventEmitter } from 'events';
6
6
  import type { ReadableStream } from 'node:stream/web';
7
7
  import { DeferredReadableStream } from '../stream/deferred_stream.js';
8
8
  import { Task } from '../utils.js';
9
+ import type { TimedString } from '../voice/io.js';
9
10
  import type { ChatContext, FunctionCall } from './chat_context.js';
10
11
  import type { ToolChoice, ToolContext } from './tool_context.js';
11
12
 
@@ -17,7 +18,10 @@ export interface InputSpeechStoppedEvent {
17
18
 
18
19
  export interface MessageGeneration {
19
20
  messageId: string;
20
- textStream: ReadableStream<string>;
21
+ /**
22
+ * Text stream that may contain plain strings or TimedString objects with timestamps.
23
+ */
24
+ textStream: ReadableStream<string | TimedString>;
21
25
  audioStream: ReadableStream<AudioFrame>;
22
26
  modalities?: Promise<('text' | 'audio')[]>;
23
27
  }
@@ -3,7 +3,9 @@
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
  import type { SentenceStream, SentenceTokenizer } from '../tokenize/index.js';
5
5
  import type { APIConnectOptions } from '../types.js';
6
+ import { USERDATA_TIMED_TRANSCRIPT } from '../types.js';
6
7
  import { Task } from '../utils.js';
8
+ import { createTimedString } from '../voice/io.js';
7
9
  import type { ChunkedStream } from './tts.js';
8
10
  import { SynthesizeStream, TTS } from './tts.js';
9
11
 
@@ -13,7 +15,7 @@ export class StreamAdapter extends TTS {
13
15
  label: string;
14
16
 
15
17
  constructor(tts: TTS, sentenceTokenizer: SentenceTokenizer) {
16
- super(tts.sampleRate, tts.numChannels, { streaming: true });
18
+ super(tts.sampleRate, tts.numChannels, { streaming: true, alignedTranscript: true });
17
19
  this.#tts = tts;
18
20
  this.#sentenceTokenizer = sentenceTokenizer;
19
21
  this.label = this.#tts.label;
@@ -53,6 +55,8 @@ export class StreamAdapterWrapper extends SynthesizeStream {
53
55
  }
54
56
 
55
57
  protected async run() {
58
+ let cumulativeDuration = 0;
59
+
56
60
  const forwardInput = async () => {
57
61
  for await (const input of this.input) {
58
62
  if (this.abortController.signal.aborted) break;
@@ -99,8 +103,26 @@ export class StreamAdapterWrapper extends SynthesizeStream {
99
103
  await prevTask?.result;
100
104
  if (controller.signal.aborted) return;
101
105
 
106
+ // Create a TimedString with the sentence text and current cumulative duration
107
+ const timedString = createTimedString({
108
+ text: token,
109
+ startTime: cumulativeDuration,
110
+ });
111
+
112
+ let isFirstFrame = true;
102
113
  for await (const audio of audioStream) {
103
114
  if (controller.signal.aborted) break;
115
+
116
+ // Attach the TimedString to the first frame of this sentence
117
+ if (isFirstFrame) {
118
+ audio.frame.userdata[USERDATA_TIMED_TRANSCRIPT] = [timedString];
119
+ isFirstFrame = false;
120
+ }
121
+
122
+ // Track cumulative duration
123
+ const frameDuration = audio.frame.samplesPerChannel / audio.frame.sampleRate;
124
+ cumulativeDuration += frameDuration;
125
+
104
126
  this.queue.put(audio);
105
127
  }
106
128
  };
package/src/tts/tts.ts CHANGED
@@ -13,8 +13,11 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
13
13
  import { recordException, traceTypes, tracer } from '../telemetry/index.js';
14
14
  import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
15
15
  import { AsyncIterableQueue, delay, mergeFrames, startSoon, toError } from '../utils.js';
16
+ import type { TimedString } from '../voice/io.js';
16
17
 
17
- /** SynthesizedAudio is a packet of speech synthesis as returned by the TTS. */
18
+ /**
19
+ * SynthesizedAudio is a packet of speech synthesis as returned by the TTS.
20
+ */
18
21
  export interface SynthesizedAudio {
19
22
  /** Request ID (one segment could be made up of multiple requests) */
20
23
  requestId: string;
@@ -26,6 +29,10 @@ export interface SynthesizedAudio {
26
29
  deltaText?: string;
27
30
  /** Whether this is the last frame of the segment (streaming only) */
28
31
  final: boolean;
32
+ /**
33
+ * Timed transcripts associated with this audio packet (word-level timestamps).
34
+ */
35
+ timedTranscripts?: TimedString[];
29
36
  }
30
37
 
31
38
  /**
@@ -37,6 +44,8 @@ export interface SynthesizedAudio {
37
44
  */
38
45
  export interface TTSCapabilities {
39
46
  streaming: boolean;
47
+ // Whether this TTS supports aligned transcripts (word-level timestamps).
48
+ alignedTranscript?: boolean;
40
49
  }
41
50
 
42
51
  export interface TTSError {
package/src/types.ts CHANGED
@@ -2,6 +2,11 @@
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ /**
6
+ * Key used to store timed transcripts in AudioFrame.userdata.
7
+ */
8
+ export const USERDATA_TIMED_TRANSCRIPT = 'lk.timed_transcripts';
9
+
5
10
  /**
6
11
  * Connection options for API calls, controlling retry and timeout behavior.
7
12
  */