@contractspec/lib.voice 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/dist/audio/audio-concatenator.d.ts +15 -0
  2. package/dist/audio/audio-concatenator.js +57 -0
  3. package/dist/audio/duration-estimator.d.ts +31 -0
  4. package/dist/audio/duration-estimator.js +22 -0
  5. package/dist/audio/format-converter.d.ts +17 -0
  6. package/dist/audio/format-converter.js +28 -0
  7. package/dist/audio/index.d.ts +4 -0
  8. package/dist/audio/index.js +121 -0
  9. package/dist/audio/silence-generator.d.ts +16 -0
  10. package/dist/audio/silence-generator.js +20 -0
  11. package/dist/browser/audio/audio-concatenator.js +56 -0
  12. package/dist/browser/audio/duration-estimator.js +21 -0
  13. package/dist/browser/audio/format-converter.js +27 -0
  14. package/dist/browser/audio/index.js +120 -0
  15. package/dist/browser/audio/silence-generator.js +19 -0
  16. package/dist/browser/conversational/index.js +241 -0
  17. package/dist/browser/conversational/response-orchestrator.js +62 -0
  18. package/dist/browser/conversational/transcript-builder.js +63 -0
  19. package/dist/browser/conversational/turn-detector.js +43 -0
  20. package/dist/browser/conversational/types.js +0 -0
  21. package/dist/browser/conversational/voice-session-manager.js +137 -0
  22. package/dist/browser/docs/conversational.docblock.js +5 -0
  23. package/dist/browser/docs/stt.docblock.js +5 -0
  24. package/dist/browser/docs/sync.docblock.js +5 -0
  25. package/dist/browser/docs/tts.docblock.js +5 -0
  26. package/dist/browser/docs/voice.docblock.js +5 -0
  27. package/dist/browser/i18n/catalogs/en.js +91 -0
  28. package/dist/browser/i18n/catalogs/es.js +91 -0
  29. package/dist/browser/i18n/catalogs/fr.js +91 -0
  30. package/dist/browser/i18n/catalogs/index.js +271 -0
  31. package/dist/browser/i18n/index.js +335 -0
  32. package/dist/browser/i18n/keys.js +38 -0
  33. package/dist/browser/i18n/locale.js +13 -0
  34. package/dist/browser/i18n/messages.js +283 -0
  35. package/dist/browser/index.js +1070 -0
  36. package/dist/browser/stt/diarization-mapper.js +42 -0
  37. package/dist/browser/stt/index.js +222 -0
  38. package/dist/browser/stt/segment-splitter.js +36 -0
  39. package/dist/browser/stt/subtitle-formatter.js +51 -0
  40. package/dist/browser/stt/transcriber.js +219 -0
  41. package/dist/browser/stt/types.js +0 -0
  42. package/dist/browser/sync/duration-negotiator.js +69 -0
  43. package/dist/browser/sync/index.js +165 -0
  44. package/dist/browser/sync/scene-adapter.js +52 -0
  45. package/dist/browser/sync/timing-calculator.js +46 -0
  46. package/dist/browser/tts/audio-assembler.js +120 -0
  47. package/dist/browser/tts/emphasis-planner.js +134 -0
  48. package/dist/browser/tts/index.js +439 -0
  49. package/dist/browser/tts/pace-analyzer.js +67 -0
  50. package/dist/browser/tts/segment-synthesizer.js +36 -0
  51. package/dist/browser/tts/types.js +0 -0
  52. package/dist/browser/tts/voice-synthesizer.js +435 -0
  53. package/dist/browser/types.js +0 -0
  54. package/dist/conversational/index.d.ts +5 -0
  55. package/dist/conversational/index.js +242 -0
  56. package/dist/conversational/response-orchestrator.d.ts +26 -0
  57. package/dist/conversational/response-orchestrator.js +63 -0
  58. package/dist/conversational/transcript-builder.d.ts +25 -0
  59. package/dist/conversational/transcript-builder.js +64 -0
  60. package/dist/conversational/turn-detector.d.ts +31 -0
  61. package/dist/conversational/turn-detector.js +44 -0
  62. package/dist/conversational/types.d.ts +55 -0
  63. package/dist/conversational/types.js +1 -0
  64. package/dist/conversational/voice-session-manager.d.ts +17 -0
  65. package/dist/conversational/voice-session-manager.js +138 -0
  66. package/dist/docs/conversational.docblock.d.ts +14 -0
  67. package/dist/docs/conversational.docblock.js +6 -0
  68. package/dist/docs/stt.docblock.d.ts +12 -0
  69. package/dist/docs/stt.docblock.js +6 -0
  70. package/dist/docs/sync.docblock.d.ts +12 -0
  71. package/dist/docs/sync.docblock.js +6 -0
  72. package/dist/docs/tts.docblock.d.ts +12 -0
  73. package/dist/docs/tts.docblock.js +6 -0
  74. package/dist/docs/voice.docblock.d.ts +22 -0
  75. package/dist/docs/voice.docblock.js +6 -0
  76. package/dist/i18n/catalogs/en.d.ts +6 -0
  77. package/dist/i18n/catalogs/en.js +92 -0
  78. package/dist/i18n/catalogs/es.d.ts +4 -0
  79. package/dist/i18n/catalogs/es.js +92 -0
  80. package/dist/i18n/catalogs/fr.d.ts +4 -0
  81. package/dist/i18n/catalogs/fr.js +92 -0
  82. package/dist/i18n/catalogs/index.d.ts +3 -0
  83. package/dist/i18n/catalogs/index.js +272 -0
  84. package/dist/i18n/index.d.ts +20 -0
  85. package/dist/i18n/index.js +336 -0
  86. package/dist/i18n/keys.d.ts +50 -0
  87. package/dist/i18n/keys.js +39 -0
  88. package/dist/i18n/locale.d.ts +6 -0
  89. package/dist/i18n/locale.js +14 -0
  90. package/dist/i18n/messages.d.ts +13 -0
  91. package/dist/i18n/messages.js +284 -0
  92. package/dist/index.d.ts +6 -0
  93. package/dist/index.js +1071 -0
  94. package/dist/node/audio/audio-concatenator.js +56 -0
  95. package/dist/node/audio/duration-estimator.js +21 -0
  96. package/dist/node/audio/format-converter.js +27 -0
  97. package/dist/node/audio/index.js +120 -0
  98. package/dist/node/audio/silence-generator.js +19 -0
  99. package/dist/node/conversational/index.js +241 -0
  100. package/dist/node/conversational/response-orchestrator.js +62 -0
  101. package/dist/node/conversational/transcript-builder.js +63 -0
  102. package/dist/node/conversational/turn-detector.js +43 -0
  103. package/dist/node/conversational/types.js +0 -0
  104. package/dist/node/conversational/voice-session-manager.js +137 -0
  105. package/dist/node/docs/conversational.docblock.js +5 -0
  106. package/dist/node/docs/stt.docblock.js +5 -0
  107. package/dist/node/docs/sync.docblock.js +5 -0
  108. package/dist/node/docs/tts.docblock.js +5 -0
  109. package/dist/node/docs/voice.docblock.js +5 -0
  110. package/dist/node/i18n/catalogs/en.js +91 -0
  111. package/dist/node/i18n/catalogs/es.js +91 -0
  112. package/dist/node/i18n/catalogs/fr.js +91 -0
  113. package/dist/node/i18n/catalogs/index.js +271 -0
  114. package/dist/node/i18n/index.js +335 -0
  115. package/dist/node/i18n/keys.js +38 -0
  116. package/dist/node/i18n/locale.js +13 -0
  117. package/dist/node/i18n/messages.js +283 -0
  118. package/dist/node/index.js +1070 -0
  119. package/dist/node/stt/diarization-mapper.js +42 -0
  120. package/dist/node/stt/index.js +222 -0
  121. package/dist/node/stt/segment-splitter.js +36 -0
  122. package/dist/node/stt/subtitle-formatter.js +51 -0
  123. package/dist/node/stt/transcriber.js +219 -0
  124. package/dist/node/stt/types.js +0 -0
  125. package/dist/node/sync/duration-negotiator.js +69 -0
  126. package/dist/node/sync/index.js +165 -0
  127. package/dist/node/sync/scene-adapter.js +52 -0
  128. package/dist/node/sync/timing-calculator.js +46 -0
  129. package/dist/node/tts/audio-assembler.js +120 -0
  130. package/dist/node/tts/emphasis-planner.js +134 -0
  131. package/dist/node/tts/index.js +439 -0
  132. package/dist/node/tts/pace-analyzer.js +67 -0
  133. package/dist/node/tts/segment-synthesizer.js +36 -0
  134. package/dist/node/tts/types.js +0 -0
  135. package/dist/node/tts/voice-synthesizer.js +435 -0
  136. package/dist/node/types.js +0 -0
  137. package/dist/stt/diarization-mapper.d.ts +19 -0
  138. package/dist/stt/diarization-mapper.js +43 -0
  139. package/dist/stt/index.d.ts +5 -0
  140. package/dist/stt/index.js +223 -0
  141. package/dist/stt/segment-splitter.d.ts +19 -0
  142. package/dist/stt/segment-splitter.js +37 -0
  143. package/dist/stt/subtitle-formatter.d.ts +19 -0
  144. package/dist/stt/subtitle-formatter.js +52 -0
  145. package/dist/stt/transcriber.d.ts +21 -0
  146. package/dist/stt/transcriber.js +220 -0
  147. package/dist/stt/types.d.ts +44 -0
  148. package/dist/stt/types.js +1 -0
  149. package/dist/sync/duration-negotiator.d.ts +37 -0
  150. package/dist/sync/duration-negotiator.js +70 -0
  151. package/dist/sync/index.d.ts +3 -0
  152. package/dist/sync/index.js +166 -0
  153. package/dist/sync/scene-adapter.d.ts +29 -0
  154. package/dist/sync/scene-adapter.js +53 -0
  155. package/dist/sync/timing-calculator.d.ts +21 -0
  156. package/dist/sync/timing-calculator.js +47 -0
  157. package/dist/tts/audio-assembler.d.ts +19 -0
  158. package/dist/tts/audio-assembler.js +121 -0
  159. package/dist/tts/emphasis-planner.d.ts +24 -0
  160. package/dist/tts/emphasis-planner.js +135 -0
  161. package/dist/tts/index.d.ts +6 -0
  162. package/dist/tts/index.js +440 -0
  163. package/dist/tts/pace-analyzer.d.ts +30 -0
  164. package/dist/tts/pace-analyzer.js +68 -0
  165. package/dist/tts/segment-synthesizer.d.ts +21 -0
  166. package/dist/tts/segment-synthesizer.js +37 -0
  167. package/dist/tts/types.d.ts +76 -0
  168. package/dist/tts/types.js +1 -0
  169. package/dist/tts/voice-synthesizer.d.ts +28 -0
  170. package/dist/tts/voice-synthesizer.js +436 -0
  171. package/dist/types.d.ts +12 -0
  172. package/dist/types.js +1 -0
  173. package/package.json +760 -0
@@ -0,0 +1,56 @@
1
+ // src/audio/audio-concatenator.ts
2
+ class AudioConcatenator {
3
+ concatenate(segments) {
4
+ if (segments.length === 0) {
5
+ return {
6
+ data: new Uint8Array(0),
7
+ format: "wav",
8
+ sampleRateHz: 44100,
9
+ durationMs: 0,
10
+ channels: 1
11
+ };
12
+ }
13
+ const [firstSegment] = segments;
14
+ if (!firstSegment) {
15
+ return {
16
+ data: new Uint8Array(0),
17
+ format: "wav",
18
+ sampleRateHz: 44100,
19
+ durationMs: 0,
20
+ channels: 1
21
+ };
22
+ }
23
+ if (segments.length === 1) {
24
+ return { ...firstSegment };
25
+ }
26
+ const referenceFormat = firstSegment.format;
27
+ const referenceSampleRate = firstSegment.sampleRateHz;
28
+ const referenceChannels = firstSegment.channels ?? 1;
29
+ for (const seg of segments) {
30
+ if (seg.format !== referenceFormat) {
31
+ throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
32
+ }
33
+ if (seg.sampleRateHz !== referenceSampleRate) {
34
+ throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
35
+ }
36
+ }
37
+ const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
38
+ const combined = new Uint8Array(totalBytes);
39
+ let offset = 0;
40
+ for (const seg of segments) {
41
+ combined.set(seg.data, offset);
42
+ offset += seg.data.length;
43
+ }
44
+ const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
45
+ return {
46
+ data: combined,
47
+ format: referenceFormat,
48
+ sampleRateHz: referenceSampleRate,
49
+ durationMs: totalDurationMs,
50
+ channels: referenceChannels
51
+ };
52
+ }
53
+ }
54
+ export {
55
+ AudioConcatenator
56
+ };
@@ -0,0 +1,21 @@
1
+ // src/audio/duration-estimator.ts
2
+ class DurationEstimator {
3
+ static DEFAULT_WPM = 150;
4
+ estimateSeconds(text, wordsPerMinute) {
5
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
6
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
7
+ return Math.ceil(wordCount / wpm * 60);
8
+ }
9
+ estimateMs(text, wordsPerMinute) {
10
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
11
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
12
+ return Math.ceil(wordCount / wpm * 60 * 1000);
13
+ }
14
+ estimateWordCount(durationSeconds, wordsPerMinute) {
15
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
16
+ return Math.round(durationSeconds / 60 * wpm);
17
+ }
18
+ }
19
+ export {
20
+ DurationEstimator
21
+ };
@@ -0,0 +1,27 @@
1
+ // src/audio/format-converter.ts
2
+ class FormatConverter {
3
+ convert(audio, targetFormat) {
4
+ if (audio.format === targetFormat) {
5
+ return audio;
6
+ }
7
+ return {
8
+ ...audio,
9
+ format: targetFormat
10
+ };
11
+ }
12
+ isSupported(from, to) {
13
+ if (from === to)
14
+ return true;
15
+ const supportedPaths = {
16
+ wav: ["mp3", "ogg", "pcm", "opus"],
17
+ mp3: ["wav"],
18
+ ogg: ["wav"],
19
+ pcm: ["wav"],
20
+ opus: ["wav"]
21
+ };
22
+ return supportedPaths[from]?.includes(to) ?? false;
23
+ }
24
+ }
25
+ export {
26
+ FormatConverter
27
+ };
@@ -0,0 +1,120 @@
1
+ // src/audio/audio-concatenator.ts
2
+ class AudioConcatenator {
3
+ concatenate(segments) {
4
+ if (segments.length === 0) {
5
+ return {
6
+ data: new Uint8Array(0),
7
+ format: "wav",
8
+ sampleRateHz: 44100,
9
+ durationMs: 0,
10
+ channels: 1
11
+ };
12
+ }
13
+ const [firstSegment] = segments;
14
+ if (!firstSegment) {
15
+ return {
16
+ data: new Uint8Array(0),
17
+ format: "wav",
18
+ sampleRateHz: 44100,
19
+ durationMs: 0,
20
+ channels: 1
21
+ };
22
+ }
23
+ if (segments.length === 1) {
24
+ return { ...firstSegment };
25
+ }
26
+ const referenceFormat = firstSegment.format;
27
+ const referenceSampleRate = firstSegment.sampleRateHz;
28
+ const referenceChannels = firstSegment.channels ?? 1;
29
+ for (const seg of segments) {
30
+ if (seg.format !== referenceFormat) {
31
+ throw new Error(`Format mismatch: expected ${referenceFormat}, got ${seg.format}`);
32
+ }
33
+ if (seg.sampleRateHz !== referenceSampleRate) {
34
+ throw new Error(`Sample rate mismatch: expected ${referenceSampleRate}, got ${seg.sampleRateHz}`);
35
+ }
36
+ }
37
+ const totalBytes = segments.reduce((sum, s) => sum + s.data.length, 0);
38
+ const combined = new Uint8Array(totalBytes);
39
+ let offset = 0;
40
+ for (const seg of segments) {
41
+ combined.set(seg.data, offset);
42
+ offset += seg.data.length;
43
+ }
44
+ const totalDurationMs = segments.reduce((sum, s) => sum + (s.durationMs ?? 0), 0);
45
+ return {
46
+ data: combined,
47
+ format: referenceFormat,
48
+ sampleRateHz: referenceSampleRate,
49
+ durationMs: totalDurationMs,
50
+ channels: referenceChannels
51
+ };
52
+ }
53
+ }
54
+
55
+ // src/audio/duration-estimator.ts
56
+ class DurationEstimator {
57
+ static DEFAULT_WPM = 150;
58
+ estimateSeconds(text, wordsPerMinute) {
59
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
60
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
61
+ return Math.ceil(wordCount / wpm * 60);
62
+ }
63
+ estimateMs(text, wordsPerMinute) {
64
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
65
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
66
+ return Math.ceil(wordCount / wpm * 60 * 1000);
67
+ }
68
+ estimateWordCount(durationSeconds, wordsPerMinute) {
69
+ const wpm = wordsPerMinute ?? DurationEstimator.DEFAULT_WPM;
70
+ return Math.round(durationSeconds / 60 * wpm);
71
+ }
72
+ }
73
+
74
+ // src/audio/format-converter.ts
75
+ class FormatConverter {
76
+ convert(audio, targetFormat) {
77
+ if (audio.format === targetFormat) {
78
+ return audio;
79
+ }
80
+ return {
81
+ ...audio,
82
+ format: targetFormat
83
+ };
84
+ }
85
+ isSupported(from, to) {
86
+ if (from === to)
87
+ return true;
88
+ const supportedPaths = {
89
+ wav: ["mp3", "ogg", "pcm", "opus"],
90
+ mp3: ["wav"],
91
+ ogg: ["wav"],
92
+ pcm: ["wav"],
93
+ opus: ["wav"]
94
+ };
95
+ return supportedPaths[from]?.includes(to) ?? false;
96
+ }
97
+ }
98
+
99
+ // src/audio/silence-generator.ts
100
+ class SilenceGenerator {
101
+ generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
102
+ const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
103
+ const bytesPerSample = 2;
104
+ const dataSize = totalSamples * bytesPerSample * channels;
105
+ const data = new Uint8Array(dataSize);
106
+ return {
107
+ data,
108
+ format,
109
+ sampleRateHz,
110
+ durationMs,
111
+ channels
112
+ };
113
+ }
114
+ }
115
+ export {
116
+ SilenceGenerator,
117
+ FormatConverter,
118
+ DurationEstimator,
119
+ AudioConcatenator
120
+ };
@@ -0,0 +1,19 @@
1
+ // src/audio/silence-generator.ts
2
+ class SilenceGenerator {
3
+ generate(durationMs, format = "wav", sampleRateHz = 44100, channels = 1) {
4
+ const totalSamples = Math.ceil(sampleRateHz * durationMs / 1000);
5
+ const bytesPerSample = 2;
6
+ const dataSize = totalSamples * bytesPerSample * channels;
7
+ const data = new Uint8Array(dataSize);
8
+ return {
9
+ data,
10
+ format,
11
+ sampleRateHz,
12
+ durationMs,
13
+ channels
14
+ };
15
+ }
16
+ }
17
+ export {
18
+ SilenceGenerator
19
+ };
@@ -0,0 +1,241 @@
1
+ // src/conversational/transcript-builder.ts
2
+ class TranscriptBuilder {
3
+ turns = [];
4
+ currentTurn = null;
5
+ sessionStartMs = Date.now();
6
+ getTranscript() {
7
+ return [...this.turns];
8
+ }
9
+ toText() {
10
+ return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
11
+ `);
12
+ }
13
+ getTurnCount() {
14
+ return this.turns.length;
15
+ }
16
+ processEvent(event) {
17
+ switch (event.type) {
18
+ case "session_started":
19
+ this.sessionStartMs = Date.now();
20
+ break;
21
+ case "user_speech_started":
22
+ this.currentTurn = {
23
+ role: "user",
24
+ startMs: Date.now() - this.sessionStartMs
25
+ };
26
+ break;
27
+ case "user_speech_ended":
28
+ if (this.currentTurn && this.currentTurn.role === "user") {
29
+ this.currentTurn.text = event.transcript;
30
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
31
+ this.turns.push(this.currentTurn);
32
+ this.currentTurn = null;
33
+ }
34
+ break;
35
+ case "agent_speech_started":
36
+ this.currentTurn = {
37
+ role: "agent",
38
+ text: event.text,
39
+ startMs: Date.now() - this.sessionStartMs
40
+ };
41
+ break;
42
+ case "agent_speech_ended":
43
+ if (this.currentTurn && this.currentTurn.role === "agent") {
44
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
45
+ this.turns.push(this.currentTurn);
46
+ this.currentTurn = null;
47
+ }
48
+ break;
49
+ case "transcript":
50
+ break;
51
+ default:
52
+ break;
53
+ }
54
+ }
55
+ reset() {
56
+ this.turns.length = 0;
57
+ this.currentTurn = null;
58
+ this.sessionStartMs = Date.now();
59
+ }
60
+ }
61
+
62
+ // src/conversational/voice-session-manager.ts
63
+ class VoiceSessionManager {
64
+ provider;
65
+ constructor(options) {
66
+ this.provider = options.conversational;
67
+ }
68
+ async startSession(config) {
69
+ const transcriptBuilder = new TranscriptBuilder;
70
+ const session = await this.provider.startSession({
71
+ voiceId: config.voiceId,
72
+ language: config.language,
73
+ systemPrompt: config.systemPrompt,
74
+ llmModel: config.llmModel,
75
+ inputFormat: config.inputFormat,
76
+ outputFormat: config.outputFormat,
77
+ turnDetection: config.turnDetection,
78
+ silenceThresholdMs: config.silenceThresholdMs,
79
+ maxDurationSeconds: config.maxDurationSeconds
80
+ });
81
+ const state = {
82
+ sessionId: "",
83
+ status: "connecting",
84
+ currentTurn: "idle",
85
+ turnCount: 0,
86
+ durationMs: 0,
87
+ transcript: []
88
+ };
89
+ const wrappedEvents = this.wrapEvents(session.events, state, transcriptBuilder);
90
+ return {
91
+ state,
92
+ sendAudio: (chunk) => session.sendAudio(chunk),
93
+ sendText: (text) => session.sendText(text),
94
+ interrupt: () => session.interrupt(),
95
+ close: async () => {
96
+ const summary = await session.close();
97
+ state.status = "ended";
98
+ return summary;
99
+ },
100
+ events: wrappedEvents
101
+ };
102
+ }
103
+ async* wrapEvents(events, state, transcriptBuilder) {
104
+ for await (const event of events) {
105
+ transcriptBuilder.processEvent(event);
106
+ switch (event.type) {
107
+ case "session_started":
108
+ state.sessionId = event.sessionId;
109
+ state.status = "active";
110
+ break;
111
+ case "user_speech_started":
112
+ state.currentTurn = "user";
113
+ break;
114
+ case "user_speech_ended":
115
+ state.currentTurn = "idle";
116
+ state.turnCount += 1;
117
+ break;
118
+ case "agent_speech_started":
119
+ state.currentTurn = "agent";
120
+ break;
121
+ case "agent_speech_ended":
122
+ state.currentTurn = "idle";
123
+ state.turnCount += 1;
124
+ break;
125
+ case "session_ended":
126
+ state.status = "ended";
127
+ state.durationMs = event.durationMs;
128
+ break;
129
+ }
130
+ state.transcript = transcriptBuilder.getTranscript();
131
+ yield event;
132
+ }
133
+ }
134
+ }
135
+
136
+ // src/conversational/turn-detector.ts
137
+ class TurnDetector {
138
+ silenceThresholdMs;
139
+ energyThreshold;
140
+ silenceStartMs = null;
141
+ constructor(silenceThresholdMs = 800, energyThreshold = 0.01) {
142
+ this.silenceThresholdMs = silenceThresholdMs;
143
+ this.energyThreshold = energyThreshold;
144
+ }
145
+ processChunk(chunk, timestampMs) {
146
+ const energy = this.calculateEnergy(chunk);
147
+ const isSpeech = energy > this.energyThreshold;
148
+ if (isSpeech) {
149
+ this.silenceStartMs = null;
150
+ return false;
151
+ }
152
+ if (this.silenceStartMs === null) {
153
+ this.silenceStartMs = timestampMs;
154
+ }
155
+ const silenceDurationMs = timestampMs - this.silenceStartMs;
156
+ return silenceDurationMs >= this.silenceThresholdMs;
157
+ }
158
+ reset() {
159
+ this.silenceStartMs = null;
160
+ }
161
+ calculateEnergy(chunk) {
162
+ if (chunk.length < 2)
163
+ return 0;
164
+ let sum = 0;
165
+ const sampleCount = Math.floor(chunk.length / 2);
166
+ for (let i = 0;i < chunk.length - 1; i += 2) {
167
+ const low = chunk[i] ?? 0;
168
+ const high = chunk[i + 1] ?? 0;
169
+ const sample = (low | high << 8) << 16 >> 16;
170
+ const normalized = sample / 32768;
171
+ sum += normalized * normalized;
172
+ }
173
+ return Math.sqrt(sum / sampleCount);
174
+ }
175
+ }
176
+
177
+ // src/conversational/response-orchestrator.ts
178
+ class ResponseOrchestrator {
179
+ stt;
180
+ llm;
181
+ tts;
182
+ conversationHistory = [];
183
+ constructor(stt, llm, tts) {
184
+ this.stt = stt;
185
+ this.llm = llm;
186
+ this.tts = tts;
187
+ }
188
+ async* processUserTurn(userAudio, config) {
189
+ const transcription = await this.stt.transcribe({
190
+ audio: userAudio,
191
+ language: config.language,
192
+ wordTimestamps: false
193
+ });
194
+ const userText = transcription.text;
195
+ yield { type: "user_speech_ended", transcript: userText };
196
+ yield {
197
+ type: "transcript",
198
+ role: "user",
199
+ text: userText,
200
+ timestamp: Date.now()
201
+ };
202
+ this.conversationHistory.push({ role: "user", content: userText });
203
+ const llmResponse = await this.llm.chat([
204
+ {
205
+ role: "system",
206
+ content: [{ type: "text", text: config.systemPrompt }]
207
+ },
208
+ ...this.conversationHistory.map((msg) => ({
209
+ role: msg.role,
210
+ content: [{ type: "text", text: msg.content }]
211
+ }))
212
+ ], { model: config.llmModel });
213
+ const responseText = llmResponse.message.content.find((p) => p.type === "text");
214
+ const agentText = responseText && responseText.type === "text" ? responseText.text : "I apologize, I could not generate a response.";
215
+ this.conversationHistory.push({ role: "assistant", content: agentText });
216
+ yield { type: "agent_speech_started", text: agentText };
217
+ const synthesis = await this.tts.synthesize({
218
+ text: agentText,
219
+ voiceId: config.voiceId,
220
+ language: config.language,
221
+ format: config.outputFormat
222
+ });
223
+ yield { type: "agent_audio", audio: synthesis.audio.data };
224
+ yield { type: "agent_speech_ended" };
225
+ yield {
226
+ type: "transcript",
227
+ role: "agent",
228
+ text: agentText,
229
+ timestamp: Date.now()
230
+ };
231
+ }
232
+ reset() {
233
+ this.conversationHistory.length = 0;
234
+ }
235
+ }
236
+ export {
237
+ VoiceSessionManager,
238
+ TurnDetector,
239
+ TranscriptBuilder,
240
+ ResponseOrchestrator
241
+ };
@@ -0,0 +1,62 @@
1
+ // src/conversational/response-orchestrator.ts
2
+ class ResponseOrchestrator {
3
+ stt;
4
+ llm;
5
+ tts;
6
+ conversationHistory = [];
7
+ constructor(stt, llm, tts) {
8
+ this.stt = stt;
9
+ this.llm = llm;
10
+ this.tts = tts;
11
+ }
12
+ async* processUserTurn(userAudio, config) {
13
+ const transcription = await this.stt.transcribe({
14
+ audio: userAudio,
15
+ language: config.language,
16
+ wordTimestamps: false
17
+ });
18
+ const userText = transcription.text;
19
+ yield { type: "user_speech_ended", transcript: userText };
20
+ yield {
21
+ type: "transcript",
22
+ role: "user",
23
+ text: userText,
24
+ timestamp: Date.now()
25
+ };
26
+ this.conversationHistory.push({ role: "user", content: userText });
27
+ const llmResponse = await this.llm.chat([
28
+ {
29
+ role: "system",
30
+ content: [{ type: "text", text: config.systemPrompt }]
31
+ },
32
+ ...this.conversationHistory.map((msg) => ({
33
+ role: msg.role,
34
+ content: [{ type: "text", text: msg.content }]
35
+ }))
36
+ ], { model: config.llmModel });
37
+ const responseText = llmResponse.message.content.find((p) => p.type === "text");
38
+ const agentText = responseText && responseText.type === "text" ? responseText.text : "I apologize, I could not generate a response.";
39
+ this.conversationHistory.push({ role: "assistant", content: agentText });
40
+ yield { type: "agent_speech_started", text: agentText };
41
+ const synthesis = await this.tts.synthesize({
42
+ text: agentText,
43
+ voiceId: config.voiceId,
44
+ language: config.language,
45
+ format: config.outputFormat
46
+ });
47
+ yield { type: "agent_audio", audio: synthesis.audio.data };
48
+ yield { type: "agent_speech_ended" };
49
+ yield {
50
+ type: "transcript",
51
+ role: "agent",
52
+ text: agentText,
53
+ timestamp: Date.now()
54
+ };
55
+ }
56
+ reset() {
57
+ this.conversationHistory.length = 0;
58
+ }
59
+ }
60
+ export {
61
+ ResponseOrchestrator
62
+ };
@@ -0,0 +1,63 @@
1
+ // src/conversational/transcript-builder.ts
2
+ class TranscriptBuilder {
3
+ turns = [];
4
+ currentTurn = null;
5
+ sessionStartMs = Date.now();
6
+ getTranscript() {
7
+ return [...this.turns];
8
+ }
9
+ toText() {
10
+ return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
11
+ `);
12
+ }
13
+ getTurnCount() {
14
+ return this.turns.length;
15
+ }
16
+ processEvent(event) {
17
+ switch (event.type) {
18
+ case "session_started":
19
+ this.sessionStartMs = Date.now();
20
+ break;
21
+ case "user_speech_started":
22
+ this.currentTurn = {
23
+ role: "user",
24
+ startMs: Date.now() - this.sessionStartMs
25
+ };
26
+ break;
27
+ case "user_speech_ended":
28
+ if (this.currentTurn && this.currentTurn.role === "user") {
29
+ this.currentTurn.text = event.transcript;
30
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
31
+ this.turns.push(this.currentTurn);
32
+ this.currentTurn = null;
33
+ }
34
+ break;
35
+ case "agent_speech_started":
36
+ this.currentTurn = {
37
+ role: "agent",
38
+ text: event.text,
39
+ startMs: Date.now() - this.sessionStartMs
40
+ };
41
+ break;
42
+ case "agent_speech_ended":
43
+ if (this.currentTurn && this.currentTurn.role === "agent") {
44
+ this.currentTurn.endMs = Date.now() - this.sessionStartMs;
45
+ this.turns.push(this.currentTurn);
46
+ this.currentTurn = null;
47
+ }
48
+ break;
49
+ case "transcript":
50
+ break;
51
+ default:
52
+ break;
53
+ }
54
+ }
55
+ reset() {
56
+ this.turns.length = 0;
57
+ this.currentTurn = null;
58
+ this.sessionStartMs = Date.now();
59
+ }
60
+ }
61
+ export {
62
+ TranscriptBuilder
63
+ };
@@ -0,0 +1,43 @@
1
+ // src/conversational/turn-detector.ts
2
+ class TurnDetector {
3
+ silenceThresholdMs;
4
+ energyThreshold;
5
+ silenceStartMs = null;
6
+ constructor(silenceThresholdMs = 800, energyThreshold = 0.01) {
7
+ this.silenceThresholdMs = silenceThresholdMs;
8
+ this.energyThreshold = energyThreshold;
9
+ }
10
+ processChunk(chunk, timestampMs) {
11
+ const energy = this.calculateEnergy(chunk);
12
+ const isSpeech = energy > this.energyThreshold;
13
+ if (isSpeech) {
14
+ this.silenceStartMs = null;
15
+ return false;
16
+ }
17
+ if (this.silenceStartMs === null) {
18
+ this.silenceStartMs = timestampMs;
19
+ }
20
+ const silenceDurationMs = timestampMs - this.silenceStartMs;
21
+ return silenceDurationMs >= this.silenceThresholdMs;
22
+ }
23
+ reset() {
24
+ this.silenceStartMs = null;
25
+ }
26
+ calculateEnergy(chunk) {
27
+ if (chunk.length < 2)
28
+ return 0;
29
+ let sum = 0;
30
+ const sampleCount = Math.floor(chunk.length / 2);
31
+ for (let i = 0;i < chunk.length - 1; i += 2) {
32
+ const low = chunk[i] ?? 0;
33
+ const high = chunk[i + 1] ?? 0;
34
+ const sample = (low | high << 8) << 16 >> 16;
35
+ const normalized = sample / 32768;
36
+ sum += normalized * normalized;
37
+ }
38
+ return Math.sqrt(sum / sampleCount);
39
+ }
40
+ }
41
+ export {
42
+ TurnDetector
43
+ };
File without changes