@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
// src/conversational/transcript-builder.ts
|
|
2
|
+
class TranscriptBuilder {
|
|
3
|
+
turns = [];
|
|
4
|
+
currentTurn = null;
|
|
5
|
+
sessionStartMs = Date.now();
|
|
6
|
+
getTranscript() {
|
|
7
|
+
return [...this.turns];
|
|
8
|
+
}
|
|
9
|
+
toText() {
|
|
10
|
+
return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
|
|
11
|
+
`);
|
|
12
|
+
}
|
|
13
|
+
getTurnCount() {
|
|
14
|
+
return this.turns.length;
|
|
15
|
+
}
|
|
16
|
+
processEvent(event) {
|
|
17
|
+
switch (event.type) {
|
|
18
|
+
case "session_started":
|
|
19
|
+
this.sessionStartMs = Date.now();
|
|
20
|
+
break;
|
|
21
|
+
case "user_speech_started":
|
|
22
|
+
this.currentTurn = {
|
|
23
|
+
role: "user",
|
|
24
|
+
startMs: Date.now() - this.sessionStartMs
|
|
25
|
+
};
|
|
26
|
+
break;
|
|
27
|
+
case "user_speech_ended":
|
|
28
|
+
if (this.currentTurn && this.currentTurn.role === "user") {
|
|
29
|
+
this.currentTurn.text = event.transcript;
|
|
30
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
31
|
+
this.turns.push(this.currentTurn);
|
|
32
|
+
this.currentTurn = null;
|
|
33
|
+
}
|
|
34
|
+
break;
|
|
35
|
+
case "agent_speech_started":
|
|
36
|
+
this.currentTurn = {
|
|
37
|
+
role: "agent",
|
|
38
|
+
text: event.text,
|
|
39
|
+
startMs: Date.now() - this.sessionStartMs
|
|
40
|
+
};
|
|
41
|
+
break;
|
|
42
|
+
case "agent_speech_ended":
|
|
43
|
+
if (this.currentTurn && this.currentTurn.role === "agent") {
|
|
44
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
45
|
+
this.turns.push(this.currentTurn);
|
|
46
|
+
this.currentTurn = null;
|
|
47
|
+
}
|
|
48
|
+
break;
|
|
49
|
+
case "transcript":
|
|
50
|
+
break;
|
|
51
|
+
default:
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
reset() {
|
|
56
|
+
this.turns.length = 0;
|
|
57
|
+
this.currentTurn = null;
|
|
58
|
+
this.sessionStartMs = Date.now();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// src/conversational/voice-session-manager.ts
|
|
63
|
+
class VoiceSessionManager {
|
|
64
|
+
provider;
|
|
65
|
+
constructor(options) {
|
|
66
|
+
this.provider = options.conversational;
|
|
67
|
+
}
|
|
68
|
+
async startSession(config) {
|
|
69
|
+
const transcriptBuilder = new TranscriptBuilder;
|
|
70
|
+
const session = await this.provider.startSession({
|
|
71
|
+
voiceId: config.voiceId,
|
|
72
|
+
language: config.language,
|
|
73
|
+
systemPrompt: config.systemPrompt,
|
|
74
|
+
llmModel: config.llmModel,
|
|
75
|
+
inputFormat: config.inputFormat,
|
|
76
|
+
outputFormat: config.outputFormat,
|
|
77
|
+
turnDetection: config.turnDetection,
|
|
78
|
+
silenceThresholdMs: config.silenceThresholdMs,
|
|
79
|
+
maxDurationSeconds: config.maxDurationSeconds
|
|
80
|
+
});
|
|
81
|
+
const state = {
|
|
82
|
+
sessionId: "",
|
|
83
|
+
status: "connecting",
|
|
84
|
+
currentTurn: "idle",
|
|
85
|
+
turnCount: 0,
|
|
86
|
+
durationMs: 0,
|
|
87
|
+
transcript: []
|
|
88
|
+
};
|
|
89
|
+
const wrappedEvents = this.wrapEvents(session.events, state, transcriptBuilder);
|
|
90
|
+
return {
|
|
91
|
+
state,
|
|
92
|
+
sendAudio: (chunk) => session.sendAudio(chunk),
|
|
93
|
+
sendText: (text) => session.sendText(text),
|
|
94
|
+
interrupt: () => session.interrupt(),
|
|
95
|
+
close: async () => {
|
|
96
|
+
const summary = await session.close();
|
|
97
|
+
state.status = "ended";
|
|
98
|
+
return summary;
|
|
99
|
+
},
|
|
100
|
+
events: wrappedEvents
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
async* wrapEvents(events, state, transcriptBuilder) {
|
|
104
|
+
for await (const event of events) {
|
|
105
|
+
transcriptBuilder.processEvent(event);
|
|
106
|
+
switch (event.type) {
|
|
107
|
+
case "session_started":
|
|
108
|
+
state.sessionId = event.sessionId;
|
|
109
|
+
state.status = "active";
|
|
110
|
+
break;
|
|
111
|
+
case "user_speech_started":
|
|
112
|
+
state.currentTurn = "user";
|
|
113
|
+
break;
|
|
114
|
+
case "user_speech_ended":
|
|
115
|
+
state.currentTurn = "idle";
|
|
116
|
+
state.turnCount += 1;
|
|
117
|
+
break;
|
|
118
|
+
case "agent_speech_started":
|
|
119
|
+
state.currentTurn = "agent";
|
|
120
|
+
break;
|
|
121
|
+
case "agent_speech_ended":
|
|
122
|
+
state.currentTurn = "idle";
|
|
123
|
+
state.turnCount += 1;
|
|
124
|
+
break;
|
|
125
|
+
case "session_ended":
|
|
126
|
+
state.status = "ended";
|
|
127
|
+
state.durationMs = event.durationMs;
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
state.transcript = transcriptBuilder.getTranscript();
|
|
131
|
+
yield event;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// src/conversational/turn-detector.ts
|
|
137
|
+
class TurnDetector {
|
|
138
|
+
silenceThresholdMs;
|
|
139
|
+
energyThreshold;
|
|
140
|
+
silenceStartMs = null;
|
|
141
|
+
constructor(silenceThresholdMs = 800, energyThreshold = 0.01) {
|
|
142
|
+
this.silenceThresholdMs = silenceThresholdMs;
|
|
143
|
+
this.energyThreshold = energyThreshold;
|
|
144
|
+
}
|
|
145
|
+
processChunk(chunk, timestampMs) {
|
|
146
|
+
const energy = this.calculateEnergy(chunk);
|
|
147
|
+
const isSpeech = energy > this.energyThreshold;
|
|
148
|
+
if (isSpeech) {
|
|
149
|
+
this.silenceStartMs = null;
|
|
150
|
+
return false;
|
|
151
|
+
}
|
|
152
|
+
if (this.silenceStartMs === null) {
|
|
153
|
+
this.silenceStartMs = timestampMs;
|
|
154
|
+
}
|
|
155
|
+
const silenceDurationMs = timestampMs - this.silenceStartMs;
|
|
156
|
+
return silenceDurationMs >= this.silenceThresholdMs;
|
|
157
|
+
}
|
|
158
|
+
reset() {
|
|
159
|
+
this.silenceStartMs = null;
|
|
160
|
+
}
|
|
161
|
+
calculateEnergy(chunk) {
|
|
162
|
+
if (chunk.length < 2)
|
|
163
|
+
return 0;
|
|
164
|
+
let sum = 0;
|
|
165
|
+
const sampleCount = Math.floor(chunk.length / 2);
|
|
166
|
+
for (let i = 0;i < chunk.length - 1; i += 2) {
|
|
167
|
+
const low = chunk[i] ?? 0;
|
|
168
|
+
const high = chunk[i + 1] ?? 0;
|
|
169
|
+
const sample = (low | high << 8) << 16 >> 16;
|
|
170
|
+
const normalized = sample / 32768;
|
|
171
|
+
sum += normalized * normalized;
|
|
172
|
+
}
|
|
173
|
+
return Math.sqrt(sum / sampleCount);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// src/conversational/response-orchestrator.ts
|
|
178
|
+
class ResponseOrchestrator {
|
|
179
|
+
stt;
|
|
180
|
+
llm;
|
|
181
|
+
tts;
|
|
182
|
+
conversationHistory = [];
|
|
183
|
+
constructor(stt, llm, tts) {
|
|
184
|
+
this.stt = stt;
|
|
185
|
+
this.llm = llm;
|
|
186
|
+
this.tts = tts;
|
|
187
|
+
}
|
|
188
|
+
async* processUserTurn(userAudio, config) {
|
|
189
|
+
const transcription = await this.stt.transcribe({
|
|
190
|
+
audio: userAudio,
|
|
191
|
+
language: config.language,
|
|
192
|
+
wordTimestamps: false
|
|
193
|
+
});
|
|
194
|
+
const userText = transcription.text;
|
|
195
|
+
yield { type: "user_speech_ended", transcript: userText };
|
|
196
|
+
yield {
|
|
197
|
+
type: "transcript",
|
|
198
|
+
role: "user",
|
|
199
|
+
text: userText,
|
|
200
|
+
timestamp: Date.now()
|
|
201
|
+
};
|
|
202
|
+
this.conversationHistory.push({ role: "user", content: userText });
|
|
203
|
+
const llmResponse = await this.llm.chat([
|
|
204
|
+
{
|
|
205
|
+
role: "system",
|
|
206
|
+
content: [{ type: "text", text: config.systemPrompt }]
|
|
207
|
+
},
|
|
208
|
+
...this.conversationHistory.map((msg) => ({
|
|
209
|
+
role: msg.role,
|
|
210
|
+
content: [{ type: "text", text: msg.content }]
|
|
211
|
+
}))
|
|
212
|
+
], { model: config.llmModel });
|
|
213
|
+
const responseText = llmResponse.message.content.find((p) => p.type === "text");
|
|
214
|
+
const agentText = responseText && responseText.type === "text" ? responseText.text : "I apologize, I could not generate a response.";
|
|
215
|
+
this.conversationHistory.push({ role: "assistant", content: agentText });
|
|
216
|
+
yield { type: "agent_speech_started", text: agentText };
|
|
217
|
+
const synthesis = await this.tts.synthesize({
|
|
218
|
+
text: agentText,
|
|
219
|
+
voiceId: config.voiceId,
|
|
220
|
+
language: config.language,
|
|
221
|
+
format: config.outputFormat
|
|
222
|
+
});
|
|
223
|
+
yield { type: "agent_audio", audio: synthesis.audio.data };
|
|
224
|
+
yield { type: "agent_speech_ended" };
|
|
225
|
+
yield {
|
|
226
|
+
type: "transcript",
|
|
227
|
+
role: "agent",
|
|
228
|
+
text: agentText,
|
|
229
|
+
timestamp: Date.now()
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
reset() {
|
|
233
|
+
this.conversationHistory.length = 0;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
export {
|
|
237
|
+
VoiceSessionManager,
|
|
238
|
+
TurnDetector,
|
|
239
|
+
TranscriptBuilder,
|
|
240
|
+
ResponseOrchestrator
|
|
241
|
+
};
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
// src/conversational/response-orchestrator.ts
|
|
2
|
+
class ResponseOrchestrator {
|
|
3
|
+
stt;
|
|
4
|
+
llm;
|
|
5
|
+
tts;
|
|
6
|
+
conversationHistory = [];
|
|
7
|
+
constructor(stt, llm, tts) {
|
|
8
|
+
this.stt = stt;
|
|
9
|
+
this.llm = llm;
|
|
10
|
+
this.tts = tts;
|
|
11
|
+
}
|
|
12
|
+
async* processUserTurn(userAudio, config) {
|
|
13
|
+
const transcription = await this.stt.transcribe({
|
|
14
|
+
audio: userAudio,
|
|
15
|
+
language: config.language,
|
|
16
|
+
wordTimestamps: false
|
|
17
|
+
});
|
|
18
|
+
const userText = transcription.text;
|
|
19
|
+
yield { type: "user_speech_ended", transcript: userText };
|
|
20
|
+
yield {
|
|
21
|
+
type: "transcript",
|
|
22
|
+
role: "user",
|
|
23
|
+
text: userText,
|
|
24
|
+
timestamp: Date.now()
|
|
25
|
+
};
|
|
26
|
+
this.conversationHistory.push({ role: "user", content: userText });
|
|
27
|
+
const llmResponse = await this.llm.chat([
|
|
28
|
+
{
|
|
29
|
+
role: "system",
|
|
30
|
+
content: [{ type: "text", text: config.systemPrompt }]
|
|
31
|
+
},
|
|
32
|
+
...this.conversationHistory.map((msg) => ({
|
|
33
|
+
role: msg.role,
|
|
34
|
+
content: [{ type: "text", text: msg.content }]
|
|
35
|
+
}))
|
|
36
|
+
], { model: config.llmModel });
|
|
37
|
+
const responseText = llmResponse.message.content.find((p) => p.type === "text");
|
|
38
|
+
const agentText = responseText && responseText.type === "text" ? responseText.text : "I apologize, I could not generate a response.";
|
|
39
|
+
this.conversationHistory.push({ role: "assistant", content: agentText });
|
|
40
|
+
yield { type: "agent_speech_started", text: agentText };
|
|
41
|
+
const synthesis = await this.tts.synthesize({
|
|
42
|
+
text: agentText,
|
|
43
|
+
voiceId: config.voiceId,
|
|
44
|
+
language: config.language,
|
|
45
|
+
format: config.outputFormat
|
|
46
|
+
});
|
|
47
|
+
yield { type: "agent_audio", audio: synthesis.audio.data };
|
|
48
|
+
yield { type: "agent_speech_ended" };
|
|
49
|
+
yield {
|
|
50
|
+
type: "transcript",
|
|
51
|
+
role: "agent",
|
|
52
|
+
text: agentText,
|
|
53
|
+
timestamp: Date.now()
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
reset() {
|
|
57
|
+
this.conversationHistory.length = 0;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
export {
|
|
61
|
+
ResponseOrchestrator
|
|
62
|
+
};
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
// src/conversational/transcript-builder.ts
|
|
2
|
+
class TranscriptBuilder {
|
|
3
|
+
turns = [];
|
|
4
|
+
currentTurn = null;
|
|
5
|
+
sessionStartMs = Date.now();
|
|
6
|
+
getTranscript() {
|
|
7
|
+
return [...this.turns];
|
|
8
|
+
}
|
|
9
|
+
toText() {
|
|
10
|
+
return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
|
|
11
|
+
`);
|
|
12
|
+
}
|
|
13
|
+
getTurnCount() {
|
|
14
|
+
return this.turns.length;
|
|
15
|
+
}
|
|
16
|
+
processEvent(event) {
|
|
17
|
+
switch (event.type) {
|
|
18
|
+
case "session_started":
|
|
19
|
+
this.sessionStartMs = Date.now();
|
|
20
|
+
break;
|
|
21
|
+
case "user_speech_started":
|
|
22
|
+
this.currentTurn = {
|
|
23
|
+
role: "user",
|
|
24
|
+
startMs: Date.now() - this.sessionStartMs
|
|
25
|
+
};
|
|
26
|
+
break;
|
|
27
|
+
case "user_speech_ended":
|
|
28
|
+
if (this.currentTurn && this.currentTurn.role === "user") {
|
|
29
|
+
this.currentTurn.text = event.transcript;
|
|
30
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
31
|
+
this.turns.push(this.currentTurn);
|
|
32
|
+
this.currentTurn = null;
|
|
33
|
+
}
|
|
34
|
+
break;
|
|
35
|
+
case "agent_speech_started":
|
|
36
|
+
this.currentTurn = {
|
|
37
|
+
role: "agent",
|
|
38
|
+
text: event.text,
|
|
39
|
+
startMs: Date.now() - this.sessionStartMs
|
|
40
|
+
};
|
|
41
|
+
break;
|
|
42
|
+
case "agent_speech_ended":
|
|
43
|
+
if (this.currentTurn && this.currentTurn.role === "agent") {
|
|
44
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
45
|
+
this.turns.push(this.currentTurn);
|
|
46
|
+
this.currentTurn = null;
|
|
47
|
+
}
|
|
48
|
+
break;
|
|
49
|
+
case "transcript":
|
|
50
|
+
break;
|
|
51
|
+
default:
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
reset() {
|
|
56
|
+
this.turns.length = 0;
|
|
57
|
+
this.currentTurn = null;
|
|
58
|
+
this.sessionStartMs = Date.now();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
export {
|
|
62
|
+
TranscriptBuilder
|
|
63
|
+
};
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// src/conversational/turn-detector.ts
|
|
2
|
+
class TurnDetector {
|
|
3
|
+
silenceThresholdMs;
|
|
4
|
+
energyThreshold;
|
|
5
|
+
silenceStartMs = null;
|
|
6
|
+
constructor(silenceThresholdMs = 800, energyThreshold = 0.01) {
|
|
7
|
+
this.silenceThresholdMs = silenceThresholdMs;
|
|
8
|
+
this.energyThreshold = energyThreshold;
|
|
9
|
+
}
|
|
10
|
+
processChunk(chunk, timestampMs) {
|
|
11
|
+
const energy = this.calculateEnergy(chunk);
|
|
12
|
+
const isSpeech = energy > this.energyThreshold;
|
|
13
|
+
if (isSpeech) {
|
|
14
|
+
this.silenceStartMs = null;
|
|
15
|
+
return false;
|
|
16
|
+
}
|
|
17
|
+
if (this.silenceStartMs === null) {
|
|
18
|
+
this.silenceStartMs = timestampMs;
|
|
19
|
+
}
|
|
20
|
+
const silenceDurationMs = timestampMs - this.silenceStartMs;
|
|
21
|
+
return silenceDurationMs >= this.silenceThresholdMs;
|
|
22
|
+
}
|
|
23
|
+
reset() {
|
|
24
|
+
this.silenceStartMs = null;
|
|
25
|
+
}
|
|
26
|
+
calculateEnergy(chunk) {
|
|
27
|
+
if (chunk.length < 2)
|
|
28
|
+
return 0;
|
|
29
|
+
let sum = 0;
|
|
30
|
+
const sampleCount = Math.floor(chunk.length / 2);
|
|
31
|
+
for (let i = 0;i < chunk.length - 1; i += 2) {
|
|
32
|
+
const low = chunk[i] ?? 0;
|
|
33
|
+
const high = chunk[i + 1] ?? 0;
|
|
34
|
+
const sample = (low | high << 8) << 16 >> 16;
|
|
35
|
+
const normalized = sample / 32768;
|
|
36
|
+
sum += normalized * normalized;
|
|
37
|
+
}
|
|
38
|
+
return Math.sqrt(sum / sampleCount);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
export {
|
|
42
|
+
TurnDetector
|
|
43
|
+
};
|
|
File without changes
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
// src/conversational/transcript-builder.ts
|
|
2
|
+
class TranscriptBuilder {
|
|
3
|
+
turns = [];
|
|
4
|
+
currentTurn = null;
|
|
5
|
+
sessionStartMs = Date.now();
|
|
6
|
+
getTranscript() {
|
|
7
|
+
return [...this.turns];
|
|
8
|
+
}
|
|
9
|
+
toText() {
|
|
10
|
+
return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
|
|
11
|
+
`);
|
|
12
|
+
}
|
|
13
|
+
getTurnCount() {
|
|
14
|
+
return this.turns.length;
|
|
15
|
+
}
|
|
16
|
+
processEvent(event) {
|
|
17
|
+
switch (event.type) {
|
|
18
|
+
case "session_started":
|
|
19
|
+
this.sessionStartMs = Date.now();
|
|
20
|
+
break;
|
|
21
|
+
case "user_speech_started":
|
|
22
|
+
this.currentTurn = {
|
|
23
|
+
role: "user",
|
|
24
|
+
startMs: Date.now() - this.sessionStartMs
|
|
25
|
+
};
|
|
26
|
+
break;
|
|
27
|
+
case "user_speech_ended":
|
|
28
|
+
if (this.currentTurn && this.currentTurn.role === "user") {
|
|
29
|
+
this.currentTurn.text = event.transcript;
|
|
30
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
31
|
+
this.turns.push(this.currentTurn);
|
|
32
|
+
this.currentTurn = null;
|
|
33
|
+
}
|
|
34
|
+
break;
|
|
35
|
+
case "agent_speech_started":
|
|
36
|
+
this.currentTurn = {
|
|
37
|
+
role: "agent",
|
|
38
|
+
text: event.text,
|
|
39
|
+
startMs: Date.now() - this.sessionStartMs
|
|
40
|
+
};
|
|
41
|
+
break;
|
|
42
|
+
case "agent_speech_ended":
|
|
43
|
+
if (this.currentTurn && this.currentTurn.role === "agent") {
|
|
44
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
45
|
+
this.turns.push(this.currentTurn);
|
|
46
|
+
this.currentTurn = null;
|
|
47
|
+
}
|
|
48
|
+
break;
|
|
49
|
+
case "transcript":
|
|
50
|
+
break;
|
|
51
|
+
default:
|
|
52
|
+
break;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
reset() {
|
|
56
|
+
this.turns.length = 0;
|
|
57
|
+
this.currentTurn = null;
|
|
58
|
+
this.sessionStartMs = Date.now();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// src/conversational/voice-session-manager.ts
|
|
63
|
+
class VoiceSessionManager {
|
|
64
|
+
provider;
|
|
65
|
+
constructor(options) {
|
|
66
|
+
this.provider = options.conversational;
|
|
67
|
+
}
|
|
68
|
+
async startSession(config) {
|
|
69
|
+
const transcriptBuilder = new TranscriptBuilder;
|
|
70
|
+
const session = await this.provider.startSession({
|
|
71
|
+
voiceId: config.voiceId,
|
|
72
|
+
language: config.language,
|
|
73
|
+
systemPrompt: config.systemPrompt,
|
|
74
|
+
llmModel: config.llmModel,
|
|
75
|
+
inputFormat: config.inputFormat,
|
|
76
|
+
outputFormat: config.outputFormat,
|
|
77
|
+
turnDetection: config.turnDetection,
|
|
78
|
+
silenceThresholdMs: config.silenceThresholdMs,
|
|
79
|
+
maxDurationSeconds: config.maxDurationSeconds
|
|
80
|
+
});
|
|
81
|
+
const state = {
|
|
82
|
+
sessionId: "",
|
|
83
|
+
status: "connecting",
|
|
84
|
+
currentTurn: "idle",
|
|
85
|
+
turnCount: 0,
|
|
86
|
+
durationMs: 0,
|
|
87
|
+
transcript: []
|
|
88
|
+
};
|
|
89
|
+
const wrappedEvents = this.wrapEvents(session.events, state, transcriptBuilder);
|
|
90
|
+
return {
|
|
91
|
+
state,
|
|
92
|
+
sendAudio: (chunk) => session.sendAudio(chunk),
|
|
93
|
+
sendText: (text) => session.sendText(text),
|
|
94
|
+
interrupt: () => session.interrupt(),
|
|
95
|
+
close: async () => {
|
|
96
|
+
const summary = await session.close();
|
|
97
|
+
state.status = "ended";
|
|
98
|
+
return summary;
|
|
99
|
+
},
|
|
100
|
+
events: wrappedEvents
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
async* wrapEvents(events, state, transcriptBuilder) {
|
|
104
|
+
for await (const event of events) {
|
|
105
|
+
transcriptBuilder.processEvent(event);
|
|
106
|
+
switch (event.type) {
|
|
107
|
+
case "session_started":
|
|
108
|
+
state.sessionId = event.sessionId;
|
|
109
|
+
state.status = "active";
|
|
110
|
+
break;
|
|
111
|
+
case "user_speech_started":
|
|
112
|
+
state.currentTurn = "user";
|
|
113
|
+
break;
|
|
114
|
+
case "user_speech_ended":
|
|
115
|
+
state.currentTurn = "idle";
|
|
116
|
+
state.turnCount += 1;
|
|
117
|
+
break;
|
|
118
|
+
case "agent_speech_started":
|
|
119
|
+
state.currentTurn = "agent";
|
|
120
|
+
break;
|
|
121
|
+
case "agent_speech_ended":
|
|
122
|
+
state.currentTurn = "idle";
|
|
123
|
+
state.turnCount += 1;
|
|
124
|
+
break;
|
|
125
|
+
case "session_ended":
|
|
126
|
+
state.status = "ended";
|
|
127
|
+
state.durationMs = event.durationMs;
|
|
128
|
+
break;
|
|
129
|
+
}
|
|
130
|
+
state.transcript = transcriptBuilder.getTranscript();
|
|
131
|
+
yield event;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
export {
|
|
136
|
+
VoiceSessionManager
|
|
137
|
+
};
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
// src/i18n/catalogs/en.ts
|
|
2
|
+
import { defineTranslation } from "@contractspec/lib.contracts-spec/translations";
|
|
3
|
+
var enMessages = defineTranslation({
|
|
4
|
+
meta: {
|
|
5
|
+
key: "voice.messages",
|
|
6
|
+
version: "1.0.0",
|
|
7
|
+
domain: "voice",
|
|
8
|
+
description: "All user-facing, LLM-facing, and developer-facing strings for the voice package",
|
|
9
|
+
owners: ["platform"],
|
|
10
|
+
stability: "experimental"
|
|
11
|
+
},
|
|
12
|
+
locale: "en",
|
|
13
|
+
fallback: "en",
|
|
14
|
+
messages: {
|
|
15
|
+
"prompt.tts.system": {
|
|
16
|
+
value: `You are a voice narration script writer.
|
|
17
|
+
Analyze the content and produce a narration script with pacing directives.
|
|
18
|
+
Return JSON with segments, each having sceneId, text, and contentType.`,
|
|
19
|
+
description: "TTS system prompt for LLM-enhanced script generation"
|
|
20
|
+
},
|
|
21
|
+
"prompt.pace.sceneMatched": {
|
|
22
|
+
value: "Match voice pacing to scene durations. Adjust rate and emphasis per segment to fit the video timeline.",
|
|
23
|
+
description: "Prompt for scene-matched pacing strategy"
|
|
24
|
+
},
|
|
25
|
+
"prompt.emphasis.system": {
|
|
26
|
+
value: "You are a voice director. For each segment, determine the optimal emphasis, tone, and speaking rate.",
|
|
27
|
+
description: "Emphasis planner LLM system prompt"
|
|
28
|
+
},
|
|
29
|
+
"pace.intro.description": {
|
|
30
|
+
value: "Authoritative opening at a measured pace",
|
|
31
|
+
description: "Description for intro pacing"
|
|
32
|
+
},
|
|
33
|
+
"pace.problem.description": {
|
|
34
|
+
value: "Urgent emphasis on the challenge",
|
|
35
|
+
description: "Description for problem pacing"
|
|
36
|
+
},
|
|
37
|
+
"pace.solution.description": {
|
|
38
|
+
value: "Calm, clear delivery of the solution",
|
|
39
|
+
description: "Description for solution pacing"
|
|
40
|
+
},
|
|
41
|
+
"pace.metric.description": {
|
|
42
|
+
value: "Excited emphasis on key results",
|
|
43
|
+
description: "Description for metric pacing"
|
|
44
|
+
},
|
|
45
|
+
"pace.cta.description": {
|
|
46
|
+
value: "Authoritative call to action",
|
|
47
|
+
description: "Description for CTA pacing"
|
|
48
|
+
},
|
|
49
|
+
"pace.transition.description": {
|
|
50
|
+
value: "Quick neutral transition",
|
|
51
|
+
description: "Description for transition pacing"
|
|
52
|
+
},
|
|
53
|
+
"stt.transcribing": {
|
|
54
|
+
value: "Transcribing audio...",
|
|
55
|
+
description: "Status message during transcription"
|
|
56
|
+
},
|
|
57
|
+
"stt.diarization.speaker": {
|
|
58
|
+
value: "Speaker {index}",
|
|
59
|
+
description: "Default speaker label",
|
|
60
|
+
placeholders: [{ name: "index", type: "number" }]
|
|
61
|
+
},
|
|
62
|
+
"stt.subtitle.timestamp": {
|
|
63
|
+
value: "{start} --> {end}",
|
|
64
|
+
description: "Subtitle timestamp format",
|
|
65
|
+
placeholders: [
|
|
66
|
+
{ name: "start", type: "string" },
|
|
67
|
+
{ name: "end", type: "string" }
|
|
68
|
+
]
|
|
69
|
+
},
|
|
70
|
+
"conv.session.started": {
|
|
71
|
+
value: "Voice session started",
|
|
72
|
+
description: "Session start notification"
|
|
73
|
+
},
|
|
74
|
+
"conv.turn.user": {
|
|
75
|
+
value: "User is speaking",
|
|
76
|
+
description: "User turn indicator"
|
|
77
|
+
},
|
|
78
|
+
"conv.turn.agent": {
|
|
79
|
+
value: "Agent is responding",
|
|
80
|
+
description: "Agent turn indicator"
|
|
81
|
+
},
|
|
82
|
+
"conv.session.ended": {
|
|
83
|
+
value: "Voice session ended. Duration: {durationMs}ms",
|
|
84
|
+
description: "Session end notification",
|
|
85
|
+
placeholders: [{ name: "durationMs", type: "number" }]
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
export {
|
|
90
|
+
enMessages
|
|
91
|
+
};
|