@contractspec/lib.voice 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/audio/audio-concatenator.d.ts +15 -0
- package/dist/audio/audio-concatenator.js +57 -0
- package/dist/audio/duration-estimator.d.ts +31 -0
- package/dist/audio/duration-estimator.js +22 -0
- package/dist/audio/format-converter.d.ts +17 -0
- package/dist/audio/format-converter.js +28 -0
- package/dist/audio/index.d.ts +4 -0
- package/dist/audio/index.js +121 -0
- package/dist/audio/silence-generator.d.ts +16 -0
- package/dist/audio/silence-generator.js +20 -0
- package/dist/browser/audio/audio-concatenator.js +56 -0
- package/dist/browser/audio/duration-estimator.js +21 -0
- package/dist/browser/audio/format-converter.js +27 -0
- package/dist/browser/audio/index.js +120 -0
- package/dist/browser/audio/silence-generator.js +19 -0
- package/dist/browser/conversational/index.js +241 -0
- package/dist/browser/conversational/response-orchestrator.js +62 -0
- package/dist/browser/conversational/transcript-builder.js +63 -0
- package/dist/browser/conversational/turn-detector.js +43 -0
- package/dist/browser/conversational/types.js +0 -0
- package/dist/browser/conversational/voice-session-manager.js +137 -0
- package/dist/browser/docs/conversational.docblock.js +5 -0
- package/dist/browser/docs/stt.docblock.js +5 -0
- package/dist/browser/docs/sync.docblock.js +5 -0
- package/dist/browser/docs/tts.docblock.js +5 -0
- package/dist/browser/docs/voice.docblock.js +5 -0
- package/dist/browser/i18n/catalogs/en.js +91 -0
- package/dist/browser/i18n/catalogs/es.js +91 -0
- package/dist/browser/i18n/catalogs/fr.js +91 -0
- package/dist/browser/i18n/catalogs/index.js +271 -0
- package/dist/browser/i18n/index.js +335 -0
- package/dist/browser/i18n/keys.js +38 -0
- package/dist/browser/i18n/locale.js +13 -0
- package/dist/browser/i18n/messages.js +283 -0
- package/dist/browser/index.js +1070 -0
- package/dist/browser/stt/diarization-mapper.js +42 -0
- package/dist/browser/stt/index.js +222 -0
- package/dist/browser/stt/segment-splitter.js +36 -0
- package/dist/browser/stt/subtitle-formatter.js +51 -0
- package/dist/browser/stt/transcriber.js +219 -0
- package/dist/browser/stt/types.js +0 -0
- package/dist/browser/sync/duration-negotiator.js +69 -0
- package/dist/browser/sync/index.js +165 -0
- package/dist/browser/sync/scene-adapter.js +52 -0
- package/dist/browser/sync/timing-calculator.js +46 -0
- package/dist/browser/tts/audio-assembler.js +120 -0
- package/dist/browser/tts/emphasis-planner.js +134 -0
- package/dist/browser/tts/index.js +439 -0
- package/dist/browser/tts/pace-analyzer.js +67 -0
- package/dist/browser/tts/segment-synthesizer.js +36 -0
- package/dist/browser/tts/types.js +0 -0
- package/dist/browser/tts/voice-synthesizer.js +435 -0
- package/dist/browser/types.js +0 -0
- package/dist/conversational/index.d.ts +5 -0
- package/dist/conversational/index.js +242 -0
- package/dist/conversational/response-orchestrator.d.ts +26 -0
- package/dist/conversational/response-orchestrator.js +63 -0
- package/dist/conversational/transcript-builder.d.ts +25 -0
- package/dist/conversational/transcript-builder.js +64 -0
- package/dist/conversational/turn-detector.d.ts +31 -0
- package/dist/conversational/turn-detector.js +44 -0
- package/dist/conversational/types.d.ts +55 -0
- package/dist/conversational/types.js +1 -0
- package/dist/conversational/voice-session-manager.d.ts +17 -0
- package/dist/conversational/voice-session-manager.js +138 -0
- package/dist/docs/conversational.docblock.d.ts +14 -0
- package/dist/docs/conversational.docblock.js +6 -0
- package/dist/docs/stt.docblock.d.ts +12 -0
- package/dist/docs/stt.docblock.js +6 -0
- package/dist/docs/sync.docblock.d.ts +12 -0
- package/dist/docs/sync.docblock.js +6 -0
- package/dist/docs/tts.docblock.d.ts +12 -0
- package/dist/docs/tts.docblock.js +6 -0
- package/dist/docs/voice.docblock.d.ts +22 -0
- package/dist/docs/voice.docblock.js +6 -0
- package/dist/i18n/catalogs/en.d.ts +6 -0
- package/dist/i18n/catalogs/en.js +92 -0
- package/dist/i18n/catalogs/es.d.ts +4 -0
- package/dist/i18n/catalogs/es.js +92 -0
- package/dist/i18n/catalogs/fr.d.ts +4 -0
- package/dist/i18n/catalogs/fr.js +92 -0
- package/dist/i18n/catalogs/index.d.ts +3 -0
- package/dist/i18n/catalogs/index.js +272 -0
- package/dist/i18n/index.d.ts +20 -0
- package/dist/i18n/index.js +336 -0
- package/dist/i18n/keys.d.ts +50 -0
- package/dist/i18n/keys.js +39 -0
- package/dist/i18n/locale.d.ts +6 -0
- package/dist/i18n/locale.js +14 -0
- package/dist/i18n/messages.d.ts +13 -0
- package/dist/i18n/messages.js +284 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +1071 -0
- package/dist/node/audio/audio-concatenator.js +56 -0
- package/dist/node/audio/duration-estimator.js +21 -0
- package/dist/node/audio/format-converter.js +27 -0
- package/dist/node/audio/index.js +120 -0
- package/dist/node/audio/silence-generator.js +19 -0
- package/dist/node/conversational/index.js +241 -0
- package/dist/node/conversational/response-orchestrator.js +62 -0
- package/dist/node/conversational/transcript-builder.js +63 -0
- package/dist/node/conversational/turn-detector.js +43 -0
- package/dist/node/conversational/types.js +0 -0
- package/dist/node/conversational/voice-session-manager.js +137 -0
- package/dist/node/docs/conversational.docblock.js +5 -0
- package/dist/node/docs/stt.docblock.js +5 -0
- package/dist/node/docs/sync.docblock.js +5 -0
- package/dist/node/docs/tts.docblock.js +5 -0
- package/dist/node/docs/voice.docblock.js +5 -0
- package/dist/node/i18n/catalogs/en.js +91 -0
- package/dist/node/i18n/catalogs/es.js +91 -0
- package/dist/node/i18n/catalogs/fr.js +91 -0
- package/dist/node/i18n/catalogs/index.js +271 -0
- package/dist/node/i18n/index.js +335 -0
- package/dist/node/i18n/keys.js +38 -0
- package/dist/node/i18n/locale.js +13 -0
- package/dist/node/i18n/messages.js +283 -0
- package/dist/node/index.js +1070 -0
- package/dist/node/stt/diarization-mapper.js +42 -0
- package/dist/node/stt/index.js +222 -0
- package/dist/node/stt/segment-splitter.js +36 -0
- package/dist/node/stt/subtitle-formatter.js +51 -0
- package/dist/node/stt/transcriber.js +219 -0
- package/dist/node/stt/types.js +0 -0
- package/dist/node/sync/duration-negotiator.js +69 -0
- package/dist/node/sync/index.js +165 -0
- package/dist/node/sync/scene-adapter.js +52 -0
- package/dist/node/sync/timing-calculator.js +46 -0
- package/dist/node/tts/audio-assembler.js +120 -0
- package/dist/node/tts/emphasis-planner.js +134 -0
- package/dist/node/tts/index.js +439 -0
- package/dist/node/tts/pace-analyzer.js +67 -0
- package/dist/node/tts/segment-synthesizer.js +36 -0
- package/dist/node/tts/types.js +0 -0
- package/dist/node/tts/voice-synthesizer.js +435 -0
- package/dist/node/types.js +0 -0
- package/dist/stt/diarization-mapper.d.ts +19 -0
- package/dist/stt/diarization-mapper.js +43 -0
- package/dist/stt/index.d.ts +5 -0
- package/dist/stt/index.js +223 -0
- package/dist/stt/segment-splitter.d.ts +19 -0
- package/dist/stt/segment-splitter.js +37 -0
- package/dist/stt/subtitle-formatter.d.ts +19 -0
- package/dist/stt/subtitle-formatter.js +52 -0
- package/dist/stt/transcriber.d.ts +21 -0
- package/dist/stt/transcriber.js +220 -0
- package/dist/stt/types.d.ts +44 -0
- package/dist/stt/types.js +1 -0
- package/dist/sync/duration-negotiator.d.ts +37 -0
- package/dist/sync/duration-negotiator.js +70 -0
- package/dist/sync/index.d.ts +3 -0
- package/dist/sync/index.js +166 -0
- package/dist/sync/scene-adapter.d.ts +29 -0
- package/dist/sync/scene-adapter.js +53 -0
- package/dist/sync/timing-calculator.d.ts +21 -0
- package/dist/sync/timing-calculator.js +47 -0
- package/dist/tts/audio-assembler.d.ts +19 -0
- package/dist/tts/audio-assembler.js +121 -0
- package/dist/tts/emphasis-planner.d.ts +24 -0
- package/dist/tts/emphasis-planner.js +135 -0
- package/dist/tts/index.d.ts +6 -0
- package/dist/tts/index.js +440 -0
- package/dist/tts/pace-analyzer.d.ts +30 -0
- package/dist/tts/pace-analyzer.js +68 -0
- package/dist/tts/segment-synthesizer.d.ts +21 -0
- package/dist/tts/segment-synthesizer.js +37 -0
- package/dist/tts/types.d.ts +76 -0
- package/dist/tts/types.js +1 -0
- package/dist/tts/voice-synthesizer.d.ts +28 -0
- package/dist/tts/voice-synthesizer.js +436 -0
- package/dist/types.d.ts +12 -0
- package/dist/types.js +1 -0
- package/package.json +760 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/conversational/response-orchestrator.ts
|
|
3
|
+
class ResponseOrchestrator {
|
|
4
|
+
stt;
|
|
5
|
+
llm;
|
|
6
|
+
tts;
|
|
7
|
+
conversationHistory = [];
|
|
8
|
+
constructor(stt, llm, tts) {
|
|
9
|
+
this.stt = stt;
|
|
10
|
+
this.llm = llm;
|
|
11
|
+
this.tts = tts;
|
|
12
|
+
}
|
|
13
|
+
async* processUserTurn(userAudio, config) {
|
|
14
|
+
const transcription = await this.stt.transcribe({
|
|
15
|
+
audio: userAudio,
|
|
16
|
+
language: config.language,
|
|
17
|
+
wordTimestamps: false
|
|
18
|
+
});
|
|
19
|
+
const userText = transcription.text;
|
|
20
|
+
yield { type: "user_speech_ended", transcript: userText };
|
|
21
|
+
yield {
|
|
22
|
+
type: "transcript",
|
|
23
|
+
role: "user",
|
|
24
|
+
text: userText,
|
|
25
|
+
timestamp: Date.now()
|
|
26
|
+
};
|
|
27
|
+
this.conversationHistory.push({ role: "user", content: userText });
|
|
28
|
+
const llmResponse = await this.llm.chat([
|
|
29
|
+
{
|
|
30
|
+
role: "system",
|
|
31
|
+
content: [{ type: "text", text: config.systemPrompt }]
|
|
32
|
+
},
|
|
33
|
+
...this.conversationHistory.map((msg) => ({
|
|
34
|
+
role: msg.role,
|
|
35
|
+
content: [{ type: "text", text: msg.content }]
|
|
36
|
+
}))
|
|
37
|
+
], { model: config.llmModel });
|
|
38
|
+
const responseText = llmResponse.message.content.find((p) => p.type === "text");
|
|
39
|
+
const agentText = responseText && responseText.type === "text" ? responseText.text : "I apologize, I could not generate a response.";
|
|
40
|
+
this.conversationHistory.push({ role: "assistant", content: agentText });
|
|
41
|
+
yield { type: "agent_speech_started", text: agentText };
|
|
42
|
+
const synthesis = await this.tts.synthesize({
|
|
43
|
+
text: agentText,
|
|
44
|
+
voiceId: config.voiceId,
|
|
45
|
+
language: config.language,
|
|
46
|
+
format: config.outputFormat
|
|
47
|
+
});
|
|
48
|
+
yield { type: "agent_audio", audio: synthesis.audio.data };
|
|
49
|
+
yield { type: "agent_speech_ended" };
|
|
50
|
+
yield {
|
|
51
|
+
type: "transcript",
|
|
52
|
+
role: "agent",
|
|
53
|
+
text: agentText,
|
|
54
|
+
timestamp: Date.now()
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
reset() {
|
|
58
|
+
this.conversationHistory.length = 0;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
export {
|
|
62
|
+
ResponseOrchestrator
|
|
63
|
+
};
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { ConversationalEvent } from '../types';
|
|
2
|
+
import type { ConversationTurn } from './types';
|
|
3
|
+
/**
|
|
4
|
+
* Build a conversation transcript in real-time from event stream.
|
|
5
|
+
*
|
|
6
|
+
* Accumulates turns as events arrive and provides access to the
|
|
7
|
+
* full transcript at any point during the conversation.
|
|
8
|
+
*/
|
|
9
|
+
export declare class TranscriptBuilder {
|
|
10
|
+
private readonly turns;
|
|
11
|
+
private currentTurn;
|
|
12
|
+
private sessionStartMs;
|
|
13
|
+
/** Get the current transcript */
|
|
14
|
+
getTranscript(): ConversationTurn[];
|
|
15
|
+
/** Get the full transcript as plain text */
|
|
16
|
+
toText(): string;
|
|
17
|
+
/** Get the total number of turns */
|
|
18
|
+
getTurnCount(): number;
|
|
19
|
+
/**
|
|
20
|
+
* Process a conversational event and update the transcript.
|
|
21
|
+
*/
|
|
22
|
+
processEvent(event: ConversationalEvent): void;
|
|
23
|
+
/** Reset the transcript */
|
|
24
|
+
reset(): void;
|
|
25
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/conversational/transcript-builder.ts
|
|
3
|
+
class TranscriptBuilder {
|
|
4
|
+
turns = [];
|
|
5
|
+
currentTurn = null;
|
|
6
|
+
sessionStartMs = Date.now();
|
|
7
|
+
getTranscript() {
|
|
8
|
+
return [...this.turns];
|
|
9
|
+
}
|
|
10
|
+
toText() {
|
|
11
|
+
return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
|
|
12
|
+
`);
|
|
13
|
+
}
|
|
14
|
+
getTurnCount() {
|
|
15
|
+
return this.turns.length;
|
|
16
|
+
}
|
|
17
|
+
processEvent(event) {
|
|
18
|
+
switch (event.type) {
|
|
19
|
+
case "session_started":
|
|
20
|
+
this.sessionStartMs = Date.now();
|
|
21
|
+
break;
|
|
22
|
+
case "user_speech_started":
|
|
23
|
+
this.currentTurn = {
|
|
24
|
+
role: "user",
|
|
25
|
+
startMs: Date.now() - this.sessionStartMs
|
|
26
|
+
};
|
|
27
|
+
break;
|
|
28
|
+
case "user_speech_ended":
|
|
29
|
+
if (this.currentTurn && this.currentTurn.role === "user") {
|
|
30
|
+
this.currentTurn.text = event.transcript;
|
|
31
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
32
|
+
this.turns.push(this.currentTurn);
|
|
33
|
+
this.currentTurn = null;
|
|
34
|
+
}
|
|
35
|
+
break;
|
|
36
|
+
case "agent_speech_started":
|
|
37
|
+
this.currentTurn = {
|
|
38
|
+
role: "agent",
|
|
39
|
+
text: event.text,
|
|
40
|
+
startMs: Date.now() - this.sessionStartMs
|
|
41
|
+
};
|
|
42
|
+
break;
|
|
43
|
+
case "agent_speech_ended":
|
|
44
|
+
if (this.currentTurn && this.currentTurn.role === "agent") {
|
|
45
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
46
|
+
this.turns.push(this.currentTurn);
|
|
47
|
+
this.currentTurn = null;
|
|
48
|
+
}
|
|
49
|
+
break;
|
|
50
|
+
case "transcript":
|
|
51
|
+
break;
|
|
52
|
+
default:
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
reset() {
|
|
57
|
+
this.turns.length = 0;
|
|
58
|
+
this.currentTurn = null;
|
|
59
|
+
this.sessionStartMs = Date.now();
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
export {
|
|
63
|
+
TranscriptBuilder
|
|
64
|
+
};
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Voice Activity Detection (VAD) and silence detection for turn-taking.
|
|
3
|
+
*
|
|
4
|
+
* Determines when the user has finished speaking based on audio energy
|
|
5
|
+
* and silence duration thresholds.
|
|
6
|
+
*/
|
|
7
|
+
export declare class TurnDetector {
|
|
8
|
+
private readonly silenceThresholdMs;
|
|
9
|
+
private readonly energyThreshold;
|
|
10
|
+
private silenceStartMs;
|
|
11
|
+
/**
|
|
12
|
+
* @param silenceThresholdMs - Silence duration to trigger end of turn (default 800ms)
|
|
13
|
+
* @param energyThreshold - Minimum audio energy to consider as speech (default 0.01)
|
|
14
|
+
*/
|
|
15
|
+
constructor(silenceThresholdMs?: number, energyThreshold?: number);
|
|
16
|
+
/**
|
|
17
|
+
* Process an audio chunk and determine if it contains speech.
|
|
18
|
+
*
|
|
19
|
+
* @param chunk - Raw audio data
|
|
20
|
+
* @param timestampMs - Current timestamp in ms
|
|
21
|
+
* @returns Whether end-of-turn was detected
|
|
22
|
+
*/
|
|
23
|
+
processChunk(chunk: Uint8Array, timestampMs: number): boolean;
|
|
24
|
+
/** Reset the detector state */
|
|
25
|
+
reset(): void;
|
|
26
|
+
/**
|
|
27
|
+
* Calculate RMS energy of an audio chunk.
|
|
28
|
+
* Assumes 16-bit PCM audio.
|
|
29
|
+
*/
|
|
30
|
+
private calculateEnergy;
|
|
31
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/conversational/turn-detector.ts
|
|
3
|
+
class TurnDetector {
|
|
4
|
+
silenceThresholdMs;
|
|
5
|
+
energyThreshold;
|
|
6
|
+
silenceStartMs = null;
|
|
7
|
+
constructor(silenceThresholdMs = 800, energyThreshold = 0.01) {
|
|
8
|
+
this.silenceThresholdMs = silenceThresholdMs;
|
|
9
|
+
this.energyThreshold = energyThreshold;
|
|
10
|
+
}
|
|
11
|
+
processChunk(chunk, timestampMs) {
|
|
12
|
+
const energy = this.calculateEnergy(chunk);
|
|
13
|
+
const isSpeech = energy > this.energyThreshold;
|
|
14
|
+
if (isSpeech) {
|
|
15
|
+
this.silenceStartMs = null;
|
|
16
|
+
return false;
|
|
17
|
+
}
|
|
18
|
+
if (this.silenceStartMs === null) {
|
|
19
|
+
this.silenceStartMs = timestampMs;
|
|
20
|
+
}
|
|
21
|
+
const silenceDurationMs = timestampMs - this.silenceStartMs;
|
|
22
|
+
return silenceDurationMs >= this.silenceThresholdMs;
|
|
23
|
+
}
|
|
24
|
+
reset() {
|
|
25
|
+
this.silenceStartMs = null;
|
|
26
|
+
}
|
|
27
|
+
calculateEnergy(chunk) {
|
|
28
|
+
if (chunk.length < 2)
|
|
29
|
+
return 0;
|
|
30
|
+
let sum = 0;
|
|
31
|
+
const sampleCount = Math.floor(chunk.length / 2);
|
|
32
|
+
for (let i = 0;i < chunk.length - 1; i += 2) {
|
|
33
|
+
const low = chunk[i] ?? 0;
|
|
34
|
+
const high = chunk[i + 1] ?? 0;
|
|
35
|
+
const sample = (low | high << 8) << 16 >> 16;
|
|
36
|
+
const normalized = sample / 32768;
|
|
37
|
+
sum += normalized * normalized;
|
|
38
|
+
}
|
|
39
|
+
return Math.sqrt(sum / sampleCount);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
export {
|
|
43
|
+
TurnDetector
|
|
44
|
+
};
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import type { ConversationalProvider, ConversationalEvent, ConversationalSessionSummary, AudioFormat, STTProvider, TTSProvider } from '../types';
|
|
2
|
+
import type { LLMProvider } from '@contractspec/lib.contracts-integrations/integrations/providers/llm';
|
|
3
|
+
export interface ConversationConfig {
|
|
4
|
+
voiceId: string;
|
|
5
|
+
language?: string;
|
|
6
|
+
systemPrompt: string;
|
|
7
|
+
llmModel?: string;
|
|
8
|
+
inputFormat?: AudioFormat;
|
|
9
|
+
outputFormat?: AudioFormat;
|
|
10
|
+
turnDetection?: 'server_vad' | 'push_to_talk';
|
|
11
|
+
silenceThresholdMs?: number;
|
|
12
|
+
maxDurationSeconds?: number;
|
|
13
|
+
/** Tools the agent can invoke mid-conversation */
|
|
14
|
+
tools?: ConversationalTool[];
|
|
15
|
+
}
|
|
16
|
+
export interface ConversationalTool {
|
|
17
|
+
name: string;
|
|
18
|
+
description: string;
|
|
19
|
+
inputSchema: Record<string, unknown>;
|
|
20
|
+
handler: (args: Record<string, unknown>) => Promise<string>;
|
|
21
|
+
}
|
|
22
|
+
export interface ConversationState {
|
|
23
|
+
sessionId: string;
|
|
24
|
+
status: 'connecting' | 'active' | 'paused' | 'ended';
|
|
25
|
+
currentTurn: 'user' | 'agent' | 'idle';
|
|
26
|
+
turnCount: number;
|
|
27
|
+
durationMs: number;
|
|
28
|
+
transcript: ConversationTurn[];
|
|
29
|
+
}
|
|
30
|
+
export interface ConversationTurn {
|
|
31
|
+
role: 'user' | 'agent';
|
|
32
|
+
text: string;
|
|
33
|
+
startMs: number;
|
|
34
|
+
endMs: number;
|
|
35
|
+
toolCalls?: {
|
|
36
|
+
name: string;
|
|
37
|
+
result: string;
|
|
38
|
+
}[];
|
|
39
|
+
}
|
|
40
|
+
export interface ConversationalOptions {
|
|
41
|
+
conversational: ConversationalProvider;
|
|
42
|
+
/** Optional fallback: use separate STT + LLM + TTS if provider doesn't support native conversational */
|
|
43
|
+
fallbackSTT?: STTProvider;
|
|
44
|
+
fallbackTTS?: TTSProvider;
|
|
45
|
+
fallbackLLM?: LLMProvider;
|
|
46
|
+
}
|
|
47
|
+
export interface ManagedSession {
|
|
48
|
+
state: ConversationState;
|
|
49
|
+
sendAudio(chunk: Uint8Array): void;
|
|
50
|
+
sendText(text: string): void;
|
|
51
|
+
interrupt(): void;
|
|
52
|
+
close(): Promise<ConversationalSessionSummary>;
|
|
53
|
+
events: AsyncIterable<ConversationalEvent>;
|
|
54
|
+
}
|
|
55
|
+
export type { ConversationalEvent, ConversationalSessionSummary };
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
// @bun
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { ConversationConfig, ConversationalOptions, ManagedSession } from './types';
|
|
2
|
+
/**
|
|
3
|
+
* Manage voice conversation sessions.
|
|
4
|
+
*
|
|
5
|
+
* Two strategies:
|
|
6
|
+
* 1. Native: Delegate to ConversationalProvider if it supports full bidirectional.
|
|
7
|
+
* 2. Composed: Chain STT + LLM + TTS via ResponseOrchestrator (fallback).
|
|
8
|
+
*/
|
|
9
|
+
export declare class VoiceSessionManager {
|
|
10
|
+
private readonly provider;
|
|
11
|
+
constructor(options: ConversationalOptions);
|
|
12
|
+
/**
|
|
13
|
+
* Start a new voice conversation session.
|
|
14
|
+
*/
|
|
15
|
+
startSession(config: ConversationConfig): Promise<ManagedSession>;
|
|
16
|
+
private wrapEvents;
|
|
17
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/conversational/transcript-builder.ts
|
|
3
|
+
class TranscriptBuilder {
|
|
4
|
+
turns = [];
|
|
5
|
+
currentTurn = null;
|
|
6
|
+
sessionStartMs = Date.now();
|
|
7
|
+
getTranscript() {
|
|
8
|
+
return [...this.turns];
|
|
9
|
+
}
|
|
10
|
+
toText() {
|
|
11
|
+
return this.turns.map((t) => `[${t.role}] ${t.text}`).join(`
|
|
12
|
+
`);
|
|
13
|
+
}
|
|
14
|
+
getTurnCount() {
|
|
15
|
+
return this.turns.length;
|
|
16
|
+
}
|
|
17
|
+
processEvent(event) {
|
|
18
|
+
switch (event.type) {
|
|
19
|
+
case "session_started":
|
|
20
|
+
this.sessionStartMs = Date.now();
|
|
21
|
+
break;
|
|
22
|
+
case "user_speech_started":
|
|
23
|
+
this.currentTurn = {
|
|
24
|
+
role: "user",
|
|
25
|
+
startMs: Date.now() - this.sessionStartMs
|
|
26
|
+
};
|
|
27
|
+
break;
|
|
28
|
+
case "user_speech_ended":
|
|
29
|
+
if (this.currentTurn && this.currentTurn.role === "user") {
|
|
30
|
+
this.currentTurn.text = event.transcript;
|
|
31
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
32
|
+
this.turns.push(this.currentTurn);
|
|
33
|
+
this.currentTurn = null;
|
|
34
|
+
}
|
|
35
|
+
break;
|
|
36
|
+
case "agent_speech_started":
|
|
37
|
+
this.currentTurn = {
|
|
38
|
+
role: "agent",
|
|
39
|
+
text: event.text,
|
|
40
|
+
startMs: Date.now() - this.sessionStartMs
|
|
41
|
+
};
|
|
42
|
+
break;
|
|
43
|
+
case "agent_speech_ended":
|
|
44
|
+
if (this.currentTurn && this.currentTurn.role === "agent") {
|
|
45
|
+
this.currentTurn.endMs = Date.now() - this.sessionStartMs;
|
|
46
|
+
this.turns.push(this.currentTurn);
|
|
47
|
+
this.currentTurn = null;
|
|
48
|
+
}
|
|
49
|
+
break;
|
|
50
|
+
case "transcript":
|
|
51
|
+
break;
|
|
52
|
+
default:
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
reset() {
|
|
57
|
+
this.turns.length = 0;
|
|
58
|
+
this.currentTurn = null;
|
|
59
|
+
this.sessionStartMs = Date.now();
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// src/conversational/voice-session-manager.ts
|
|
64
|
+
class VoiceSessionManager {
|
|
65
|
+
provider;
|
|
66
|
+
constructor(options) {
|
|
67
|
+
this.provider = options.conversational;
|
|
68
|
+
}
|
|
69
|
+
async startSession(config) {
|
|
70
|
+
const transcriptBuilder = new TranscriptBuilder;
|
|
71
|
+
const session = await this.provider.startSession({
|
|
72
|
+
voiceId: config.voiceId,
|
|
73
|
+
language: config.language,
|
|
74
|
+
systemPrompt: config.systemPrompt,
|
|
75
|
+
llmModel: config.llmModel,
|
|
76
|
+
inputFormat: config.inputFormat,
|
|
77
|
+
outputFormat: config.outputFormat,
|
|
78
|
+
turnDetection: config.turnDetection,
|
|
79
|
+
silenceThresholdMs: config.silenceThresholdMs,
|
|
80
|
+
maxDurationSeconds: config.maxDurationSeconds
|
|
81
|
+
});
|
|
82
|
+
const state = {
|
|
83
|
+
sessionId: "",
|
|
84
|
+
status: "connecting",
|
|
85
|
+
currentTurn: "idle",
|
|
86
|
+
turnCount: 0,
|
|
87
|
+
durationMs: 0,
|
|
88
|
+
transcript: []
|
|
89
|
+
};
|
|
90
|
+
const wrappedEvents = this.wrapEvents(session.events, state, transcriptBuilder);
|
|
91
|
+
return {
|
|
92
|
+
state,
|
|
93
|
+
sendAudio: (chunk) => session.sendAudio(chunk),
|
|
94
|
+
sendText: (text) => session.sendText(text),
|
|
95
|
+
interrupt: () => session.interrupt(),
|
|
96
|
+
close: async () => {
|
|
97
|
+
const summary = await session.close();
|
|
98
|
+
state.status = "ended";
|
|
99
|
+
return summary;
|
|
100
|
+
},
|
|
101
|
+
events: wrappedEvents
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
async* wrapEvents(events, state, transcriptBuilder) {
|
|
105
|
+
for await (const event of events) {
|
|
106
|
+
transcriptBuilder.processEvent(event);
|
|
107
|
+
switch (event.type) {
|
|
108
|
+
case "session_started":
|
|
109
|
+
state.sessionId = event.sessionId;
|
|
110
|
+
state.status = "active";
|
|
111
|
+
break;
|
|
112
|
+
case "user_speech_started":
|
|
113
|
+
state.currentTurn = "user";
|
|
114
|
+
break;
|
|
115
|
+
case "user_speech_ended":
|
|
116
|
+
state.currentTurn = "idle";
|
|
117
|
+
state.turnCount += 1;
|
|
118
|
+
break;
|
|
119
|
+
case "agent_speech_started":
|
|
120
|
+
state.currentTurn = "agent";
|
|
121
|
+
break;
|
|
122
|
+
case "agent_speech_ended":
|
|
123
|
+
state.currentTurn = "idle";
|
|
124
|
+
state.turnCount += 1;
|
|
125
|
+
break;
|
|
126
|
+
case "session_ended":
|
|
127
|
+
state.status = "ended";
|
|
128
|
+
state.durationMs = event.durationMs;
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
state.transcript = transcriptBuilder.getTranscript();
|
|
132
|
+
yield event;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
export {
|
|
137
|
+
VoiceSessionManager
|
|
138
|
+
};
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @docblock
|
|
3
|
+
* @title Conversational Sub-domain
|
|
4
|
+
* @domain voice.conversational
|
|
5
|
+
* @description
|
|
6
|
+
* Real-time bidirectional voice conversations.
|
|
7
|
+
*
|
|
8
|
+
* Supports two strategies:
|
|
9
|
+
* 1. Native: Delegate to ConversationalProvider
|
|
10
|
+
* 2. Composed: Chain STT + LLM + TTS via ResponseOrchestrator
|
|
11
|
+
*
|
|
12
|
+
* The VoiceSessionManager class manages session lifecycle.
|
|
13
|
+
*/
|
|
14
|
+
export declare const conversationalDocblock = true;
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @docblock
|
|
3
|
+
* @title STT Sub-domain
|
|
4
|
+
* @domain voice.stt
|
|
5
|
+
* @description
|
|
6
|
+
* Speech-to-text transcription with diarization and subtitle generation.
|
|
7
|
+
*
|
|
8
|
+
* Pipeline: Audio -> SegmentSplitter -> STTProvider -> DiarizationMapper -> SubtitleFormatter -> TranscriptionProject
|
|
9
|
+
*
|
|
10
|
+
* The Transcriber class orchestrates the full pipeline.
|
|
11
|
+
*/
|
|
12
|
+
export declare const sttDocblock = true;
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @docblock
|
|
3
|
+
* @title Sync Layer
|
|
4
|
+
* @domain voice.sync
|
|
5
|
+
* @description
|
|
6
|
+
* Video-gen integration layer for timing synchronization.
|
|
7
|
+
*
|
|
8
|
+
* - TimingCalculator: SynthesizedSegments -> VoiceTimingMap
|
|
9
|
+
* - SceneAdapter: video-gen ScenePlan -> TTSScript
|
|
10
|
+
* - DurationNegotiator: Balance voice vs scene durations
|
|
11
|
+
*/
|
|
12
|
+
export declare const syncDocblock = true;
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @docblock
|
|
3
|
+
* @title TTS Sub-domain
|
|
4
|
+
* @domain voice.tts
|
|
5
|
+
* @description
|
|
6
|
+
* Text-to-speech pipeline with scene-aware narration.
|
|
7
|
+
*
|
|
8
|
+
* Pipeline: ContentBrief -> PaceAnalyzer -> EmphasisPlanner -> SegmentSynthesizer -> AudioAssembler -> TTSProject
|
|
9
|
+
*
|
|
10
|
+
* The VoiceSynthesizer class orchestrates the full pipeline.
|
|
11
|
+
*/
|
|
12
|
+
export declare const ttsDocblock = true;
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @docblock
|
|
3
|
+
* @title Voice Library
|
|
4
|
+
* @domain voice
|
|
5
|
+
* @description
|
|
6
|
+
* Umbrella library for voice capabilities: TTS, STT, and Conversational AI.
|
|
7
|
+
*
|
|
8
|
+
* ## Sub-domains
|
|
9
|
+
* - **TTS**: Text-to-speech synthesis with pacing, emphasis, and video-gen integration
|
|
10
|
+
* - **STT**: Speech-to-text transcription with diarization and subtitle generation
|
|
11
|
+
* - **Conversational**: Real-time bidirectional voice conversations
|
|
12
|
+
* - **Sync**: Video-gen timing integration layer
|
|
13
|
+
*
|
|
14
|
+
* ## Import paths
|
|
15
|
+
* ```ts
|
|
16
|
+
* import { VoiceSynthesizer } from "@contractspec/lib.voice/tts";
|
|
17
|
+
* import { Transcriber } from "@contractspec/lib.voice/stt";
|
|
18
|
+
* import { VoiceSessionManager } from "@contractspec/lib.voice/conversational";
|
|
19
|
+
* import { TimingCalculator } from "@contractspec/lib.voice/sync";
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
export declare const voiceDocblock = true;
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
// @bun
|
|
2
|
+
// src/i18n/catalogs/en.ts
|
|
3
|
+
import { defineTranslation } from "@contractspec/lib.contracts-spec/translations";
|
|
4
|
+
var enMessages = defineTranslation({
|
|
5
|
+
meta: {
|
|
6
|
+
key: "voice.messages",
|
|
7
|
+
version: "1.0.0",
|
|
8
|
+
domain: "voice",
|
|
9
|
+
description: "All user-facing, LLM-facing, and developer-facing strings for the voice package",
|
|
10
|
+
owners: ["platform"],
|
|
11
|
+
stability: "experimental"
|
|
12
|
+
},
|
|
13
|
+
locale: "en",
|
|
14
|
+
fallback: "en",
|
|
15
|
+
messages: {
|
|
16
|
+
"prompt.tts.system": {
|
|
17
|
+
value: `You are a voice narration script writer.
|
|
18
|
+
Analyze the content and produce a narration script with pacing directives.
|
|
19
|
+
Return JSON with segments, each having sceneId, text, and contentType.`,
|
|
20
|
+
description: "TTS system prompt for LLM-enhanced script generation"
|
|
21
|
+
},
|
|
22
|
+
"prompt.pace.sceneMatched": {
|
|
23
|
+
value: "Match voice pacing to scene durations. Adjust rate and emphasis per segment to fit the video timeline.",
|
|
24
|
+
description: "Prompt for scene-matched pacing strategy"
|
|
25
|
+
},
|
|
26
|
+
"prompt.emphasis.system": {
|
|
27
|
+
value: "You are a voice director. For each segment, determine the optimal emphasis, tone, and speaking rate.",
|
|
28
|
+
description: "Emphasis planner LLM system prompt"
|
|
29
|
+
},
|
|
30
|
+
"pace.intro.description": {
|
|
31
|
+
value: "Authoritative opening at a measured pace",
|
|
32
|
+
description: "Description for intro pacing"
|
|
33
|
+
},
|
|
34
|
+
"pace.problem.description": {
|
|
35
|
+
value: "Urgent emphasis on the challenge",
|
|
36
|
+
description: "Description for problem pacing"
|
|
37
|
+
},
|
|
38
|
+
"pace.solution.description": {
|
|
39
|
+
value: "Calm, clear delivery of the solution",
|
|
40
|
+
description: "Description for solution pacing"
|
|
41
|
+
},
|
|
42
|
+
"pace.metric.description": {
|
|
43
|
+
value: "Excited emphasis on key results",
|
|
44
|
+
description: "Description for metric pacing"
|
|
45
|
+
},
|
|
46
|
+
"pace.cta.description": {
|
|
47
|
+
value: "Authoritative call to action",
|
|
48
|
+
description: "Description for CTA pacing"
|
|
49
|
+
},
|
|
50
|
+
"pace.transition.description": {
|
|
51
|
+
value: "Quick neutral transition",
|
|
52
|
+
description: "Description for transition pacing"
|
|
53
|
+
},
|
|
54
|
+
"stt.transcribing": {
|
|
55
|
+
value: "Transcribing audio...",
|
|
56
|
+
description: "Status message during transcription"
|
|
57
|
+
},
|
|
58
|
+
"stt.diarization.speaker": {
|
|
59
|
+
value: "Speaker {index}",
|
|
60
|
+
description: "Default speaker label",
|
|
61
|
+
placeholders: [{ name: "index", type: "number" }]
|
|
62
|
+
},
|
|
63
|
+
"stt.subtitle.timestamp": {
|
|
64
|
+
value: "{start} --> {end}",
|
|
65
|
+
description: "Subtitle timestamp format",
|
|
66
|
+
placeholders: [
|
|
67
|
+
{ name: "start", type: "string" },
|
|
68
|
+
{ name: "end", type: "string" }
|
|
69
|
+
]
|
|
70
|
+
},
|
|
71
|
+
"conv.session.started": {
|
|
72
|
+
value: "Voice session started",
|
|
73
|
+
description: "Session start notification"
|
|
74
|
+
},
|
|
75
|
+
"conv.turn.user": {
|
|
76
|
+
value: "User is speaking",
|
|
77
|
+
description: "User turn indicator"
|
|
78
|
+
},
|
|
79
|
+
"conv.turn.agent": {
|
|
80
|
+
value: "Agent is responding",
|
|
81
|
+
description: "Agent turn indicator"
|
|
82
|
+
},
|
|
83
|
+
"conv.session.ended": {
|
|
84
|
+
value: "Voice session ended. Duration: {durationMs}ms",
|
|
85
|
+
description: "Session end notification",
|
|
86
|
+
placeholders: [{ name: "durationMs", type: "number" }]
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
});
|
|
90
|
+
export {
|
|
91
|
+
enMessages
|
|
92
|
+
};
|