@speech-sdk/core 0.7.0 → 0.8.1-alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -108
- package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
- package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
- package/dist/__tests__/e2e/_save-audio.js +0 -59
- package/dist/__tests__/e2e/_save-audio.js.map +1 -1
- package/dist/audio-duration.d.ts +0 -5
- package/dist/audio-duration.d.ts.map +1 -1
- package/dist/audio-duration.js +3 -10
- package/dist/audio-duration.js.map +1 -1
- package/dist/audio-utils.d.ts +0 -10
- package/dist/audio-utils.d.ts.map +1 -1
- package/dist/audio-utils.js +2 -14
- package/dist/audio-utils.js.map +1 -1
- package/dist/captions.d.ts +0 -108
- package/dist/captions.d.ts.map +1 -1
- package/dist/captions.js +8 -98
- package/dist/captions.js.map +1 -1
- package/dist/conversation/attribute-timestamps.d.ts +26 -0
- package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
- package/dist/conversation/attribute-timestamps.js +276 -0
- package/dist/conversation/attribute-timestamps.js.map +1 -0
- package/dist/conversation/dispatch.d.ts +5 -5
- package/dist/conversation/dispatch.d.ts.map +1 -1
- package/dist/conversation/dispatch.js +18 -8
- package/dist/conversation/dispatch.js.map +1 -1
- package/dist/conversation/errors.d.ts +3 -0
- package/dist/conversation/errors.d.ts.map +1 -1
- package/dist/conversation/errors.js +6 -0
- package/dist/conversation/errors.js.map +1 -1
- package/dist/conversation/pcm-concat.d.ts +0 -23
- package/dist/conversation/pcm-concat.d.ts.map +1 -1
- package/dist/conversation/pcm-concat.js +5 -43
- package/dist/conversation/pcm-concat.js.map +1 -1
- package/dist/conversation/proportional-fill.d.ts +10 -0
- package/dist/conversation/proportional-fill.d.ts.map +1 -0
- package/dist/conversation/proportional-fill.js +64 -0
- package/dist/conversation/proportional-fill.js.map +1 -0
- package/dist/conversation/silence-detection.d.ts +14 -0
- package/dist/conversation/silence-detection.d.ts.map +1 -0
- package/dist/conversation/silence-detection.js +52 -0
- package/dist/conversation/silence-detection.js.map +1 -0
- package/dist/conversation/stitch.d.ts +5 -6
- package/dist/conversation/stitch.d.ts.map +1 -1
- package/dist/conversation/stitch.js +42 -36
- package/dist/conversation/stitch.js.map +1 -1
- package/dist/conversation/types.d.ts +1 -35
- package/dist/conversation/types.d.ts.map +1 -1
- package/dist/conversation/validate.d.ts +1 -16
- package/dist/conversation/validate.d.ts.map +1 -1
- package/dist/conversation/validate.js +29 -29
- package/dist/conversation/validate.js.map +1 -1
- package/dist/default-stt-fallback.d.ts +3 -0
- package/dist/default-stt-fallback.d.ts.map +1 -0
- package/dist/default-stt-fallback.js +11 -0
- package/dist/default-stt-fallback.js.map +1 -0
- package/dist/derive-timestamps.d.ts +1 -5
- package/dist/derive-timestamps.d.ts.map +1 -1
- package/dist/derive-timestamps.js +1 -15
- package/dist/derive-timestamps.js.map +1 -1
- package/dist/errors.d.ts +5 -12
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +12 -14
- package/dist/errors.js.map +1 -1
- package/dist/generate-conversation.d.ts +4 -3
- package/dist/generate-conversation.d.ts.map +1 -1
- package/dist/generate-conversation.js +162 -67
- package/dist/generate-conversation.js.map +1 -1
- package/dist/generate-speech.d.ts +1 -26
- package/dist/generate-speech.d.ts.map +1 -1
- package/dist/generate-speech.js +85 -64
- package/dist/generate-speech.js.map +1 -1
- package/dist/index.d.ts +4 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -4
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts.map +1 -1
- package/dist/logger.js +2 -13
- package/dist/logger.js.map +1 -1
- package/dist/metadata.d.ts +0 -22
- package/dist/metadata.d.ts.map +1 -1
- package/dist/provider-utils.d.ts +3 -9
- package/dist/provider-utils.d.ts.map +1 -1
- package/dist/provider-utils.js +34 -51
- package/dist/provider-utils.js.map +1 -1
- package/dist/providers/cartesia/alignment.d.ts +0 -16
- package/dist/providers/cartesia/alignment.d.ts.map +1 -1
- package/dist/providers/cartesia/alignment.js +1 -6
- package/dist/providers/cartesia/alignment.js.map +1 -1
- package/dist/providers/cartesia/index.d.ts +7 -19
- package/dist/providers/cartesia/index.d.ts.map +1 -1
- package/dist/providers/cartesia/index.js +68 -80
- package/dist/providers/cartesia/index.js.map +1 -1
- package/dist/providers/deepgram/index.d.ts +7 -8
- package/dist/providers/deepgram/index.d.ts.map +1 -1
- package/dist/providers/deepgram/index.js +17 -18
- package/dist/providers/deepgram/index.js.map +1 -1
- package/dist/providers/elevenlabs/alignment.d.ts +7 -21
- package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
- package/dist/providers/elevenlabs/alignment.js +8 -9
- package/dist/providers/elevenlabs/alignment.js.map +1 -1
- package/dist/providers/elevenlabs/index.d.ts +7 -38
- package/dist/providers/elevenlabs/index.d.ts.map +1 -1
- package/dist/providers/elevenlabs/index.js +161 -169
- package/dist/providers/elevenlabs/index.js.map +1 -1
- package/dist/providers/fal/index.d.ts +7 -18
- package/dist/providers/fal/index.d.ts.map +1 -1
- package/dist/providers/fal/index.js +37 -31
- package/dist/providers/fal/index.js.map +1 -1
- package/dist/providers/fish-audio/index.d.ts +7 -8
- package/dist/providers/fish-audio/index.d.ts.map +1 -1
- package/dist/providers/fish-audio/index.js +23 -19
- package/dist/providers/fish-audio/index.js.map +1 -1
- package/dist/providers/gateway/index.d.ts +68 -0
- package/dist/providers/gateway/index.d.ts.map +1 -0
- package/dist/providers/gateway/index.js +236 -0
- package/dist/providers/gateway/index.js.map +1 -0
- package/dist/providers/google/index.d.ts +7 -20
- package/dist/providers/google/index.d.ts.map +1 -1
- package/dist/providers/google/index.js +161 -151
- package/dist/providers/google/index.js.map +1 -1
- package/dist/providers/hume/alignment.d.ts +30 -35
- package/dist/providers/hume/alignment.d.ts.map +1 -1
- package/dist/providers/hume/alignment.js +14 -8
- package/dist/providers/hume/alignment.js.map +1 -1
- package/dist/providers/hume/index.d.ts +7 -16
- package/dist/providers/hume/index.d.ts.map +1 -1
- package/dist/providers/hume/index.js +55 -65
- package/dist/providers/hume/index.js.map +1 -1
- package/dist/providers/inworld/alignment.d.ts +8 -22
- package/dist/providers/inworld/alignment.d.ts.map +1 -1
- package/dist/providers/inworld/alignment.js +9 -8
- package/dist/providers/inworld/alignment.js.map +1 -1
- package/dist/providers/inworld/index.d.ts +7 -20
- package/dist/providers/inworld/index.d.ts.map +1 -1
- package/dist/providers/inworld/index.js +47 -39
- package/dist/providers/inworld/index.js.map +1 -1
- package/dist/providers/mistral/index.d.ts +7 -8
- package/dist/providers/mistral/index.d.ts.map +1 -1
- package/dist/providers/mistral/index.js +39 -38
- package/dist/providers/mistral/index.js.map +1 -1
- package/dist/providers/murf/alignment.d.ts +10 -19
- package/dist/providers/murf/alignment.d.ts.map +1 -1
- package/dist/providers/murf/alignment.js +10 -5
- package/dist/providers/murf/alignment.js.map +1 -1
- package/dist/providers/murf/index.d.ts +7 -16
- package/dist/providers/murf/index.d.ts.map +1 -1
- package/dist/providers/murf/index.js +65 -57
- package/dist/providers/murf/index.js.map +1 -1
- package/dist/providers/openai/index.d.ts +36 -29
- package/dist/providers/openai/index.d.ts.map +1 -1
- package/dist/providers/openai/index.js +270 -106
- package/dist/providers/openai/index.js.map +1 -1
- package/dist/providers/resemble/alignment.d.ts +8 -29
- package/dist/providers/resemble/alignment.d.ts.map +1 -1
- package/dist/providers/resemble/alignment.js +9 -12
- package/dist/providers/resemble/alignment.js.map +1 -1
- package/dist/providers/resemble/index.d.ts +7 -11
- package/dist/providers/resemble/index.d.ts.map +1 -1
- package/dist/providers/resemble/index.js +54 -48
- package/dist/providers/resemble/index.js.map +1 -1
- package/dist/providers/xai/index.d.ts +7 -9
- package/dist/providers/xai/index.d.ts.map +1 -1
- package/dist/providers/xai/index.js +37 -40
- package/dist/providers/xai/index.js.map +1 -1
- package/dist/providers.d.ts +29 -0
- package/dist/providers.d.ts.map +1 -0
- package/dist/providers.js +15 -0
- package/dist/providers.js.map +1 -0
- package/dist/resolve-provider.d.ts.map +1 -1
- package/dist/resolve-provider.js +8 -51
- package/dist/resolve-provider.js.map +1 -1
- package/dist/speech-provider.d.ts +13 -53
- package/dist/speech-provider.d.ts.map +1 -1
- package/dist/speech-provider.js +5 -26
- package/dist/speech-provider.js.map +1 -1
- package/dist/speech-result.d.ts +8 -9
- package/dist/speech-result.d.ts.map +1 -1
- package/dist/speech-result.js.map +1 -1
- package/dist/speech-to-text-provider.d.ts +0 -12
- package/dist/speech-to-text-provider.d.ts.map +1 -1
- package/dist/stream-speech.d.ts.map +1 -1
- package/dist/stream-speech.js +2 -3
- package/dist/stream-speech.js.map +1 -1
- package/dist/timestamps.d.ts +3 -17
- package/dist/timestamps.d.ts.map +1 -1
- package/dist/turns.d.ts +9 -0
- package/dist/turns.d.ts.map +1 -0
- package/dist/turns.js +21 -0
- package/dist/turns.js.map +1 -0
- package/dist/types.d.ts +25 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/volume-adjust.d.ts +0 -6
- package/dist/volume-adjust.d.ts.map +1 -1
- package/dist/volume-adjust.js +0 -6
- package/dist/volume-adjust.js.map +1 -1
- package/package.json +11 -66
- package/dist/stt-providers/openai/index.d.ts +0 -42
- package/dist/stt-providers/openai/index.d.ts.map +0 -1
- package/dist/stt-providers/openai/index.js +0 -184
- package/dist/stt-providers/openai/index.js.map +0 -1
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
export function distributeWordsAcrossTurns(words, expectedTokensPerTurn) {
|
|
2
|
+
if (words.length === 0) {
|
|
3
|
+
return [];
|
|
4
|
+
}
|
|
5
|
+
if (expectedTokensPerTurn.length <= 1) {
|
|
6
|
+
return words.map((w) => ({ ...w, turnIndex: 0 }));
|
|
7
|
+
}
|
|
8
|
+
const totalExpected = expectedTokensPerTurn.reduce((n, t) => n + t, 0);
|
|
9
|
+
if (totalExpected === 0) {
|
|
10
|
+
return words.map((w) => ({ ...w, turnIndex: 0 }));
|
|
11
|
+
}
|
|
12
|
+
const idealCounts = expectedTokensPerTurn.map((t) => (t / totalExpected) * words.length);
|
|
13
|
+
const counts = idealCounts.map(Math.floor);
|
|
14
|
+
let assigned = counts.reduce((n, c) => n + c, 0);
|
|
15
|
+
const remainders = idealCounts
|
|
16
|
+
.map((value, turnIndex) => ({
|
|
17
|
+
turnIndex,
|
|
18
|
+
remainder: value - Math.floor(value),
|
|
19
|
+
}))
|
|
20
|
+
.sort((a, b) => b.remainder - a.remainder || a.turnIndex - b.turnIndex);
|
|
21
|
+
for (const r of remainders) {
|
|
22
|
+
if (assigned >= words.length) {
|
|
23
|
+
break;
|
|
24
|
+
}
|
|
25
|
+
counts[r.turnIndex] = (counts[r.turnIndex] ?? 0) + 1;
|
|
26
|
+
assigned++;
|
|
27
|
+
}
|
|
28
|
+
const out = [];
|
|
29
|
+
let wordIndex = 0;
|
|
30
|
+
for (let turnIndex = 0; turnIndex < counts.length; turnIndex++) {
|
|
31
|
+
const count = counts[turnIndex] ?? 0;
|
|
32
|
+
for (let i = 0; i < count && wordIndex < words.length; i++) {
|
|
33
|
+
out.push({ ...words[wordIndex], turnIndex });
|
|
34
|
+
wordIndex++;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
while (wordIndex < words.length) {
|
|
38
|
+
out.push({
|
|
39
|
+
...words[wordIndex],
|
|
40
|
+
turnIndex: expectedTokensPerTurn.length - 1,
|
|
41
|
+
});
|
|
42
|
+
wordIndex++;
|
|
43
|
+
}
|
|
44
|
+
return out;
|
|
45
|
+
}
|
|
46
|
+
export function fillTurnTimestampsProportional(args) {
|
|
47
|
+
const { turnIndex, tokenCount, startSec, endSec, texts } = args;
|
|
48
|
+
if (tokenCount === 0) {
|
|
49
|
+
return [];
|
|
50
|
+
}
|
|
51
|
+
const span = Math.max(0, endSec - startSec);
|
|
52
|
+
const per = span / tokenCount;
|
|
53
|
+
const out = [];
|
|
54
|
+
for (let i = 0; i < tokenCount; i++) {
|
|
55
|
+
out.push({
|
|
56
|
+
text: texts[i] ?? "",
|
|
57
|
+
start: startSec + i * per,
|
|
58
|
+
end: startSec + (i + 1) * per,
|
|
59
|
+
turnIndex,
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
return out;
|
|
63
|
+
}
|
|
64
|
+
//# sourceMappingURL=proportional-fill.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"proportional-fill.js","sourceRoot":"","sources":["../../src/conversation/proportional-fill.ts"],"names":[],"mappings":"AAKA,MAAM,UAAU,0BAA0B,CACxC,KAA+B,EAC/B,qBAAwC;IAExC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,IAAI,qBAAqB,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACtC,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,aAAa,GAAG,qBAAqB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACvE,IAAI,aAAa,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,CAAC;IAED,MAAM,WAAW,GAAG,qBAAqB,CAAC,GAAG,CAC3C,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,aAAa,CAAC,GAAG,KAAK,CAAC,MAAM,CAC1C,CAAC;IACF,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC3C,IAAI,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACjD,MAAM,UAAU,GAAG,WAAW;SAC3B,GAAG,CAAC,CAAC,KAAK,EAAE,SAAS,EAAE,EAAE,CAAC,CAAC;QAC1B,SAAS;QACT,SAAS,EAAE,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC;KACrC,CAAC,CAAC;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,IAAI,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;IAE1E,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;QAC3B,IAAI,QAAQ,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;YAC7B,MAAM;QACR,CAAC;QACD,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;QACrD,QAAQ,EAAE,CAAC;IACb,CAAC;IAED,MAAM,GAAG,GAAgC,EAAE,CAAC;IAC5C,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,IAAI,SAAS,GAAG,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC,MAAM,EAAE,SAAS,EAAE,EAAE,CAAC;QAC/D,MAAM,KAAK,GAAG,MAAM,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,IAAI,SAAS,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3D,GAAG,CAAC,IAAI,CAAC,EAAE,GAAG,KAAK,CAAC,SAAS,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC;YAC7C,SAAS,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,SAAS,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QAChC,GAAG,CAAC,IAAI,CAAC;YACP,GAAG,KAAK,CAAC,SAAS,CAAC;YACnB,SAAS,EAAE,qBAAqB,CAAC,MAAM,GAAG,CAAC;SAC5C,CAAC,CAAC;QACH,SAAS,EAAE,CAAC;IACd,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,MAAM,UAAU,8BAA8B,CAAC,IAM9C;IACC,MAAM,EAAE,SAAS,EAAE,UAAU,EAAE,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC;IAChE,IAAI,UAAU,KAAK,CAAC,EAAE,CAAC;QACrB,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,GAAG,QAAQ,CAAC,CAAC;IAC5C,MAAM,GAAG,GAAG,IAAI,GAAG,UAAU,CAAC;IAC9B,MAAM,GAAG,GAAgC,EAAE,CAAC;IAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,GAAG,CAAC,IAAI,CAAC;YACP,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE;YACpB,KAAK,EAAE,QAAQ,GAAG,CAAC,GAAG,GAAG;YACzB,GAAG,EAAE,QAAQ,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG;YAC7B,SAAS;SACV,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export interface SilenceGap {
|
|
2
|
+
readonly durationMs: number;
|
|
3
|
+
readonly endMs: number;
|
|
4
|
+
readonly startMs: number;
|
|
5
|
+
}
|
|
6
|
+
export interface DetectSilenceOptions {
|
|
7
|
+
readonly minDurationMs: number;
|
|
8
|
+
readonly sampleRate: number;
|
|
9
|
+
readonly silenceRmsThreshold?: number;
|
|
10
|
+
readonly windowMs?: number;
|
|
11
|
+
}
|
|
12
|
+
export declare function detectSilenceGaps(pcm: Int16Array, options: DetectSilenceOptions): readonly SilenceGap[];
|
|
13
|
+
export declare function pickTopGaps(gaps: readonly SilenceGap[], n: number): readonly SilenceGap[];
|
|
14
|
+
//# sourceMappingURL=silence-detection.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"silence-detection.d.ts","sourceRoot":"","sources":["../../src/conversation/silence-detection.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAE5B,QAAQ,CAAC,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAEtC,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAKD,wBAAgB,iBAAiB,CAC/B,GAAG,EAAE,UAAU,EACf,OAAO,EAAE,oBAAoB,GAC5B,SAAS,UAAU,EAAE,CAiDvB;AAED,wBAAgB,WAAW,CACzB,IAAI,EAAE,SAAS,UAAU,EAAE,EAC3B,CAAC,EAAE,MAAM,GACR,SAAS,UAAU,EAAE,CASvB"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
const DEFAULT_RMS_THRESHOLD = 200;
|
|
2
|
+
const DEFAULT_WINDOW_MS = 20;
|
|
3
|
+
export function detectSilenceGaps(pcm, options) {
|
|
4
|
+
const { sampleRate, minDurationMs, silenceRmsThreshold = DEFAULT_RMS_THRESHOLD, windowMs = DEFAULT_WINDOW_MS, } = options;
|
|
5
|
+
if (pcm.length === 0 || sampleRate <= 0 || minDurationMs <= 0) {
|
|
6
|
+
return [];
|
|
7
|
+
}
|
|
8
|
+
const windowSamples = Math.max(1, Math.round((windowMs / 1000) * sampleRate));
|
|
9
|
+
const windowCount = Math.floor(pcm.length / windowSamples);
|
|
10
|
+
if (windowCount === 0) {
|
|
11
|
+
return [];
|
|
12
|
+
}
|
|
13
|
+
// Mark each window as silent or not.
|
|
14
|
+
const silent = new Array(windowCount);
|
|
15
|
+
for (let w = 0; w < windowCount; w++) {
|
|
16
|
+
const start = w * windowSamples;
|
|
17
|
+
let sumSq = 0;
|
|
18
|
+
for (let i = 0; i < windowSamples; i++) {
|
|
19
|
+
const s = pcm[start + i] ?? 0;
|
|
20
|
+
sumSq += s * s;
|
|
21
|
+
}
|
|
22
|
+
const rms = Math.sqrt(sumSq / windowSamples);
|
|
23
|
+
silent[w] = rms < silenceRmsThreshold;
|
|
24
|
+
}
|
|
25
|
+
const gaps = [];
|
|
26
|
+
let runStart = -1;
|
|
27
|
+
for (let w = 0; w <= windowCount; w++) {
|
|
28
|
+
const isSilent = w < windowCount && silent[w];
|
|
29
|
+
if (isSilent && runStart === -1) {
|
|
30
|
+
runStart = w;
|
|
31
|
+
}
|
|
32
|
+
else if (!isSilent && runStart !== -1) {
|
|
33
|
+
const startMs = (runStart * windowSamples * 1000) / sampleRate;
|
|
34
|
+
const endMs = (w * windowSamples * 1000) / sampleRate;
|
|
35
|
+
const durationMs = endMs - startMs;
|
|
36
|
+
if (durationMs >= minDurationMs) {
|
|
37
|
+
gaps.push({ startMs, endMs, durationMs });
|
|
38
|
+
}
|
|
39
|
+
runStart = -1;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return gaps;
|
|
43
|
+
}
|
|
44
|
+
export function pickTopGaps(gaps, n) {
|
|
45
|
+
if (n <= 0 || gaps.length === 0) {
|
|
46
|
+
return [];
|
|
47
|
+
}
|
|
48
|
+
const sortedByDuration = [...gaps].sort((a, b) => b.durationMs - a.durationMs);
|
|
49
|
+
const top = sortedByDuration.slice(0, n);
|
|
50
|
+
return top.sort((a, b) => a.startMs - b.startMs);
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=silence-detection.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"silence-detection.js","sourceRoot":"","sources":["../../src/conversation/silence-detection.ts"],"names":[],"mappings":"AAeA,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,iBAAiB,GAAG,EAAE,CAAC;AAE7B,MAAM,UAAU,iBAAiB,CAC/B,GAAe,EACf,OAA6B;IAE7B,MAAM,EACJ,UAAU,EACV,aAAa,EACb,mBAAmB,GAAG,qBAAqB,EAC3C,QAAQ,GAAG,iBAAiB,GAC7B,GAAG,OAAO,CAAC;IAEZ,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,UAAU,IAAI,CAAC,IAAI,aAAa,IAAI,CAAC,EAAE,CAAC;QAC9D,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,QAAQ,GAAG,IAAI,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC;IAC9E,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,GAAG,aAAa,CAAC,CAAC;IAC3D,IAAI,WAAW,KAAK,CAAC,EAAE,CAAC;QACtB,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,qCAAqC;IACrC,MAAM,MAAM,GAAc,IAAI,KAAK,CAAC,WAAW,CAAC,CAAC;IACjD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,KAAK,GAAG,CAAC,GAAG,aAAa,CAAC;QAChC,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,CAAC,GAAG,GAAG,CAAC,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC;QACjB,CAAC;QACD,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,GAAG,aAAa,CAAC,CAAC;QAC7C,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,GAAG,mBAAmB,CAAC;IACxC,CAAC;IAED,MAAM,IAAI,GAAiB,EAAE,CAAC;IAC9B,IAAI,QAAQ,GAAG,CAAC,CAAC,CAAC;IAClB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,QAAQ,GAAG,CAAC,GAAG,WAAW,IAAI,MAAM,CAAC,CAAC,CAAC,CAAC;QAC9C,IAAI,QAAQ,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YAChC,QAAQ,GAAG,CAAC,CAAC;QACf,CAAC;aAAM,IAAI,CAAC,QAAQ,IAAI,QAAQ,KAAK,CAAC,CAAC,EAAE,CAAC;YACxC,MAAM,OAAO,GAAG,CAAC,QAAQ,GAAG,aAAa,GAAG,IAAI,CAAC,GAAG,UAAU,CAAC;YAC/D,MAAM,KAAK,GAAG,CAAC,CAAC,GAAG,aAAa,GAAG,IAAI,CAAC,GAAG,UAAU,CAAC;YACtD,MAAM,UAAU,GAAG,KAAK,GAAG,OAAO,CAAC;YACnC,IAAI,UAAU,IAAI,aAAa,EAAE,CAAC;gBAChC,IAAI,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,CAAC;YAC5C,CAAC;YACD,QAAQ,GAAG,CAAC,CAAC,CAAC;QAChB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,MAAM,UAAU,WAAW,CACzB,IAA2B,EAC3B,CAAS;IAET,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAChC,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,MAAM,gBAAgB,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC,IAAI,CACrC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CACtC,CAAC;IACF,MAAM,GAAG,GAAG,gBAAgB,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACzC,OAAO,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;AACnD,CAAC"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
import type { SpeechMetadata } from "../metadata.js";
|
|
1
2
|
import type { ResolvedModel, Voice } from "../speech-provider.js";
|
|
2
|
-
import type {
|
|
3
|
-
import type { TimestampMode, WordTimestamp } from "../timestamps.js";
|
|
3
|
+
import type { ConversationWordTimestamp } from "../timestamps.js";
|
|
4
4
|
import type { ConversationTurn } from "./types.js";
|
|
5
5
|
interface StitchInput<V extends Voice = Voice> {
|
|
6
6
|
readonly abortSignal?: AbortSignal;
|
|
@@ -9,14 +9,12 @@ interface StitchInput<V extends Voice = Voice> {
|
|
|
9
9
|
readonly headers?: Record<string, string>;
|
|
10
10
|
readonly maxConcurrency: number;
|
|
11
11
|
readonly maxRetries: number;
|
|
12
|
-
readonly normalizeVolume: boolean;
|
|
13
12
|
readonly resolvedPerTurn: readonly ResolvedModel<V>[];
|
|
14
13
|
readonly stitchOptionsPerTurn: readonly {
|
|
15
14
|
providerOptions: Record<string, unknown>;
|
|
16
15
|
mediaType: string;
|
|
17
16
|
}[];
|
|
18
|
-
readonly
|
|
19
|
-
readonly timestamps: TimestampMode;
|
|
17
|
+
readonly timestamps: boolean;
|
|
20
18
|
readonly topLevelProviderOptions?: Record<string, unknown>;
|
|
21
19
|
readonly turns: readonly ConversationTurn<V>[];
|
|
22
20
|
readonly volumeDbfs?: number;
|
|
@@ -29,8 +27,9 @@ interface StitchOutput {
|
|
|
29
27
|
readonly latencyMs: number;
|
|
30
28
|
readonly audioDurationMs?: number;
|
|
31
29
|
};
|
|
30
|
+
readonly metadataPerTurn: readonly SpeechMetadata[];
|
|
32
31
|
readonly providerMetadataPerTurn: readonly (Record<string, unknown> | undefined)[];
|
|
33
|
-
readonly timestamps?: readonly
|
|
32
|
+
readonly timestamps?: readonly ConversationWordTimestamp[];
|
|
34
33
|
readonly warnings: readonly string[];
|
|
35
34
|
}
|
|
36
35
|
export declare function runStitch<V extends Voice>(input: StitchInput<V>): Promise<StitchOutput>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stitch.d.ts","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,
|
|
1
|
+
{"version":3,"file":"stitch.d.ts","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AACrD,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAClE,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,kBAAkB,CAAC;AAQlE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAEnD,UAAU,WAAW,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IAC3C,QAAQ,CAAC,WAAW,CAAC,EAAE,WAAW,CAAC;IACnC,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,eAAe,EAAE,SAAS,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC;IACtD,QAAQ,CAAC,oBAAoB,EAAE,SAAS;QACtC,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;KACnB,EAAE,CAAC;IACJ,QAAQ,CAAC,UAAU,EAAE,OAAO,CAAC;IAC7B,QAAQ,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC3D,QAAQ,CAAC,KAAK,EAAE,SAAS,gBAAgB,CAAC,CAAC,CAAC,EAAE,CAAC;IAC/C,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,UAAU,YAAY;IACpB,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,QAAQ,EAAE;QACjB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;QAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;QAC3B,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC;KACnC,CAAC;IACF,QAAQ,CAAC,eAAe,EAAE,SAAS,cAAc,EAAE,CAAC;IACpD,QAAQ,CAAC,uBAAuB,EAAE,SAAS,CACvC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACvB,SAAS,CACZ,EAAE,CAAC;IACJ,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,yBAAyB,EAAE,CAAC;IAC3D,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;CACtC;AA4BD,wBAAsB,SAAS,CAAC,CAAC,SAAS,KAAK,EAC7C,KAAK,EAAE,WAAW,CAAC,CAAC,CAAC,GACpB,OAAO,CAAC,YAAY,CAAC,CAkIvB"}
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import { generateSpeech } from "../generate-speech.js";
|
|
2
2
|
import { debug } from "../logger.js";
|
|
3
3
|
import { concatPcmToWav, dbfsToInt16Rms, decodeToPcm16, normalizeRms, } from "./pcm-concat.js";
|
|
4
|
+
import { fillTurnTimestampsProportional } from "./proportional-fill.js";
|
|
4
5
|
const TARGET_SAMPLE_RATE = 24_000;
|
|
5
|
-
|
|
6
|
-
* Run `worker(items[i], i)` for each item, capping in-flight executions at
|
|
7
|
-
* `concurrency`. Preserves input ordering in the returned array.
|
|
8
|
-
*/
|
|
6
|
+
const WHITESPACE_RE = /\s+/;
|
|
9
7
|
async function mapWithConcurrency(items, concurrency, worker) {
|
|
10
8
|
const results = new Array(items.length);
|
|
11
9
|
let next = 0;
|
|
@@ -41,20 +39,13 @@ export async function runStitch(input) {
|
|
|
41
39
|
abortSignal: input.abortSignal,
|
|
42
40
|
headers: input.headers,
|
|
43
41
|
timestamps: input.timestamps,
|
|
44
|
-
timestampProvider: input.timestampProvider,
|
|
45
42
|
});
|
|
46
|
-
//
|
|
47
|
-
// content-type: providers' response headers often omit the sample
|
|
48
|
-
// rate (e.g. Hume sends `audio/pcm` for what is actually 48 kHz),
|
|
49
|
-
// and getStitchOptions is the authoritative declaration of what
|
|
50
|
-
// the provider returns for the requested format.
|
|
43
|
+
// Hume and others omit sample rate from content-type; prefer getStitchOptions.
|
|
51
44
|
const segment = decodeToPcm16(result.audio.uint8Array, stitchOpts.mediaType);
|
|
52
45
|
return { result, segment };
|
|
53
46
|
});
|
|
54
47
|
const segments = perTurn.map((p) => p.segment);
|
|
55
|
-
const leveledSegments = input.
|
|
56
|
-
? normalizeRms(segments, input.volumeDbfs == null ? undefined : dbfsToInt16Rms(input.volumeDbfs))
|
|
57
|
-
: segments;
|
|
48
|
+
const leveledSegments = normalizeRms(segments, input.volumeDbfs == null ? undefined : dbfsToInt16Rms(input.volumeDbfs));
|
|
58
49
|
const audio = await concatPcmToWav(leveledSegments, {
|
|
59
50
|
gapMs: input.gapMs,
|
|
60
51
|
targetSampleRate: TARGET_SAMPLE_RATE,
|
|
@@ -65,39 +56,51 @@ export async function runStitch(input) {
|
|
|
65
56
|
Math.round((input.gapMs / 1000) * TARGET_SAMPLE_RATE);
|
|
66
57
|
const audioDurationMs = Math.round((totalSamples / TARGET_SAMPLE_RATE) * 1000);
|
|
67
58
|
const warnings = perTurn.flatMap((p) => p.result.warnings ?? []);
|
|
59
|
+
const metadataPerTurn = perTurn.map((p) => p.result.metadata);
|
|
68
60
|
const providerMetadataPerTurn = perTurn.map((p) => p.result.providerMetadata);
|
|
69
|
-
//
|
|
70
|
-
// cumulative duration of prior turns + (gapMs * number of preceding gaps).
|
|
71
|
-
// Uses each segment's *source* duration (pcm.length / sampleRate) rather
|
|
72
|
-
// than the resampled target, because the offsets must match the audio the
|
|
73
|
-
// per-turn STT/native path actually saw — resampling is a constant-duration
|
|
74
|
-
// transform but rounding differences can drift by a sample or two.
|
|
61
|
+
// Use source duration (pre-resample) so offsets match what the per-turn STT/native saw.
|
|
75
62
|
const gapSeconds = input.gapMs / 1000;
|
|
76
63
|
const turnDurations = perTurn.map((p) => p.segment.pcm.length / p.segment.sampleRate);
|
|
77
|
-
const
|
|
78
|
-
perTurn.every((p) => p.result.timestamps !== undefined);
|
|
64
|
+
const fillWarnings = [];
|
|
79
65
|
let timestamps;
|
|
80
|
-
if (
|
|
66
|
+
if (input.timestamps) {
|
|
81
67
|
timestamps = [];
|
|
82
68
|
let offsetSec = 0;
|
|
69
|
+
const filledTurns = [];
|
|
83
70
|
for (let i = 0; i < perTurn.length; i++) {
|
|
84
|
-
const turnTimestamps = perTurn[i]?.result.timestamps
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
71
|
+
const turnTimestamps = perTurn[i]?.result.timestamps;
|
|
72
|
+
if (turnTimestamps && turnTimestamps.length > 0) {
|
|
73
|
+
for (const w of turnTimestamps) {
|
|
74
|
+
timestamps.push({
|
|
75
|
+
text: w.text,
|
|
76
|
+
start: w.start + offsetSec,
|
|
77
|
+
end: w.end + offsetSec,
|
|
78
|
+
turnIndex: i,
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
const turnText = input.turns[i]?.text ?? "";
|
|
84
|
+
const tokens = turnText
|
|
85
|
+
.split(WHITESPACE_RE)
|
|
86
|
+
.filter((t) => t.length > 0);
|
|
87
|
+
const turnSec = turnDurations[i] ?? 0;
|
|
88
|
+
const filled = fillTurnTimestampsProportional({
|
|
89
|
+
turnIndex: i,
|
|
90
|
+
tokenCount: tokens.length,
|
|
91
|
+
startSec: offsetSec,
|
|
92
|
+
endSec: offsetSec + turnSec,
|
|
93
|
+
texts: tokens,
|
|
90
94
|
});
|
|
95
|
+
timestamps.push(...filled);
|
|
96
|
+
filledTurns.push(i);
|
|
91
97
|
}
|
|
92
98
|
offsetSec += (turnDurations[i] ?? 0) + gapSeconds;
|
|
93
99
|
}
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
.map((p, i) => (p.result.timestamps === undefined ? i : -1))
|
|
99
|
-
.filter((i) => i !== -1);
|
|
100
|
-
debug(`stitch: returning no timestamps — ${missing.length}/${perTurn.length} turn(s) had no alignment data (turns: ${missing.join(", ")}). Use timestamps: "on" and/or mark provider models as native/derived to get full coverage.`);
|
|
100
|
+
if (filledTurns.length > 0) {
|
|
101
|
+
fillWarnings.push(`speech-sdk: stitch path filled timestamps for turn(s) [${filledTurns.join(",")}] proportionally — provider returned no per-word alignment for those turns.`);
|
|
102
|
+
}
|
|
103
|
+
debug(`stitch: composed ${timestamps.length} word timestamps across ${perTurn.length} turn(s); ${filledTurns.length} turn(s) filled proportionally.`);
|
|
101
104
|
}
|
|
102
105
|
return {
|
|
103
106
|
audio,
|
|
@@ -107,9 +110,12 @@ export async function runStitch(input) {
|
|
|
107
110
|
latencyMs: Math.round(performance.now() - start),
|
|
108
111
|
audioDurationMs,
|
|
109
112
|
},
|
|
113
|
+
metadataPerTurn,
|
|
110
114
|
providerMetadataPerTurn,
|
|
111
115
|
timestamps,
|
|
112
|
-
warnings
|
|
116
|
+
warnings: warnings.length > 0 || fillWarnings.length > 0
|
|
117
|
+
? [...warnings, ...fillWarnings]
|
|
118
|
+
: warnings,
|
|
113
119
|
};
|
|
114
120
|
}
|
|
115
121
|
//# sourceMappingURL=stitch.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stitch.js","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAIrC,OAAO,EACL,cAAc,EACd,cAAc,EACd,aAAa,EACb,YAAY,GACb,MAAM,iBAAiB,CAAC;
|
|
1
|
+
{"version":3,"file":"stitch.js","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAIrC,OAAO,EACL,cAAc,EACd,cAAc,EACd,aAAa,EACb,YAAY,GACb,MAAM,iBAAiB,CAAC;AACzB,OAAO,EAAE,8BAA8B,EAAE,MAAM,wBAAwB,CAAC;AAsCxE,MAAM,kBAAkB,GAAG,MAAM,CAAC;AAClC,MAAM,aAAa,GAAG,KAAK,CAAC;AAE5B,KAAK,UAAU,kBAAkB,CAC/B,KAAmB,EACnB,WAAmB,EACnB,MAA8C;IAE9C,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CACxB,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAC5D,KAAK,IAAI,EAAE;QACT,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC;YACjB,IAAI,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;gBACtB,OAAO;YACT,CAAC;YACD,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC,CACF,CAAC;IACF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAC3B,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,KAAqB;IAErB,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEhC,MAAM,OAAO,GAAG,MAAM,kBAAkB,CACtC,KAAK,CAAC,KAAK,EACX,KAAK,CAAC,cAAc,EACpB,KAAK,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE;QAChB,MAAM,QAAQ,GAAG,KAAK,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,UAAU,GAAG,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC;QACjD,MAAM,qBAAqB,GAAG;YAC5B,GAAG,KAAK,CAAC,uBAAuB;YAChC,GAAG,IAAI,CAAC,eAAe;YACvB,GAAG,UAAU,CAAC,eAAe;SAC9B,CAAC;QACF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC;YAClC,KAAK,EAAE,QAAQ;YACf,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,MAAM,EAAE,KAAK,CAAC,MAAM;YACpB,eAAe,EAAE,qBAAqB;YACtC,UAAU,EAAE,KAAK,CAAC,UAAU;YAC5B,WAAW,EAAE,KAAK,CAAC,WAAW;YAC9B,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,UAAU,EAAE,KAAK,CAAC,UAAU;SAC7B,CAAC,CAAC;QACH,+EAA+E;QAC/E,MAAM,OAAO,GAAG,aAAa,CAC3B,MAAM,CAAC,KAAK,CAAC,UAAU,EACvB,UAAU,CAAC,SAAS,CACrB,CAAC;QACF,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC;IAC7B,CAAC,CACF,CAAC;IAEF,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;IAC/C,MAAM,eAAe,GAAG,YAAY,CAClC,QAAQ,EACR,KAAK,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,cAAc,CAAC,KAAK,CAAC,UAAU,CAAC,CACxE,CAAC;IAEF,MAAM,KAAK,GAAG,MAAM,cAAc,CAAC,eAAe,EAAE;QAClD,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,gBAAgB,EAAE,kBAAkB;KACrC,CAAC,CAAC;IAEH,MAAM,YAAY,GAChB,OAAO,CAAC,MAAM,CACZ,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CACP,CAAC;QACD,IAAI,CAAC,KAAK,CACR,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,kBAAkB,CACnE,EACH,CAAC,CACF;QACD,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;YAClB,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,kBAAkB,CAAC,CAAC;IAC1D,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAChC,CAAC,YAAY,GAAG,kBAAkB,CAAC,GAAG,IAAI,CAC3C,CAAC;IAEF,MAAM,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;IACjE,MAAM,eAAe,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IAC9D,MAAM,uBAAuB,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC;IAE9E,wFAAwF;IACxF,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,GAAG,IAAI,CAAC;IACtC,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAC/B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,CACnD,CAAC;IACF,MAAM,YAAY,GAAa,EAAE,CAAC;IAClC,IAAI,UAAmD,CAAC;IACxD,IAAI,KAAK,CAAC,UAAU,EAAE,CAAC;QACrB,UAAU,GAAG,EAAE,CAAC;QAChB,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,MAAM,WAAW,GAAa,EAAE,CAAC;QACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,cAAc,GAAG,OAAO,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC;YACrD,IAAI,cAAc,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAChD,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;oBAC/B,UAAU,CAAC,IAAI,CAAC;wBACd,IAAI,EAAE,CAAC,CAAC,IAAI;wBACZ,KAAK,EAAE,CAAC,CAAC,KAAK,GAAG,SAAS;wBAC1B,GAAG,EAAE,CAAC,CAAC,GAAG,GAAG,SAAS;wBACtB,SAAS,EAAE,CAAC;qBACb,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,IAAI,EAAE,CAAC;gBAC5C,MAAM,MAAM,GAAG,QAAQ;qBACpB,KAAK,CAAC,aAAa,CAAC;qBACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBAC/B,MAAM,OAAO,GAAG,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;gBACtC,MAAM,MAAM,GAAG,8BAA8B,CAAC;oBAC5C,SAAS,EAAE,CAAC;oBACZ,UAAU,EAAE,MAAM,CAAC,MAAM;oBACzB,QAAQ,EAAE,SAAS;oBACnB,MAAM,EAAE,SAAS,GAAG,OAAO;oBAC3B,KAAK,EAAE,MAAM;iBACd,CAAC,CAAC;gBACH,UAAU,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;gBAC3B,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACtB,CAAC;YACD,SAAS,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,UAAU,CAAC;QACpD,CAAC;QACD,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,YAAY,CAAC,IAAI,CACf,0DAA0D,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,6EAA6E,CAC7J,CAAC;QACJ,CAAC;QACD,KAAK,CACH,oBAAoB,UAAU,CAAC,MAAM,2BAA2B,OAAO,CAAC,MAAM,aAAa,WAAW,CAAC,MAAM,iCAAiC,CAC/I,CAAC;IACJ,CAAC;IAED,OAAO;QACL,KAAK;QACL,SAAS,EAAE,WAAW;QACtB,QAAQ,EAAE;YACR,UAAU,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;YAC9D,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;YAChD,eAAe;SAChB;QACD,eAAe;QACf,uBAAuB;QACvB,UAAU;QACV,QAAQ,EACN,QAAQ,CAAC,MAAM,GAAG,CAAC,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC;YAC5C,CAAC,CAAC,CAAC,GAAG,QAAQ,EAAE,GAAG,YAAY,CAAC;YAChC,CAAC,CAAC,QAAQ;KACf,CAAC;AACJ,CAAC"}
|
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
import type { ResolvedModel, Voice } from "../speech-provider.js";
|
|
2
|
-
import type { ResolvedSTTModel } from "../speech-to-text-provider.js";
|
|
3
|
-
import type { TimestampMode } from "../timestamps.js";
|
|
4
2
|
export interface ConversationTurn<V extends Voice = Voice> {
|
|
5
3
|
readonly model?: string | ResolvedModel<V>;
|
|
6
4
|
readonly providerOptions?: Record<string, unknown>;
|
|
@@ -15,41 +13,9 @@ export interface GenerateConversationOptions<V extends Voice = Voice> {
|
|
|
15
13
|
readonly maxConcurrency?: number;
|
|
16
14
|
readonly maxRetries?: number;
|
|
17
15
|
readonly model?: string | ResolvedModel<V>;
|
|
18
|
-
/**
|
|
19
|
-
* RMS-normalize the output audio to an absolute target level (see
|
|
20
|
-
* `volumeDbfs` for the level itself, default -20 dBFS — the broadcast /
|
|
21
|
-
* podcast voice standard). Every call to generateConversation produces
|
|
22
|
-
* output at the same loudness regardless of which providers or content
|
|
23
|
-
* are used, so two separate conversations can be played back-to-back
|
|
24
|
-
* without the listener adjusting volume. Roughly two O(N) passes over
|
|
25
|
-
* the int16 PCM samples — cheap. Pass `false` to skip the step entirely
|
|
26
|
-
* (~zero work) and keep the raw provider levels. Applied on both the
|
|
27
|
-
* stitch and native dialogue paths, provided the chosen provider
|
|
28
|
-
* exposes a decodable PCM/WAV mode via `getStitchOptions`. Default: true.
|
|
29
|
-
*/
|
|
30
|
-
readonly normalizeVolume?: boolean;
|
|
31
16
|
readonly providerOptions?: Record<string, unknown>;
|
|
32
|
-
|
|
33
|
-
* Override the STT provider used for the derived-timestamps path. Construct
|
|
34
|
-
* via a factory (e.g. `createOpenAISTT({ apiKey })("whisper-1")`). Only
|
|
35
|
-
* consulted when the TTS provider can't supply timestamps natively. Defaults
|
|
36
|
-
* to OpenAI Whisper read from `OPENAI_API_KEY`.
|
|
37
|
-
*/
|
|
38
|
-
readonly timestampProvider?: ResolvedSTTModel;
|
|
39
|
-
/**
|
|
40
|
-
* Controls whether the returned `SpeechResult` includes word-level
|
|
41
|
-
* timestamps. Default `"auto"`. On the stitch path each turn's timestamps
|
|
42
|
-
* are offset by cumulative duration + gap and concatenated flat; on the
|
|
43
|
-
* native path the mixed audio yields a flat list without speaker labels.
|
|
44
|
-
*/
|
|
45
|
-
readonly timestamps?: TimestampMode;
|
|
17
|
+
readonly timestamps?: boolean;
|
|
46
18
|
readonly turns: readonly ConversationTurn<V>[];
|
|
47
|
-
/**
|
|
48
|
-
* Target loudness in dBFS for `normalizeVolume`. Must be ≤ 0 (0 dBFS is
|
|
49
|
-
* the int16 ceiling). Lower values are quieter — -20 leaves ~20 dB of
|
|
50
|
-
* peak headroom so typical TTS speech doesn't clip after gain. Ignored
|
|
51
|
-
* when `normalizeVolume` is `false`. Default: -20.
|
|
52
|
-
*/
|
|
53
19
|
readonly volumeDbfs?: number;
|
|
54
20
|
}
|
|
55
21
|
//# sourceMappingURL=types.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/conversation/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/conversation/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAElE,MAAM,WAAW,gBAAgB,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IACvD,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IAC3C,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,CAAC,CAAC;CACnB;AAED,MAAM,WAAW,2BAA2B,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IAClE,QAAQ,CAAC,WAAW,CAAC,EAAE,WAAW,CAAC;IACnC,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM,CAAC;IACjC,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IAC3C,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnD,QAAQ,CAAC,UAAU,CAAC,EAAE,OAAO,CAAC;IAC9B,QAAQ,CAAC,KAAK,EAAE,SAAS,gBAAgB,CAAC,CAAC,CAAC,EAAE,CAAC;IAE/C,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;CAC9B"}
|
|
@@ -1,19 +1,4 @@
|
|
|
1
1
|
import type { ConversationTurn, GenerateConversationOptions } from "./types.js";
|
|
2
|
-
|
|
3
|
-
* Stable key for a voice so we can count unique voices across turns within
|
|
4
|
-
* one call. String voices and URL voices use their value; binary
|
|
5
|
-
* `Uint8Array` audio voices use object-reference identity (two distinct
|
|
6
|
-
* buffers with the same length/endpoints would otherwise collide).
|
|
7
|
-
*/
|
|
8
|
-
export declare function voiceKey(voice: ConversationTurn["voice"], refIds: WeakMap<object, number>, refCounter: {
|
|
9
|
-
next: number;
|
|
10
|
-
}): string;
|
|
11
|
-
/** Build a fresh ref-id context for a single conversation. */
|
|
12
|
-
export declare function newVoiceKeyContext(): {
|
|
13
|
-
refIds: WeakMap<object, number>;
|
|
14
|
-
refCounter: {
|
|
15
|
-
next: number;
|
|
16
|
-
};
|
|
17
|
-
};
|
|
2
|
+
export declare function newVoiceKeyer(): (voice: ConversationTurn["voice"]) => string;
|
|
18
3
|
export declare function validateConversationInput(options: GenerateConversationOptions): void;
|
|
19
4
|
//# sourceMappingURL=validate.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"validate.d.ts","sourceRoot":"","sources":["../../src/conversation/validate.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,2BAA2B,EAAE,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"validate.d.ts","sourceRoot":"","sources":["../../src/conversation/validate.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,2BAA2B,EAAE,MAAM,YAAY,CAAC;AAGhF,wBAAgB,aAAa,IAAI,CAAC,KAAK,EAAE,gBAAgB,CAAC,OAAO,CAAC,KAAK,MAAM,CAoB5E;AAED,wBAAgB,yBAAyB,CACvC,OAAO,EAAE,2BAA2B,GACnC,IAAI,CA2BN"}
|
|
@@ -1,43 +1,43 @@
|
|
|
1
1
|
import { ConversationInputError } from "./errors.js";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
id
|
|
22
|
-
|
|
23
|
-
}
|
|
24
|
-
return `o:${id}`;
|
|
25
|
-
}
|
|
26
|
-
/** Build a fresh ref-id context for a single conversation. */
|
|
27
|
-
export function newVoiceKeyContext() {
|
|
28
|
-
return { refIds: new WeakMap(), refCounter: { next: 0 } };
|
|
2
|
+
// Object voices key by reference — distinct buffers with identical content must not collide.
|
|
3
|
+
export function newVoiceKeyer() {
|
|
4
|
+
const refIds = new WeakMap();
|
|
5
|
+
let nextId = 0;
|
|
6
|
+
return (voice) => {
|
|
7
|
+
if (typeof voice === "string") {
|
|
8
|
+
return `s:${voice}`;
|
|
9
|
+
}
|
|
10
|
+
if ("url" in voice) {
|
|
11
|
+
return `u:${voice.url}`;
|
|
12
|
+
}
|
|
13
|
+
if ("audio" in voice && typeof voice.audio === "string") {
|
|
14
|
+
return `a:${voice.audio}`;
|
|
15
|
+
}
|
|
16
|
+
let id = refIds.get(voice);
|
|
17
|
+
if (id === undefined) {
|
|
18
|
+
id = nextId++;
|
|
19
|
+
refIds.set(voice, id);
|
|
20
|
+
}
|
|
21
|
+
return `o:${id}`;
|
|
22
|
+
};
|
|
29
23
|
}
|
|
30
24
|
export function validateConversationInput(options) {
|
|
31
25
|
if (options.turns.length === 0) {
|
|
32
26
|
throw new ConversationInputError("generateConversation requires at least one turn.");
|
|
33
27
|
}
|
|
28
|
+
// Model placement must be all-or-nothing — partial mix hides which model actually ran where.
|
|
29
|
+
const hasTopLevel = options.model != null;
|
|
34
30
|
for (let i = 0; i < options.turns.length; i++) {
|
|
35
31
|
const turn = options.turns[i];
|
|
36
32
|
if (turn.text.trim().length === 0) {
|
|
37
33
|
throw new ConversationInputError(`turns[${i}].text must not be empty.`);
|
|
38
34
|
}
|
|
39
|
-
|
|
40
|
-
|
|
35
|
+
const hasTurnModel = turn.model != null;
|
|
36
|
+
if (hasTopLevel && hasTurnModel) {
|
|
37
|
+
throw new ConversationInputError(`turns[${i}].model is set, but options.model is also set. Set the model either at the top level for all turns, or on every turn — not both.`);
|
|
38
|
+
}
|
|
39
|
+
if (!(hasTopLevel || hasTurnModel)) {
|
|
40
|
+
throw new ConversationInputError(`turns[${i}].model is required because options.model is not set. Either set options.model for all turns, or set model on every turn.`);
|
|
41
41
|
}
|
|
42
42
|
}
|
|
43
43
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"validate.js","sourceRoot":"","sources":["../../src/conversation/validate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AAGrD
|
|
1
|
+
{"version":3,"file":"validate.js","sourceRoot":"","sources":["../../src/conversation/validate.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AAGrD,6FAA6F;AAC7F,MAAM,UAAU,aAAa;IAC3B,MAAM,MAAM,GAAG,IAAI,OAAO,EAAkB,CAAC;IAC7C,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,OAAO,CAAC,KAAK,EAAE,EAAE;QACf,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,OAAO,KAAK,KAAK,EAAE,CAAC;QACtB,CAAC;QACD,IAAI,KAAK,IAAI,KAAK,EAAE,CAAC;YACnB,OAAO,KAAK,KAAK,CAAC,GAAG,EAAE,CAAC;QAC1B,CAAC;QACD,IAAI,OAAO,IAAI,KAAK,IAAI,OAAO,KAAK,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YACxD,OAAO,KAAK,KAAK,CAAC,KAAK,EAAE,CAAC;QAC5B,CAAC;QACD,IAAI,EAAE,GAAG,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAC3B,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;YACrB,EAAE,GAAG,MAAM,EAAE,CAAC;YACd,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACxB,CAAC;QACD,OAAO,KAAK,EAAE,EAAE,CAAC;IACnB,CAAC,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,yBAAyB,CACvC,OAAoC;IAEpC,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,MAAM,IAAI,sBAAsB,CAC9B,kDAAkD,CACnD,CAAC;IACJ,CAAC;IAED,6FAA6F;IAC7F,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC;IAE1C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9C,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC9B,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,MAAM,IAAI,sBAAsB,CAAC,SAAS,CAAC,2BAA2B,CAAC,CAAC;QAC1E,CAAC;QACD,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC;QACxC,IAAI,WAAW,IAAI,YAAY,EAAE,CAAC;YAChC,MAAM,IAAI,sBAAsB,CAC9B,SAAS,CAAC,kIAAkI,CAC7I,CAAC;QACJ,CAAC;QACD,IAAI,CAAC,CAAC,WAAW,IAAI,YAAY,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,sBAAsB,CAC9B,SAAS,CAAC,2HAA2H,CACtI,CAAC;QACJ,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"default-stt-fallback.d.ts","sourceRoot":"","sources":["../src/default-stt-fallback.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAKrE,wBAAsB,qBAAqB,IAAI,OAAO,CAAC,gBAAgB,CAAC,CAOvE"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
let cached;
|
|
2
|
+
// Dynamic import keeps the OpenAI provider out of bundles for callers who never trigger the STT fallback.
|
|
3
|
+
export async function getDefaultSTTFallback() {
|
|
4
|
+
if (cached) {
|
|
5
|
+
return cached;
|
|
6
|
+
}
|
|
7
|
+
const { createOpenAI } = await import("./providers/openai/index.js");
|
|
8
|
+
cached = createOpenAI().stt("whisper-1");
|
|
9
|
+
return cached;
|
|
10
|
+
}
|
|
11
|
+
//# sourceMappingURL=default-stt-fallback.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"default-stt-fallback.js","sourceRoot":"","sources":["../src/default-stt-fallback.ts"],"names":[],"mappings":"AAEA,IAAI,MAAoC,CAAC;AAEzC,0GAA0G;AAC1G,MAAM,CAAC,KAAK,UAAU,qBAAqB;IACzC,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,MAAM,CAAC;IAChB,CAAC;IACD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,6BAA6B,CAAC,CAAC;IACrE,MAAM,GAAG,YAAY,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;IACzC,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
import type { ResolvedSTTModel } from "./speech-to-text-provider.js";
|
|
2
2
|
import type { WordTimestamp } from "./timestamps.js";
|
|
3
|
-
/**
|
|
4
|
-
* Pipes synthesized audio through an STT provider to produce word-level
|
|
5
|
-
* timestamps. Shared between `generateSpeech()` and conversation paths.
|
|
6
|
-
*/
|
|
7
3
|
export declare function deriveTimestampsViaSTT(args: {
|
|
8
4
|
ttsModel: string;
|
|
9
5
|
audio: Uint8Array;
|
|
10
6
|
mediaType: string;
|
|
11
|
-
|
|
7
|
+
timestampFallback: ResolvedSTTModel;
|
|
12
8
|
abortSignal: AbortSignal | undefined;
|
|
13
9
|
}): Promise<readonly WordTimestamp[]>;
|
|
14
10
|
//# sourceMappingURL=derive-timestamps.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"derive-timestamps.d.ts","sourceRoot":"","sources":["../src/derive-timestamps.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;
|
|
1
|
+
{"version":3,"file":"derive-timestamps.d.ts","sourceRoot":"","sources":["../src/derive-timestamps.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD,wBAAsB,sBAAsB,CAAC,IAAI,EAAE;IACjD,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,UAAU,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,iBAAiB,EAAE,gBAAgB,CAAC;IACpC,WAAW,EAAE,WAAW,GAAG,SAAS,CAAC;CACtC,GAAG,OAAO,CAAC,SAAS,aAAa,EAAE,CAAC,CAqBpC"}
|
|
@@ -1,20 +1,6 @@
|
|
|
1
1
|
import { MissingApiKeyError, TimestampKeyMissingError } from "./errors.js";
|
|
2
|
-
import { OpenAISpeechToTextProvider } from "./stt-providers/openai/index.js";
|
|
3
|
-
/**
|
|
4
|
-
* Default STT model used on the derived-timestamps path when the caller
|
|
5
|
-
* hasn't supplied a `timestampProvider` override. Reads `OPENAI_API_KEY`
|
|
6
|
-
* from the environment via the provider's own key resolution.
|
|
7
|
-
*/
|
|
8
|
-
function defaultTimestampProvider() {
|
|
9
|
-
const provider = new OpenAISpeechToTextProvider();
|
|
10
|
-
return { provider, modelId: provider.defaultModel };
|
|
11
|
-
}
|
|
12
|
-
/**
|
|
13
|
-
* Pipes synthesized audio through an STT provider to produce word-level
|
|
14
|
-
* timestamps. Shared between `generateSpeech()` and conversation paths.
|
|
15
|
-
*/
|
|
16
2
|
export async function deriveTimestampsViaSTT(args) {
|
|
17
|
-
const sttModel = args.
|
|
3
|
+
const sttModel = args.timestampFallback;
|
|
18
4
|
try {
|
|
19
5
|
const { timestamps } = await sttModel.provider.transcribe({
|
|
20
6
|
modelId: sttModel.modelId,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"derive-timestamps.js","sourceRoot":"","sources":["../src/derive-timestamps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"derive-timestamps.js","sourceRoot":"","sources":["../src/derive-timestamps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AAI3E,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAAC,IAM5C;IACC,MAAM,QAAQ,GAAG,IAAI,CAAC,iBAAiB,CAAC;IAExC,IAAI,CAAC;QACH,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,QAAQ,CAAC,QAAQ,CAAC,UAAU,CAAC;YACxD,OAAO,EAAE,QAAQ,CAAC,OAAO;YACzB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,WAAW,EAAE,IAAI,CAAC,WAAW;SAC9B,CAAC,CAAC;QACH,OAAO,UAAU,CAAC;IACpB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,kBAAkB,EAAE,CAAC;YACtC,MAAM,IAAI,wBAAwB,CAAC;gBACjC,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,WAAW,EAAE,GAAG,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,EAAE;gBAC1D,MAAM,EAAE,GAAG,CAAC,MAAM;aACnB,CAAC,CAAC;QACL,CAAC;QACD,MAAM,GAAG,CAAC;IACZ,CAAC;AACH,CAAC"}
|