@speech-sdk/core 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -21
- package/README.md +215 -269
- package/dist/__tests__/e2e/_save-audio.d.ts +25 -2
- package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
- package/dist/__tests__/e2e/_save-audio.js +46 -10
- package/dist/__tests__/e2e/_save-audio.js.map +1 -1
- package/dist/audio-utils.d.ts +2 -0
- package/dist/audio-utils.d.ts.map +1 -1
- package/dist/audio-utils.js +9 -0
- package/dist/audio-utils.js.map +1 -1
- package/dist/captions.d.ts +137 -0
- package/dist/captions.d.ts.map +1 -0
- package/dist/captions.js +283 -0
- package/dist/captions.js.map +1 -0
- package/dist/conversation/stitch.d.ts +5 -0
- package/dist/conversation/stitch.d.ts.map +1 -1
- package/dist/conversation/stitch.js +37 -0
- package/dist/conversation/stitch.js.map +1 -1
- package/dist/conversation/types.d.ts +16 -0
- package/dist/conversation/types.d.ts.map +1 -1
- package/dist/derive-timestamps.d.ts +14 -0
- package/dist/derive-timestamps.d.ts.map +1 -0
- package/dist/derive-timestamps.js +38 -0
- package/dist/derive-timestamps.js.map +1 -0
- package/dist/errors.d.ts +25 -0
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +28 -0
- package/dist/errors.js.map +1 -1
- package/dist/generate-conversation.d.ts +1 -1
- package/dist/generate-conversation.d.ts.map +1 -1
- package/dist/generate-conversation.js +59 -0
- package/dist/generate-conversation.js.map +1 -1
- package/dist/generate-speech.d.ts +18 -1
- package/dist/generate-speech.d.ts.map +1 -1
- package/dist/generate-speech.js +73 -16
- package/dist/generate-speech.js.map +1 -1
- package/dist/index.d.ts +6 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts +2 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +40 -0
- package/dist/logger.js.map +1 -0
- package/dist/provider-utils.d.ts +8 -0
- package/dist/provider-utils.d.ts.map +1 -1
- package/dist/provider-utils.js +16 -2
- package/dist/provider-utils.js.map +1 -1
- package/dist/providers/cartesia/alignment.d.ts +24 -0
- package/dist/providers/cartesia/alignment.d.ts.map +1 -0
- package/dist/providers/cartesia/alignment.js +23 -0
- package/dist/providers/cartesia/alignment.js.map +1 -0
- package/dist/providers/cartesia/index.d.ts +12 -2
- package/dist/providers/cartesia/index.d.ts.map +1 -1
- package/dist/providers/cartesia/index.js +137 -2
- package/dist/providers/cartesia/index.js.map +1 -1
- package/dist/providers/elevenlabs/alignment.d.ts +24 -0
- package/dist/providers/elevenlabs/alignment.d.ts.map +1 -0
- package/dist/providers/elevenlabs/alignment.js +48 -0
- package/dist/providers/elevenlabs/alignment.js.map +1 -0
- package/dist/providers/elevenlabs/index.d.ts +19 -4
- package/dist/providers/elevenlabs/index.d.ts.map +1 -1
- package/dist/providers/elevenlabs/index.js +83 -13
- package/dist/providers/elevenlabs/index.js.map +1 -1
- package/dist/providers/fal/index.d.ts +0 -25
- package/dist/providers/fal/index.d.ts.map +1 -1
- package/dist/providers/fal/index.js +3 -58
- package/dist/providers/fal/index.js.map +1 -1
- package/dist/providers/hume/alignment.d.ts +38 -0
- package/dist/providers/hume/alignment.d.ts.map +1 -0
- package/dist/providers/hume/alignment.js +31 -0
- package/dist/providers/hume/alignment.js.map +1 -0
- package/dist/providers/hume/index.d.ts +8 -1
- package/dist/providers/hume/index.d.ts.map +1 -1
- package/dist/providers/hume/index.js +75 -1
- package/dist/providers/hume/index.js.map +1 -1
- package/dist/providers/inworld/alignment.d.ts +25 -0
- package/dist/providers/inworld/alignment.d.ts.map +1 -0
- package/dist/providers/inworld/alignment.js +23 -0
- package/dist/providers/inworld/alignment.js.map +1 -0
- package/dist/providers/inworld/index.d.ts +11 -2
- package/dist/providers/inworld/index.d.ts.map +1 -1
- package/dist/providers/inworld/index.js +11 -2
- package/dist/providers/inworld/index.js.map +1 -1
- package/dist/providers/murf/alignment.d.ts +22 -0
- package/dist/providers/murf/alignment.d.ts.map +1 -0
- package/dist/providers/murf/alignment.js +17 -0
- package/dist/providers/murf/alignment.js.map +1 -0
- package/dist/providers/murf/index.d.ts +8 -1
- package/dist/providers/murf/index.d.ts.map +1 -1
- package/dist/providers/murf/index.js +10 -1
- package/dist/providers/murf/index.js.map +1 -1
- package/dist/providers/openai/index.d.ts +12 -3
- package/dist/providers/openai/index.d.ts.map +1 -1
- package/dist/providers/openai/index.js +7 -3
- package/dist/providers/openai/index.js.map +1 -1
- package/dist/providers/resemble/alignment.d.ts +32 -0
- package/dist/providers/resemble/alignment.d.ts.map +1 -0
- package/dist/providers/resemble/alignment.js +57 -0
- package/dist/providers/resemble/alignment.js.map +1 -0
- package/dist/providers/resemble/index.d.ts +7 -1
- package/dist/providers/resemble/index.d.ts.map +1 -1
- package/dist/providers/resemble/index.js +13 -1
- package/dist/providers/resemble/index.js.map +1 -1
- package/dist/resolve-provider.d.ts.map +1 -1
- package/dist/resolve-provider.js +3 -12
- package/dist/resolve-provider.js.map +1 -1
- package/dist/speech-provider.d.ts +48 -4
- package/dist/speech-provider.d.ts.map +1 -1
- package/dist/speech-provider.js +16 -0
- package/dist/speech-provider.js.map +1 -1
- package/dist/speech-result.d.ts +10 -0
- package/dist/speech-result.d.ts.map +1 -1
- package/dist/speech-result.js.map +1 -1
- package/dist/speech-to-text-provider.d.ts +40 -0
- package/dist/speech-to-text-provider.d.ts.map +1 -0
- package/dist/speech-to-text-provider.js +2 -0
- package/dist/speech-to-text-provider.js.map +1 -0
- package/dist/stt-providers/openai/index.d.ts +42 -0
- package/dist/stt-providers/openai/index.d.ts.map +1 -0
- package/dist/stt-providers/openai/index.js +184 -0
- package/dist/stt-providers/openai/index.js.map +1 -0
- package/dist/timestamps.d.ts +23 -0
- package/dist/timestamps.d.ts.map +1 -0
- package/dist/timestamps.js +2 -0
- package/dist/timestamps.js.map +1 -0
- package/package.json +6 -2
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import type { ResolvedModel, Voice } from "../speech-provider.js";
|
|
2
|
+
import type { ResolvedSTTModel } from "../speech-to-text-provider.js";
|
|
3
|
+
import type { TimestampMode, WordTimestamp } from "../timestamps.js";
|
|
2
4
|
import type { ConversationTurn } from "./types.js";
|
|
3
5
|
interface StitchInput<V extends Voice = Voice> {
|
|
4
6
|
readonly abortSignal?: AbortSignal;
|
|
@@ -13,6 +15,8 @@ interface StitchInput<V extends Voice = Voice> {
|
|
|
13
15
|
providerOptions: Record<string, unknown>;
|
|
14
16
|
mediaType: string;
|
|
15
17
|
}[];
|
|
18
|
+
readonly timestampProvider?: ResolvedSTTModel;
|
|
19
|
+
readonly timestamps: TimestampMode;
|
|
16
20
|
readonly topLevelProviderOptions?: Record<string, unknown>;
|
|
17
21
|
readonly turns: readonly ConversationTurn<V>[];
|
|
18
22
|
readonly volumeDbfs?: number;
|
|
@@ -26,6 +30,7 @@ interface StitchOutput {
|
|
|
26
30
|
readonly audioDurationMs?: number;
|
|
27
31
|
};
|
|
28
32
|
readonly providerMetadataPerTurn: readonly (Record<string, unknown> | undefined)[];
|
|
33
|
+
readonly timestamps?: readonly WordTimestamp[];
|
|
29
34
|
readonly warnings: readonly string[];
|
|
30
35
|
}
|
|
31
36
|
export declare function runStitch<V extends Voice>(input: StitchInput<V>): Promise<StitchOutput>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stitch.d.ts","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"stitch.d.ts","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAClE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AACtE,OAAO,KAAK,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAOrE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAEnD,UAAU,WAAW,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IAC3C,QAAQ,CAAC,WAAW,CAAC,EAAE,WAAW,CAAC;IACnC,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,eAAe,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,eAAe,EAAE,SAAS,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC;IACtD,QAAQ,CAAC,oBAAoB,EAAE,SAAS;QACtC,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACzC,SAAS,EAAE,MAAM,CAAC;KACnB,EAAE,CAAC;IACJ,QAAQ,CAAC,iBAAiB,CAAC,EAAE,gBAAgB,CAAC;IAC9C,QAAQ,CAAC,UAAU,EAAE,aAAa,CAAC;IACnC,QAAQ,CAAC,uBAAuB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC3D,QAAQ,CAAC,KAAK,EAAE,SAAS,gBAAgB,CAAC,CAAC,CAAC,EAAE,CAAC;IAC/C,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,UAAU,YAAY;IACpB,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,QAAQ,EAAE;QACjB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;QAC5B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;QAC3B,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC;KACnC,CAAC;IACF,QAAQ,CAAC,uBAAuB,EAAE,SAAS,CACvC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACvB,SAAS,CACZ,EAAE,CAAC;IACJ,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,CAAC;IAC/C,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;CACtC;AA+BD,wBAAsB,SAAS,CAAC,CAAC,SAAS,KAAK,EAC7C,KAAK,EAAE,WAAW,CAAC,CAAC,CAAC,GACpB,OAAO,CAAC,YAAY,CAAC,CA2HvB"}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { generateSpeech } from "../generate-speech.js";
|
|
2
|
+
import { debug } from "../logger.js";
|
|
2
3
|
import { concatPcmToWav, dbfsToInt16Rms, decodeToPcm16, normalizeRms, } from "./pcm-concat.js";
|
|
3
4
|
const TARGET_SAMPLE_RATE = 24_000;
|
|
4
5
|
/**
|
|
@@ -39,6 +40,8 @@ export async function runStitch(input) {
|
|
|
39
40
|
maxRetries: input.maxRetries,
|
|
40
41
|
abortSignal: input.abortSignal,
|
|
41
42
|
headers: input.headers,
|
|
43
|
+
timestamps: input.timestamps,
|
|
44
|
+
timestampProvider: input.timestampProvider,
|
|
42
45
|
});
|
|
43
46
|
// Prefer the mediaType from getStitchOptions over the response
|
|
44
47
|
// content-type: providers' response headers often omit the sample
|
|
@@ -63,6 +66,39 @@ export async function runStitch(input) {
|
|
|
63
66
|
const audioDurationMs = Math.round((totalSamples / TARGET_SAMPLE_RATE) * 1000);
|
|
64
67
|
const warnings = perTurn.flatMap((p) => p.result.warnings ?? []);
|
|
65
68
|
const providerMetadataPerTurn = perTurn.map((p) => p.result.providerMetadata);
|
|
69
|
+
// Compose per-turn word timestamps into a single flat list, offset by the
|
|
70
|
+
// cumulative duration of prior turns + (gapMs * number of preceding gaps).
|
|
71
|
+
// Uses each segment's *source* duration (pcm.length / sampleRate) rather
|
|
72
|
+
// than the resampled target, because the offsets must match the audio the
|
|
73
|
+
// per-turn STT/native path actually saw — resampling is a constant-duration
|
|
74
|
+
// transform but rounding differences can drift by a sample or two.
|
|
75
|
+
const gapSeconds = input.gapMs / 1000;
|
|
76
|
+
const turnDurations = perTurn.map((p) => p.segment.pcm.length / p.segment.sampleRate);
|
|
77
|
+
const allTurnsHaveTimestamps = input.timestamps !== "off" &&
|
|
78
|
+
perTurn.every((p) => p.result.timestamps !== undefined);
|
|
79
|
+
let timestamps;
|
|
80
|
+
if (allTurnsHaveTimestamps) {
|
|
81
|
+
timestamps = [];
|
|
82
|
+
let offsetSec = 0;
|
|
83
|
+
for (let i = 0; i < perTurn.length; i++) {
|
|
84
|
+
const turnTimestamps = perTurn[i]?.result.timestamps ?? [];
|
|
85
|
+
for (const w of turnTimestamps) {
|
|
86
|
+
timestamps.push({
|
|
87
|
+
text: w.text,
|
|
88
|
+
start: w.start + offsetSec,
|
|
89
|
+
end: w.end + offsetSec,
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
offsetSec += (turnDurations[i] ?? 0) + gapSeconds;
|
|
93
|
+
}
|
|
94
|
+
debug(`stitch: composed ${timestamps.length} word timestamps across ${perTurn.length} turn(s).`);
|
|
95
|
+
}
|
|
96
|
+
else if (input.timestamps !== "off") {
|
|
97
|
+
const missing = perTurn
|
|
98
|
+
.map((p, i) => (p.result.timestamps === undefined ? i : -1))
|
|
99
|
+
.filter((i) => i !== -1);
|
|
100
|
+
debug(`stitch: returning no timestamps — ${missing.length}/${perTurn.length} turn(s) had no alignment data (turns: ${missing.join(", ")}). Use timestamps: "on" and/or mark provider models as native/derived to get full coverage.`);
|
|
101
|
+
}
|
|
66
102
|
return {
|
|
67
103
|
audio,
|
|
68
104
|
mediaType: "audio/wav",
|
|
@@ -72,6 +108,7 @@ export async function runStitch(input) {
|
|
|
72
108
|
audioDurationMs,
|
|
73
109
|
},
|
|
74
110
|
providerMetadataPerTurn,
|
|
111
|
+
timestamps,
|
|
75
112
|
warnings,
|
|
76
113
|
};
|
|
77
114
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stitch.js","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"stitch.js","sourceRoot":"","sources":["../../src/conversation/stitch.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAIrC,OAAO,EACL,cAAc,EACd,cAAc,EACd,aAAa,EACb,YAAY,GACb,MAAM,iBAAiB,CAAC;AAuCzB,MAAM,kBAAkB,GAAG,MAAM,CAAC;AAElC;;;GAGG;AACH,KAAK,UAAU,kBAAkB,CAC/B,KAAmB,EACnB,WAAmB,EACnB,MAA8C;IAE9C,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IAC7C,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CACxB,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAC5D,KAAK,IAAI,EAAE;QACT,OAAO,IAAI,EAAE,CAAC;YACZ,MAAM,CAAC,GAAG,IAAI,EAAE,CAAC;YACjB,IAAI,CAAC,IAAI,KAAK,CAAC,MAAM,EAAE,CAAC;gBACtB,OAAO;YACT,CAAC;YACD,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACzC,CAAC;IACH,CAAC,CACF,CAAC;IACF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAC3B,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,KAAqB;IAErB,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEhC,MAAM,OAAO,GAAG,MAAM,kBAAkB,CACtC,KAAK,CAAC,KAAK,EACX,KAAK,CAAC,cAAc,EACpB,KAAK,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE;QAChB,MAAM,QAAQ,GAAG,KAAK,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,UAAU,GAAG,KAAK,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC;QACjD,MAAM,qBAAqB,GAAG;YAC5B,GAAG,KAAK,CAAC,uBAAuB;YAChC,GAAG,IAAI,CAAC,eAAe;YACvB,GAAG,UAAU,CAAC,eAAe;SAC9B,CAAC;QACF,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC;YAClC,KAAK,EAAE,QAAQ;YACf,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,MAAM,EAAE,KAAK,CAAC,MAAM;YACpB,eAAe,EAAE,qBAAqB;YACtC,UAAU,EAAE,KAAK,CAAC,UAAU;YAC5B,WAAW,EAAE,KAAK,CAAC,WAAW;YAC9B,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,UAAU,EAAE,KAAK,CAAC,UAAU;YAC5B,iBAAiB,EAAE,KAAK,CAAC,iBAAiB;SAC3C,CAAC,CAAC;QACH,+DAA+D;QAC/D,kEAAkE;QAClE,kEAAkE;QAClE,gEAAgE;QAChE,iDAAiD;QACjD,MAAM,OAAO,GAAG,aAAa,CAC3B,MAAM,CAAC,KAAK,CAAC,UAAU,EACvB,UAAU,CAAC,SAAS,CACrB,CAAC;QACF,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,CAAC;IAC7B,CAAC,CACF,CAAC;IAEF,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;IAC/C,MAAM,eAAe,GAAG,KAAK,CAAC,eAAe;QAC3C,CAAC,CAAC,YAAY,CACV,QAAQ,EACR,KAAK,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,cAAc,CAAC,KAAK,CAAC,UAAU,CAAC,CACxE;QACH,CAAC,CAAC,QAAQ,CAAC;IAEb,MAAM,KAAK,GAAG,MAAM,cAAc,CAAC,eAAe,EAAE;QAClD,KAAK,EAAE,KAAK,CAAC,KAAK;QAClB,gBAAgB,EAAE,kBAAkB;KACrC,CAAC,CAAC;IAEH,MAAM,YAAY,GAChB,OAAO,CAAC,MAAM,CACZ,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CACP,CAAC;QACD,IAAI,CAAC,KAAK,CACR,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,kBAAkB,CACnE,EACH,CAAC,CACF;QACD,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;YAClB,IAAI,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,kBAAkB,CAAC,CAAC;IAC1D,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAChC,CAAC,YAAY,GAAG,kBAAkB,CAAC,GAAG,IAAI,CAC3C,CAAC;IAEF,MAAM,QAAQ,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;IACjE,MAAM,uBAAuB,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC;IAE9E,0EAA0E;IAC1E,2EAA2E;IAC3E,yEAAyE;IACzE,0EAA0E;IAC1E,4EAA4E;IAC5E,mEAAmE;IACnE,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,GAAG,IAAI,CAAC;IACtC,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAC/B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,OAAO,CAAC,UAAU,CACnD,CAAC;IACF,MAAM,sBAAsB,GAC1B,KAAK,CAAC,UAAU,KAAK,KAAK;QAC1B,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,KAAK,SAAS,CAAC,CAAC;IAE1D,IAAI,UAAuC,CAAC;IAC5C,IAAI,sBAAsB,EAAE,CAAC;QAC3B,UAAU,GAAG,EAAE,CAAC;QAChB,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,cAAc,GAAG,OAAO,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC;YAC3D,KAAK,MAAM,CAAC,IAAI,cAAc,EAAE,CAAC;gBAC/B,UAAU,CAAC,IAAI,CAAC;oBACd,IAAI,EAAE,CAAC,CAAC,IAAI;oBACZ,KAAK,EAAE,CAAC,CAAC,KAAK,GAAG,SAAS;oBAC1B,GAAG,EAAE,CAAC,CAAC,GAAG,GAAG,SAAS;iBACvB,CAAC,CAAC;YACL,CAAC;YACD,SAAS,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,UAAU,CAAC;QACpD,CAAC;QACD,KAAK,CACH,oBAAoB,UAAU,CAAC,MAAM,2BAA2B,OAAO,CAAC,MAAM,WAAW,CAC1F,CAAC;IACJ,CAAC;SAAM,IAAI,KAAK,CAAC,UAAU,KAAK,KAAK,EAAE,CAAC;QACtC,MAAM,OAAO,GAAG,OAAO;aACpB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,KAAK,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;aAC3D,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAC3B,KAAK,CACH,qCAAqC,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,MAAM,0CAA0C,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,6FAA6F,CAC/N,CAAC;IACJ,CAAC;IAED,OAAO;QACL,KAAK;QACL,SAAS,EAAE,WAAW;QACtB,QAAQ,EAAE;YACR,UAAU,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;YAC9D,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;YAChD,eAAe;SAChB;QACD,uBAAuB;QACvB,UAAU;QACV,QAAQ;KACT,CAAC;AACJ,CAAC"}
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import type { ResolvedModel, Voice } from "../speech-provider.js";
|
|
2
|
+
import type { ResolvedSTTModel } from "../speech-to-text-provider.js";
|
|
3
|
+
import type { TimestampMode } from "../timestamps.js";
|
|
2
4
|
export interface ConversationTurn<V extends Voice = Voice> {
|
|
3
5
|
readonly model?: string | ResolvedModel<V>;
|
|
4
6
|
readonly providerOptions?: Record<string, unknown>;
|
|
@@ -27,6 +29,20 @@ export interface GenerateConversationOptions<V extends Voice = Voice> {
|
|
|
27
29
|
*/
|
|
28
30
|
readonly normalizeVolume?: boolean;
|
|
29
31
|
readonly providerOptions?: Record<string, unknown>;
|
|
32
|
+
/**
|
|
33
|
+
* Override the STT provider used for the derived-timestamps path. Construct
|
|
34
|
+
* via a factory (e.g. `createOpenAISTT({ apiKey })("whisper-1")`). Only
|
|
35
|
+
* consulted when the TTS provider can't supply timestamps natively. Defaults
|
|
36
|
+
* to OpenAI Whisper read from `OPENAI_API_KEY`.
|
|
37
|
+
*/
|
|
38
|
+
readonly timestampProvider?: ResolvedSTTModel;
|
|
39
|
+
/**
|
|
40
|
+
* Controls whether the returned `SpeechResult` includes word-level
|
|
41
|
+
* timestamps. Default `"auto"`. On the stitch path each turn's timestamps
|
|
42
|
+
* are offset by cumulative duration + gap and concatenated flat; on the
|
|
43
|
+
* native path the mixed audio yields a flat list without speaker labels.
|
|
44
|
+
*/
|
|
45
|
+
readonly timestamps?: TimestampMode;
|
|
30
46
|
readonly turns: readonly ConversationTurn<V>[];
|
|
31
47
|
/**
|
|
32
48
|
* Target loudness in dBFS for `normalizeVolume`. Must be ≤ 0 (0 dBFS is
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/conversation/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/conversation/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAClE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AACtE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAEtD,MAAM,WAAW,gBAAgB,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IACvD,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IAC3C,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,EAAE,CAAC,CAAC;CACnB;AAED,MAAM,WAAW,2BAA2B,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IAClE,QAAQ,CAAC,WAAW,CAAC,EAAE,WAAW,CAAC;IACnC,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC1C,QAAQ,CAAC,cAAc,CAAC,EAAE,MAAM,CAAC;IACjC,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IAC3C;;;;;;;;;;;OAWG;IACH,QAAQ,CAAC,eAAe,CAAC,EAAE,OAAO,CAAC;IACnC,QAAQ,CAAC,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnD;;;;;OAKG;IACH,QAAQ,CAAC,iBAAiB,CAAC,EAAE,gBAAgB,CAAC;IAC9C;;;;;OAKG;IACH,QAAQ,CAAC,UAAU,CAAC,EAAE,aAAa,CAAC;IACpC,QAAQ,CAAC,KAAK,EAAE,SAAS,gBAAgB,CAAC,CAAC,CAAC,EAAE,CAAC;IAC/C;;;;;OAKG;IACH,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;CAC9B"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { ResolvedSTTModel } from "./speech-to-text-provider.js";
|
|
2
|
+
import type { WordTimestamp } from "./timestamps.js";
|
|
3
|
+
/**
|
|
4
|
+
* Pipes synthesized audio through an STT provider to produce word-level
|
|
5
|
+
* timestamps. Shared between `generateSpeech()` and conversation paths.
|
|
6
|
+
*/
|
|
7
|
+
export declare function deriveTimestampsViaSTT(args: {
|
|
8
|
+
ttsModel: string;
|
|
9
|
+
audio: Uint8Array;
|
|
10
|
+
mediaType: string;
|
|
11
|
+
timestampProvider: ResolvedSTTModel | undefined;
|
|
12
|
+
abortSignal: AbortSignal | undefined;
|
|
13
|
+
}): Promise<readonly WordTimestamp[]>;
|
|
14
|
+
//# sourceMappingURL=derive-timestamps.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"derive-timestamps.d.ts","sourceRoot":"","sources":["../src/derive-timestamps.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AAErE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAYrD;;;GAGG;AACH,wBAAsB,sBAAsB,CAAC,IAAI,EAAE;IACjD,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,UAAU,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,iBAAiB,EAAE,gBAAgB,GAAG,SAAS,CAAC;IAChD,WAAW,EAAE,WAAW,GAAG,SAAS,CAAC;CACtC,GAAG,OAAO,CAAC,SAAS,aAAa,EAAE,CAAC,CAqBpC"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { MissingApiKeyError, TimestampKeyMissingError } from "./errors.js";
|
|
2
|
+
import { OpenAISpeechToTextProvider } from "./stt-providers/openai/index.js";
|
|
3
|
+
/**
|
|
4
|
+
* Default STT model used on the derived-timestamps path when the caller
|
|
5
|
+
* hasn't supplied a `timestampProvider` override. Reads `OPENAI_API_KEY`
|
|
6
|
+
* from the environment via the provider's own key resolution.
|
|
7
|
+
*/
|
|
8
|
+
function defaultTimestampProvider() {
|
|
9
|
+
const provider = new OpenAISpeechToTextProvider();
|
|
10
|
+
return { provider, modelId: provider.defaultModel };
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Pipes synthesized audio through an STT provider to produce word-level
|
|
14
|
+
* timestamps. Shared between `generateSpeech()` and conversation paths.
|
|
15
|
+
*/
|
|
16
|
+
export async function deriveTimestampsViaSTT(args) {
|
|
17
|
+
const sttModel = args.timestampProvider ?? defaultTimestampProvider();
|
|
18
|
+
try {
|
|
19
|
+
const { timestamps } = await sttModel.provider.transcribe({
|
|
20
|
+
modelId: sttModel.modelId,
|
|
21
|
+
audio: args.audio,
|
|
22
|
+
mediaType: args.mediaType,
|
|
23
|
+
abortSignal: args.abortSignal,
|
|
24
|
+
});
|
|
25
|
+
return timestamps;
|
|
26
|
+
}
|
|
27
|
+
catch (err) {
|
|
28
|
+
if (err instanceof MissingApiKeyError) {
|
|
29
|
+
throw new TimestampKeyMissingError({
|
|
30
|
+
ttsModel: args.ttsModel,
|
|
31
|
+
sttProvider: `${sttModel.provider.id}/${sttModel.modelId}`,
|
|
32
|
+
envVar: err.envVar,
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
throw err;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=derive-timestamps.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"derive-timestamps.js","sourceRoot":"","sources":["../src/derive-timestamps.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AAE3E,OAAO,EAAE,0BAA0B,EAAE,MAAM,iCAAiC,CAAC;AAG7E;;;;GAIG;AACH,SAAS,wBAAwB;IAC/B,MAAM,QAAQ,GAAG,IAAI,0BAA0B,EAAE,CAAC;IAClD,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,CAAC,YAAY,EAAE,CAAC;AACtD,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAAC,IAM5C;IACC,MAAM,QAAQ,GAAG,IAAI,CAAC,iBAAiB,IAAI,wBAAwB,EAAE,CAAC;IAEtE,IAAI,CAAC;QACH,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,QAAQ,CAAC,QAAQ,CAAC,UAAU,CAAC;YACxD,OAAO,EAAE,QAAQ,CAAC,OAAO;YACzB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,WAAW,EAAE,IAAI,CAAC,WAAW;SAC9B,CAAC,CAAC;QACH,OAAO,UAAU,CAAC;IACpB,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,kBAAkB,EAAE,CAAC;YACtC,MAAM,IAAI,wBAAwB,CAAC;gBACjC,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,WAAW,EAAE,GAAG,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,EAAE;gBAC1D,MAAM,EAAE,GAAG,CAAC,MAAM;aACnB,CAAC,CAAC;QACL,CAAC;QACD,MAAM,GAAG,CAAC;IACZ,CAAC;AACH,CAAC"}
|
package/dist/errors.d.ts
CHANGED
|
@@ -23,4 +23,29 @@ export declare class StreamingNotSupportedError extends SpeechSDKError {
|
|
|
23
23
|
export declare class VolumeAdjustmentUnsupportedError extends SpeechSDKError {
|
|
24
24
|
constructor(model: string);
|
|
25
25
|
}
|
|
26
|
+
/**
|
|
27
|
+
* Thrown by `resolveApiKey` when neither the `apiKey` option nor the provider's
|
|
28
|
+
* env var is set. Carries the provider name + env var so callers can build
|
|
29
|
+
* their own actionable error (see `TimestampKeyMissingError`).
|
|
30
|
+
*/
|
|
31
|
+
export declare class MissingApiKeyError extends SpeechSDKError {
|
|
32
|
+
readonly providerName: string;
|
|
33
|
+
readonly envVar: string;
|
|
34
|
+
constructor(options: {
|
|
35
|
+
providerName: string;
|
|
36
|
+
envVar: string;
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Thrown when `timestamps: "on"` is requested but the SDK can't obtain word
|
|
41
|
+
* timestamps because the required API key for the fallback STT provider is
|
|
42
|
+
* missing. Message names the env vars that would unblock the request.
|
|
43
|
+
*/
|
|
44
|
+
export declare class TimestampKeyMissingError extends SpeechSDKError {
|
|
45
|
+
constructor(options: {
|
|
46
|
+
ttsModel: string;
|
|
47
|
+
sttProvider: string;
|
|
48
|
+
envVar: string;
|
|
49
|
+
});
|
|
50
|
+
}
|
|
26
51
|
//# sourceMappingURL=errors.d.ts.map
|
package/dist/errors.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"errors.d.ts","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA,qBAAa,cAAe,SAAQ,KAAK;gBAC3B,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE;QAAE,KAAK,CAAC,EAAE,OAAO,CAAA;KAAE;CAI3D;AAED,qBAAa,QAAS,SAAQ,cAAc;IAC1C,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,YAAY,CAAC,EAAE,OAAO,CAAC;IAChC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAGrB,OAAO,EAAE,MAAM,EACf,OAAO,EAAE;QACP,UAAU,EAAE,MAAM,CAAC;QACnB,KAAK,EAAE,MAAM,CAAC;QACd,YAAY,CAAC,EAAE,OAAO,CAAC;QACvB,KAAK,CAAC,EAAE,OAAO,CAAC;KACjB;CAQJ;AAED,qBAAa,sBAAuB,SAAQ,cAAc;gBAC5C,OAAO,CAAC,EAAE,MAAM;CAI7B;AAED,qBAAa,0BAA2B,SAAQ,cAAc;gBAChD,KAAK,EAAE,MAAM;CAM1B;AAED,qBAAa,gCAAiC,SAAQ,cAAc;gBACtD,KAAK,EAAE,MAAM;CAM1B"}
|
|
1
|
+
{"version":3,"file":"errors.d.ts","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA,qBAAa,cAAe,SAAQ,KAAK;gBAC3B,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE;QAAE,KAAK,CAAC,EAAE,OAAO,CAAA;KAAE;CAI3D;AAED,qBAAa,QAAS,SAAQ,cAAc;IAC1C,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,YAAY,CAAC,EAAE,OAAO,CAAC;IAChC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;gBAGrB,OAAO,EAAE,MAAM,EACf,OAAO,EAAE;QACP,UAAU,EAAE,MAAM,CAAC;QACnB,KAAK,EAAE,MAAM,CAAC;QACd,YAAY,CAAC,EAAE,OAAO,CAAC;QACvB,KAAK,CAAC,EAAE,OAAO,CAAC;KACjB;CAQJ;AAED,qBAAa,sBAAuB,SAAQ,cAAc;gBAC5C,OAAO,CAAC,EAAE,MAAM;CAI7B;AAED,qBAAa,0BAA2B,SAAQ,cAAc;gBAChD,KAAK,EAAE,MAAM;CAM1B;AAED,qBAAa,gCAAiC,SAAQ,cAAc;gBACtD,KAAK,EAAE,MAAM;CAM1B;AAED;;;;GAIG;AACH,qBAAa,kBAAmB,SAAQ,cAAc;IACpD,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;gBAEZ,OAAO,EAAE;QAAE,YAAY,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE;CAQ9D;AAED;;;;GAIG;AACH,qBAAa,wBAAyB,SAAQ,cAAc;gBAC9C,OAAO,EAAE;QACnB,QAAQ,EAAE,MAAM,CAAC;QACjB,WAAW,EAAE,MAAM,CAAC;QACpB,MAAM,EAAE,MAAM,CAAC;KAChB;CAQF"}
|
package/dist/errors.js
CHANGED
|
@@ -34,4 +34,32 @@ export class VolumeAdjustmentUnsupportedError extends SpeechSDKError {
|
|
|
34
34
|
this.name = "VolumeAdjustmentUnsupportedError";
|
|
35
35
|
}
|
|
36
36
|
}
|
|
37
|
+
/**
|
|
38
|
+
* Thrown by `resolveApiKey` when neither the `apiKey` option nor the provider's
|
|
39
|
+
* env var is set. Carries the provider name + env var so callers can build
|
|
40
|
+
* their own actionable error (see `TimestampKeyMissingError`).
|
|
41
|
+
*/
|
|
42
|
+
export class MissingApiKeyError extends SpeechSDKError {
|
|
43
|
+
providerName;
|
|
44
|
+
envVar;
|
|
45
|
+
constructor(options) {
|
|
46
|
+
super(`${options.providerName} API key is required. Pass it via apiKey option or set the ${options.envVar} environment variable.`);
|
|
47
|
+
this.name = "MissingApiKeyError";
|
|
48
|
+
this.providerName = options.providerName;
|
|
49
|
+
this.envVar = options.envVar;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Thrown when `timestamps: "on"` is requested but the SDK can't obtain word
|
|
54
|
+
* timestamps because the required API key for the fallback STT provider is
|
|
55
|
+
* missing. Message names the env vars that would unblock the request.
|
|
56
|
+
*/
|
|
57
|
+
export class TimestampKeyMissingError extends SpeechSDKError {
|
|
58
|
+
constructor(options) {
|
|
59
|
+
super(`${options.ttsModel} does not return word timestamps natively. ` +
|
|
60
|
+
`Set ${options.envVar} to enable the ${options.sttProvider} fallback, ` +
|
|
61
|
+
`pass a configured timestampProvider, or use timestamps: 'auto' | 'off'.`);
|
|
62
|
+
this.name = "TimestampKeyMissingError";
|
|
63
|
+
}
|
|
64
|
+
}
|
|
37
65
|
//# sourceMappingURL=errors.js.map
|
package/dist/errors.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"errors.js","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA,MAAM,OAAO,cAAe,SAAQ,KAAK;IACvC,YAAY,OAAe,EAAE,OAA6B;QACxD,KAAK,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;QACxB,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,QAAS,SAAQ,cAAc;IACjC,UAAU,CAAS;IACnB,YAAY,CAAW;IACvB,KAAK,CAAS;IAEvB,YACE,OAAe,EACf,OAKC;QAED,KAAK,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,IAAI,GAAG,UAAU,CAAC;QACvB,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QAC3B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;IAC3C,CAAC;CACF;AAED,MAAM,OAAO,sBAAuB,SAAQ,cAAc;IACxD,YAAY,OAAgB;QAC1B,KAAK,CAAC,OAAO,IAAI,gCAAgC,CAAC,CAAC;QACnD,IAAI,CAAC,IAAI,GAAG,wBAAwB,CAAC;IACvC,CAAC;CACF;AAED,MAAM,OAAO,0BAA2B,SAAQ,cAAc;IAC5D,YAAY,KAAa;QACvB,KAAK,CACH,iCAAiC,KAAK,iCAAiC,CACxE,CAAC;QACF,IAAI,CAAC,IAAI,GAAG,4BAA4B,CAAC;IAC3C,CAAC;CACF;AAED,MAAM,OAAO,gCAAiC,SAAQ,cAAc;IAClE,YAAY,KAAa;QACvB,KAAK,CACH,kCAAkC,KAAK,gEAAgE,CACxG,CAAC;QACF,IAAI,CAAC,IAAI,GAAG,kCAAkC,CAAC;IACjD,CAAC;CACF"}
|
|
1
|
+
{"version":3,"file":"errors.js","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA,MAAM,OAAO,cAAe,SAAQ,KAAK;IACvC,YAAY,OAAe,EAAE,OAA6B;QACxD,KAAK,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;QACxB,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;IAC/B,CAAC;CACF;AAED,MAAM,OAAO,QAAS,SAAQ,cAAc;IACjC,UAAU,CAAS;IACnB,YAAY,CAAW;IACvB,KAAK,CAAS;IAEvB,YACE,OAAe,EACf,OAKC;QAED,KAAK,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;QACzC,IAAI,CAAC,IAAI,GAAG,UAAU,CAAC;QACvB,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;QAC3B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;IAC3C,CAAC;CACF;AAED,MAAM,OAAO,sBAAuB,SAAQ,cAAc;IACxD,YAAY,OAAgB;QAC1B,KAAK,CAAC,OAAO,IAAI,gCAAgC,CAAC,CAAC;QACnD,IAAI,CAAC,IAAI,GAAG,wBAAwB,CAAC;IACvC,CAAC;CACF;AAED,MAAM,OAAO,0BAA2B,SAAQ,cAAc;IAC5D,YAAY,KAAa;QACvB,KAAK,CACH,iCAAiC,KAAK,iCAAiC,CACxE,CAAC;QACF,IAAI,CAAC,IAAI,GAAG,4BAA4B,CAAC;IAC3C,CAAC;CACF;AAED,MAAM,OAAO,gCAAiC,SAAQ,cAAc;IAClE,YAAY,KAAa;QACvB,KAAK,CACH,kCAAkC,KAAK,gEAAgE,CACxG,CAAC;QACF,IAAI,CAAC,IAAI,GAAG,kCAAkC,CAAC;IACjD,CAAC;CACF;AAED;;;;GAIG;AACH,MAAM,OAAO,kBAAmB,SAAQ,cAAc;IAC3C,YAAY,CAAS;IACrB,MAAM,CAAS;IAExB,YAAY,OAAiD;QAC3D,KAAK,CACH,GAAG,OAAO,CAAC,YAAY,8DAA8D,OAAO,CAAC,MAAM,wBAAwB,CAC5H,CAAC;QACF,IAAI,CAAC,IAAI,GAAG,oBAAoB,CAAC;QACjC,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;QACzC,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IAC/B,CAAC;CACF;AAED;;;;GAIG;AACH,MAAM,OAAO,wBAAyB,SAAQ,cAAc;IAC1D,YAAY,OAIX;QACC,KAAK,CACH,GAAG,OAAO,CAAC,QAAQ,6CAA6C;YAC9D,OAAO,OAAO,CAAC,MAAM,kBAAkB,OAAO,CAAC,WAAW,aAAa;YACvE,yEAAyE,CAC5E,CAAC;QACF,IAAI,CAAC,IAAI,GAAG,0BAA0B,CAAC;IACzC,CAAC;CACF"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { GenerateConversationOptions } from "./conversation/types.js";
|
|
2
|
-
import type
|
|
2
|
+
import { type Voice } from "./speech-provider.js";
|
|
3
3
|
import type { SpeechResult } from "./speech-result.js";
|
|
4
4
|
export { ConversationInputError, DialogueConstraintError, StitchUnsupportedError, } from "./conversation/errors.js";
|
|
5
5
|
export type { ConversationTurn, GenerateConversationOptions, } from "./conversation/types.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"generate-conversation.d.ts","sourceRoot":"","sources":["../src/generate-conversation.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,2BAA2B,EAAE,MAAM,yBAAyB,CAAC;
|
|
1
|
+
{"version":3,"file":"generate-conversation.d.ts","sourceRoot":"","sources":["../src/generate-conversation.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,2BAA2B,EAAE,MAAM,yBAAyB,CAAC;AAO3E,OAAO,EAGL,KAAK,KAAK,EACX,MAAM,sBAAsB,CAAC;AAC9B,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAMvD,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,sBAAsB,GACvB,MAAM,0BAA0B,CAAC;AAClC,YAAY,EACV,gBAAgB,EAChB,2BAA2B,GAC5B,MAAM,yBAAyB,CAAC;AAMjC,wBAAsB,oBAAoB,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK,EAChE,OAAO,EAAE,2BAA2B,CAAC,CAAC,CAAC,GACtC,OAAO,CAAC,YAAY,CAAC,CAwFvB"}
|
|
@@ -3,8 +3,11 @@ import { computeAudioDuration } from "./audio-duration.js";
|
|
|
3
3
|
import { chooseConversationPath } from "./conversation/dispatch.js";
|
|
4
4
|
import { ConversationInputError } from "./conversation/errors.js";
|
|
5
5
|
import { validateConversationInput } from "./conversation/validate.js";
|
|
6
|
+
import { deriveTimestampsViaSTT } from "./derive-timestamps.js";
|
|
6
7
|
import { ApiError, NoSpeechGeneratedError } from "./errors.js";
|
|
8
|
+
import { debug } from "./logger.js";
|
|
7
9
|
import { resolveModel } from "./resolve-provider.js";
|
|
10
|
+
import { modelDeclaresNativeTimestamps, } from "./speech-provider.js";
|
|
8
11
|
import { DefaultGeneratedAudioFile } from "./speech-result.js";
|
|
9
12
|
// biome-ignore lint/performance/noBarrelFile: public entry point — re-export error classes so callers get fn + types + errors from one import
|
|
10
13
|
export { ConversationInputError, DialogueConstraintError, StitchUnsupportedError, } from "./conversation/errors.js";
|
|
@@ -58,6 +61,8 @@ export async function generateConversation(options) {
|
|
|
58
61
|
volumeDbfs: options.volumeDbfs,
|
|
59
62
|
abortSignal: options.abortSignal,
|
|
60
63
|
headers: options.headers,
|
|
64
|
+
timestamps: options.timestamps ?? "auto",
|
|
65
|
+
timestampProvider: options.timestampProvider,
|
|
61
66
|
});
|
|
62
67
|
if (stitched.audio.length === 0) {
|
|
63
68
|
throw new NoSpeechGeneratedError();
|
|
@@ -81,6 +86,7 @@ export async function generateConversation(options) {
|
|
|
81
86
|
metadata,
|
|
82
87
|
providerMetadata: { turns: stitched.providerMetadataPerTurn },
|
|
83
88
|
warnings: stitched.warnings.length > 0 ? [...stitched.warnings] : undefined,
|
|
89
|
+
timestamps: stitched.timestamps,
|
|
84
90
|
};
|
|
85
91
|
}
|
|
86
92
|
async function runNative(args) {
|
|
@@ -110,12 +116,30 @@ async function runNative(args) {
|
|
|
110
116
|
const dialogueProviderOptions = stitchOpts
|
|
111
117
|
? { ...options.providerOptions, ...stitchOpts.providerOptions }
|
|
112
118
|
: options.providerOptions;
|
|
119
|
+
const timestampMode = options.timestamps ?? "auto";
|
|
120
|
+
const hasNativeDialogueTimestamps = modelDeclaresNativeTimestamps(resolved);
|
|
121
|
+
const shouldRequestNative = (timestampMode === "on" || timestampMode === "auto") &&
|
|
122
|
+
hasNativeDialogueTimestamps;
|
|
123
|
+
const dialogueId = `${resolved.provider.id}/${resolved.modelId}`;
|
|
124
|
+
if (timestampMode === "off") {
|
|
125
|
+
debug(`${dialogueId} (dialogue): timestamps: "off" — skipping alignment.`);
|
|
126
|
+
}
|
|
127
|
+
else if (shouldRequestNative) {
|
|
128
|
+
debug(`${dialogueId} (dialogue): timestamps: "${timestampMode}" — requesting native dialogue alignment.`);
|
|
129
|
+
}
|
|
130
|
+
else if (timestampMode === "auto") {
|
|
131
|
+
debug(`${dialogueId} (dialogue): timestamps: "auto" — dialogue endpoint has no native alignment; skipping. Pass timestamps: "on" to derive from the mixed audio via STT (flat list, no speaker labels).`);
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
debug(`${dialogueId} (dialogue): timestamps: "on" but no native dialogue alignment — will transcribe mixed audio via STT after rendering (adds a round-trip).`);
|
|
135
|
+
}
|
|
113
136
|
const result = await pRetry(() => generateDialogue({
|
|
114
137
|
modelId: resolved.modelId,
|
|
115
138
|
turns: options.turns.map((t) => ({ voice: t.voice, text: t.text })),
|
|
116
139
|
providerOptions: dialogueProviderOptions,
|
|
117
140
|
abortSignal: options.abortSignal,
|
|
118
141
|
headers: options.headers,
|
|
142
|
+
includeTimestamps: shouldRequestNative,
|
|
119
143
|
}), {
|
|
120
144
|
retries: maxRetries,
|
|
121
145
|
signal: options.abortSignal,
|
|
@@ -149,6 +173,15 @@ async function runNative(args) {
|
|
|
149
173
|
});
|
|
150
174
|
const computedDuration = await computeAudioDuration(audio.uint8Array, outputMediaType);
|
|
151
175
|
const audioDurationMs = computedDuration ?? result.audioDurationMs;
|
|
176
|
+
const timestamps = await resolveNativeDialogueTimestamps({
|
|
177
|
+
timestampMode,
|
|
178
|
+
nativeTimestamps: result.timestamps,
|
|
179
|
+
audio: audio.uint8Array,
|
|
180
|
+
mediaType: outputMediaType,
|
|
181
|
+
ttsModel: `${resolved.provider.id}/${resolved.modelId}`,
|
|
182
|
+
timestampProvider: options.timestampProvider,
|
|
183
|
+
abortSignal: options.abortSignal,
|
|
184
|
+
});
|
|
152
185
|
const inputChars = options.turns.reduce((n, t) => n + t.text.length, 0);
|
|
153
186
|
const metadata = {
|
|
154
187
|
latencyMs,
|
|
@@ -162,6 +195,32 @@ async function runNative(args) {
|
|
|
162
195
|
metadata,
|
|
163
196
|
providerMetadata: result.providerMetadata,
|
|
164
197
|
warnings: warnings.length > 0 ? warnings : undefined,
|
|
198
|
+
timestamps,
|
|
165
199
|
};
|
|
166
200
|
}
|
|
201
|
+
// Resolves timestamps for the native dialogue path:
|
|
202
|
+
// - "off" → undefined
|
|
203
|
+
// - native alignment returned → pass through
|
|
204
|
+
// - "on" without native → STT fallback on the mixed audio
|
|
205
|
+
// (flat WordTimestamp[] without speaker labels — limitation of one-call
|
|
206
|
+
// dialogue rendering)
|
|
207
|
+
// - "auto" without native → undefined
|
|
208
|
+
async function resolveNativeDialogueTimestamps(args) {
|
|
209
|
+
if (args.timestampMode === "off") {
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
if (args.nativeTimestamps && args.nativeTimestamps.length > 0) {
|
|
213
|
+
return args.nativeTimestamps;
|
|
214
|
+
}
|
|
215
|
+
if (args.timestampMode === "on") {
|
|
216
|
+
return await deriveTimestampsViaSTT({
|
|
217
|
+
ttsModel: args.ttsModel,
|
|
218
|
+
audio: args.audio,
|
|
219
|
+
mediaType: args.mediaType,
|
|
220
|
+
timestampProvider: args.timestampProvider,
|
|
221
|
+
abortSignal: args.abortSignal,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
167
226
|
//# sourceMappingURL=generate-conversation.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"generate-conversation.js","sourceRoot":"","sources":["../src/generate-conversation.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,SAAS,CAAC;AAC7B,OAAO,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAC3D,OAAO,EAAE,sBAAsB,EAAE,MAAM,4BAA4B,CAAC;AACpE,OAAO,EAAE,sBAAsB,EAAE,MAAM,0BAA0B,CAAC;AAElE,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AACvE,OAAO,EAAE,QAAQ,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"generate-conversation.js","sourceRoot":"","sources":["../src/generate-conversation.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,SAAS,CAAC;AAC7B,OAAO,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAC3D,OAAO,EAAE,sBAAsB,EAAE,MAAM,4BAA4B,CAAC;AACpE,OAAO,EAAE,sBAAsB,EAAE,MAAM,0BAA0B,CAAC;AAElE,OAAO,EAAE,yBAAyB,EAAE,MAAM,4BAA4B,CAAC;AACvE,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAC;AAChE,OAAO,EAAE,QAAQ,EAAE,sBAAsB,EAAE,MAAM,aAAa,CAAC;AAC/D,OAAO,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAEpC,OAAO,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AACrD,OAAO,EACL,6BAA6B,GAG9B,MAAM,sBAAsB,CAAC;AAE9B,OAAO,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAI/D,8IAA8I;AAC9I,OAAO,EACL,sBAAsB,EACtB,uBAAuB,EACvB,sBAAsB,GACvB,MAAM,0BAA0B,CAAC;AAMlC,MAAM,cAAc,GAAG,GAAG,CAAC;AAC3B,MAAM,uBAAuB,GAAG,CAAC,CAAC;AAClC,MAAM,mBAAmB,GAAG,CAAC,CAAC;AAE9B,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,OAAuC;IAEvC,yBAAyB,CAAC,OAAO,CAAC,CAAC;IAEnC,MAAM,eAAe,GAAuB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACrE,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,CAAC;QAC1C,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QAC7D,CAAC;QACD,OAAO,YAAY,CAAC,KAAK,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,CAAqB,CAAC;IAC7E,CAAC,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,sBAAsB,CAAC;QAClC,eAAe;QACf,KAAK,EAAE,OAAO,CAAC,KAAK;KACrB,CAAC,CAAC;IAEH,IAAI,IAAI,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QAC3B,0EAA0E;QAC1E,uEAAuE;QACvE,0EAA0E;QAC1E,wEAAwE;QACxE,mEAAmE;QACnE,2BAA2B;QAC3B,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,SAAS,CAC1C,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,KAAK,SAAS,CACvC,CAAC;QACF,IAAI,YAAY,KAAK,CAAC,CAAC,EAAE,CAAC;YACxB,MAAM,IAAI,sBAAsB,CAC9B,SAAS,YAAY,iCAAiC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,gMAAgM,CACzS,CAAC;QACJ,CAAC;QACD,OAAO,MAAM,SAAS,CAAC;YACrB,OAAO;YACP,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,UAAU,EAAE,OAAO,CAAC,UAAU,IAAI,mBAAmB;SACtD,CAAC,CAAC;IACL,CAAC;IAED,uEAAuE;IACvE,sEAAsE;IACtE,2EAA2E;IAC3E,MAAM,EAAE,SAAS,EAAE,GAAG,MAAM,MAAM,CAAC,0BAA0B,CAAC,CAAC;IAC/D,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC;QAC/B,eAAe;QACf,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,oBAAoB,EAAE,IAAI,CAAC,oBAAoB;QAC/C,uBAAuB,EAAE,OAAO,CAAC,eAAe;QAChD,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,cAAc;QACtC,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,uBAAuB;QACjE,UAAU,EAAE,OAAO,CAAC,UAAU,IAAI,mBAAmB;QACrD,eAAe,EAAE,OAAO,CAAC,eAAe,IAAI,IAAI;QAChD,UAAU,EAAE,OAAO,CAAC,UAAU;QAC9B,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,UAAU,EAAE,OAAO,CAAC,UAAU,IAAI,MAAM;QACxC,iBAAiB,EAAE,OAAO,CAAC,iBAAiB;KAC7C,CAAC,CAAC;IAEH,IAAI,QAAQ,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAChC,MAAM,IAAI,sBAAsB,EAAE,CAAC;IACrC,CAAC;IAED,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAC1B,IAAI,GAAG,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CACnD,CAAC;IACF,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;IAE1E,MAAM,QAAQ,GAAmB;QAC/B,SAAS,EAAE,QAAQ,CAAC,QAAQ,CAAC,SAAS;QACtC,UAAU,EAAE,QAAQ,CAAC,QAAQ,CAAC,UAAU;QACxC,QAAQ,EAAE,SAAS,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC;QACrE,KAAK,EAAE,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;QACzD,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,eAAe,IAAI,IAAI,IAAI;YAC/C,eAAe,EAAE,QAAQ,CAAC,QAAQ,CAAC,eAAe;SACnD,CAAC;KACH,CAAC;IAEF,OAAO;QACL,KAAK,EAAE,IAAI,yBAAyB,CAAC;YACnC,IAAI,EAAE,QAAQ,CAAC,KAAK;YACpB,SAAS,EAAE,QAAQ,CAAC,SAAS;SAC9B,CAAC;QACF,QAAQ;QACR,gBAAgB,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,uBAAuB,EAAE;QAC7D,QAAQ,EAAE,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS;QAC3E,UAAU,EAAE,QAAQ,CAAC,UAAU;KAChC,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,SAAS,CAAkB,IAIzC;IACC,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,GAAG,IAAI,CAAC;IAC/C,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEhC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,EAAE,CAAC;QACxC,MAAM,IAAI,KAAK,CACb,yBAAyB,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,oDAAoD,CACtH,CAAC;IACJ,CAAC;IAED,MAAM,gBAAgB,GAAG,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CAAC,IAAI,CAC9D,QAAQ,CAAC,QAAQ,CAClB,CAAC;IAEF,uEAAuE;IACvE,0EAA0E;IAC1E,sEAAsE;IACtE,uEAAuE;IACvE,+CAA+C;IAC/C,MAAM,SAAS,GAAG,OAAO,CAAC,eAAe,IAAI,IAAI,CAAC;IAClD,MAAM,UAAU,GAAG,SAAS;QAC1B,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC;QACxD,CAAC,CAAC,SAAS,CAAC;IACd,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,IAAI,SAAS,IAAI,CAAC,UAAU,EAAE,CAAC;QAC7B,QAAQ,CAAC,IAAI,CACX,GAAG,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,6IAA6I,CACzL,CAAC;IACJ,CAAC;IAED,sEAAsE;IACtE,wEAAwE;IACxE,uEAAuE;IACvE,kDAAkD;IAClD,MAAM,uBAAuB,GAAG,UAAU;QACxC,CAAC,CAAC,EAAE,GAAG,OAAO,CAAC,eAAe,EAAE,GAAG,UAAU,CAAC,eAAe,EAAE;QAC/D,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC;IAE5B,MAAM,aAAa,GAAG,OAAO,CAAC,UAAU,IAAI,MAAM,CAAC;IACnD,MAAM,2BAA2B,GAAG,6BAA6B,CAAC,QAAQ,CAAC,CAAC;IAC5E,MAAM,mBAAmB,GACvB,CAAC,aAAa,KAAK,IAAI,IAAI,aAAa,KAAK,MAAM,CAAC;QACpD,2BAA2B,CAAC;IAE9B,MAAM,UAAU,GAAG,GAAG,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;IACjE,IAAI,aAAa,KAAK,KAAK,EAAE,CAAC;QAC5B,KAAK,CAAC,GAAG,UAAU,sDAAsD,CAAC,CAAC;IAC7E,CAAC;SAAM,IAAI,mBAAmB,EAAE,CAAC;QAC/B,KAAK,CACH,GAAG,UAAU,6BAA6B,aAAa,2CAA2C,CACnG,CAAC;IACJ,CAAC;SAAM,IAAI,aAAa,KAAK,MAAM,EAAE,CAAC;QACpC,KAAK,CACH,GAAG,UAAU,qLAAqL,CACnM,CAAC;IACJ,CAAC;SAAM,CAAC;QACN,KAAK,CACH,GAAG,UAAU,2IAA2I,CACzJ,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,MAAM,MAAM,CACzB,GAAG,EAAE,CACH,gBAAgB,CAAC;QACf,OAAO,EAAE,QAAQ,CAAC,OAAO;QACzB,KAAK,EAAE,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACnE,eAAe,EAAE,uBAAuB;QACxC,WAAW,EAAE,OAAO,CAAC,WAAW;QAChC,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,iBAAiB,EAAE,mBAAmB;KACvC,CAAC,EACJ;QACE,OAAO,EAAE,UAAU;QACnB,MAAM,EAAE,OAAO,CAAC,WAAW;QAC3B,WAAW,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;YACzB,IAAI,KAAK,YAAY,QAAQ,IAAI,KAAK,CAAC,UAAU,GAAG,GAAG,EAAE,CAAC;gBACxD,OAAO,KAAK,CAAC;YACf,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;KACF,CACF,CAAC;IAEF,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,CAAC;IAExD,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,sBAAsB,EAAE,CAAC;IACrC,CAAC;IAED,IAAI,UAAU,GAAwB,MAAM,CAAC,KAAK,CAAC;IACnD,wEAAwE;IACxE,qEAAqE;IACrE,IAAI,eAAe,GAAG,UAAU,EAAE,SAAS,IAAI,MAAM,CAAC,SAAS,CAAC;IAEhE,IAAI,UAAU,EAAE,CAAC;QACf,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;QAC5D,UAAU,GAAG,MAAM,YAAY,CAAC;YAC9B,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,SAAS,EAAE,UAAU,CAAC,SAAS;YAC/B,UAAU,EAAE,OAAO,CAAC,UAAU,IAAI,CAAC,EAAE;SACtC,CAAC,CAAC;QACH,eAAe,GAAG,WAAW,CAAC;IAChC,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,yBAAyB,CAAC;QAC1C,IAAI,EAAE,UAAU;QAChB,SAAS,EAAE,eAAe;KAC3B,CAAC,CAAC;IAEH,MAAM,gBAAgB,GAAG,MAAM,oBAAoB,CACjD,KAAK,CAAC,UAAU,EAChB,eAAe,CAChB,CAAC;IACF,MAAM,eAAe,GAAG,gBAAgB,IAAI,MAAM,CAAC,eAAe,CAAC;IAEnE,MAAM,UAAU,GAAG,MAAM,+BAA+B,CAAC;QACvD,aAAa;QACb,gBAAgB,EAAE,MAAM,CAAC,UAAU;QACnC,KAAK,EAAE,KAAK,CAAC,UAAU;QACvB,SAAS,EAAE,eAAe;QAC1B,QAAQ,EAAE,GAAG,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,EAAE;QACvD,iBAAiB,EAAE,OAAO,CAAC,iBAAiB;QAC5C,WAAW,EAAE,OAAO,CAAC,WAAW;KACjC,CAAC,CAAC;IAEH,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAExE,MAAM,QAAQ,GAAmB;QAC/B,SAAS;QACT,UAAU;QACV,QAAQ,EAAE,QAAQ,CAAC,QAAQ,CAAC,EAAE;QAC9B,KAAK,EAAE,QAAQ,CAAC,OAAO;QACvB,GAAG,CAAC,eAAe,IAAI,IAAI,IAAI,EAAE,eAAe,EAAE,CAAC;KACpD,CAAC;IAEF,OAAO;QACL,KAAK;QACL,QAAQ;QACR,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;QACzC,QAAQ,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;QACpD,UAAU;KACX,CAAC;AACJ,CAAC;AAED,oDAAoD;AACpD,mDAAmD;AACnD,sDAAsD;AACtD,yEAAyE;AACzE,4EAA4E;AAC5E,0BAA0B;AAC1B,mDAAmD;AACnD,KAAK,UAAU,+BAA+B,CAAC,IAQ9C;IACC,IAAI,IAAI,CAAC,aAAa,KAAK,KAAK,EAAE,CAAC;QACjC,OAAO;IACT,CAAC;IACD,IAAI,IAAI,CAAC,gBAAgB,IAAI,IAAI,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC9D,OAAO,IAAI,CAAC,gBAAgB,CAAC;IAC/B,CAAC;IACD,IAAI,IAAI,CAAC,aAAa,KAAK,IAAI,EAAE,CAAC;QAChC,OAAO,MAAM,sBAAsB,CAAC;YAClC,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,iBAAiB,EAAE,IAAI,CAAC,iBAAiB;YACzC,WAAW,EAAE,IAAI,CAAC,WAAW;SAC9B,CAAC,CAAC;IACL,CAAC;IACD,OAAO;AACT,CAAC"}
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type ResolvedModel, type Voice } from "./speech-provider.js";
|
|
2
2
|
import type { SpeechResult } from "./speech-result.js";
|
|
3
|
+
import type { ResolvedSTTModel } from "./speech-to-text-provider.js";
|
|
4
|
+
import type { TimestampMode } from "./timestamps.js";
|
|
3
5
|
export declare function generateSpeech<V extends Voice = Voice>(options: {
|
|
4
6
|
model: string | ResolvedModel<V>;
|
|
5
7
|
text: string;
|
|
@@ -19,5 +21,20 @@ export declare function generateSpeech<V extends Voice = Voice>(options: {
|
|
|
19
21
|
* if the provider doesn't expose a decodable output mode.
|
|
20
22
|
*/
|
|
21
23
|
volumeDbfs?: number;
|
|
24
|
+
/**
|
|
25
|
+
* Controls whether the returned `SpeechResult` includes word-level
|
|
26
|
+
* timestamps. Default `"auto"` — return natively when the TTS provider
|
|
27
|
+
* supplies alignment, otherwise omit. `"on"` forces word timestamps
|
|
28
|
+
* (falling back to STT round-trip when necessary). `"off"` suppresses
|
|
29
|
+
* them even for providers that would return them free.
|
|
30
|
+
*/
|
|
31
|
+
timestamps?: TimestampMode;
|
|
32
|
+
/**
|
|
33
|
+
* Override the STT provider used for the derived-timestamps path. Construct
|
|
34
|
+
* via a factory (e.g. `createOpenAISTT({ apiKey })("whisper-1")`). Only
|
|
35
|
+
* consulted when timestamps are requested AND the TTS provider can't supply
|
|
36
|
+
* them natively. Defaults to OpenAI Whisper read from `OPENAI_API_KEY`.
|
|
37
|
+
*/
|
|
38
|
+
timestampProvider?: ResolvedSTTModel;
|
|
22
39
|
}): Promise<SpeechResult>;
|
|
23
40
|
//# sourceMappingURL=generate-speech.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"generate-speech.d.ts","sourceRoot":"","sources":["../src/generate-speech.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"generate-speech.d.ts","sourceRoot":"","sources":["../src/generate-speech.ts"],"names":[],"mappings":"AAYA,OAAO,EAEL,KAAK,aAAa,EAClB,KAAK,KAAK,EACX,MAAM,sBAAsB,CAAC;AAC9B,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAEvD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,KAAK,EAAE,aAAa,EAAiB,MAAM,iBAAiB,CAAC;AAEpE,wBAAsB,cAAc,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK,EAAE,OAAO,EAAE;IACrE,KAAK,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,CAAC,CAAC;IACT,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC1C,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC;;;;;;;;OAQG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,aAAa,CAAC;IAC3B;;;;;OAKG;IACH,iBAAiB,CAAC,EAAE,gBAAgB,CAAC;CACtC,GAAG,OAAO,CAAC,YAAY,CAAC,CAuJxB"}
|
package/dist/generate-speech.js
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import pRetry from "p-retry";
|
|
2
2
|
import { computeAudioDuration } from "./audio-duration.js";
|
|
3
3
|
import { detectAudioTags, stripAudioTags } from "./audio-tags.js";
|
|
4
|
+
import { deriveTimestampsViaSTT } from "./derive-timestamps.js";
|
|
4
5
|
import { ApiError, NoSpeechGeneratedError, VolumeAdjustmentUnsupportedError, } from "./errors.js";
|
|
6
|
+
import { debug } from "./logger.js";
|
|
5
7
|
import { resolveModel } from "./resolve-provider.js";
|
|
8
|
+
import { modelDeclaresNativeTimestamps, } from "./speech-provider.js";
|
|
6
9
|
import { DefaultGeneratedAudioFile } from "./speech-result.js";
|
|
7
10
|
export async function generateSpeech(options) {
|
|
8
|
-
const { model, voice, abortSignal, headers, volumeDbfs } = options;
|
|
11
|
+
const { model, voice, abortSignal, headers, volumeDbfs, timestamps: timestampMode = "auto", timestampProvider, } = options;
|
|
9
12
|
const maxRetries = options.maxRetries ?? 2;
|
|
10
13
|
const resolved = resolveModel(model, { apiKey: options.apiKey });
|
|
11
14
|
const modelIdentifier = `${resolved.provider.id}/${resolved.modelId}`;
|
|
@@ -23,26 +26,23 @@ export async function generateSpeech(options) {
|
|
|
23
26
|
...stitchOpts.providerOptions,
|
|
24
27
|
};
|
|
25
28
|
}
|
|
26
|
-
|
|
27
|
-
let warnings;
|
|
28
|
-
if (resolved.provider.processAudioTags) {
|
|
29
|
-
({ text: processedText, warnings } = resolved.provider.processAudioTags(options.text, resolved.modelId));
|
|
30
|
-
}
|
|
31
|
-
else {
|
|
32
|
-
const tags = detectAudioTags(options.text);
|
|
33
|
-
if (tags.length > 0) {
|
|
34
|
-
({ text: processedText, warnings } = stripAudioTags(options.text, modelIdentifier));
|
|
35
|
-
}
|
|
36
|
-
else {
|
|
37
|
-
processedText = options.text;
|
|
38
|
-
warnings = [];
|
|
39
|
-
}
|
|
40
|
-
}
|
|
29
|
+
const { text: processedText, warnings } = preprocessText(resolved, options.text, modelIdentifier);
|
|
41
30
|
if (processedText.trim().length === 0) {
|
|
42
31
|
throw new NoSpeechGeneratedError(warnings.length > 0
|
|
43
32
|
? `Text is empty after removing unsupported audio tags for ${modelIdentifier}.`
|
|
44
33
|
: "Text must not be empty.");
|
|
45
34
|
}
|
|
35
|
+
const hasNativeTimestamps = modelDeclaresNativeTimestamps(resolved);
|
|
36
|
+
// For "on" we still ask the provider natively first — if it has native
|
|
37
|
+
// alignment, we skip the STT round-trip.
|
|
38
|
+
const shouldRequestNative = (timestampMode === "on" || timestampMode === "auto") && hasNativeTimestamps;
|
|
39
|
+
logTimestampDecision({
|
|
40
|
+
modelIdentifier,
|
|
41
|
+
mode: timestampMode,
|
|
42
|
+
hasNative: hasNativeTimestamps,
|
|
43
|
+
willRequestNative: shouldRequestNative,
|
|
44
|
+
timestampProvider,
|
|
45
|
+
});
|
|
46
46
|
const startTime = performance.now();
|
|
47
47
|
const result = await pRetry(() => resolved.provider.generate({
|
|
48
48
|
modelId: resolved.modelId,
|
|
@@ -51,6 +51,7 @@ export async function generateSpeech(options) {
|
|
|
51
51
|
providerOptions,
|
|
52
52
|
abortSignal,
|
|
53
53
|
headers,
|
|
54
|
+
includeTimestamps: shouldRequestNative,
|
|
54
55
|
}), {
|
|
55
56
|
retries: maxRetries,
|
|
56
57
|
signal: abortSignal,
|
|
@@ -83,6 +84,23 @@ export async function generateSpeech(options) {
|
|
|
83
84
|
});
|
|
84
85
|
const audioDurationMs = (await computeAudioDuration(audio.uint8Array, outputMediaType)) ??
|
|
85
86
|
result.audioDurationMs;
|
|
87
|
+
let timestamps;
|
|
88
|
+
if (timestampMode !== "off") {
|
|
89
|
+
if (result.timestamps && result.timestamps.length > 0) {
|
|
90
|
+
debug(`${modelIdentifier}: returned ${result.timestamps.length} native word timestamps.`);
|
|
91
|
+
timestamps = result.timestamps;
|
|
92
|
+
}
|
|
93
|
+
else if (timestampMode === "on") {
|
|
94
|
+
timestamps = await deriveTimestampsViaSTT({
|
|
95
|
+
ttsModel: modelIdentifier,
|
|
96
|
+
audio: audio.uint8Array,
|
|
97
|
+
mediaType: outputMediaType,
|
|
98
|
+
timestampProvider,
|
|
99
|
+
abortSignal,
|
|
100
|
+
});
|
|
101
|
+
debug(`${modelIdentifier}: derived ${timestamps.length} word timestamps via STT fallback.`);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
86
104
|
const metadata = {
|
|
87
105
|
latencyMs,
|
|
88
106
|
inputChars: processedText.length,
|
|
@@ -95,6 +113,45 @@ export async function generateSpeech(options) {
|
|
|
95
113
|
metadata,
|
|
96
114
|
providerMetadata: result.providerMetadata,
|
|
97
115
|
warnings: warnings.length > 0 ? warnings : undefined,
|
|
116
|
+
timestamps,
|
|
98
117
|
};
|
|
99
118
|
}
|
|
119
|
+
function preprocessText(resolved, rawText, modelIdentifier) {
|
|
120
|
+
if (resolved.provider.processAudioTags) {
|
|
121
|
+
return resolved.provider.processAudioTags(rawText, resolved.modelId);
|
|
122
|
+
}
|
|
123
|
+
const tags = detectAudioTags(rawText);
|
|
124
|
+
if (tags.length > 0) {
|
|
125
|
+
return stripAudioTags(rawText, modelIdentifier);
|
|
126
|
+
}
|
|
127
|
+
return { text: rawText, warnings: [] };
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Logs the timestamp routing decision at debug level so developers can see
|
|
131
|
+
* why they are / aren't getting alignment data. Silent unless `DEBUG`
|
|
132
|
+
* includes `speech-sdk` (or `*`).
|
|
133
|
+
*/
|
|
134
|
+
function logTimestampDecision(args) {
|
|
135
|
+
const { modelIdentifier, mode, willRequestNative } = args;
|
|
136
|
+
if (mode === "off") {
|
|
137
|
+
debug(`${modelIdentifier}: timestamps: "off" — skipping alignment.`);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
if (willRequestNative) {
|
|
141
|
+
debug(`${modelIdentifier}: timestamps: "${mode}" — requesting native alignment from the provider.`);
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
if (mode === "auto") {
|
|
145
|
+
debug(`${modelIdentifier}: timestamps: "auto" — model has no native alignment; skipping. Pass timestamps: "on" to derive via STT (adds a round-trip of the synthesized audio through Whisper by default).`);
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
// mode === "on" and no native support → will fall back to STT
|
|
149
|
+
debug(`${modelIdentifier}: timestamps: "on" but no native alignment available — will pipe synthesized audio through ${describeSTTTarget(args.timestampProvider)} for word timestamps (adds a round-trip).`);
|
|
150
|
+
}
|
|
151
|
+
function describeSTTTarget(provider) {
|
|
152
|
+
if (provider) {
|
|
153
|
+
return `${provider.provider.id}/${provider.modelId}`;
|
|
154
|
+
}
|
|
155
|
+
return "openai/whisper-1 (default)";
|
|
156
|
+
}
|
|
100
157
|
//# sourceMappingURL=generate-speech.js.map
|