@speech-sdk/core 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -21
- package/README.md +215 -269
- package/dist/__tests__/e2e/_save-audio.d.ts +51 -2
- package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
- package/dist/__tests__/e2e/_save-audio.js +139 -11
- package/dist/__tests__/e2e/_save-audio.js.map +1 -1
- package/dist/audio-utils.d.ts +2 -0
- package/dist/audio-utils.d.ts.map +1 -1
- package/dist/audio-utils.js +9 -0
- package/dist/audio-utils.js.map +1 -1
- package/dist/captions.d.ts +137 -0
- package/dist/captions.d.ts.map +1 -0
- package/dist/captions.js +283 -0
- package/dist/captions.js.map +1 -0
- package/dist/conversation/stitch.d.ts +5 -0
- package/dist/conversation/stitch.d.ts.map +1 -1
- package/dist/conversation/stitch.js +37 -0
- package/dist/conversation/stitch.js.map +1 -1
- package/dist/conversation/types.d.ts +16 -0
- package/dist/conversation/types.d.ts.map +1 -1
- package/dist/conversation/validate.d.ts.map +1 -1
- package/dist/conversation/validate.js +0 -6
- package/dist/conversation/validate.js.map +1 -1
- package/dist/derive-timestamps.d.ts +14 -0
- package/dist/derive-timestamps.d.ts.map +1 -0
- package/dist/derive-timestamps.js +38 -0
- package/dist/derive-timestamps.js.map +1 -0
- package/dist/errors.d.ts +25 -0
- package/dist/errors.d.ts.map +1 -1
- package/dist/errors.js +28 -0
- package/dist/errors.js.map +1 -1
- package/dist/generate-conversation.d.ts +2 -1
- package/dist/generate-conversation.d.ts.map +1 -1
- package/dist/generate-conversation.js +72 -0
- package/dist/generate-conversation.js.map +1 -1
- package/dist/generate-speech.d.ts +18 -1
- package/dist/generate-speech.d.ts.map +1 -1
- package/dist/generate-speech.js +73 -16
- package/dist/generate-speech.js.map +1 -1
- package/dist/index.d.ts +6 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/dist/logger.d.ts +2 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/logger.js +40 -0
- package/dist/logger.js.map +1 -0
- package/dist/provider-utils.d.ts +8 -0
- package/dist/provider-utils.d.ts.map +1 -1
- package/dist/provider-utils.js +16 -2
- package/dist/provider-utils.js.map +1 -1
- package/dist/providers/cartesia/alignment.d.ts +24 -0
- package/dist/providers/cartesia/alignment.d.ts.map +1 -0
- package/dist/providers/cartesia/alignment.js +23 -0
- package/dist/providers/cartesia/alignment.js.map +1 -0
- package/dist/providers/cartesia/index.d.ts +12 -2
- package/dist/providers/cartesia/index.d.ts.map +1 -1
- package/dist/providers/cartesia/index.js +137 -2
- package/dist/providers/cartesia/index.js.map +1 -1
- package/dist/providers/elevenlabs/alignment.d.ts +24 -0
- package/dist/providers/elevenlabs/alignment.d.ts.map +1 -0
- package/dist/providers/elevenlabs/alignment.js +48 -0
- package/dist/providers/elevenlabs/alignment.js.map +1 -0
- package/dist/providers/elevenlabs/index.d.ts +19 -4
- package/dist/providers/elevenlabs/index.d.ts.map +1 -1
- package/dist/providers/elevenlabs/index.js +83 -13
- package/dist/providers/elevenlabs/index.js.map +1 -1
- package/dist/providers/fal/index.d.ts +0 -25
- package/dist/providers/fal/index.d.ts.map +1 -1
- package/dist/providers/fal/index.js +3 -58
- package/dist/providers/fal/index.js.map +1 -1
- package/dist/providers/hume/alignment.d.ts +38 -0
- package/dist/providers/hume/alignment.d.ts.map +1 -0
- package/dist/providers/hume/alignment.js +31 -0
- package/dist/providers/hume/alignment.js.map +1 -0
- package/dist/providers/hume/index.d.ts +8 -1
- package/dist/providers/hume/index.d.ts.map +1 -1
- package/dist/providers/hume/index.js +75 -1
- package/dist/providers/hume/index.js.map +1 -1
- package/dist/providers/inworld/alignment.d.ts +25 -0
- package/dist/providers/inworld/alignment.d.ts.map +1 -0
- package/dist/providers/inworld/alignment.js +23 -0
- package/dist/providers/inworld/alignment.js.map +1 -0
- package/dist/providers/inworld/index.d.ts +11 -2
- package/dist/providers/inworld/index.d.ts.map +1 -1
- package/dist/providers/inworld/index.js +11 -2
- package/dist/providers/inworld/index.js.map +1 -1
- package/dist/providers/murf/alignment.d.ts +22 -0
- package/dist/providers/murf/alignment.d.ts.map +1 -0
- package/dist/providers/murf/alignment.js +17 -0
- package/dist/providers/murf/alignment.js.map +1 -0
- package/dist/providers/murf/index.d.ts +8 -1
- package/dist/providers/murf/index.d.ts.map +1 -1
- package/dist/providers/murf/index.js +10 -1
- package/dist/providers/murf/index.js.map +1 -1
- package/dist/providers/openai/index.d.ts +12 -3
- package/dist/providers/openai/index.d.ts.map +1 -1
- package/dist/providers/openai/index.js +7 -3
- package/dist/providers/openai/index.js.map +1 -1
- package/dist/providers/resemble/alignment.d.ts +32 -0
- package/dist/providers/resemble/alignment.d.ts.map +1 -0
- package/dist/providers/resemble/alignment.js +57 -0
- package/dist/providers/resemble/alignment.js.map +1 -0
- package/dist/providers/resemble/index.d.ts +7 -1
- package/dist/providers/resemble/index.d.ts.map +1 -1
- package/dist/providers/resemble/index.js +13 -1
- package/dist/providers/resemble/index.js.map +1 -1
- package/dist/resolve-provider.d.ts.map +1 -1
- package/dist/resolve-provider.js +3 -12
- package/dist/resolve-provider.js.map +1 -1
- package/dist/speech-provider.d.ts +48 -4
- package/dist/speech-provider.d.ts.map +1 -1
- package/dist/speech-provider.js +16 -0
- package/dist/speech-provider.js.map +1 -1
- package/dist/speech-result.d.ts +10 -0
- package/dist/speech-result.d.ts.map +1 -1
- package/dist/speech-result.js.map +1 -1
- package/dist/speech-to-text-provider.d.ts +40 -0
- package/dist/speech-to-text-provider.d.ts.map +1 -0
- package/dist/speech-to-text-provider.js +2 -0
- package/dist/speech-to-text-provider.js.map +1 -0
- package/dist/stt-providers/openai/index.d.ts +42 -0
- package/dist/stt-providers/openai/index.d.ts.map +1 -0
- package/dist/stt-providers/openai/index.js +184 -0
- package/dist/stt-providers/openai/index.js.map +1 -0
- package/dist/timestamps.d.ts +23 -0
- package/dist/timestamps.d.ts.map +1 -0
- package/dist/timestamps.js +2 -0
- package/dist/timestamps.js.map +1 -0
- package/package.json +6 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"speech-result.d.ts","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"speech-result.d.ts","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AACpD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;CACjC;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,KAAK,EAAE,kBAAkB,CAAC;IACnC,QAAQ,CAAC,QAAQ,EAAE,cAAc,CAAC;IAClC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpD;;;;;;;OAOG;IACH,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,CAAC;IAC/C,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,qBAAa,yBAA0B,YAAW,kBAAkB;IAClE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAE3B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsB;IAC5C,OAAO,CAAC,WAAW,CAAC,CAAa;IACjC,OAAO,CAAC,OAAO,CAAC,CAAS;gBAEb,EACV,IAAI,EACJ,SAAS,GACV,EAAE;QAAE,IAAI,EAAE,MAAM,GAAG,UAAU,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE;IAKnD,IAAI,UAAU,IAAI,UAAU,CAe3B;IAED,IAAI,MAAM,IAAI,MAAM,CAcnB;CACF"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"speech-result.js","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"speech-result.js","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAyBA,MAAM,OAAO,yBAAyB;IAC3B,SAAS,CAAS;IAEV,KAAK,CAAsB;IACpC,WAAW,CAAc;IACzB,OAAO,CAAU;IAEzB,YAAY,EACV,IAAI,EACJ,SAAS,GACwC;QACjD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;QAClB,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,IAAI,UAAU;QACZ,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,WAAW,CAAC;QAC1B,CAAC;QACD,IAAI,IAAI,CAAC,KAAK,YAAY,UAAU,EAAE,CAAC;YACrC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACtC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YACxC,CAAC;YACD,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC;QAC3B,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,IAAI,MAAM;QACR,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC,OAAO,CAAC;QACtB,CAAC;QACD,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YACnC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC;QAC5B,CAAC;aAAM,CAAC;YACN,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC9B,YAAY,IAAI,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;YAC5C,CAAC;YACD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import type { WordTimestamp } from "./timestamps.js";
|
|
2
|
+
/**
|
|
3
|
+
* Minimal info about an STT model. Parallels `ModelInfo` on the TTS side.
|
|
4
|
+
*/
|
|
5
|
+
export interface STTModelInfo {
|
|
6
|
+
readonly id: string;
|
|
7
|
+
readonly languages: readonly string[];
|
|
8
|
+
readonly releaseDate: string;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Transcribes generated audio and returns word-level timestamps. This is the
|
|
12
|
+
* "derived" path for `timestamps: "on"` — used when the TTS provider doesn't
|
|
13
|
+
* return alignment data natively.
|
|
14
|
+
*
|
|
15
|
+
* Providers return `WordTimestamp[]` with start/end in seconds. Normalization
|
|
16
|
+
* (ms → seconds, char/phoneme aggregation, tuple → object) happens inside the
|
|
17
|
+
* provider adapter so the public surface is uniform.
|
|
18
|
+
*/
|
|
19
|
+
export interface SpeechToTextProvider {
|
|
20
|
+
readonly defaultModel: string;
|
|
21
|
+
readonly id: string;
|
|
22
|
+
readonly models: readonly STTModelInfo[];
|
|
23
|
+
transcribe(options: {
|
|
24
|
+
modelId: string;
|
|
25
|
+
audio: Uint8Array;
|
|
26
|
+
mediaType: string;
|
|
27
|
+
language?: string;
|
|
28
|
+
abortSignal?: AbortSignal;
|
|
29
|
+
headers?: Record<string, string>;
|
|
30
|
+
}): Promise<{
|
|
31
|
+
timestamps: WordTimestamp[];
|
|
32
|
+
text?: string;
|
|
33
|
+
providerMetadata?: Record<string, unknown>;
|
|
34
|
+
}>;
|
|
35
|
+
}
|
|
36
|
+
export interface ResolvedSTTModel {
|
|
37
|
+
readonly modelId: string;
|
|
38
|
+
readonly provider: SpeechToTextProvider;
|
|
39
|
+
}
|
|
40
|
+
//# sourceMappingURL=speech-to-text-provider.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"speech-to-text-provider.d.ts","sourceRoot":"","sources":["../src/speech-to-text-provider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;CAC9B;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,MAAM,EAAE,SAAS,YAAY,EAAE,CAAC;IAEzC,UAAU,CAAC,OAAO,EAAE;QAClB,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,UAAU,EAAE,aAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,oBAAoB,CAAC;CACzC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"speech-to-text-provider.js","sourceRoot":"","sources":["../src/speech-to-text-provider.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import type { ResolvedSTTModel, SpeechToTextProvider } from "../../speech-to-text-provider.js";
|
|
2
|
+
import type { WordTimestamp } from "../../timestamps.js";
|
|
3
|
+
export interface OpenAISpeechToTextProviderConfig {
|
|
4
|
+
apiKey?: string;
|
|
5
|
+
baseURL?: string;
|
|
6
|
+
fetch?: typeof globalThis.fetch;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* OpenAI Whisper / gpt-4o-transcribe adapter for the SDK's derived-timestamps
|
|
10
|
+
* path. Uses `/v1/audio/transcriptions` with `timestamp_granularities: ["word"]`
|
|
11
|
+
* and `response_format: "verbose_json"`.
|
|
12
|
+
*
|
|
13
|
+
* Note: `gpt-4o-transcribe-diarize` is intentionally not listed — that
|
|
14
|
+
* variant does not support `timestamp_granularities`.
|
|
15
|
+
*/
|
|
16
|
+
export declare class OpenAISpeechToTextProvider implements SpeechToTextProvider {
|
|
17
|
+
readonly id = "openai";
|
|
18
|
+
readonly defaultModel = "whisper-1";
|
|
19
|
+
readonly models: readonly [{
|
|
20
|
+
readonly id: "whisper-1";
|
|
21
|
+
readonly releaseDate: "2023-03-01";
|
|
22
|
+
readonly languages: readonly ["af", "ar", "az", "be", "bg", "bn", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gl", "he", "hi", "hr", "hu", "hy", "id", "is", "it", "ja", "kk", "kn", "ko", "lt", "lv", "mi", "mk", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "sw", "ta", "th", "tl", "tr", "uk", "ur", "vi", "zh"];
|
|
23
|
+
}];
|
|
24
|
+
private readonly apiKey;
|
|
25
|
+
private readonly baseURL;
|
|
26
|
+
private readonly fetchFn;
|
|
27
|
+
constructor(config?: OpenAISpeechToTextProviderConfig);
|
|
28
|
+
transcribe(options: {
|
|
29
|
+
modelId: string;
|
|
30
|
+
audio: Uint8Array;
|
|
31
|
+
mediaType: string;
|
|
32
|
+
language?: string;
|
|
33
|
+
abortSignal?: AbortSignal;
|
|
34
|
+
headers?: Record<string, string>;
|
|
35
|
+
}): Promise<{
|
|
36
|
+
timestamps: WordTimestamp[];
|
|
37
|
+
text?: string;
|
|
38
|
+
providerMetadata?: Record<string, unknown>;
|
|
39
|
+
}>;
|
|
40
|
+
}
|
|
41
|
+
export declare function createOpenAISTT(config?: OpenAISpeechToTextProviderConfig): (modelId?: string) => ResolvedSTTModel;
|
|
42
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/stt-providers/openai/index.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EACV,gBAAgB,EAChB,oBAAoB,EACrB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAEzD,MAAM,WAAW,gCAAgC;IAC/C,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;CACjC;AAiED;;;;;;;GAOG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IACrE,QAAQ,CAAC,EAAE,YAAY;IACvB,QAAQ,CAAC,YAAY,eAAe;IAMpC,QAAQ,CAAC,MAAM;;;;OAMJ;IAEX,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAqB;IAC5C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA0B;gBAEtC,MAAM,GAAE,gCAAqC;IAMnD,UAAU,CAAC,OAAO,EAAE;QACxB,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,UAAU,EAAE,aAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC;CAsDH;AAED,wBAAgB,eAAe,CAAC,MAAM,GAAE,gCAAqC,IAGjD,UAAU,MAAM,KAAG,gBAAgB,CAM9D"}
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import { parseMediaTypeParam, wrapPcm16Mono } from "../../audio-utils.js";
|
|
2
|
+
import { handleErrorResponse, resolveApiKey, SDK_USER_AGENT, } from "../../provider-utils.js";
|
|
3
|
+
// OpenAI Whisper is advertised as 50+ languages; we list the ISO-639-1 codes
|
|
4
|
+
// the API's `language` parameter accepts. Matches the TTS provider's list.
|
|
5
|
+
const OPENAI_STT_LANGUAGES = [
|
|
6
|
+
"af",
|
|
7
|
+
"ar",
|
|
8
|
+
"az",
|
|
9
|
+
"be",
|
|
10
|
+
"bg",
|
|
11
|
+
"bn",
|
|
12
|
+
"bs",
|
|
13
|
+
"ca",
|
|
14
|
+
"cs",
|
|
15
|
+
"cy",
|
|
16
|
+
"da",
|
|
17
|
+
"de",
|
|
18
|
+
"el",
|
|
19
|
+
"en",
|
|
20
|
+
"es",
|
|
21
|
+
"et",
|
|
22
|
+
"fa",
|
|
23
|
+
"fi",
|
|
24
|
+
"fr",
|
|
25
|
+
"gl",
|
|
26
|
+
"he",
|
|
27
|
+
"hi",
|
|
28
|
+
"hr",
|
|
29
|
+
"hu",
|
|
30
|
+
"hy",
|
|
31
|
+
"id",
|
|
32
|
+
"is",
|
|
33
|
+
"it",
|
|
34
|
+
"ja",
|
|
35
|
+
"kk",
|
|
36
|
+
"kn",
|
|
37
|
+
"ko",
|
|
38
|
+
"lt",
|
|
39
|
+
"lv",
|
|
40
|
+
"mi",
|
|
41
|
+
"mk",
|
|
42
|
+
"mr",
|
|
43
|
+
"ms",
|
|
44
|
+
"ne",
|
|
45
|
+
"nl",
|
|
46
|
+
"no",
|
|
47
|
+
"pl",
|
|
48
|
+
"pt",
|
|
49
|
+
"ro",
|
|
50
|
+
"ru",
|
|
51
|
+
"sk",
|
|
52
|
+
"sl",
|
|
53
|
+
"sr",
|
|
54
|
+
"sv",
|
|
55
|
+
"sw",
|
|
56
|
+
"ta",
|
|
57
|
+
"th",
|
|
58
|
+
"tl",
|
|
59
|
+
"tr",
|
|
60
|
+
"uk",
|
|
61
|
+
"ur",
|
|
62
|
+
"vi",
|
|
63
|
+
"zh",
|
|
64
|
+
];
|
|
65
|
+
/**
|
|
66
|
+
* OpenAI Whisper / gpt-4o-transcribe adapter for the SDK's derived-timestamps
|
|
67
|
+
* path. Uses `/v1/audio/transcriptions` with `timestamp_granularities: ["word"]`
|
|
68
|
+
* and `response_format: "verbose_json"`.
|
|
69
|
+
*
|
|
70
|
+
* Note: `gpt-4o-transcribe-diarize` is intentionally not listed — that
|
|
71
|
+
* variant does not support `timestamp_granularities`.
|
|
72
|
+
*/
|
|
73
|
+
export class OpenAISpeechToTextProvider {
|
|
74
|
+
id = "openai";
|
|
75
|
+
defaultModel = "whisper-1";
|
|
76
|
+
// Only whisper-1 supports word-level timestamps. The newer
|
|
77
|
+
// gpt-4o-transcribe / gpt-4o-mini-transcribe models accept `json` /
|
|
78
|
+
// `text` response formats only and don't expose `timestamp_granularities`,
|
|
79
|
+
// so they can't satisfy this provider's contract.
|
|
80
|
+
models = [
|
|
81
|
+
{
|
|
82
|
+
id: "whisper-1",
|
|
83
|
+
releaseDate: "2023-03-01",
|
|
84
|
+
languages: OPENAI_STT_LANGUAGES,
|
|
85
|
+
},
|
|
86
|
+
];
|
|
87
|
+
apiKey;
|
|
88
|
+
baseURL;
|
|
89
|
+
fetchFn;
|
|
90
|
+
constructor(config = {}) {
|
|
91
|
+
this.apiKey = config.apiKey;
|
|
92
|
+
this.baseURL = config.baseURL ?? "https://api.openai.com/v1";
|
|
93
|
+
this.fetchFn = config.fetch ?? globalThis.fetch.bind(globalThis);
|
|
94
|
+
}
|
|
95
|
+
async transcribe(options) {
|
|
96
|
+
const { audio, mediaType } = await normalizeAudioForOpenAI(options.audio, options.mediaType);
|
|
97
|
+
const form = new FormData();
|
|
98
|
+
const filename = `audio.${mediaTypeToExtension(mediaType)}`;
|
|
99
|
+
// Cast via BlobPart: TS narrowing of Uint8Array<ArrayBufferLike> vs
|
|
100
|
+
// Blob's required ArrayBuffer-backed view is stricter than runtime.
|
|
101
|
+
form.append("file", new Blob([audio], { type: mediaType }), filename);
|
|
102
|
+
form.append("model", options.modelId);
|
|
103
|
+
form.append("response_format", "verbose_json");
|
|
104
|
+
form.append("timestamp_granularities[]", "word");
|
|
105
|
+
if (options.language) {
|
|
106
|
+
form.append("language", options.language);
|
|
107
|
+
}
|
|
108
|
+
const response = await this.fetchFn(`${this.baseURL}/audio/transcriptions`, {
|
|
109
|
+
method: "POST",
|
|
110
|
+
headers: {
|
|
111
|
+
Authorization: `Bearer ${resolveApiKey(this.apiKey, "OPENAI_API_KEY", "OpenAI")}`,
|
|
112
|
+
"X-User-Agent": SDK_USER_AGENT,
|
|
113
|
+
...options.headers,
|
|
114
|
+
},
|
|
115
|
+
body: form,
|
|
116
|
+
signal: options.abortSignal,
|
|
117
|
+
});
|
|
118
|
+
await handleErrorResponse(response, `openai/${options.modelId}`);
|
|
119
|
+
const data = (await response.json());
|
|
120
|
+
const timestamps = (data.words ?? []).map((w) => ({
|
|
121
|
+
text: w.word,
|
|
122
|
+
start: w.start,
|
|
123
|
+
end: w.end,
|
|
124
|
+
}));
|
|
125
|
+
return {
|
|
126
|
+
timestamps,
|
|
127
|
+
text: data.text,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
export function createOpenAISTT(config = {}) {
|
|
132
|
+
const provider = new OpenAISpeechToTextProvider(config);
|
|
133
|
+
return function openaiSTT(modelId) {
|
|
134
|
+
return {
|
|
135
|
+
provider,
|
|
136
|
+
modelId: modelId ?? provider.defaultModel,
|
|
137
|
+
};
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
// OpenAI transcription accepts mp3/mp4/mpeg/mpga/m4a/wav/webm/flac/ogg/opus
|
|
141
|
+
// but rejects raw PCM. When a TTS provider hands us raw little-endian PCM
|
|
142
|
+
// (stitch mode), we wrap it with a WAV header so the STT endpoint will
|
|
143
|
+
// parse it. `audio/l16` is intentionally NOT handled: RFC 2586 defines it
|
|
144
|
+
// as big-endian and `wrapPcm16Mono` writes little-endian — silently mis-
|
|
145
|
+
// wrapping would corrupt audio. No current provider emits L16; add an
|
|
146
|
+
// explicit byte-swap branch here if one does.
|
|
147
|
+
async function normalizeAudioForOpenAI(audio, mediaType) {
|
|
148
|
+
if (mediaTypeBase(mediaType) === "audio/pcm") {
|
|
149
|
+
const sampleRate = parseMediaTypeParam(mediaType, "rate") ?? 24_000;
|
|
150
|
+
return {
|
|
151
|
+
audio: await wrapPcm16Mono(audio, sampleRate),
|
|
152
|
+
mediaType: "audio/wav",
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
return { audio, mediaType };
|
|
156
|
+
}
|
|
157
|
+
function mediaTypeBase(mediaType) {
|
|
158
|
+
return mediaType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
159
|
+
}
|
|
160
|
+
function mediaTypeToExtension(mediaType) {
|
|
161
|
+
switch (mediaTypeBase(mediaType)) {
|
|
162
|
+
case "audio/mpeg":
|
|
163
|
+
case "audio/mp3":
|
|
164
|
+
return "mp3";
|
|
165
|
+
case "audio/wav":
|
|
166
|
+
case "audio/x-wav":
|
|
167
|
+
return "wav";
|
|
168
|
+
case "audio/ogg":
|
|
169
|
+
return "ogg";
|
|
170
|
+
case "audio/opus":
|
|
171
|
+
return "opus";
|
|
172
|
+
case "audio/flac":
|
|
173
|
+
return "flac";
|
|
174
|
+
case "audio/webm":
|
|
175
|
+
return "webm";
|
|
176
|
+
case "audio/mp4":
|
|
177
|
+
case "audio/m4a":
|
|
178
|
+
case "audio/x-m4a":
|
|
179
|
+
return "m4a";
|
|
180
|
+
default:
|
|
181
|
+
return "mp3";
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/stt-providers/openai/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAC1E,OAAO,EACL,mBAAmB,EACnB,aAAa,EACb,cAAc,GACf,MAAM,yBAAyB,CAAC;AAajC,6EAA6E;AAC7E,2EAA2E;AAC3E,MAAM,oBAAoB,GAAG;IAC3B,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;CACI,CAAC;AAEX;;;;;;;GAOG;AACH,MAAM,OAAO,0BAA0B;IAC5B,EAAE,GAAG,QAAQ,CAAC;IACd,YAAY,GAAG,WAAW,CAAC;IAEpC,2DAA2D;IAC3D,oEAAoE;IACpE,2EAA2E;IAC3E,kDAAkD;IACzC,MAAM,GAAG;QAChB;YACE,EAAE,EAAE,WAAW;YACf,WAAW,EAAE,YAAY;YACzB,SAAS,EAAE,oBAAoB;SAChC;KACO,CAAC;IAEM,MAAM,CAAqB;IAC3B,OAAO,CAAS;IAChB,OAAO,CAA0B;IAElD,YAAY,SAA2C,EAAE;QACvD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QAC5B,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,OAAO,IAAI,2BAA2B,CAAC;QAC7D,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,KAAK,IAAI,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IACnE,CAAC;IAED,KAAK,CAAC,UAAU,CAAC,OAOhB;QAKC,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,MAAM,uBAAuB,CACxD,OAAO,CAAC,KAAK,EACb,OAAO,CAAC,SAAS,CAClB,CAAC;QAEF,MAAM,IAAI,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,QAAQ,GAAG,SAAS,oBAAoB,CAAC,SAAS,CAAC,EAAE,CAAC;QAC5D,oEAAoE;QACpE,oEAAoE;QACpE,IAAI,CAAC,MAAM,CACT,MAAM,EACN,IAAI,IAAI,CAAC,CAAC,KAAiB,CAAC,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC,EAClD,QAAQ,CACT,CAAC;QACF,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QACtC,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QAC/C,IAAI,CAAC,MAAM,CAAC,2BAA2B,EAAE,MAAM,CAAC,CAAC;QACjD,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QAC5C,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CACjC,GAAG,IAAI,CAAC,OAAO,uBAAuB,EACtC;YACE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,aAAa,EAAE,UAAU,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,gBAAgB,EAAE,QAAQ,CAAC,EAAE;gBACjF,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI;YACV,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CACF,CAAC;QAEF,MAAM,mBAAmB,CAAC,QAAQ,EAAE,UAAU,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC;QAEjE,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAGlC,CAAC;QAEF,MAAM,UAAU,GAAoB,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACjE,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,GAAG,EAAE,CAAC,CAAC,GAAG;SACX,CAAC,CAAC,CAAC;QAEJ,OAAO;YACL,UAAU;YACV,IAAI,EAAE,IAAI,CAAC,IAAI;SAChB,CAAC;IACJ,CAAC;CACF;AAED,MAAM,UAAU,eAAe,CAAC,SAA2C,EAAE;IAC3E,MAAM,QAAQ,GAAG,IAAI,0BAA0B,CAAC,MAAM,CAAC,CAAC;IAExD,OAAO,SAAS,SAAS,CAAC,OAAgB;QACxC,OAAO;YACL,QAAQ;YACR,OAAO,EAAE,OAAO,IAAI,QAAQ,CAAC,YAAY;SAC1C,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC;AAED,4EAA4E;AAC5E,0EAA0E;AAC1E,uEAAuE;AACvE,0EAA0E;AAC1E,yEAAyE;AACzE,sEAAsE;AACtE,8CAA8C;AAC9C,KAAK,UAAU,uBAAuB,CACpC,KAAiB,EACjB,SAAiB;IAEjB,IAAI,aAAa,CAAC,SAAS,CAAC,KAAK,WAAW,EAAE,CAAC;QAC7C,MAAM,UAAU,GAAG,mBAAmB,CAAC,SAAS,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC;QACpE,OAAO;YACL,KAAK,EAAE,MAAM,aAAa,CAAC,KAAK,EAAE,UAAU,CAAC;YAC7C,SAAS,EAAE,WAAW;SACvB,CAAC;IACJ,CAAC;IACD,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;AAC9B,CAAC;AAED,SAAS,aAAa,CAAC,SAAiB;IACtC,OAAO,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;AAC7D,CAAC;AAED,SAAS,oBAAoB,CAAC,SAAiB;IAC7C,QAAQ,aAAa,CAAC,SAAS,CAAC,EAAE,CAAC;QACjC,KAAK,YAAY,CAAC;QAClB,KAAK,WAAW;YACd,OAAO,KAAK,CAAC;QACf,KAAK,WAAW,CAAC;QACjB,KAAK,aAAa;YAChB,OAAO,KAAK,CAAC;QACf,KAAK,WAAW;YACd,OAAO,KAAK,CAAC;QACf,KAAK,YAAY;YACf,OAAO,MAAM,CAAC;QAChB,KAAK,YAAY;YACf,OAAO,MAAM,CAAC;QAChB,KAAK,YAAY;YACf,OAAO,MAAM,CAAC;QAChB,KAAK,WAAW,CAAC;QACjB,KAAK,WAAW,CAAC;QACjB,KAAK,aAAa;YAChB,OAAO,KAAK,CAAC;QACf;YACE,OAAO,KAAK,CAAC;IACjB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Word-granularity alignment data. Timestamps are always in seconds from
|
|
3
|
+
* the start of the generated audio. Providers that natively return character
|
|
4
|
+
* or phoneme granularity are aggregated to words internally.
|
|
5
|
+
*/
|
|
6
|
+
export interface WordTimestamp {
|
|
7
|
+
readonly end: number;
|
|
8
|
+
readonly start: number;
|
|
9
|
+
readonly text: string;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Controls whether `generateSpeech()` returns word timestamps.
|
|
13
|
+
*
|
|
14
|
+
* - `"auto"` (default): return timestamps only if the TTS provider supplies
|
|
15
|
+
* them natively. Free, no extra API calls.
|
|
16
|
+
* - `"on"`: always return timestamps. Uses native data when available;
|
|
17
|
+
* otherwise falls back to a speech-to-text round-trip of the synthesized
|
|
18
|
+
* audio (cost + latency implications).
|
|
19
|
+
* - `"off"`: never return timestamps, even when the provider would give them
|
|
20
|
+
* away for free.
|
|
21
|
+
*/
|
|
22
|
+
export type TimestampMode = "on" | "auto" | "off";
|
|
23
|
+
//# sourceMappingURL=timestamps.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"timestamps.d.ts","sourceRoot":"","sources":["../src/timestamps.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED;;;;;;;;;;GAUG;AACH,MAAM,MAAM,aAAa,GAAG,IAAI,GAAG,MAAM,GAAG,KAAK,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"timestamps.js","sourceRoot":"","sources":["../src/timestamps.ts"],"names":[],"mappings":""}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@speech-sdk/core",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "Universal, cross-platform text-to-speech SDK with multi-provider support.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -69,6 +69,10 @@
|
|
|
69
69
|
"./xai": {
|
|
70
70
|
"types": "./dist/providers/xai/index.d.ts",
|
|
71
71
|
"default": "./dist/providers/xai/index.js"
|
|
72
|
+
},
|
|
73
|
+
"./stt/openai": {
|
|
74
|
+
"types": "./dist/stt-providers/openai/index.d.ts",
|
|
75
|
+
"default": "./dist/stt-providers/openai/index.js"
|
|
72
76
|
}
|
|
73
77
|
},
|
|
74
78
|
"files": [
|
|
@@ -84,7 +88,7 @@
|
|
|
84
88
|
"inworld",
|
|
85
89
|
"ai"
|
|
86
90
|
],
|
|
87
|
-
"license": "
|
|
91
|
+
"license": "Apache-2.0",
|
|
88
92
|
"repository": {
|
|
89
93
|
"type": "git",
|
|
90
94
|
"url": "https://github.com/Jellypod-Inc/speech-sdk"
|