@speech-sdk/core 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/LICENSE +202 -21
  2. package/README.md +215 -269
  3. package/dist/__tests__/e2e/_save-audio.d.ts +51 -2
  4. package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
  5. package/dist/__tests__/e2e/_save-audio.js +139 -11
  6. package/dist/__tests__/e2e/_save-audio.js.map +1 -1
  7. package/dist/audio-utils.d.ts +2 -0
  8. package/dist/audio-utils.d.ts.map +1 -1
  9. package/dist/audio-utils.js +9 -0
  10. package/dist/audio-utils.js.map +1 -1
  11. package/dist/captions.d.ts +137 -0
  12. package/dist/captions.d.ts.map +1 -0
  13. package/dist/captions.js +283 -0
  14. package/dist/captions.js.map +1 -0
  15. package/dist/conversation/stitch.d.ts +5 -0
  16. package/dist/conversation/stitch.d.ts.map +1 -1
  17. package/dist/conversation/stitch.js +37 -0
  18. package/dist/conversation/stitch.js.map +1 -1
  19. package/dist/conversation/types.d.ts +16 -0
  20. package/dist/conversation/types.d.ts.map +1 -1
  21. package/dist/conversation/validate.d.ts.map +1 -1
  22. package/dist/conversation/validate.js +0 -6
  23. package/dist/conversation/validate.js.map +1 -1
  24. package/dist/derive-timestamps.d.ts +14 -0
  25. package/dist/derive-timestamps.d.ts.map +1 -0
  26. package/dist/derive-timestamps.js +38 -0
  27. package/dist/derive-timestamps.js.map +1 -0
  28. package/dist/errors.d.ts +25 -0
  29. package/dist/errors.d.ts.map +1 -1
  30. package/dist/errors.js +28 -0
  31. package/dist/errors.js.map +1 -1
  32. package/dist/generate-conversation.d.ts +2 -1
  33. package/dist/generate-conversation.d.ts.map +1 -1
  34. package/dist/generate-conversation.js +72 -0
  35. package/dist/generate-conversation.js.map +1 -1
  36. package/dist/generate-speech.d.ts +18 -1
  37. package/dist/generate-speech.d.ts.map +1 -1
  38. package/dist/generate-speech.js +73 -16
  39. package/dist/generate-speech.js.map +1 -1
  40. package/dist/index.d.ts +6 -2
  41. package/dist/index.d.ts.map +1 -1
  42. package/dist/index.js +2 -1
  43. package/dist/index.js.map +1 -1
  44. package/dist/logger.d.ts +2 -0
  45. package/dist/logger.d.ts.map +1 -0
  46. package/dist/logger.js +40 -0
  47. package/dist/logger.js.map +1 -0
  48. package/dist/provider-utils.d.ts +8 -0
  49. package/dist/provider-utils.d.ts.map +1 -1
  50. package/dist/provider-utils.js +16 -2
  51. package/dist/provider-utils.js.map +1 -1
  52. package/dist/providers/cartesia/alignment.d.ts +24 -0
  53. package/dist/providers/cartesia/alignment.d.ts.map +1 -0
  54. package/dist/providers/cartesia/alignment.js +23 -0
  55. package/dist/providers/cartesia/alignment.js.map +1 -0
  56. package/dist/providers/cartesia/index.d.ts +12 -2
  57. package/dist/providers/cartesia/index.d.ts.map +1 -1
  58. package/dist/providers/cartesia/index.js +137 -2
  59. package/dist/providers/cartesia/index.js.map +1 -1
  60. package/dist/providers/elevenlabs/alignment.d.ts +24 -0
  61. package/dist/providers/elevenlabs/alignment.d.ts.map +1 -0
  62. package/dist/providers/elevenlabs/alignment.js +48 -0
  63. package/dist/providers/elevenlabs/alignment.js.map +1 -0
  64. package/dist/providers/elevenlabs/index.d.ts +19 -4
  65. package/dist/providers/elevenlabs/index.d.ts.map +1 -1
  66. package/dist/providers/elevenlabs/index.js +83 -13
  67. package/dist/providers/elevenlabs/index.js.map +1 -1
  68. package/dist/providers/fal/index.d.ts +0 -25
  69. package/dist/providers/fal/index.d.ts.map +1 -1
  70. package/dist/providers/fal/index.js +3 -58
  71. package/dist/providers/fal/index.js.map +1 -1
  72. package/dist/providers/hume/alignment.d.ts +38 -0
  73. package/dist/providers/hume/alignment.d.ts.map +1 -0
  74. package/dist/providers/hume/alignment.js +31 -0
  75. package/dist/providers/hume/alignment.js.map +1 -0
  76. package/dist/providers/hume/index.d.ts +8 -1
  77. package/dist/providers/hume/index.d.ts.map +1 -1
  78. package/dist/providers/hume/index.js +75 -1
  79. package/dist/providers/hume/index.js.map +1 -1
  80. package/dist/providers/inworld/alignment.d.ts +25 -0
  81. package/dist/providers/inworld/alignment.d.ts.map +1 -0
  82. package/dist/providers/inworld/alignment.js +23 -0
  83. package/dist/providers/inworld/alignment.js.map +1 -0
  84. package/dist/providers/inworld/index.d.ts +11 -2
  85. package/dist/providers/inworld/index.d.ts.map +1 -1
  86. package/dist/providers/inworld/index.js +11 -2
  87. package/dist/providers/inworld/index.js.map +1 -1
  88. package/dist/providers/murf/alignment.d.ts +22 -0
  89. package/dist/providers/murf/alignment.d.ts.map +1 -0
  90. package/dist/providers/murf/alignment.js +17 -0
  91. package/dist/providers/murf/alignment.js.map +1 -0
  92. package/dist/providers/murf/index.d.ts +8 -1
  93. package/dist/providers/murf/index.d.ts.map +1 -1
  94. package/dist/providers/murf/index.js +10 -1
  95. package/dist/providers/murf/index.js.map +1 -1
  96. package/dist/providers/openai/index.d.ts +12 -3
  97. package/dist/providers/openai/index.d.ts.map +1 -1
  98. package/dist/providers/openai/index.js +7 -3
  99. package/dist/providers/openai/index.js.map +1 -1
  100. package/dist/providers/resemble/alignment.d.ts +32 -0
  101. package/dist/providers/resemble/alignment.d.ts.map +1 -0
  102. package/dist/providers/resemble/alignment.js +57 -0
  103. package/dist/providers/resemble/alignment.js.map +1 -0
  104. package/dist/providers/resemble/index.d.ts +7 -1
  105. package/dist/providers/resemble/index.d.ts.map +1 -1
  106. package/dist/providers/resemble/index.js +13 -1
  107. package/dist/providers/resemble/index.js.map +1 -1
  108. package/dist/resolve-provider.d.ts.map +1 -1
  109. package/dist/resolve-provider.js +3 -12
  110. package/dist/resolve-provider.js.map +1 -1
  111. package/dist/speech-provider.d.ts +48 -4
  112. package/dist/speech-provider.d.ts.map +1 -1
  113. package/dist/speech-provider.js +16 -0
  114. package/dist/speech-provider.js.map +1 -1
  115. package/dist/speech-result.d.ts +10 -0
  116. package/dist/speech-result.d.ts.map +1 -1
  117. package/dist/speech-result.js.map +1 -1
  118. package/dist/speech-to-text-provider.d.ts +40 -0
  119. package/dist/speech-to-text-provider.d.ts.map +1 -0
  120. package/dist/speech-to-text-provider.js +2 -0
  121. package/dist/speech-to-text-provider.js.map +1 -0
  122. package/dist/stt-providers/openai/index.d.ts +42 -0
  123. package/dist/stt-providers/openai/index.d.ts.map +1 -0
  124. package/dist/stt-providers/openai/index.js +184 -0
  125. package/dist/stt-providers/openai/index.js.map +1 -0
  126. package/dist/timestamps.d.ts +23 -0
  127. package/dist/timestamps.d.ts.map +1 -0
  128. package/dist/timestamps.js +2 -0
  129. package/dist/timestamps.js.map +1 -0
  130. package/package.json +6 -2
@@ -1 +1 @@
1
- {"version":3,"file":"speech-result.d.ts","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAEpD,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;CACjC;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,KAAK,EAAE,kBAAkB,CAAC;IACnC,QAAQ,CAAC,QAAQ,EAAE,cAAc,CAAC;IAClC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpD,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,qBAAa,yBAA0B,YAAW,kBAAkB;IAClE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAE3B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsB;IAC5C,OAAO,CAAC,WAAW,CAAC,CAAa;IACjC,OAAO,CAAC,OAAO,CAAC,CAAS;gBAEb,EACV,IAAI,EACJ,SAAS,GACV,EAAE;QAAE,IAAI,EAAE,MAAM,GAAG,UAAU,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE;IAKnD,IAAI,UAAU,IAAI,UAAU,CAe3B;IAED,IAAI,MAAM,IAAI,MAAM,CAcnB;CACF"}
1
+ {"version":3,"file":"speech-result.d.ts","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AACpD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;CACjC;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,KAAK,EAAE,kBAAkB,CAAC;IACnC,QAAQ,CAAC,QAAQ,EAAE,cAAc,CAAC;IAClC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpD;;;;;;;OAOG;IACH,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,CAAC;IAC/C,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,qBAAa,yBAA0B,YAAW,kBAAkB;IAClE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAE3B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsB;IAC5C,OAAO,CAAC,WAAW,CAAC,CAAa;IACjC,OAAO,CAAC,OAAO,CAAC,CAAS;gBAEb,EACV,IAAI,EACJ,SAAS,GACV,EAAE;QAAE,IAAI,EAAE,MAAM,GAAG,UAAU,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE;IAKnD,IAAI,UAAU,IAAI,UAAU,CAe3B;IAED,IAAI,MAAM,IAAI,MAAM,CAcnB;CACF"}
@@ -1 +1 @@
1
- {"version":3,"file":"speech-result.js","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAeA,MAAM,OAAO,yBAAyB;IAC3B,SAAS,CAAS;IAEV,KAAK,CAAsB;IACpC,WAAW,CAAc;IACzB,OAAO,CAAU;IAEzB,YAAY,EACV,IAAI,EACJ,SAAS,GACwC;QACjD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;QAClB,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,IAAI,UAAU;QACZ,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,WAAW,CAAC;QAC1B,CAAC;QACD,IAAI,IAAI,CAAC,KAAK,YAAY,UAAU,EAAE,CAAC;YACrC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACtC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YACxC,CAAC;YACD,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC;QAC3B,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,IAAI,MAAM;QACR,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC,OAAO,CAAC;QACtB,CAAC;QACD,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YACnC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC;QAC5B,CAAC;aAAM,CAAC;YACN,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC9B,YAAY,IAAI,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;YAC5C,CAAC;YACD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF"}
1
+ {"version":3,"file":"speech-result.js","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAyBA,MAAM,OAAO,yBAAyB;IAC3B,SAAS,CAAS;IAEV,KAAK,CAAsB;IACpC,WAAW,CAAc;IACzB,OAAO,CAAU;IAEzB,YAAY,EACV,IAAI,EACJ,SAAS,GACwC;QACjD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;QAClB,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,IAAI,UAAU;QACZ,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,WAAW,CAAC;QAC1B,CAAC;QACD,IAAI,IAAI,CAAC,KAAK,YAAY,UAAU,EAAE,CAAC;YACrC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACtC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YACxC,CAAC;YACD,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC;QAC3B,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,IAAI,MAAM;QACR,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC,OAAO,CAAC;QACtB,CAAC;QACD,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YACnC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC;QAC5B,CAAC;aAAM,CAAC;YACN,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC9B,YAAY,IAAI,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;YAC5C,CAAC;YACD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF"}
@@ -0,0 +1,40 @@
1
+ import type { WordTimestamp } from "./timestamps.js";
2
+ /**
3
+ * Minimal info about an STT model. Parallels `ModelInfo` on the TTS side.
4
+ */
5
+ export interface STTModelInfo {
6
+ readonly id: string;
7
+ readonly languages: readonly string[];
8
+ readonly releaseDate: string;
9
+ }
10
+ /**
11
+ * Transcribes generated audio and returns word-level timestamps. This is the
12
+ * "derived" path for `timestamps: "on"` — used when the TTS provider doesn't
13
+ * return alignment data natively.
14
+ *
15
+ * Providers return `WordTimestamp[]` with start/end in seconds. Normalization
16
+ * (ms → seconds, char/phoneme aggregation, tuple → object) happens inside the
17
+ * provider adapter so the public surface is uniform.
18
+ */
19
+ export interface SpeechToTextProvider {
20
+ readonly defaultModel: string;
21
+ readonly id: string;
22
+ readonly models: readonly STTModelInfo[];
23
+ transcribe(options: {
24
+ modelId: string;
25
+ audio: Uint8Array;
26
+ mediaType: string;
27
+ language?: string;
28
+ abortSignal?: AbortSignal;
29
+ headers?: Record<string, string>;
30
+ }): Promise<{
31
+ timestamps: WordTimestamp[];
32
+ text?: string;
33
+ providerMetadata?: Record<string, unknown>;
34
+ }>;
35
+ }
36
+ export interface ResolvedSTTModel {
37
+ readonly modelId: string;
38
+ readonly provider: SpeechToTextProvider;
39
+ }
40
+ //# sourceMappingURL=speech-to-text-provider.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"speech-to-text-provider.d.ts","sourceRoot":"","sources":["../src/speech-to-text-provider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;CAC9B;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,MAAM,EAAE,SAAS,YAAY,EAAE,CAAC;IAEzC,UAAU,CAAC,OAAO,EAAE;QAClB,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,UAAU,EAAE,aAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,oBAAoB,CAAC;CACzC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=speech-to-text-provider.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"speech-to-text-provider.js","sourceRoot":"","sources":["../src/speech-to-text-provider.ts"],"names":[],"mappings":""}
@@ -0,0 +1,42 @@
1
+ import type { ResolvedSTTModel, SpeechToTextProvider } from "../../speech-to-text-provider.js";
2
+ import type { WordTimestamp } from "../../timestamps.js";
3
+ export interface OpenAISpeechToTextProviderConfig {
4
+ apiKey?: string;
5
+ baseURL?: string;
6
+ fetch?: typeof globalThis.fetch;
7
+ }
8
+ /**
9
+ * OpenAI Whisper / gpt-4o-transcribe adapter for the SDK's derived-timestamps
10
+ * path. Uses `/v1/audio/transcriptions` with `timestamp_granularities: ["word"]`
11
+ * and `response_format: "verbose_json"`.
12
+ *
13
+ * Note: `gpt-4o-transcribe-diarize` is intentionally not listed — that
14
+ * variant does not support `timestamp_granularities`.
15
+ */
16
+ export declare class OpenAISpeechToTextProvider implements SpeechToTextProvider {
17
+ readonly id = "openai";
18
+ readonly defaultModel = "whisper-1";
19
+ readonly models: readonly [{
20
+ readonly id: "whisper-1";
21
+ readonly releaseDate: "2023-03-01";
22
+ readonly languages: readonly ["af", "ar", "az", "be", "bg", "bn", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gl", "he", "hi", "hr", "hu", "hy", "id", "is", "it", "ja", "kk", "kn", "ko", "lt", "lv", "mi", "mk", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "sw", "ta", "th", "tl", "tr", "uk", "ur", "vi", "zh"];
23
+ }];
24
+ private readonly apiKey;
25
+ private readonly baseURL;
26
+ private readonly fetchFn;
27
+ constructor(config?: OpenAISpeechToTextProviderConfig);
28
+ transcribe(options: {
29
+ modelId: string;
30
+ audio: Uint8Array;
31
+ mediaType: string;
32
+ language?: string;
33
+ abortSignal?: AbortSignal;
34
+ headers?: Record<string, string>;
35
+ }): Promise<{
36
+ timestamps: WordTimestamp[];
37
+ text?: string;
38
+ providerMetadata?: Record<string, unknown>;
39
+ }>;
40
+ }
41
+ export declare function createOpenAISTT(config?: OpenAISpeechToTextProviderConfig): (modelId?: string) => ResolvedSTTModel;
42
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/stt-providers/openai/index.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EACV,gBAAgB,EAChB,oBAAoB,EACrB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAEzD,MAAM,WAAW,gCAAgC;IAC/C,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;CACjC;AAiED;;;;;;;GAOG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IACrE,QAAQ,CAAC,EAAE,YAAY;IACvB,QAAQ,CAAC,YAAY,eAAe;IAMpC,QAAQ,CAAC,MAAM;;;;OAMJ;IAEX,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAqB;IAC5C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA0B;gBAEtC,MAAM,GAAE,gCAAqC;IAMnD,UAAU,CAAC,OAAO,EAAE;QACxB,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,UAAU,EAAE,aAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC;CAsDH;AAED,wBAAgB,eAAe,CAAC,MAAM,GAAE,gCAAqC,IAGjD,UAAU,MAAM,KAAG,gBAAgB,CAM9D"}
@@ -0,0 +1,184 @@
1
+ import { parseMediaTypeParam, wrapPcm16Mono } from "../../audio-utils.js";
2
+ import { handleErrorResponse, resolveApiKey, SDK_USER_AGENT, } from "../../provider-utils.js";
3
+ // OpenAI Whisper is advertised as 50+ languages; we list the ISO-639-1 codes
4
+ // the API's `language` parameter accepts. Matches the TTS provider's list.
5
+ const OPENAI_STT_LANGUAGES = [
6
+ "af",
7
+ "ar",
8
+ "az",
9
+ "be",
10
+ "bg",
11
+ "bn",
12
+ "bs",
13
+ "ca",
14
+ "cs",
15
+ "cy",
16
+ "da",
17
+ "de",
18
+ "el",
19
+ "en",
20
+ "es",
21
+ "et",
22
+ "fa",
23
+ "fi",
24
+ "fr",
25
+ "gl",
26
+ "he",
27
+ "hi",
28
+ "hr",
29
+ "hu",
30
+ "hy",
31
+ "id",
32
+ "is",
33
+ "it",
34
+ "ja",
35
+ "kk",
36
+ "kn",
37
+ "ko",
38
+ "lt",
39
+ "lv",
40
+ "mi",
41
+ "mk",
42
+ "mr",
43
+ "ms",
44
+ "ne",
45
+ "nl",
46
+ "no",
47
+ "pl",
48
+ "pt",
49
+ "ro",
50
+ "ru",
51
+ "sk",
52
+ "sl",
53
+ "sr",
54
+ "sv",
55
+ "sw",
56
+ "ta",
57
+ "th",
58
+ "tl",
59
+ "tr",
60
+ "uk",
61
+ "ur",
62
+ "vi",
63
+ "zh",
64
+ ];
65
+ /**
66
+ * OpenAI Whisper / gpt-4o-transcribe adapter for the SDK's derived-timestamps
67
+ * path. Uses `/v1/audio/transcriptions` with `timestamp_granularities: ["word"]`
68
+ * and `response_format: "verbose_json"`.
69
+ *
70
+ * Note: `gpt-4o-transcribe-diarize` is intentionally not listed — that
71
+ * variant does not support `timestamp_granularities`.
72
+ */
73
+ export class OpenAISpeechToTextProvider {
74
+ id = "openai";
75
+ defaultModel = "whisper-1";
76
+ // Only whisper-1 supports word-level timestamps. The newer
77
+ // gpt-4o-transcribe / gpt-4o-mini-transcribe models accept `json` /
78
+ // `text` response formats only and don't expose `timestamp_granularities`,
79
+ // so they can't satisfy this provider's contract.
80
+ models = [
81
+ {
82
+ id: "whisper-1",
83
+ releaseDate: "2023-03-01",
84
+ languages: OPENAI_STT_LANGUAGES,
85
+ },
86
+ ];
87
+ apiKey;
88
+ baseURL;
89
+ fetchFn;
90
+ constructor(config = {}) {
91
+ this.apiKey = config.apiKey;
92
+ this.baseURL = config.baseURL ?? "https://api.openai.com/v1";
93
+ this.fetchFn = config.fetch ?? globalThis.fetch.bind(globalThis);
94
+ }
95
+ async transcribe(options) {
96
+ const { audio, mediaType } = await normalizeAudioForOpenAI(options.audio, options.mediaType);
97
+ const form = new FormData();
98
+ const filename = `audio.${mediaTypeToExtension(mediaType)}`;
99
+ // Cast via BlobPart: TS narrowing of Uint8Array<ArrayBufferLike> vs
100
+ // Blob's required ArrayBuffer-backed view is stricter than runtime.
101
+ form.append("file", new Blob([audio], { type: mediaType }), filename);
102
+ form.append("model", options.modelId);
103
+ form.append("response_format", "verbose_json");
104
+ form.append("timestamp_granularities[]", "word");
105
+ if (options.language) {
106
+ form.append("language", options.language);
107
+ }
108
+ const response = await this.fetchFn(`${this.baseURL}/audio/transcriptions`, {
109
+ method: "POST",
110
+ headers: {
111
+ Authorization: `Bearer ${resolveApiKey(this.apiKey, "OPENAI_API_KEY", "OpenAI")}`,
112
+ "X-User-Agent": SDK_USER_AGENT,
113
+ ...options.headers,
114
+ },
115
+ body: form,
116
+ signal: options.abortSignal,
117
+ });
118
+ await handleErrorResponse(response, `openai/${options.modelId}`);
119
+ const data = (await response.json());
120
+ const timestamps = (data.words ?? []).map((w) => ({
121
+ text: w.word,
122
+ start: w.start,
123
+ end: w.end,
124
+ }));
125
+ return {
126
+ timestamps,
127
+ text: data.text,
128
+ };
129
+ }
130
+ }
131
+ export function createOpenAISTT(config = {}) {
132
+ const provider = new OpenAISpeechToTextProvider(config);
133
+ return function openaiSTT(modelId) {
134
+ return {
135
+ provider,
136
+ modelId: modelId ?? provider.defaultModel,
137
+ };
138
+ };
139
+ }
140
+ // OpenAI transcription accepts mp3/mp4/mpeg/mpga/m4a/wav/webm/flac/ogg/opus
141
+ // but rejects raw PCM. When a TTS provider hands us raw little-endian PCM
142
+ // (stitch mode), we wrap it with a WAV header so the STT endpoint will
143
+ // parse it. `audio/l16` is intentionally NOT handled: RFC 2586 defines it
144
+ // as big-endian and `wrapPcm16Mono` writes little-endian — silently mis-
145
+ // wrapping would corrupt audio. No current provider emits L16; add an
146
+ // explicit byte-swap branch here if one does.
147
+ async function normalizeAudioForOpenAI(audio, mediaType) {
148
+ if (mediaTypeBase(mediaType) === "audio/pcm") {
149
+ const sampleRate = parseMediaTypeParam(mediaType, "rate") ?? 24_000;
150
+ return {
151
+ audio: await wrapPcm16Mono(audio, sampleRate),
152
+ mediaType: "audio/wav",
153
+ };
154
+ }
155
+ return { audio, mediaType };
156
+ }
157
+ function mediaTypeBase(mediaType) {
158
+ return mediaType.split(";")[0]?.trim().toLowerCase() ?? "";
159
+ }
160
+ function mediaTypeToExtension(mediaType) {
161
+ switch (mediaTypeBase(mediaType)) {
162
+ case "audio/mpeg":
163
+ case "audio/mp3":
164
+ return "mp3";
165
+ case "audio/wav":
166
+ case "audio/x-wav":
167
+ return "wav";
168
+ case "audio/ogg":
169
+ return "ogg";
170
+ case "audio/opus":
171
+ return "opus";
172
+ case "audio/flac":
173
+ return "flac";
174
+ case "audio/webm":
175
+ return "webm";
176
+ case "audio/mp4":
177
+ case "audio/m4a":
178
+ case "audio/x-m4a":
179
+ return "m4a";
180
+ default:
181
+ return "mp3";
182
+ }
183
+ }
184
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/stt-providers/openai/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAC1E,OAAO,EACL,mBAAmB,EACnB,aAAa,EACb,cAAc,GACf,MAAM,yBAAyB,CAAC;AAajC,6EAA6E;AAC7E,2EAA2E;AAC3E,MAAM,oBAAoB,GAAG;IAC3B,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;CACI,CAAC;AAEX;;;;;;;GAOG;AACH,MAAM,OAAO,0BAA0B;IAC5B,EAAE,GAAG,QAAQ,CAAC;IACd,YAAY,GAAG,WAAW,CAAC;IAEpC,2DAA2D;IAC3D,oEAAoE;IACpE,2EAA2E;IAC3E,kDAAkD;IACzC,MAAM,GAAG;QAChB;YACE,EAAE,EAAE,WAAW;YACf,WAAW,EAAE,YAAY;YACzB,SAAS,EAAE,oBAAoB;SAChC;KACO,CAAC;IAEM,MAAM,CAAqB;IAC3B,OAAO,CAAS;IAChB,OAAO,CAA0B;IAElD,YAAY,SAA2C,EAAE;QACvD,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;QAC5B,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,OAAO,IAAI,2BAA2B,CAAC;QAC7D,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,KAAK,IAAI,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IACnE,CAAC;IAED,KAAK,CAAC,UAAU,CAAC,OAOhB;QAKC,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,MAAM,uBAAuB,CACxD,OAAO,CAAC,KAAK,EACb,OAAO,CAAC,SAAS,CAClB,CAAC;QAEF,MAAM,IAAI,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,QAAQ,GAAG,SAAS,oBAAoB,CAAC,SAAS,CAAC,EAAE,CAAC;QAC5D,oEAAoE;QACpE,oEAAoE;QACpE,IAAI,CAAC,MAAM,CACT,MAAM,EACN,IAAI,IAAI,CAAC,CAAC,KAAiB,CAAC,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC,EAClD,QAAQ,CACT,CAAC;QACF,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QACtC,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,cAAc,CAAC,CAAC;QAC/C,IAAI,CAAC,MAAM,CAAC,2BAA2B,EAAE,MAAM,CAAC,CAAC;QACjD,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QAC5C,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CACjC,GAAG,IAAI,CAAC,OAAO,uBAAuB,EACtC;YACE,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,aAAa,EAAE,UAAU,aAAa,CAAC,IAAI,CAAC,MAAM,EAAE,gBAAgB,EAAE,QAAQ,CAAC,EAAE;gBACjF,cAAc,EAAE,cAAc;gBAC9B,GAAG,OAAO,CAAC,OAAO;aACnB;YACD,IAAI,EAAE,IAAI;YACV,MAAM,EAAE,OAAO,CAAC,WAAW;SAC5B,CACF,CAAC;QAEF,MAAM,mBAAmB,CAAC,QAAQ,EAAE,UAAU,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC;QAEjE,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAGlC,CAAC;QAEF,MAAM,UAAU,GAAoB,CAAC,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACjE,IAAI,EAAE,CAAC,CAAC,IAAI;YACZ,KAAK,EAAE,CAAC,CAAC,KAAK;YACd,GAAG,EAAE,CAAC,CAAC,GAAG;SACX,CAAC,CAAC,CAAC;QAEJ,OAAO;YACL,UAAU;YACV,IAAI,EAAE,IAAI,CAAC,IAAI;SAChB,CAAC;IACJ,CAAC;CACF;AAED,MAAM,UAAU,eAAe,CAAC,SAA2C,EAAE;IAC3E,MAAM,QAAQ,GAAG,IAAI,0BAA0B,CAAC,MAAM,CAAC,CAAC;IAExD,OAAO,SAAS,SAAS,CAAC,OAAgB;QACxC,OAAO;YACL,QAAQ;YACR,OAAO,EAAE,OAAO,IAAI,QAAQ,CAAC,YAAY;SAC1C,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC;AAED,4EAA4E;AAC5E,0EAA0E;AAC1E,uEAAuE;AACvE,0EAA0E;AAC1E,yEAAyE;AACzE,sEAAsE;AACtE,8CAA8C;AAC9C,KAAK,UAAU,uBAAuB,CACpC,KAAiB,EACjB,SAAiB;IAEjB,IAAI,aAAa,CAAC,SAAS,CAAC,KAAK,WAAW,EAAE,CAAC;QAC7C,MAAM,UAAU,GAAG,mBAAmB,CAAC,SAAS,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC;QACpE,OAAO;YACL,KAAK,EAAE,MAAM,aAAa,CAAC,KAAK,EAAE,UAAU,CAAC;YAC7C,SAAS,EAAE,WAAW;SACvB,CAAC;IACJ,CAAC;IACD,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;AAC9B,CAAC;AAED,SAAS,aAAa,CAAC,SAAiB;IACtC,OAAO,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CAAC;AAC7D,CAAC;AAED,SAAS,oBAAoB,CAAC,SAAiB;IAC7C,QAAQ,aAAa,CAAC,SAAS,CAAC,EAAE,CAAC;QACjC,KAAK,YAAY,CAAC;QAClB,KAAK,WAAW;YACd,OAAO,KAAK,CAAC;QACf,KAAK,WAAW,CAAC;QACjB,KAAK,aAAa;YAChB,OAAO,KAAK,CAAC;QACf,KAAK,WAAW;YACd,OAAO,KAAK,CAAC;QACf,KAAK,YAAY;YACf,OAAO,MAAM,CAAC;QAChB,KAAK,YAAY;YACf,OAAO,MAAM,CAAC;QAChB,KAAK,YAAY;YACf,OAAO,MAAM,CAAC;QAChB,KAAK,WAAW,CAAC;QACjB,KAAK,WAAW,CAAC;QACjB,KAAK,aAAa;YAChB,OAAO,KAAK,CAAC;QACf;YACE,OAAO,KAAK,CAAC;IACjB,CAAC;AACH,CAAC"}
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Word-granularity alignment data. Timestamps are always in seconds from
3
+ * the start of the generated audio. Providers that natively return character
4
+ * or phoneme granularity are aggregated to words internally.
5
+ */
6
+ export interface WordTimestamp {
7
+ readonly end: number;
8
+ readonly start: number;
9
+ readonly text: string;
10
+ }
11
+ /**
12
+ * Controls whether `generateSpeech()` returns word timestamps.
13
+ *
14
+ * - `"auto"` (default): return timestamps only if the TTS provider supplies
15
+ * them natively. Free, no extra API calls.
16
+ * - `"on"`: always return timestamps. Uses native data when available;
17
+ * otherwise falls back to a speech-to-text round-trip of the synthesized
18
+ * audio (cost + latency implications).
19
+ * - `"off"`: never return timestamps, even when the provider would give them
20
+ * away for free.
21
+ */
22
+ export type TimestampMode = "on" | "auto" | "off";
23
+ //# sourceMappingURL=timestamps.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"timestamps.d.ts","sourceRoot":"","sources":["../src/timestamps.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED;;;;;;;;;;GAUG;AACH,MAAM,MAAM,aAAa,GAAG,IAAI,GAAG,MAAM,GAAG,KAAK,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=timestamps.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"timestamps.js","sourceRoot":"","sources":["../src/timestamps.ts"],"names":[],"mappings":""}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@speech-sdk/core",
3
- "version": "0.6.1",
3
+ "version": "0.7.0",
4
4
  "description": "Universal, cross-platform text-to-speech SDK with multi-provider support.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -69,6 +69,10 @@
69
69
  "./xai": {
70
70
  "types": "./dist/providers/xai/index.d.ts",
71
71
  "default": "./dist/providers/xai/index.js"
72
+ },
73
+ "./stt/openai": {
74
+ "types": "./dist/stt-providers/openai/index.d.ts",
75
+ "default": "./dist/stt-providers/openai/index.js"
72
76
  }
73
77
  },
74
78
  "files": [
@@ -84,7 +88,7 @@
84
88
  "inworld",
85
89
  "ai"
86
90
  ],
87
- "license": "MIT",
91
+ "license": "Apache-2.0",
88
92
  "repository": {
89
93
  "type": "git",
90
94
  "url": "https://github.com/Jellypod-Inc/speech-sdk"