@speech-sdk/core 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. package/README.md +227 -108
  2. package/dist/__tests__/e2e/_save-audio.d.ts +0 -42
  3. package/dist/__tests__/e2e/_save-audio.d.ts.map +1 -1
  4. package/dist/__tests__/e2e/_save-audio.js +0 -59
  5. package/dist/__tests__/e2e/_save-audio.js.map +1 -1
  6. package/dist/audio-decode.d.ts +7 -0
  7. package/dist/audio-decode.d.ts.map +1 -0
  8. package/dist/audio-decode.js +109 -0
  9. package/dist/audio-decode.js.map +1 -0
  10. package/dist/audio-duration.d.ts +0 -5
  11. package/dist/audio-duration.d.ts.map +1 -1
  12. package/dist/audio-duration.js +5 -21
  13. package/dist/audio-duration.js.map +1 -1
  14. package/dist/audio-output.d.ts +39 -0
  15. package/dist/audio-output.d.ts.map +1 -0
  16. package/dist/audio-output.js +111 -0
  17. package/dist/audio-output.js.map +1 -0
  18. package/dist/audio-utils.d.ts +2 -10
  19. package/dist/audio-utils.d.ts.map +1 -1
  20. package/dist/audio-utils.js +57 -15
  21. package/dist/audio-utils.js.map +1 -1
  22. package/dist/captions.d.ts +0 -108
  23. package/dist/captions.d.ts.map +1 -1
  24. package/dist/captions.js +8 -98
  25. package/dist/captions.js.map +1 -1
  26. package/dist/conversation/attribute-timestamps.d.ts +26 -0
  27. package/dist/conversation/attribute-timestamps.d.ts.map +1 -0
  28. package/dist/conversation/attribute-timestamps.js +276 -0
  29. package/dist/conversation/attribute-timestamps.js.map +1 -0
  30. package/dist/conversation/dispatch.d.ts +5 -5
  31. package/dist/conversation/dispatch.d.ts.map +1 -1
  32. package/dist/conversation/dispatch.js +18 -8
  33. package/dist/conversation/dispatch.js.map +1 -1
  34. package/dist/conversation/errors.d.ts +3 -0
  35. package/dist/conversation/errors.d.ts.map +1 -1
  36. package/dist/conversation/errors.js +6 -0
  37. package/dist/conversation/errors.js.map +1 -1
  38. package/dist/conversation/pcm-concat.d.ts +0 -24
  39. package/dist/conversation/pcm-concat.d.ts.map +1 -1
  40. package/dist/conversation/pcm-concat.js +8 -183
  41. package/dist/conversation/pcm-concat.js.map +1 -1
  42. package/dist/conversation/proportional-fill.d.ts +10 -0
  43. package/dist/conversation/proportional-fill.d.ts.map +1 -0
  44. package/dist/conversation/proportional-fill.js +64 -0
  45. package/dist/conversation/proportional-fill.js.map +1 -0
  46. package/dist/conversation/silence-detection.d.ts +14 -0
  47. package/dist/conversation/silence-detection.d.ts.map +1 -0
  48. package/dist/conversation/silence-detection.js +52 -0
  49. package/dist/conversation/silence-detection.js.map +1 -0
  50. package/dist/conversation/stitch.d.ts +9 -6
  51. package/dist/conversation/stitch.d.ts.map +1 -1
  52. package/dist/conversation/stitch.js +72 -51
  53. package/dist/conversation/stitch.js.map +1 -1
  54. package/dist/conversation/types.d.ts +7 -37
  55. package/dist/conversation/types.d.ts.map +1 -1
  56. package/dist/conversation/validate.d.ts +1 -16
  57. package/dist/conversation/validate.d.ts.map +1 -1
  58. package/dist/conversation/validate.js +29 -29
  59. package/dist/conversation/validate.js.map +1 -1
  60. package/dist/default-stt-fallback.d.ts +3 -0
  61. package/dist/default-stt-fallback.d.ts.map +1 -0
  62. package/dist/default-stt-fallback.js +11 -0
  63. package/dist/default-stt-fallback.js.map +1 -0
  64. package/dist/derive-timestamps.d.ts +1 -5
  65. package/dist/derive-timestamps.d.ts.map +1 -1
  66. package/dist/derive-timestamps.js +1 -15
  67. package/dist/derive-timestamps.js.map +1 -1
  68. package/dist/encoders/mp3.d.ts +6 -0
  69. package/dist/encoders/mp3.d.ts.map +1 -0
  70. package/dist/encoders/mp3.js +54 -0
  71. package/dist/encoders/mp3.js.map +1 -0
  72. package/dist/errors.d.ts +20 -13
  73. package/dist/errors.d.ts.map +1 -1
  74. package/dist/errors.js +49 -15
  75. package/dist/errors.js.map +1 -1
  76. package/dist/generate-conversation.d.ts +5 -4
  77. package/dist/generate-conversation.d.ts.map +1 -1
  78. package/dist/generate-conversation.js +250 -93
  79. package/dist/generate-conversation.js.map +1 -1
  80. package/dist/generate-speech.d.ts +7 -28
  81. package/dist/generate-speech.d.ts.map +1 -1
  82. package/dist/generate-speech.js +185 -94
  83. package/dist/generate-speech.js.map +1 -1
  84. package/dist/index.d.ts +7 -11
  85. package/dist/index.d.ts.map +1 -1
  86. package/dist/index.js +6 -4
  87. package/dist/index.js.map +1 -1
  88. package/dist/logger.d.ts.map +1 -1
  89. package/dist/logger.js +2 -13
  90. package/dist/logger.js.map +1 -1
  91. package/dist/metadata.d.ts +0 -22
  92. package/dist/metadata.d.ts.map +1 -1
  93. package/dist/pronunciations/errors.d.ts +5 -0
  94. package/dist/pronunciations/errors.d.ts.map +1 -0
  95. package/dist/pronunciations/errors.js +8 -0
  96. package/dist/pronunciations/errors.js.map +1 -0
  97. package/dist/pronunciations/inverse-align.d.ts +4 -0
  98. package/dist/pronunciations/inverse-align.d.ts.map +1 -0
  99. package/dist/pronunciations/inverse-align.js +54 -0
  100. package/dist/pronunciations/inverse-align.js.map +1 -0
  101. package/dist/pronunciations/merge.d.ts +4 -0
  102. package/dist/pronunciations/merge.d.ts.map +1 -0
  103. package/dist/pronunciations/merge.js +13 -0
  104. package/dist/pronunciations/merge.js.map +1 -0
  105. package/dist/pronunciations/substitute.d.ts +6 -0
  106. package/dist/pronunciations/substitute.d.ts.map +1 -0
  107. package/dist/pronunciations/substitute.js +67 -0
  108. package/dist/pronunciations/substitute.js.map +1 -0
  109. package/dist/pronunciations/types.d.ts +18 -0
  110. package/dist/pronunciations/types.d.ts.map +1 -0
  111. package/dist/pronunciations/types.js +2 -0
  112. package/dist/pronunciations/types.js.map +1 -0
  113. package/dist/pronunciations/validate.d.ts +3 -0
  114. package/dist/pronunciations/validate.d.ts.map +1 -0
  115. package/dist/pronunciations/validate.js +26 -0
  116. package/dist/pronunciations/validate.js.map +1 -0
  117. package/dist/provider-utils.d.ts +4 -9
  118. package/dist/provider-utils.d.ts.map +1 -1
  119. package/dist/provider-utils.js +60 -51
  120. package/dist/provider-utils.js.map +1 -1
  121. package/dist/providers/cartesia/alignment.d.ts +0 -16
  122. package/dist/providers/cartesia/alignment.d.ts.map +1 -1
  123. package/dist/providers/cartesia/alignment.js +1 -6
  124. package/dist/providers/cartesia/alignment.js.map +1 -1
  125. package/dist/providers/cartesia/index.d.ts +29 -19
  126. package/dist/providers/cartesia/index.d.ts.map +1 -1
  127. package/dist/providers/cartesia/index.js +116 -80
  128. package/dist/providers/cartesia/index.js.map +1 -1
  129. package/dist/providers/deepgram/index.d.ts +23 -8
  130. package/dist/providers/deepgram/index.d.ts.map +1 -1
  131. package/dist/providers/deepgram/index.js +51 -18
  132. package/dist/providers/deepgram/index.js.map +1 -1
  133. package/dist/providers/elevenlabs/alignment.d.ts +7 -21
  134. package/dist/providers/elevenlabs/alignment.d.ts.map +1 -1
  135. package/dist/providers/elevenlabs/alignment.js +8 -9
  136. package/dist/providers/elevenlabs/alignment.js.map +1 -1
  137. package/dist/providers/elevenlabs/index.d.ts +14 -38
  138. package/dist/providers/elevenlabs/index.d.ts.map +1 -1
  139. package/dist/providers/elevenlabs/index.js +186 -169
  140. package/dist/providers/elevenlabs/index.js.map +1 -1
  141. package/dist/providers/fal/index.d.ts +11 -20
  142. package/dist/providers/fal/index.d.ts.map +1 -1
  143. package/dist/providers/fal/index.js +49 -37
  144. package/dist/providers/fal/index.js.map +1 -1
  145. package/dist/providers/fish-audio/index.d.ts +14 -8
  146. package/dist/providers/fish-audio/index.d.ts.map +1 -1
  147. package/dist/providers/fish-audio/index.js +47 -19
  148. package/dist/providers/fish-audio/index.js.map +1 -1
  149. package/dist/providers/gateway/index.d.ts +76 -0
  150. package/dist/providers/gateway/index.d.ts.map +1 -0
  151. package/dist/providers/gateway/index.js +251 -0
  152. package/dist/providers/gateway/index.js.map +1 -0
  153. package/dist/providers/google/index.d.ts +12 -20
  154. package/dist/providers/google/index.d.ts.map +1 -1
  155. package/dist/providers/google/index.js +180 -162
  156. package/dist/providers/google/index.js.map +1 -1
  157. package/dist/providers/hume/alignment.d.ts +30 -35
  158. package/dist/providers/hume/alignment.d.ts.map +1 -1
  159. package/dist/providers/hume/alignment.js +14 -8
  160. package/dist/providers/hume/alignment.js.map +1 -1
  161. package/dist/providers/hume/index.d.ts +16 -16
  162. package/dist/providers/hume/index.d.ts.map +1 -1
  163. package/dist/providers/hume/index.js +79 -65
  164. package/dist/providers/hume/index.js.map +1 -1
  165. package/dist/providers/inworld/alignment.d.ts +8 -22
  166. package/dist/providers/inworld/alignment.d.ts.map +1 -1
  167. package/dist/providers/inworld/alignment.js +9 -8
  168. package/dist/providers/inworld/alignment.js.map +1 -1
  169. package/dist/providers/inworld/index.d.ts +17 -20
  170. package/dist/providers/inworld/index.d.ts.map +1 -1
  171. package/dist/providers/inworld/index.js +79 -47
  172. package/dist/providers/inworld/index.js.map +1 -1
  173. package/dist/providers/mistral/index.d.ts +14 -8
  174. package/dist/providers/mistral/index.d.ts.map +1 -1
  175. package/dist/providers/mistral/index.js +63 -48
  176. package/dist/providers/mistral/index.js.map +1 -1
  177. package/dist/providers/murf/alignment.d.ts +10 -19
  178. package/dist/providers/murf/alignment.d.ts.map +1 -1
  179. package/dist/providers/murf/alignment.js +10 -5
  180. package/dist/providers/murf/alignment.js.map +1 -1
  181. package/dist/providers/murf/index.d.ts +15 -16
  182. package/dist/providers/murf/index.d.ts.map +1 -1
  183. package/dist/providers/murf/index.js +105 -58
  184. package/dist/providers/murf/index.js.map +1 -1
  185. package/dist/providers/openai/index.d.ts +43 -29
  186. package/dist/providers/openai/index.d.ts.map +1 -1
  187. package/dist/providers/openai/index.js +294 -106
  188. package/dist/providers/openai/index.js.map +1 -1
  189. package/dist/providers/resemble/alignment.d.ts +8 -29
  190. package/dist/providers/resemble/alignment.d.ts.map +1 -1
  191. package/dist/providers/resemble/alignment.js +9 -12
  192. package/dist/providers/resemble/alignment.js.map +1 -1
  193. package/dist/providers/resemble/index.d.ts +21 -11
  194. package/dist/providers/resemble/index.d.ts.map +1 -1
  195. package/dist/providers/resemble/index.js +89 -49
  196. package/dist/providers/resemble/index.js.map +1 -1
  197. package/dist/providers/smallest-ai/index.d.ts +47 -0
  198. package/dist/providers/smallest-ai/index.d.ts.map +1 -0
  199. package/dist/providers/smallest-ai/index.js +107 -0
  200. package/dist/providers/smallest-ai/index.js.map +1 -0
  201. package/dist/providers/xai/index.d.ts +25 -9
  202. package/dist/providers/xai/index.d.ts.map +1 -1
  203. package/dist/providers/xai/index.js +63 -40
  204. package/dist/providers/xai/index.js.map +1 -1
  205. package/dist/providers.d.ts +31 -0
  206. package/dist/providers.d.ts.map +1 -0
  207. package/dist/providers.js +16 -0
  208. package/dist/providers.js.map +1 -0
  209. package/dist/resolve-provider.d.ts.map +1 -1
  210. package/dist/resolve-provider.js +8 -51
  211. package/dist/resolve-provider.js.map +1 -1
  212. package/dist/retry-options.d.ts +6 -0
  213. package/dist/retry-options.d.ts.map +1 -0
  214. package/dist/retry-options.js +48 -0
  215. package/dist/retry-options.js.map +1 -0
  216. package/dist/speech-provider.d.ts +28 -53
  217. package/dist/speech-provider.d.ts.map +1 -1
  218. package/dist/speech-provider.js +5 -26
  219. package/dist/speech-provider.js.map +1 -1
  220. package/dist/speech-result.d.ts +8 -9
  221. package/dist/speech-result.d.ts.map +1 -1
  222. package/dist/speech-result.js.map +1 -1
  223. package/dist/speech-to-text-provider.d.ts +0 -12
  224. package/dist/speech-to-text-provider.d.ts.map +1 -1
  225. package/dist/stream-speech.d.ts +4 -2
  226. package/dist/stream-speech.d.ts.map +1 -1
  227. package/dist/stream-speech.js +36 -22
  228. package/dist/stream-speech.js.map +1 -1
  229. package/dist/timestamps.d.ts +3 -17
  230. package/dist/timestamps.d.ts.map +1 -1
  231. package/dist/turns.d.ts +9 -0
  232. package/dist/turns.d.ts.map +1 -0
  233. package/dist/turns.js +21 -0
  234. package/dist/turns.js.map +1 -0
  235. package/dist/types.d.ts +31 -0
  236. package/dist/types.d.ts.map +1 -1
  237. package/dist/volume-adjust.d.ts +0 -6
  238. package/dist/volume-adjust.d.ts.map +1 -1
  239. package/dist/volume-adjust.js +4 -16
  240. package/dist/volume-adjust.js.map +1 -1
  241. package/package.json +13 -66
  242. package/dist/stt-providers/openai/index.d.ts +0 -42
  243. package/dist/stt-providers/openai/index.d.ts.map +0 -1
  244. package/dist/stt-providers/openai/index.js +0 -184
  245. package/dist/stt-providers/openai/index.js.map +0 -1
@@ -1,5 +1,5 @@
1
1
  import type { SpeechMetadata } from "./metadata.js";
2
- import type { WordTimestamp } from "./timestamps.js";
2
+ import type { ConversationWordTimestamp, WordTimestamp } from "./timestamps.js";
3
3
  export interface GeneratedAudioFile {
4
4
  readonly base64: string;
5
5
  readonly mediaType: string;
@@ -9,17 +9,16 @@ export interface SpeechResult {
9
9
  readonly audio: GeneratedAudioFile;
10
10
  readonly metadata: SpeechMetadata;
11
11
  readonly providerMetadata?: Record<string, unknown>;
12
- /**
13
- * Word-level alignment data. Populated when `timestamps: "on"` or when
14
- * `timestamps: "auto"` (default) is combined with a TTS provider that
15
- * returns alignment natively. Undefined otherwise.
16
- *
17
- * Timestamps are always word-granularity with start/end in seconds.
18
- * Character- or phoneme-level native data is aggregated internally.
19
- */
20
12
  readonly timestamps?: readonly WordTimestamp[];
21
13
  readonly warnings?: string[];
22
14
  }
15
+ export interface ConversationMetadata extends SpeechMetadata {
16
+ readonly perTurn?: readonly SpeechMetadata[];
17
+ }
18
+ export interface ConversationResult extends Omit<SpeechResult, "metadata" | "timestamps"> {
19
+ readonly metadata: ConversationMetadata;
20
+ readonly timestamps?: readonly ConversationWordTimestamp[];
21
+ }
23
22
  export declare class DefaultGeneratedAudioFile implements GeneratedAudioFile {
24
23
  readonly mediaType: string;
25
24
  private readonly _data;
@@ -1 +1 @@
1
- {"version":3,"file":"speech-result.d.ts","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AACpD,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;CACjC;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,KAAK,EAAE,kBAAkB,CAAC;IACnC,QAAQ,CAAC,QAAQ,EAAE,cAAc,CAAC;IAClC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpD;;;;;;;OAOG;IACH,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,CAAC;IAC/C,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,qBAAa,yBAA0B,YAAW,kBAAkB;IAClE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAE3B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsB;IAC5C,OAAO,CAAC,WAAW,CAAC,CAAa;IACjC,OAAO,CAAC,OAAO,CAAC,CAAS;gBAEb,EACV,IAAI,EACJ,SAAS,GACV,EAAE;QAAE,IAAI,EAAE,MAAM,GAAG,UAAU,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE;IAKnD,IAAI,UAAU,IAAI,UAAU,CAe3B;IAED,IAAI,MAAM,IAAI,MAAM,CAcnB;CACF"}
1
+ {"version":3,"file":"speech-result.d.ts","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AACpD,OAAO,KAAK,EAAE,yBAAyB,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAEhF,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;CACjC;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,KAAK,EAAE,kBAAkB,CAAC;IACnC,QAAQ,CAAC,QAAQ,EAAE,cAAc,CAAC;IAClC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpD,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,aAAa,EAAE,CAAC;IAC/C,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,CAAC;CAC9B;AAED,MAAM,WAAW,oBAAqB,SAAQ,cAAc;IAG1D,QAAQ,CAAC,OAAO,CAAC,EAAE,SAAS,cAAc,EAAE,CAAC;CAC9C;AAED,MAAM,WAAW,kBACf,SAAQ,IAAI,CAAC,YAAY,EAAE,UAAU,GAAG,YAAY,CAAC;IACrD,QAAQ,CAAC,QAAQ,EAAE,oBAAoB,CAAC;IACxC,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,yBAAyB,EAAE,CAAC;CAC5D;AAED,qBAAa,yBAA0B,YAAW,kBAAkB;IAClE,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAE3B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAsB;IAC5C,OAAO,CAAC,WAAW,CAAC,CAAa;IACjC,OAAO,CAAC,OAAO,CAAC,CAAS;gBAEb,EACV,IAAI,EACJ,SAAS,GACV,EAAE;QAAE,IAAI,EAAE,MAAM,GAAG,UAAU,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE;IAKnD,IAAI,UAAU,IAAI,UAAU,CAe3B;IAED,IAAI,MAAM,IAAI,MAAM,CAcnB;CACF"}
@@ -1 +1 @@
1
- {"version":3,"file":"speech-result.js","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AAyBA,MAAM,OAAO,yBAAyB;IAC3B,SAAS,CAAS;IAEV,KAAK,CAAsB;IACpC,WAAW,CAAc;IACzB,OAAO,CAAU;IAEzB,YAAY,EACV,IAAI,EACJ,SAAS,GACwC;QACjD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;QAClB,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,IAAI,UAAU;QACZ,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,WAAW,CAAC;QAC1B,CAAC;QACD,IAAI,IAAI,CAAC,KAAK,YAAY,UAAU,EAAE,CAAC;YACrC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACtC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YACxC,CAAC;YACD,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC;QAC3B,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,IAAI,MAAM;QACR,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC,OAAO,CAAC;QACtB,CAAC;QACD,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YACnC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC;QAC5B,CAAC;aAAM,CAAC;YACN,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC9B,YAAY,IAAI,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;YAC5C,CAAC;YACD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF"}
1
+ {"version":3,"file":"speech-result.js","sourceRoot":"","sources":["../src/speech-result.ts"],"names":[],"mappings":"AA6BA,MAAM,OAAO,yBAAyB;IAC3B,SAAS,CAAS;IAEV,KAAK,CAAsB;IACpC,WAAW,CAAc;IACzB,OAAO,CAAU;IAEzB,YAAY,EACV,IAAI,EACJ,SAAS,GACwC;QACjD,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;QAClB,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC7B,CAAC;IAED,IAAI,UAAU;QACZ,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,EAAE,CAAC;YAC7B,OAAO,IAAI,CAAC,WAAW,CAAC;QAC1B,CAAC;QACD,IAAI,IAAI,CAAC,KAAK,YAAY,UAAU,EAAE,CAAC;YACrC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACtC,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,KAAK,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YACxC,CAAC;YACD,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC;QAC3B,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAED,IAAI,MAAM;QACR,IAAI,IAAI,CAAC,OAAO,IAAI,IAAI,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC,OAAO,CAAC;QACtB,CAAC;QACD,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YACnC,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC;QAC5B,CAAC;aAAM,CAAC;YACN,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;gBAC9B,YAAY,IAAI,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;YAC5C,CAAC;YACD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF"}
@@ -1,21 +1,9 @@
1
1
  import type { WordTimestamp } from "./timestamps.js";
2
- /**
3
- * Minimal info about an STT model. Parallels `ModelInfo` on the TTS side.
4
- */
5
2
  export interface STTModelInfo {
6
3
  readonly id: string;
7
4
  readonly languages: readonly string[];
8
5
  readonly releaseDate: string;
9
6
  }
10
- /**
11
- * Transcribes generated audio and returns word-level timestamps. This is the
12
- * "derived" path for `timestamps: "on"` — used when the TTS provider doesn't
13
- * return alignment data natively.
14
- *
15
- * Providers return `WordTimestamp[]` with start/end in seconds. Normalization
16
- * (ms → seconds, char/phoneme aggregation, tuple → object) happens inside the
17
- * provider adapter so the public surface is uniform.
18
- */
19
7
  export interface SpeechToTextProvider {
20
8
  readonly defaultModel: string;
21
9
  readonly id: string;
@@ -1 +1 @@
1
- {"version":3,"file":"speech-to-text-provider.d.ts","sourceRoot":"","sources":["../src/speech-to-text-provider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;CAC9B;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,MAAM,EAAE,SAAS,YAAY,EAAE,CAAC;IAEzC,UAAU,CAAC,OAAO,EAAE;QAClB,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,UAAU,EAAE,aAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,oBAAoB,CAAC;CACzC"}
1
+ {"version":3,"file":"speech-to-text-provider.d.ts","sourceRoot":"","sources":["../src/speech-to-text-provider.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;CAC9B;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,MAAM,EAAE,SAAS,YAAY,EAAE,CAAC;IAEzC,UAAU,CAAC,OAAO,EAAE;QAClB,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,UAAU,EAAE,aAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,QAAQ,EAAE,oBAAoB,CAAC;CACzC"}
@@ -1,7 +1,8 @@
1
+ import type { PronunciationsFor } from "./pronunciations/types.js";
1
2
  import { type ResolvedModel, type Voice } from "./speech-provider.js";
2
3
  import type { StreamSpeechResult } from "./stream-speech-result.js";
3
- export declare function streamSpeech<V extends Voice = Voice>(options: {
4
- model: string | ResolvedModel<V>;
4
+ export declare function streamSpeech<V extends Voice = Voice, M extends string | ResolvedModel<V> = string | ResolvedModel<V>>(options: {
5
+ model: M;
5
6
  text: string;
6
7
  voice: V;
7
8
  apiKey?: string;
@@ -9,5 +10,6 @@ export declare function streamSpeech<V extends Voice = Voice>(options: {
9
10
  maxRetries?: number;
10
11
  abortSignal?: AbortSignal;
11
12
  headers?: Record<string, string>;
13
+ pronunciations?: PronunciationsFor<M>;
12
14
  }): Promise<StreamSpeechResult>;
13
15
  //# sourceMappingURL=stream-speech.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"stream-speech.d.ts","sourceRoot":"","sources":["../src/stream-speech.ts"],"names":[],"mappings":"AASA,OAAO,EAGL,KAAK,aAAa,EAClB,KAAK,KAAK,EACX,MAAM,sBAAsB,CAAC;AAC9B,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAEpE,wBAAsB,YAAY,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK,EAAE,OAAO,EAAE;IACnE,KAAK,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,CAAC,CAAC;IACT,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC1C,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAClC,GAAG,OAAO,CAAC,kBAAkB,CAAC,CA4F9B"}
1
+ {"version":3,"file":"stream-speech.d.ts","sourceRoot":"","sources":["../src/stream-speech.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAKnE,OAAO,EAIL,KAAK,aAAa,EAClB,KAAK,KAAK,EACX,MAAM,sBAAsB,CAAC;AAC9B,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAEpE,wBAAsB,YAAY,CAChC,CAAC,SAAS,KAAK,GAAG,KAAK,EACvB,CAAC,SAAS,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,EAC/D,OAAO,EAAE;IACT,KAAK,EAAE,CAAC,CAAC;IACT,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,CAAC,CAAC;IACT,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC1C,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,cAAc,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC,CAAC;CACvC,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAmG9B"}
@@ -1,13 +1,19 @@
1
1
  import pRetry from "p-retry";
2
2
  import { detectAudioTags, stripAudioTags } from "./audio-tags.js";
3
- import { ApiError, NoSpeechGeneratedError, StreamingNotSupportedError, } from "./errors.js";
3
+ import { NoSpeechGeneratedError, StreamingNotSupportedError, } from "./errors.js";
4
+ import { mergeRules } from "./pronunciations/merge.js";
5
+ import { substitute } from "./pronunciations/substitute.js";
6
+ import { validatePronunciationsInput } from "./pronunciations/validate.js";
4
7
  import { resolveModel } from "./resolve-provider.js";
5
- import { FEATURES, hasFeature, } from "./speech-provider.js";
8
+ import { buildRetryOptions } from "./retry-options.js";
9
+ import { FEATURES, hasFeature, isSpeechGatewayModel, } from "./speech-provider.js";
6
10
  export async function streamSpeech(options) {
7
11
  const { model, voice, providerOptions, abortSignal, headers } = options;
8
12
  const maxRetries = options.maxRetries ?? 2;
9
13
  const resolved = resolveModel(model, { apiKey: options.apiKey });
10
14
  const modelIdentifier = `${resolved.provider.id}/${resolved.modelId}`;
15
+ const isGateway = isSpeechGatewayModel(resolved);
16
+ validatePronunciationsInput(options.pronunciations, isGateway);
11
17
  const modelInfo = resolved.provider.models.find((m) => m.id === resolved.modelId);
12
18
  if (modelInfo && !hasFeature(modelInfo, FEATURES.STREAMING)) {
13
19
  throw new StreamingNotSupportedError(modelIdentifier);
@@ -35,32 +41,40 @@ export async function streamSpeech(options) {
35
41
  ? `Text is empty after removing unsupported audio tags for ${modelIdentifier}.`
36
42
  : "Text must not be empty.");
37
43
  }
44
+ let textToSend = processedText;
45
+ if (!isGateway && options.pronunciations?.rules?.length) {
46
+ const ruleMap = mergeRules(options.pronunciations.rules);
47
+ textToSend = substitute(processedText, ruleMap).text;
48
+ }
38
49
  const streamFn = resolved.provider.stream.bind(resolved.provider);
39
50
  const startTime = performance.now();
40
- const result = await pRetry(() => streamFn({
41
- modelId: resolved.modelId,
42
- text: processedText,
43
- voice,
44
- providerOptions,
45
- abortSignal,
46
- headers,
47
- }), {
48
- retries: maxRetries,
49
- signal: abortSignal,
50
- shouldRetry: ({ error }) => {
51
- if (error instanceof ApiError && error.statusCode < 500) {
52
- return false;
53
- }
54
- return true;
55
- },
56
- });
51
+ const result = await pRetry(() => {
52
+ if (isGateway) {
53
+ const gatewayProvider = resolved.provider;
54
+ return gatewayProvider.stream({
55
+ modelId: resolved.modelId,
56
+ text: textToSend,
57
+ voice: voice,
58
+ providerOptions,
59
+ abortSignal,
60
+ headers,
61
+ pronunciations: options.pronunciations,
62
+ });
63
+ }
64
+ return streamFn({
65
+ modelId: resolved.modelId,
66
+ text: textToSend,
67
+ voice,
68
+ providerOptions,
69
+ abortSignal,
70
+ headers,
71
+ });
72
+ }, buildRetryOptions({ maxRetries, abortSignal }));
57
73
  const ttfbMs = Math.round(performance.now() - startTime);
58
74
  const metadata = {
59
75
  latencyMs: ttfbMs,
60
76
  ttfbMs,
61
- inputChars: processedText.length,
62
- provider: resolved.provider.id,
63
- model: resolved.modelId,
77
+ inputChars: options.text.length,
64
78
  ...(result.audioDurationMs != null && {
65
79
  audioDurationMs: result.audioDurationMs,
66
80
  }),
@@ -1 +1 @@
1
- {"version":3,"file":"stream-speech.js","sourceRoot":"","sources":["../src/stream-speech.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,SAAS,CAAC;AAC7B,OAAO,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAClE,OAAO,EACL,QAAQ,EACR,sBAAsB,EACtB,0BAA0B,GAC3B,MAAM,aAAa,CAAC;AAErB,OAAO,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AACrD,OAAO,EACL,QAAQ,EACR,UAAU,GAGX,MAAM,sBAAsB,CAAC;AAG9B,MAAM,CAAC,KAAK,UAAU,YAAY,CAA0B,OAS3D;IACC,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,WAAW,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;IACxE,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,CAAC,CAAC;IAE3C,MAAM,QAAQ,GAAG,YAAY,CAAC,KAAK,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IACjE,MAAM,eAAe,GAAG,GAAG,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;IAEtE,MAAM,SAAS,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAC7C,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,QAAQ,CAAC,OAAO,CACjC,CAAC;IACF,IAAI,SAAS,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;QAC5D,MAAM,IAAI,0BAA0B,CAAC,eAAe,CAAC,CAAC;IACxD,CAAC;IACD,IAAI,OAAO,QAAQ,CAAC,QAAQ,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACnD,MAAM,IAAI,0BAA0B,CAAC,eAAe,CAAC,CAAC;IACxD,CAAC;IAED,IAAI,aAAqB,CAAC;IAC1B,IAAI,QAAkB,CAAC;IAEvB,IAAI,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,EAAE,CAAC;QACvC,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,QAAQ,EAAE,GAAG,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CACrE,OAAO,CAAC,IAAI,EACZ,QAAQ,CAAC,OAAO,CACjB,CAAC,CAAC;IACL,CAAC;SAAM,CAAC;QACN,MAAM,IAAI,GAAG,eAAe,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC3C,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpB,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,QAAQ,EAAE,GAAG,cAAc,CACjD,OAAO,CAAC,IAAI,EACZ,eAAe,CAChB,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,aAAa,GAAG,OAAO,CAAC,IAAI,CAAC;YAC7B,QAAQ,GAAG,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAED,IAAI,aAAa,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,sBAAsB,CAC9B,QAAQ,CAAC,MAAM,GAAG,CAAC;YACjB,CAAC,CAAC,2DAA2D,eAAe,GAAG;YAC/E,CAAC,CAAC,yBAAyB,CAC9B,CAAC;IACJ,CAAC;IAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAElE,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEpC,MAAM,MAAM,GAAG,MAAM,MAAM,CACzB,GAAG,EAAE,CACH,QAAQ,CAAC;QACP,OAAO,EAAE,QAAQ,CAAC,OAAO;QACzB,IAAI,EAAE,aAAa;QACnB,KAAK;QACL,eAAe;QACf,WAAW;QACX,OAAO;KACR,CAAC,EACJ;QACE,OAAO,EAAE,UAAU;QACnB,MAAM,EAAE,WAAW;QACnB,WAAW,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE;YACzB,IAAI,KAAK,YAAY,QAAQ,IAAI,KAAK,CAAC,UAAU,GAAG,GAAG,EAAE,CAAC;gBACxD,OAAO,KAAK,CAAC;YACf,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;KACF,CACF,CAAC;IAEF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,CAAC;IAEzD,MAAM,QAAQ,GAAmB;QAC/B,SAAS,EAAE,MAAM;QACjB,MAAM;QACN,UAAU,EAAE,aAAa,CAAC,MAAM;QAChC,QAAQ,EAAE,QAAQ,CAAC,QAAQ,CAAC,EAAE;QAC9B,KAAK,EAAE,QAAQ,CAAC,OAAO;QACvB,GAAG,CAAC,MAAM,CAAC,eAAe,IAAI,IAAI,IAAI;YACpC,eAAe,EAAE,MAAM,CAAC,eAAe;SACxC,CAAC;KACH,CAAC;IAEF,OAAO;QACL,KAAK,EAAE,MAAM,CAAC,MAAM;QACpB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ;QACR,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;QACzC,QAAQ,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;KACrD,CAAC;AACJ,CAAC"}
1
+ {"version":3,"file":"stream-speech.js","sourceRoot":"","sources":["../src/stream-speech.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,SAAS,CAAC;AAC7B,OAAO,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AAClE,OAAO,EACL,sBAAsB,EACtB,0BAA0B,GAC3B,MAAM,aAAa,CAAC;AAErB,OAAO,EAAE,UAAU,EAAE,MAAM,2BAA2B,CAAC;AACvD,OAAO,EAAE,UAAU,EAAE,MAAM,gCAAgC,CAAC;AAE5D,OAAO,EAAE,2BAA2B,EAAE,MAAM,8BAA8B,CAAC;AAE3E,OAAO,EAAE,YAAY,EAAE,MAAM,uBAAuB,CAAC;AACrD,OAAO,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,EACL,QAAQ,EACR,UAAU,EACV,oBAAoB,GAGrB,MAAM,sBAAsB,CAAC;AAG9B,MAAM,CAAC,KAAK,UAAU,YAAY,CAGhC,OAUD;IACC,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,eAAe,EAAE,WAAW,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC;IACxE,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,CAAC,CAAC;IAE3C,MAAM,QAAQ,GAAG,YAAY,CAAC,KAAK,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IACjE,MAAM,eAAe,GAAG,GAAG,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,OAAO,EAAE,CAAC;IACtE,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IACjD,2BAA2B,CAAC,OAAO,CAAC,cAAc,EAAE,SAAS,CAAC,CAAC;IAE/D,MAAM,SAAS,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAC7C,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,QAAQ,CAAC,OAAO,CACjC,CAAC;IACF,IAAI,SAAS,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;QAC5D,MAAM,IAAI,0BAA0B,CAAC,eAAe,CAAC,CAAC;IACxD,CAAC;IACD,IAAI,OAAO,QAAQ,CAAC,QAAQ,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;QACnD,MAAM,IAAI,0BAA0B,CAAC,eAAe,CAAC,CAAC;IACxD,CAAC;IAED,IAAI,aAAqB,CAAC;IAC1B,IAAI,QAAkB,CAAC;IAEvB,IAAI,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,EAAE,CAAC;QACvC,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,QAAQ,EAAE,GAAG,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CACrE,OAAO,CAAC,IAAI,EACZ,QAAQ,CAAC,OAAO,CACjB,CAAC,CAAC;IACL,CAAC;SAAM,CAAC;QACN,MAAM,IAAI,GAAG,eAAe,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC3C,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpB,CAAC,EAAE,IAAI,EAAE,aAAa,EAAE,QAAQ,EAAE,GAAG,cAAc,CACjD,OAAO,CAAC,IAAI,EACZ,eAAe,CAChB,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,aAAa,GAAG,OAAO,CAAC,IAAI,CAAC;YAC7B,QAAQ,GAAG,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAED,IAAI,aAAa,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,sBAAsB,CAC9B,QAAQ,CAAC,MAAM,GAAG,CAAC;YACjB,CAAC,CAAC,2DAA2D,eAAe,GAAG;YAC/E,CAAC,CAAC,yBAAyB,CAC9B,CAAC;IACJ,CAAC;IAED,IAAI,UAAU,GAAG,aAAa,CAAC;IAC/B,IAAI,CAAC,SAAS,IAAI,OAAO,CAAC,cAAc,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;QACxD,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QACzD,UAAU,GAAG,UAAU,CAAC,aAAa,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;IACvD,CAAC;IAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAElE,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;IAEpC,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,GAAG,EAAE;QAC/B,IAAI,SAAS,EAAE,CAAC;YACd,MAAM,eAAe,GAAG,QAAQ,CAAC,QAAiC,CAAC;YACnE,OAAO,eAAe,CAAC,MAAM,CAAC;gBAC5B,OAAO,EAAE,QAAQ,CAAC,OAAO;gBACzB,IAAI,EAAE,UAAU;gBAChB,KAAK,EAAE,KAA0B;gBACjC,eAAe;gBACf,WAAW;gBACX,OAAO;gBACP,cAAc,EAAE,OAAO,CAAC,cAAc;aACvC,CAAC,CAAC;QACL,CAAC;QACD,OAAO,QAAQ,CAAC;YACd,OAAO,EAAE,QAAQ,CAAC,OAAO;YACzB,IAAI,EAAE,UAAU;YAChB,KAAK;YACL,eAAe;YACf,WAAW;YACX,OAAO;SACR,CAAC,CAAC;IACL,CAAC,EAAE,iBAAiB,CAAC,EAAE,UAAU,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC;IAEnD,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,CAAC;IAEzD,MAAM,QAAQ,GAAmB;QAC/B,SAAS,EAAE,MAAM;QACjB,MAAM;QACN,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM;QAC/B,GAAG,CAAC,MAAM,CAAC,eAAe,IAAI,IAAI,IAAI;YACpC,eAAe,EAAE,MAAM,CAAC,eAAe;SACxC,CAAC;KACH,CAAC;IAEF,OAAO;QACL,KAAK,EAAE,MAAM,CAAC,MAAM;QACpB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ;QACR,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;QACzC,QAAQ,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS;KACrD,CAAC;AACJ,CAAC"}
@@ -1,23 +1,9 @@
1
- /**
2
- * Word-granularity alignment data. Timestamps are always in seconds from
3
- * the start of the generated audio. Providers that natively return character
4
- * or phoneme granularity are aggregated to words internally.
5
- */
6
1
  export interface WordTimestamp {
7
2
  readonly end: number;
8
3
  readonly start: number;
9
4
  readonly text: string;
10
5
  }
11
- /**
12
- * Controls whether `generateSpeech()` returns word timestamps.
13
- *
14
- * - `"auto"` (default): return timestamps only if the TTS provider supplies
15
- * them natively. Free, no extra API calls.
16
- * - `"on"`: always return timestamps. Uses native data when available;
17
- * otherwise falls back to a speech-to-text round-trip of the synthesized
18
- * audio (cost + latency implications).
19
- * - `"off"`: never return timestamps, even when the provider would give them
20
- * away for free.
21
- */
22
- export type TimestampMode = "on" | "auto" | "off";
6
+ export interface ConversationWordTimestamp extends WordTimestamp {
7
+ readonly turnIndex: number;
8
+ }
23
9
  //# sourceMappingURL=timestamps.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"timestamps.d.ts","sourceRoot":"","sources":["../src/timestamps.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED;;;;;;;;;;GAUG;AACH,MAAM,MAAM,aAAa,GAAG,IAAI,GAAG,MAAM,GAAG,KAAK,CAAC"}
1
+ {"version":3,"file":"timestamps.d.ts","sourceRoot":"","sources":["../src/timestamps.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,yBAA0B,SAAQ,aAAa;IAC9D,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B"}
@@ -0,0 +1,9 @@
1
+ import type { ConversationWordTimestamp } from "./timestamps.js";
2
+ export interface TurnTimestamp {
3
+ readonly end: number;
4
+ readonly start: number;
5
+ readonly text: string;
6
+ readonly turnIndex: number;
7
+ }
8
+ export declare function timestampsToTurns(timestamps: readonly ConversationWordTimestamp[]): readonly TurnTimestamp[];
9
+ //# sourceMappingURL=turns.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"turns.d.ts","sourceRoot":"","sources":["../src/turns.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,iBAAiB,CAAC;AAEjE,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAKD,wBAAgB,iBAAiB,CAC/B,UAAU,EAAE,SAAS,yBAAyB,EAAE,GAC/C,SAAS,aAAa,EAAE,CAiB1B"}
package/dist/turns.js ADDED
@@ -0,0 +1,21 @@
1
+ // Assumes turnIndex runs are monotonic; non-adjacent runs of the same turnIndex would produce duplicate entries.
2
+ export function timestampsToTurns(timestamps) {
3
+ const turns = [];
4
+ for (const word of timestamps) {
5
+ const last = turns.at(-1);
6
+ if (last && last.turnIndex === word.turnIndex) {
7
+ last.end = word.end;
8
+ last.text = `${last.text} ${word.text}`;
9
+ }
10
+ else {
11
+ turns.push({
12
+ turnIndex: word.turnIndex,
13
+ start: word.start,
14
+ end: word.end,
15
+ text: word.text,
16
+ });
17
+ }
18
+ }
19
+ return turns;
20
+ }
21
+ //# sourceMappingURL=turns.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"turns.js","sourceRoot":"","sources":["../src/turns.ts"],"names":[],"mappings":"AAWA,iHAAiH;AACjH,MAAM,UAAU,iBAAiB,CAC/B,UAAgD;IAEhD,MAAM,KAAK,GAA6B,EAAE,CAAC;IAC3C,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC1B,IAAI,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,SAAS,EAAE,CAAC;YAC9C,IAAI,CAAC,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC;YACpB,IAAI,CAAC,IAAI,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;QAC1C,CAAC;aAAM,CAAC;YACN,KAAK,CAAC,IAAI,CAAC;gBACT,SAAS,EAAE,IAAI,CAAC,SAAS;gBACzB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,IAAI,EAAE,IAAI,CAAC,IAAI;aAChB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC"}
package/dist/types.d.ts CHANGED
@@ -1,12 +1,43 @@
1
+ import type { AudioOutput } from "./audio-output.js";
2
+ import type { PronunciationsInput } from "./pronunciations/types.js";
1
3
  import type { ResolvedModel, Voice } from "./speech-provider.js";
4
+ export type { AudioOutput, AudioOutputFormat } from "./audio-output.js";
5
+ export type { CaptionFormat, CaptionsOptions } from "./captions.js";
6
+ export type { ConversationTurn, GenerateConversationOptions, } from "./conversation/types.js";
7
+ export type { SpeechMetadata } from "./metadata.js";
8
+ export type { Pronunciation, PronunciationsInput, } from "./pronunciations/types.js";
9
+ export type { CartesiaSpeechProviderConfig } from "./providers/cartesia/index.js";
10
+ export type { DeepgramSpeechProviderConfig } from "./providers/deepgram/index.js";
11
+ export type { ElevenLabsSpeechProviderConfig } from "./providers/elevenlabs/index.js";
12
+ export type { FalSpeechProviderConfig } from "./providers/fal/index.js";
13
+ export type { FishAudioSpeechProviderConfig } from "./providers/fish-audio/index.js";
14
+ export type { SpeechGatewayProviderConfig } from "./providers/gateway/index.js";
15
+ export type { GoogleSpeechProviderConfig } from "./providers/google/index.js";
16
+ export type { HumeSpeechProviderConfig } from "./providers/hume/index.js";
17
+ export type { InworldSpeechProviderConfig } from "./providers/inworld/index.js";
18
+ export type { MistralSpeechProviderConfig } from "./providers/mistral/index.js";
19
+ export type { MurfSpeechProviderConfig } from "./providers/murf/index.js";
20
+ export type { OpenAISpeechProviderConfig } from "./providers/openai/index.js";
21
+ export type { ResembleSpeechProviderConfig } from "./providers/resemble/index.js";
22
+ export type { XaiSpeechProviderConfig } from "./providers/xai/index.js";
23
+ export type { Feature, ModelInfo, ResolvedModel, SpeechProvider, Voice, } from "./speech-provider.js";
24
+ export type { ConversationResult, GeneratedAudioFile, SpeechResult, } from "./speech-result.js";
25
+ export type { ResolvedSTTModel, SpeechToTextProvider, STTModelInfo, } from "./speech-to-text-provider.js";
26
+ export type { StreamSpeechResult } from "./stream-speech-result.js";
27
+ export type { ConversationWordTimestamp, WordTimestamp, } from "./timestamps.js";
28
+ export type { TurnTimestamp } from "./turns.js";
2
29
  export interface GenerateSpeechOptions<V extends Voice = Voice> {
3
30
  abortSignal?: AbortSignal;
4
31
  apiKey?: string;
5
32
  headers?: Record<string, string>;
6
33
  maxRetries?: number;
7
34
  model: string | ResolvedModel<V>;
35
+ output?: AudioOutput;
36
+ pronunciations?: PronunciationsInput;
8
37
  providerOptions?: Record<string, unknown>;
9
38
  text: string;
39
+ timestamps?: boolean;
10
40
  voice: V;
41
+ volumeDbfs?: number;
11
42
  }
12
43
  //# sourceMappingURL=types.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AAEjE,MAAM,WAAW,qBAAqB,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IAC5D,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IACjC,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC1C,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,CAAC,CAAC;CACV"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AACrE,OAAO,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AAEjE,YAAY,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACxE,YAAY,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACpE,YAAY,EACV,gBAAgB,EAChB,2BAA2B,GAC5B,MAAM,yBAAyB,CAAC;AACjC,YAAY,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AACpD,YAAY,EACV,aAAa,EACb,mBAAmB,GACpB,MAAM,2BAA2B,CAAC;AACnC,YAAY,EAAE,4BAA4B,EAAE,MAAM,+BAA+B,CAAC;AAClF,YAAY,EAAE,4BAA4B,EAAE,MAAM,+BAA+B,CAAC;AAClF,YAAY,EAAE,8BAA8B,EAAE,MAAM,iCAAiC,CAAC;AACtF,YAAY,EAAE,uBAAuB,EAAE,MAAM,0BAA0B,CAAC;AACxE,YAAY,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AACrF,YAAY,EAAE,2BAA2B,EAAE,MAAM,8BAA8B,CAAC;AAChF,YAAY,EAAE,0BAA0B,EAAE,MAAM,6BAA6B,CAAC;AAC9E,YAAY,EAAE,wBAAwB,EAAE,MAAM,2BAA2B,CAAC;AAC1E,YAAY,EAAE,2BAA2B,EAAE,MAAM,8BAA8B,CAAC;AAChF,YAAY,EAAE,2BAA2B,EAAE,MAAM,8BAA8B,CAAC;AAChF,YAAY,EAAE,wBAAwB,EAAE,MAAM,2BAA2B,CAAC;AAC1E,YAAY,EAAE,0BAA0B,EAAE,MAAM,6BAA6B,CAAC;AAC9E,YAAY,EAAE,4BAA4B,EAAE,MAAM,+BAA+B,CAAC;AAClF,YAAY,EAAE,uBAAuB,EAAE,MAAM,0BAA0B,CAAC;AACxE,YAAY,EACV,OAAO,EACP,SAAS,EACT,aAAa,EACb,cAAc,EACd,KAAK,GACN,MAAM,sBAAsB,CAAC;AAC9B,YAAY,EACV,kBAAkB,EAClB,kBAAkB,EAClB,YAAY,GACb,MAAM,oBAAoB,CAAC;AAC5B,YAAY,EACV,gBAAgB,EAChB,oBAAoB,EACpB,YAAY,GACb,MAAM,8BAA8B,CAAC;AACtC,YAAY,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AACpE,YAAY,EACV,yBAAyB,EACzB,aAAa,GACd,MAAM,iBAAiB,CAAC;AACzB,YAAY,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC;AAEhD,MAAM,WAAW,qBAAqB,CAAC,CAAC,SAAS,KAAK,GAAG,KAAK;IAC5D,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;IACjC,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,cAAc,CAAC,EAAE,mBAAmB,CAAC;IACrC,eAAe,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAC1C,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,KAAK,EAAE,CAAC,CAAC;IACT,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB"}
@@ -3,12 +3,6 @@ interface AdjustVolumeInput {
3
3
  readonly mediaType: string;
4
4
  readonly volumeDbfs: number;
5
5
  }
6
- /**
7
- * Decode the provider's PCM/WAV output, RMS-normalize to the target dBFS,
8
- * and re-encode as 16-bit mono WAV. Lazy-loaded by generateSpeech only when
9
- * `volumeDbfs` is set so callers that never use volume adjustment don't pay
10
- * for the WAV mux dependency chain at import time.
11
- */
12
6
  export declare function adjustVolume(input: AdjustVolumeInput): Promise<Uint8Array>;
13
7
  export {};
14
8
  //# sourceMappingURL=volume-adjust.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"volume-adjust.d.ts","sourceRoot":"","sources":["../src/volume-adjust.ts"],"names":[],"mappings":"AAOA,UAAU,iBAAiB;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,GAAG,UAAU,CAAC;IACpC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC7B;AAED;;;;;GAKG;AACH,wBAAsB,YAAY,CAChC,KAAK,EAAE,iBAAiB,GACvB,OAAO,CAAC,UAAU,CAAC,CAgBrB"}
1
+ {"version":3,"file":"volume-adjust.d.ts","sourceRoot":"","sources":["../src/volume-adjust.ts"],"names":[],"mappings":"AAQA,UAAU,iBAAiB;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,GAAG,UAAU,CAAC;IACpC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC7B;AAED,wBAAsB,YAAY,CAChC,KAAK,EAAE,iBAAiB,GACvB,OAAO,CAAC,UAAU,CAAC,CAgBrB"}
@@ -1,27 +1,15 @@
1
- import { concatPcmToWav, dbfsToInt16Rms, decodeToPcm16, normalizeRms, } from "./conversation/pcm-concat.js";
2
- /**
3
- * Decode the provider's PCM/WAV output, RMS-normalize to the target dBFS,
4
- * and re-encode as 16-bit mono WAV. Lazy-loaded by generateSpeech only when
5
- * `volumeDbfs` is set so callers that never use volume adjustment don't pay
6
- * for the WAV mux dependency chain at import time.
7
- */
1
+ import { decodeAudioToPcm16 } from "./audio-decode.js";
2
+ import { base64ToUint8Array } from "./audio-utils.js";
3
+ import { concatPcmToWav, dbfsToInt16Rms, normalizeRms, } from "./conversation/pcm-concat.js";
8
4
  export async function adjustVolume(input) {
9
5
  const bytes = input.audio instanceof Uint8Array
10
6
  ? input.audio
11
7
  : base64ToUint8Array(input.audio);
12
- const segment = decodeToPcm16(bytes, input.mediaType);
8
+ const segment = await decodeAudioToPcm16(bytes, input.mediaType);
13
9
  const [normalized] = normalizeRms([segment], dbfsToInt16Rms(input.volumeDbfs));
14
10
  return await concatPcmToWav([normalized], {
15
11
  gapMs: 0,
16
12
  targetSampleRate: normalized.sampleRate,
17
13
  });
18
14
  }
19
- function base64ToUint8Array(b64) {
20
- const binaryString = atob(b64);
21
- const out = new Uint8Array(binaryString.length);
22
- for (let i = 0; i < binaryString.length; i++) {
23
- out[i] = binaryString.charCodeAt(i);
24
- }
25
- return out;
26
- }
27
15
  //# sourceMappingURL=volume-adjust.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"volume-adjust.js","sourceRoot":"","sources":["../src/volume-adjust.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,cAAc,EACd,cAAc,EACd,aAAa,EACb,YAAY,GACb,MAAM,8BAA8B,CAAC;AAQtC;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,KAAwB;IAExB,MAAM,KAAK,GACT,KAAK,CAAC,KAAK,YAAY,UAAU;QAC/B,CAAC,CAAC,KAAK,CAAC,KAAK;QACb,CAAC,CAAC,kBAAkB,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAEtC,MAAM,OAAO,GAAG,aAAa,CAAC,KAAK,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IACtD,MAAM,CAAC,UAAU,CAAC,GAAG,YAAY,CAC/B,CAAC,OAAO,CAAC,EACT,cAAc,CAAC,KAAK,CAAC,UAAU,CAAC,CACjC,CAAC;IAEF,OAAO,MAAM,cAAc,CAAC,CAAC,UAAU,CAAC,EAAE;QACxC,KAAK,EAAE,CAAC;QACR,gBAAgB,EAAE,UAAU,CAAC,UAAU;KACxC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,kBAAkB,CAAC,GAAW;IACrC,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/B,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;IAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7C,GAAG,CAAC,CAAC,CAAC,GAAG,YAAY,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;IACtC,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC"}
1
+ {"version":3,"file":"volume-adjust.js","sourceRoot":"","sources":["../src/volume-adjust.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,MAAM,mBAAmB,CAAC;AACvD,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EACL,cAAc,EACd,cAAc,EACd,YAAY,GACb,MAAM,8BAA8B,CAAC;AAQtC,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,KAAwB;IAExB,MAAM,KAAK,GACT,KAAK,CAAC,KAAK,YAAY,UAAU;QAC/B,CAAC,CAAC,KAAK,CAAC,KAAK;QACb,CAAC,CAAC,kBAAkB,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAEtC,MAAM,OAAO,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;IACjE,MAAM,CAAC,UAAU,CAAC,GAAG,YAAY,CAC/B,CAAC,OAAO,CAAC,EACT,cAAc,CAAC,KAAK,CAAC,UAAU,CAAC,CACjC,CAAC;IAEF,OAAO,MAAM,cAAc,CAAC,CAAC,UAAU,CAAC,EAAE;QACxC,KAAK,EAAE,CAAC;QACR,gBAAgB,EAAE,UAAU,CAAC,UAAU;KACxC,CAAC,CAAC;AACL,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@speech-sdk/core",
3
- "version": "0.7.0",
3
+ "version": "0.8.0",
4
4
  "description": "Universal, cross-platform text-to-speech SDK with multi-provider support.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -10,75 +10,20 @@
10
10
  "types": "./dist/index.d.ts",
11
11
  "default": "./dist/index.js"
12
12
  },
13
- "./conversation": {
14
- "types": "./dist/generate-conversation.d.ts",
15
- "default": "./dist/generate-conversation.js"
13
+ "./providers": {
14
+ "types": "./dist/providers.d.ts",
15
+ "default": "./dist/providers.js"
16
16
  },
17
- "./conversation/errors": {
18
- "types": "./dist/conversation/errors.d.ts",
19
- "default": "./dist/conversation/errors.js"
20
- },
21
- "./openai": {
22
- "types": "./dist/providers/openai/index.d.ts",
23
- "default": "./dist/providers/openai/index.js"
24
- },
25
- "./elevenlabs": {
26
- "types": "./dist/providers/elevenlabs/index.d.ts",
27
- "default": "./dist/providers/elevenlabs/index.js"
28
- },
29
- "./deepgram": {
30
- "types": "./dist/providers/deepgram/index.d.ts",
31
- "default": "./dist/providers/deepgram/index.js"
32
- },
33
- "./cartesia": {
34
- "types": "./dist/providers/cartesia/index.d.ts",
35
- "default": "./dist/providers/cartesia/index.js"
36
- },
37
- "./hume": {
38
- "types": "./dist/providers/hume/index.d.ts",
39
- "default": "./dist/providers/hume/index.js"
40
- },
41
- "./inworld": {
42
- "types": "./dist/providers/inworld/index.d.ts",
43
- "default": "./dist/providers/inworld/index.js"
44
- },
45
- "./google": {
46
- "types": "./dist/providers/google/index.d.ts",
47
- "default": "./dist/providers/google/index.js"
48
- },
49
- "./fish-audio": {
50
- "types": "./dist/providers/fish-audio/index.d.ts",
51
- "default": "./dist/providers/fish-audio/index.js"
52
- },
53
- "./murf": {
54
- "types": "./dist/providers/murf/index.d.ts",
55
- "default": "./dist/providers/murf/index.js"
56
- },
57
- "./resemble": {
58
- "types": "./dist/providers/resemble/index.d.ts",
59
- "default": "./dist/providers/resemble/index.js"
60
- },
61
- "./fal-ai": {
62
- "types": "./dist/providers/fal/index.d.ts",
63
- "default": "./dist/providers/fal/index.js"
64
- },
65
- "./mistral": {
66
- "types": "./dist/providers/mistral/index.d.ts",
67
- "default": "./dist/providers/mistral/index.js"
68
- },
69
- "./xai": {
70
- "types": "./dist/providers/xai/index.d.ts",
71
- "default": "./dist/providers/xai/index.js"
72
- },
73
- "./stt/openai": {
74
- "types": "./dist/stt-providers/openai/index.d.ts",
75
- "default": "./dist/stt-providers/openai/index.js"
17
+ "./types": {
18
+ "types": "./dist/types.d.ts",
19
+ "default": "./dist/types.js"
76
20
  }
77
21
  },
78
22
  "files": [
79
23
  "dist",
80
24
  "README.md"
81
25
  ],
26
+ "sideEffects": false,
82
27
  "keywords": [
83
28
  "tts",
84
29
  "text-to-speech",
@@ -94,15 +39,17 @@
94
39
  "url": "https://github.com/Jellypod-Inc/speech-sdk"
95
40
  },
96
41
  "dependencies": {
42
+ "@mediabunny/mp3-encoder": "^1.42.0",
97
43
  "mediabunny": "^1.40.1",
98
- "p-retry": "^8.0.0"
44
+ "p-retry": "^8.0.0",
45
+ "zod": "^4.3.6"
99
46
  },
100
47
  "devDependencies": {
101
- "@biomejs/biome": "2.4.12",
48
+ "@biomejs/biome": "2.4.13",
102
49
  "@types/node": "^25.5.0",
103
50
  "dotenv": "^17.3.1",
104
51
  "typescript": "^5.8.0",
105
- "ultracite": "7.5.9",
52
+ "ultracite": "7.6.2",
106
53
  "vite": "^7.3.2",
107
54
  "vitest": "^4.1.3"
108
55
  },
@@ -1,42 +0,0 @@
1
- import type { ResolvedSTTModel, SpeechToTextProvider } from "../../speech-to-text-provider.js";
2
- import type { WordTimestamp } from "../../timestamps.js";
3
- export interface OpenAISpeechToTextProviderConfig {
4
- apiKey?: string;
5
- baseURL?: string;
6
- fetch?: typeof globalThis.fetch;
7
- }
8
- /**
9
- * OpenAI Whisper / gpt-4o-transcribe adapter for the SDK's derived-timestamps
10
- * path. Uses `/v1/audio/transcriptions` with `timestamp_granularities: ["word"]`
11
- * and `response_format: "verbose_json"`.
12
- *
13
- * Note: `gpt-4o-transcribe-diarize` is intentionally not listed — that
14
- * variant does not support `timestamp_granularities`.
15
- */
16
- export declare class OpenAISpeechToTextProvider implements SpeechToTextProvider {
17
- readonly id = "openai";
18
- readonly defaultModel = "whisper-1";
19
- readonly models: readonly [{
20
- readonly id: "whisper-1";
21
- readonly releaseDate: "2023-03-01";
22
- readonly languages: readonly ["af", "ar", "az", "be", "bg", "bn", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gl", "he", "hi", "hr", "hu", "hy", "id", "is", "it", "ja", "kk", "kn", "ko", "lt", "lv", "mi", "mk", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "sw", "ta", "th", "tl", "tr", "uk", "ur", "vi", "zh"];
23
- }];
24
- private readonly apiKey;
25
- private readonly baseURL;
26
- private readonly fetchFn;
27
- constructor(config?: OpenAISpeechToTextProviderConfig);
28
- transcribe(options: {
29
- modelId: string;
30
- audio: Uint8Array;
31
- mediaType: string;
32
- language?: string;
33
- abortSignal?: AbortSignal;
34
- headers?: Record<string, string>;
35
- }): Promise<{
36
- timestamps: WordTimestamp[];
37
- text?: string;
38
- providerMetadata?: Record<string, unknown>;
39
- }>;
40
- }
41
- export declare function createOpenAISTT(config?: OpenAISpeechToTextProviderConfig): (modelId?: string) => ResolvedSTTModel;
42
- //# sourceMappingURL=index.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/stt-providers/openai/index.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EACV,gBAAgB,EAChB,oBAAoB,EACrB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAEzD,MAAM,WAAW,gCAAgC;IAC/C,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;CACjC;AAiED;;;;;;;GAOG;AACH,qBAAa,0BAA2B,YAAW,oBAAoB;IACrE,QAAQ,CAAC,EAAE,YAAY;IACvB,QAAQ,CAAC,YAAY,eAAe;IAMpC,QAAQ,CAAC,MAAM;;;;OAMJ;IAEX,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAqB;IAC5C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA0B;gBAEtC,MAAM,GAAE,gCAAqC;IAMnD,UAAU,CAAC,OAAO,EAAE;QACxB,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,UAAU,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;KAClC,GAAG,OAAO,CAAC;QACV,UAAU,EAAE,aAAa,EAAE,CAAC;QAC5B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAC5C,CAAC;CAsDH;AAED,wBAAgB,eAAe,CAAC,MAAM,GAAE,gCAAqC,IAGjD,UAAU,MAAM,KAAG,gBAAgB,CAM9D"}