@huggingface/transformers 4.0.0-next.0 → 4.0.0-next.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (364) hide show
  1. package/README.md +32 -6
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +31 -31
  3. package/dist/transformers.js +9261 -1709
  4. package/dist/transformers.min.js +25 -18
  5. package/dist/transformers.node.cjs +6119 -3174
  6. package/dist/transformers.node.min.cjs +25 -23
  7. package/dist/transformers.node.min.mjs +25 -23
  8. package/dist/transformers.node.mjs +6034 -3168
  9. package/dist/transformers.web.js +4255 -1381
  10. package/dist/transformers.web.min.js +23 -19
  11. package/package.json +6 -6
  12. package/src/backends/onnx.js +128 -53
  13. package/src/backends/utils/cacheWasm.js +28 -46
  14. package/src/cache_utils.js +62 -0
  15. package/src/configs.js +123 -23
  16. package/src/env.js +100 -11
  17. package/src/generation/logits_sampler.js +3 -15
  18. package/src/generation/parameters.js +1 -1
  19. package/src/generation/streamers.js +21 -0
  20. package/src/image_processors_utils.js +29 -23
  21. package/src/models/afmoe/modeling_afmoe.js +5 -0
  22. package/src/models/auto/image_processing_auto.js +2 -1
  23. package/src/models/auto/modeling_auto.js +16 -2
  24. package/src/models/auto/tokenization_auto.js +2 -1
  25. package/src/models/chatterbox/modeling_chatterbox.js +1 -1
  26. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  27. package/src/models/chmv2/modeling_chmv2.js +4 -0
  28. package/src/models/clap/feature_extraction_clap.js +2 -1
  29. package/src/models/cohere2/modeling_cohere2.js +5 -0
  30. package/src/models/cohere_asr/feature_extraction_cohere_asr.js +117 -0
  31. package/src/models/cohere_asr/modeling_cohere_asr.js +11 -0
  32. package/src/models/cohere_asr/processing_cohere_asr.js +55 -0
  33. package/src/models/cohere_asr/tokenization_cohere_asr.js +3 -0
  34. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  35. package/src/models/detr/image_processing_detr.js +1 -1
  36. package/src/models/eurobert/modeling_eurobert.js +41 -0
  37. package/src/models/feature_extractors.js +3 -0
  38. package/src/models/gemma3/image_processing_gemma3.js +3 -0
  39. package/src/models/gemma3/modeling_gemma3.js +4 -1
  40. package/src/models/gemma3/processing_gemma3.js +45 -0
  41. package/src/models/gemma3n/modeling_gemma3n.js +2 -0
  42. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  43. package/src/models/glm46v/processing_glm46v.js +5 -0
  44. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  45. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  46. package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
  47. package/src/models/granite_speech/modeling_granite_speech.js +5 -0
  48. package/src/models/granite_speech/processing_granite_speech.js +62 -0
  49. package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
  50. package/src/models/idefics3/modeling_idefics3.js +5 -32
  51. package/src/models/image_processors.js +4 -0
  52. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
  53. package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
  54. package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
  55. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  56. package/src/models/llava/modeling_llava.js +1 -1
  57. package/src/models/marian/tokenization_marian.js +3 -2
  58. package/src/models/mistral3/modeling_mistral3.js +2 -2
  59. package/src/models/mistral4/modeling_mistral4.js +5 -0
  60. package/src/models/modeling_utils.js +283 -300
  61. package/src/models/models.js +26 -1
  62. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  63. package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
  64. package/src/models/paligemma/modeling_paligemma.js +2 -25
  65. package/src/models/paligemma/processing_paligemma.js +3 -2
  66. package/src/models/processors.js +8 -0
  67. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +9 -0
  68. package/src/models/qwen2_5_vl/processing_qwen2_5_vl.js +3 -0
  69. package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
  70. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +15 -1
  71. package/src/models/qwen2_vl/modeling_qwen2_vl.js +240 -143
  72. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  73. package/src/models/qwen3_5/modeling_qwen3_5.js +4 -0
  74. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +4 -0
  75. package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
  76. package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
  77. package/src/models/qwen3_vl/modeling_qwen3_vl.js +4 -0
  78. package/src/models/qwen3_vl/processing_qwen3_vl.js +3 -0
  79. package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
  80. package/src/models/registry.js +61 -5
  81. package/src/models/sam/image_processing_sam.js +1 -1
  82. package/src/models/session.js +33 -56
  83. package/src/models/smolvlm/modeling_smolvlm.js +7 -0
  84. package/src/models/solar_open/modeling_solar_open.js +5 -0
  85. package/src/models/tokenizers.js +1 -0
  86. package/src/models/ultravox/modeling_ultravox.js +1 -3
  87. package/src/models/voxtral/modeling_voxtral.js +3 -0
  88. package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
  89. package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
  90. package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
  91. package/src/models/whisper/feature_extraction_whisper.js +4 -13
  92. package/src/models/whisper/modeling_whisper.js +6 -5
  93. package/src/models/xlm/tokenization_xlm.js +2 -1
  94. package/src/pipelines/automatic-speech-recognition.js +47 -3
  95. package/src/pipelines/document-question-answering.js +1 -1
  96. package/src/pipelines/image-to-text.js +2 -2
  97. package/src/pipelines/index.js +313 -0
  98. package/src/pipelines/summarization.js +1 -1
  99. package/src/pipelines/text-generation.js +5 -1
  100. package/src/pipelines/text-to-audio.js +4 -2
  101. package/src/pipelines/text2text-generation.js +1 -1
  102. package/src/pipelines/translation.js +1 -1
  103. package/src/pipelines/zero-shot-classification.js +3 -2
  104. package/src/pipelines.js +140 -428
  105. package/src/tokenization_utils.js +42 -21
  106. package/src/transformers.js +10 -1
  107. package/src/utils/audio.js +20 -3
  108. package/src/utils/cache/CrossOriginStorageCache.js +251 -0
  109. package/src/utils/cache/FileCache.js +128 -0
  110. package/src/utils/cache/cross-origin-storage.d.ts +38 -0
  111. package/src/utils/cache.js +12 -4
  112. package/src/utils/core.js +23 -1
  113. package/src/utils/devices.js +22 -0
  114. package/src/utils/dtypes.js +55 -0
  115. package/src/utils/hub/{files.js → FileResponse.js} +0 -90
  116. package/src/utils/hub/utils.js +45 -5
  117. package/src/utils/hub.js +67 -23
  118. package/src/utils/image.js +14 -14
  119. package/src/utils/logger.js +67 -0
  120. package/src/utils/lru_cache.js +67 -0
  121. package/src/utils/memoize_promise.js +45 -0
  122. package/src/utils/model-loader.js +35 -17
  123. package/src/utils/model_registry/ModelRegistry.js +382 -0
  124. package/src/utils/model_registry/clear_cache.js +128 -0
  125. package/src/utils/model_registry/get_available_dtypes.js +68 -0
  126. package/src/utils/model_registry/get_file_metadata.js +162 -0
  127. package/src/utils/model_registry/get_files.js +42 -0
  128. package/src/utils/model_registry/get_model_files.js +114 -0
  129. package/src/utils/model_registry/get_pipeline_files.js +44 -0
  130. package/src/utils/model_registry/get_processor_files.js +20 -0
  131. package/src/utils/model_registry/get_tokenizer_files.js +21 -0
  132. package/src/utils/model_registry/is_cached.js +169 -0
  133. package/src/utils/model_registry/resolve_model_type.js +66 -0
  134. package/src/utils/random.js +225 -0
  135. package/src/utils/tensor.js +26 -23
  136. package/src/utils/video.js +2 -2
  137. package/types/backends/onnx.d.ts.map +1 -1
  138. package/types/backends/utils/cacheWasm.d.ts +3 -17
  139. package/types/backends/utils/cacheWasm.d.ts.map +1 -1
  140. package/types/cache_utils.d.ts +29 -0
  141. package/types/cache_utils.d.ts.map +1 -0
  142. package/types/configs.d.ts.map +1 -1
  143. package/types/env.d.ts +60 -27
  144. package/types/env.d.ts.map +1 -1
  145. package/types/generation/logits_sampler.d.ts +2 -2
  146. package/types/generation/logits_sampler.d.ts.map +1 -1
  147. package/types/generation/parameters.d.ts +1 -1
  148. package/types/generation/parameters.d.ts.map +1 -1
  149. package/types/generation/streamers.d.ts +1 -0
  150. package/types/generation/streamers.d.ts.map +1 -1
  151. package/types/image_processors_utils.d.ts +18 -1
  152. package/types/image_processors_utils.d.ts.map +1 -1
  153. package/types/models/afmoe/modeling_afmoe.d.ts +8 -0
  154. package/types/models/afmoe/modeling_afmoe.d.ts.map +1 -0
  155. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  156. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  157. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  158. package/types/models/auto/modeling_auto.d.ts +6 -0
  159. package/types/models/auto/modeling_auto.d.ts.map +1 -1
  160. package/types/models/auto/tokenization_auto.d.ts.map +1 -1
  161. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  162. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  163. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  164. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  165. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
  166. package/types/models/cohere2/modeling_cohere2.d.ts +8 -0
  167. package/types/models/cohere2/modeling_cohere2.d.ts.map +1 -0
  168. package/types/models/cohere_asr/feature_extraction_cohere_asr.d.ts +25 -0
  169. package/types/models/cohere_asr/feature_extraction_cohere_asr.d.ts.map +1 -0
  170. package/types/models/cohere_asr/modeling_cohere_asr.d.ts +9 -0
  171. package/types/models/cohere_asr/modeling_cohere_asr.d.ts.map +1 -0
  172. package/types/models/cohere_asr/processing_cohere_asr.d.ts +27 -0
  173. package/types/models/cohere_asr/processing_cohere_asr.d.ts.map +1 -0
  174. package/types/models/cohere_asr/tokenization_cohere_asr.d.ts +4 -0
  175. package/types/models/cohere_asr/tokenization_cohere_asr.d.ts.map +1 -0
  176. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  177. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  178. package/types/models/detr/image_processing_detr.d.ts +1 -1
  179. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  180. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  181. package/types/models/feature_extractors.d.ts +3 -0
  182. package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
  183. package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
  184. package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
  185. package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
  186. package/types/models/gemma3/processing_gemma3.d.ts +20 -0
  187. package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
  188. package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
  189. package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
  190. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  191. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  192. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  193. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  194. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  195. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  196. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  197. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  198. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
  199. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
  200. package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
  201. package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
  202. package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
  203. package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
  204. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
  205. package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
  206. package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
  207. package/types/models/image_processors.d.ts +4 -0
  208. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
  209. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
  210. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
  211. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
  212. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
  213. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
  214. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  215. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  216. package/types/models/marian/tokenization_marian.d.ts.map +1 -1
  217. package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
  218. package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
  219. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  220. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  221. package/types/models/modeling_utils.d.ts +46 -27
  222. package/types/models/modeling_utils.d.ts.map +1 -1
  223. package/types/models/models.d.ts +26 -1
  224. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  225. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  226. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
  227. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
  228. package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
  229. package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
  230. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  231. package/types/models/processors.d.ts +8 -0
  232. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +7 -0
  233. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -0
  234. package/types/models/qwen2_5_vl/processing_qwen2_5_vl.d.ts +4 -0
  235. package/types/models/qwen2_5_vl/processing_qwen2_5_vl.d.ts.map +1 -0
  236. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
  237. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
  238. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +3 -0
  239. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  240. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +44 -6
  241. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  242. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  243. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  244. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +6 -0
  245. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -0
  246. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +7 -0
  247. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -0
  248. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
  249. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
  250. package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
  251. package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
  252. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +7 -0
  253. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -0
  254. package/types/models/qwen3_vl/processing_qwen3_vl.d.ts +4 -0
  255. package/types/models/qwen3_vl/processing_qwen3_vl.d.ts.map +1 -0
  256. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
  257. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
  258. package/types/models/registry.d.ts +2 -1
  259. package/types/models/registry.d.ts.map +1 -1
  260. package/types/models/sam/image_processing_sam.d.ts +1 -1
  261. package/types/models/session.d.ts +3 -2
  262. package/types/models/session.d.ts.map +1 -1
  263. package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
  264. package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
  265. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  266. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  267. package/types/models/tokenizers.d.ts +1 -0
  268. package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
  269. package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
  270. package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
  271. package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
  272. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
  273. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
  274. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
  275. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
  276. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
  277. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
  278. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  279. package/types/models/whisper/modeling_whisper.d.ts.map +1 -1
  280. package/types/models/xlm/tokenization_xlm.d.ts.map +1 -1
  281. package/types/pipelines/automatic-speech-recognition.d.ts +7 -2
  282. package/types/pipelines/automatic-speech-recognition.d.ts.map +1 -1
  283. package/types/pipelines/document-question-answering.d.ts +2 -2
  284. package/types/pipelines/document-question-answering.d.ts.map +1 -1
  285. package/types/pipelines/image-to-text.d.ts +4 -4
  286. package/types/pipelines/image-to-text.d.ts.map +1 -1
  287. package/types/pipelines/index.d.ts +265 -0
  288. package/types/pipelines/index.d.ts.map +1 -0
  289. package/types/pipelines/summarization.d.ts +2 -2
  290. package/types/pipelines/summarization.d.ts.map +1 -1
  291. package/types/pipelines/text-generation.d.ts +7 -3
  292. package/types/pipelines/text-generation.d.ts.map +1 -1
  293. package/types/pipelines/text-to-audio.d.ts.map +1 -1
  294. package/types/pipelines/text2text-generation.d.ts +3 -3
  295. package/types/pipelines/text2text-generation.d.ts.map +1 -1
  296. package/types/pipelines/translation.d.ts +2 -2
  297. package/types/pipelines/translation.d.ts.map +1 -1
  298. package/types/pipelines/zero-shot-classification.d.ts.map +1 -1
  299. package/types/pipelines.d.ts +51 -291
  300. package/types/pipelines.d.ts.map +1 -1
  301. package/types/tokenization_utils.d.ts +44 -26
  302. package/types/tokenization_utils.d.ts.map +1 -1
  303. package/types/transformers.d.ts +7 -1
  304. package/types/transformers.d.ts.map +1 -1
  305. package/types/utils/audio.d.ts +5 -2
  306. package/types/utils/audio.d.ts.map +1 -1
  307. package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
  308. package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
  309. package/types/utils/cache/FileCache.d.ts +39 -0
  310. package/types/utils/cache/FileCache.d.ts.map +1 -0
  311. package/types/utils/cache.d.ts +10 -4
  312. package/types/utils/cache.d.ts.map +1 -1
  313. package/types/utils/core.d.ts +59 -2
  314. package/types/utils/core.d.ts.map +1 -1
  315. package/types/utils/devices.d.ts +15 -0
  316. package/types/utils/devices.d.ts.map +1 -1
  317. package/types/utils/dtypes.d.ts +17 -1
  318. package/types/utils/dtypes.d.ts.map +1 -1
  319. package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -32
  320. package/types/utils/hub/FileResponse.d.ts.map +1 -0
  321. package/types/utils/hub/utils.d.ts +19 -3
  322. package/types/utils/hub/utils.d.ts.map +1 -1
  323. package/types/utils/hub.d.ts +36 -7
  324. package/types/utils/hub.d.ts.map +1 -1
  325. package/types/utils/image.d.ts +1 -1
  326. package/types/utils/logger.d.ts +28 -0
  327. package/types/utils/logger.d.ts.map +1 -0
  328. package/types/utils/lru_cache.d.ts +38 -0
  329. package/types/utils/lru_cache.d.ts.map +1 -0
  330. package/types/utils/memoize_promise.d.ts +14 -0
  331. package/types/utils/memoize_promise.d.ts.map +1 -0
  332. package/types/utils/model-loader.d.ts +15 -0
  333. package/types/utils/model-loader.d.ts.map +1 -1
  334. package/types/utils/model_registry/ModelRegistry.d.ts +298 -0
  335. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -0
  336. package/types/utils/model_registry/clear_cache.d.ts +74 -0
  337. package/types/utils/model_registry/clear_cache.d.ts.map +1 -0
  338. package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
  339. package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
  340. package/types/utils/model_registry/get_file_metadata.d.ts +20 -0
  341. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -0
  342. package/types/utils/model_registry/get_files.d.ts +23 -0
  343. package/types/utils/model_registry/get_files.d.ts.map +1 -0
  344. package/types/utils/model_registry/get_model_files.d.ts +48 -0
  345. package/types/utils/model_registry/get_model_files.d.ts.map +1 -0
  346. package/types/utils/model_registry/get_pipeline_files.d.ts +22 -0
  347. package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -0
  348. package/types/utils/model_registry/get_processor_files.d.ts +9 -0
  349. package/types/utils/model_registry/get_processor_files.d.ts.map +1 -0
  350. package/types/utils/model_registry/get_tokenizer_files.d.ts +9 -0
  351. package/types/utils/model_registry/get_tokenizer_files.d.ts.map +1 -0
  352. package/types/utils/model_registry/is_cached.d.ts +105 -0
  353. package/types/utils/model_registry/is_cached.d.ts.map +1 -0
  354. package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
  355. package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
  356. package/types/utils/random.d.ts +86 -0
  357. package/types/utils/random.d.ts.map +1 -0
  358. package/types/utils/tensor.d.ts.map +1 -1
  359. package/src/utils/data-structures.js +0 -572
  360. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  361. package/types/utils/data-structures.d.ts +0 -294
  362. package/types/utils/data-structures.d.ts.map +0 -1
  363. package/types/utils/hub/files.d.ts.map +0 -1
  364. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -0,0 +1,239 @@
1
+ import { PreTrainedModel } from '../modeling_utils.js';
2
+ import { sessionRun } from '../session.js';
3
+ import { getCacheShapes } from '../../configs.js';
4
+ import { Tensor, ones } from '../../utils/tensor.js';
5
+ import { DataTypeMap } from '../../utils/dtypes.js';
6
+ import { pick } from '../../utils/core.js';
7
+ import { DynamicCache } from '../../cache_utils.js';
8
+ import { StoppingCriteria, StoppingCriteriaList } from '../../generation/stopping_criteria.js';
9
+
10
+ // Causal conv padding constants
11
+ const CONV1_LEFT_PAD = 2;
12
+ const CONV2_LEFT_PAD = 1;
13
+
14
+ /**
15
+ * WeakMap to hold encoder streaming states for each model instance during generation.
16
+ * This allows the state to be accessed and modified across the generation process
17
+ * without exposing it on the model instance itself.
18
+ * @private
19
+ * @type {WeakMap<VoxtralRealtimeForConditionalGeneration, Object>}
20
+ */
21
+ const states = new WeakMap();
22
+
23
+ /**
24
+ * Creates encoder streaming state for a VoxtralRealtime generation session.
25
+ * @param {VoxtralRealtimeForConditionalGeneration} model
26
+ * @param {Iterable<Tensor>|AsyncIterable<Tensor>} input_features
27
+ * @returns {Object} Encoder state object.
28
+ * @private
29
+ */
30
+ function createEncoderState(model, input_features) {
31
+ const { text_config, audio_config } = /** @type {any} */ (model.config);
32
+ const encoder_session = model.sessions['audio_encoder'];
33
+
34
+ const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
35
+ const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
36
+
37
+ // Initialize encoder KV cache
38
+ const enc_kv_cache = new DynamicCache();
39
+ const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? 'float32';
40
+ const enc_cls = enc_dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
41
+ const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
42
+ for (const name in enc_shapes) {
43
+ const size = enc_shapes[name].reduce((a, b) => a * b, 1);
44
+ enc_kv_cache[name] = new Tensor(enc_dtype, new enc_cls(size), enc_shapes[name]);
45
+ }
46
+
47
+ const enc_padding_cache = new Tensor(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
48
+ 1,
49
+ PADDING_CACHE_CHANNELS,
50
+ CONV1_LEFT_PAD,
51
+ ]);
52
+
53
+ // Set up iterator from input_features
54
+ const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
55
+ if (!chunks_iter) {
56
+ throw new Error('input_features must be iterable or async iterable');
57
+ }
58
+
59
+ return {
60
+ encoder_session,
61
+ enc_kv_cache,
62
+ enc_padding_cache,
63
+ enc_past_seq_len: 0,
64
+ audio_embed_queue: [],
65
+ audio_embed_total_tokens: 0,
66
+ audio_queue_offset: 0,
67
+ audio_consumed: 0,
68
+ stream_exhausted: false,
69
+ chunks_iter,
70
+ text_hidden_size: text_config.hidden_size,
71
+ };
72
+ }
73
+
74
+ /**
75
+ * Encodes one audio chunk through the audio encoder.
76
+ * @param {Object} s Encoder state.
77
+ * @param {Tensor} chunk_features Mel spectrogram chunk [1, num_mel_bins, seq_len].
78
+ * @returns {Promise<Tensor>} Audio embeddings.
79
+ * @private
80
+ */
81
+ async function encodeChunk(s, chunk_features) {
82
+ const audio_seq_len = chunk_features.dims[2];
83
+ const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
84
+
85
+ const position_ids = new Tensor(
86
+ 'int64',
87
+ BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
88
+ [1, conv2_output_len],
89
+ );
90
+
91
+ const total_seq_len = s.enc_past_seq_len + conv2_output_len;
92
+ const attention_mask = ones([1, total_seq_len]);
93
+ const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
94
+ input_features: chunk_features,
95
+ attention_mask,
96
+ position_ids,
97
+ past_padding_cache: s.enc_padding_cache,
98
+ ...s.enc_kv_cache,
99
+ });
100
+ // Dispose previous padding cache and update
101
+ if (s.enc_padding_cache.location === 'gpu-buffer') {
102
+ s.enc_padding_cache.dispose();
103
+ }
104
+ s.enc_padding_cache = present_padding_cache;
105
+
106
+ // Update encoder KV cache, disposing previous tensors
107
+ for (const name in present_cache) {
108
+ if (name.startsWith('present.')) {
109
+ const pastName = name.replace('present', 'past_key_values');
110
+ const prev = s.enc_kv_cache[pastName];
111
+ if (prev?.location === 'gpu-buffer') {
112
+ prev.dispose();
113
+ }
114
+ s.enc_kv_cache[pastName] = present_cache[name];
115
+ }
116
+ }
117
+ s.enc_past_seq_len = total_seq_len;
118
+ return audio_embeds;
119
+ }
120
+
121
+ /**
122
+ * Fills the audio embedding buffer until it has enough tokens.
123
+ * @param {Object} s Encoder state.
124
+ * @param {number} needed Total number of audio tokens needed.
125
+ * @private
126
+ */
127
+ async function fillAudioBuffer(s, needed) {
128
+ while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
129
+ const result = await s.chunks_iter.next();
130
+ if (result.done) {
131
+ s.stream_exhausted = true;
132
+ break;
133
+ }
134
+ const new_embeds = await encodeChunk(s, result.value);
135
+ s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
136
+ s.audio_embed_total_tokens += new_embeds.dims[1];
137
+ }
138
+ }
139
+
140
+ /**
141
+ * Adds audio embeddings to text embeddings from the queue.
142
+ * @param {Object} s Encoder state.
143
+ * @param {Tensor} inputs_embeds Text embeddings tensor (modified in-place).
144
+ * @param {number} current_len Number of tokens to consume.
145
+ * @private
146
+ */
147
+ function addAudioEmbeddings(s, inputs_embeds, current_len) {
148
+ if (s.audio_embed_queue.length === 0) return;
149
+
150
+ const embed_data = inputs_embeds.data;
151
+ let embed_write_pos = 0;
152
+ let remaining = current_len;
153
+
154
+ while (remaining > 0 && s.audio_embed_queue.length > 0) {
155
+ const front = s.audio_embed_queue[0];
156
+ const available = front.tokens - s.audio_queue_offset;
157
+ const n = Math.min(remaining, available);
158
+
159
+ const src_offset = s.audio_queue_offset * s.text_hidden_size;
160
+ for (let i = 0; i < n * s.text_hidden_size; ++i) {
161
+ embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
162
+ }
163
+
164
+ embed_write_pos += n;
165
+ remaining -= n;
166
+ s.audio_queue_offset += n;
167
+
168
+ if (s.audio_queue_offset >= front.tokens) {
169
+ s.audio_embed_queue.shift();
170
+ s.audio_queue_offset = 0;
171
+ }
172
+ }
173
+ s.audio_consumed += current_len - remaining;
174
+ }
175
+
176
+ /**
177
+ * Stopping criterion that triggers when the audio stream is exhausted
178
+ * and all buffered audio embeddings have been consumed.
179
+ * @private
180
+ */
181
+ class AudioExhaustedCriteria extends StoppingCriteria {
182
+ constructor(enc_state) {
183
+ super();
184
+ this._s = enc_state;
185
+ }
186
+ _call(input_ids) {
187
+ const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
188
+ return input_ids.map(() => done);
189
+ }
190
+ }
191
+
192
+ export class VoxtralRealtimePreTrainedModel extends PreTrainedModel {
193
+ forward_params = ['input_ids', 'attention_mask', 'position_ids', 'past_key_values'];
194
+ }
195
+
196
+ export class VoxtralRealtimeForConditionalGeneration extends VoxtralRealtimePreTrainedModel {
197
+ async forward({ input_ids, past_key_values, ...kwargs }) {
198
+ const current_len = input_ids.dims[1];
199
+
200
+ const enc = states.get(this);
201
+ if (enc) {
202
+ // Fill audio buffer and embed tokens with audio
203
+ await fillAudioBuffer(enc, enc.audio_consumed + current_len);
204
+ }
205
+
206
+ const { inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], { input_ids });
207
+ if (enc) {
208
+ addAudioEmbeddings(enc, inputs_embeds, current_len);
209
+ }
210
+
211
+ const decoder_feeds = { inputs_embeds, ...kwargs };
212
+ this.addPastKeyValues(decoder_feeds, past_key_values);
213
+
214
+ const session = this.sessions['decoder_model_merged'];
215
+ const fixed = pick(decoder_feeds, session.inputNames);
216
+ return await sessionRun(session, fixed);
217
+ }
218
+
219
+ async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
220
+ if (!input_features) {
221
+ throw new Error('input_features (generator/iterable) must be provided');
222
+ }
223
+
224
+ const enc_state = createEncoderState(this, input_features);
225
+ states.set(this, enc_state);
226
+
227
+ const stopping_criteria = new StoppingCriteriaList();
228
+ stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
229
+ if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
230
+
231
+ try {
232
+ return await super.generate({ ...kwargs, stopping_criteria });
233
+ } finally {
234
+ // Cleanup encoder state
235
+ enc_state.enc_kv_cache.dispose();
236
+ states.delete(this);
237
+ }
238
+ }
239
+ }
@@ -0,0 +1,113 @@
1
+ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
2
+ import { AutoTokenizer } from '../auto/tokenization_auto.js';
3
+ import { Processor } from '../../processing_utils.js';
4
+ import { Tensor } from '../../utils/tensor.js';
5
+ import { validate_audio_inputs } from '../../feature_extraction_utils.js';
6
+
7
+ // Voxtral Realtime audio config constants (from mistral_common AudioConfig)
8
+ const NUM_LEFT_PAD_TOKENS = 32;
9
+ const NUM_DELAY_TOKENS = 6;
10
+ const AUDIO_LENGTH_PER_TOK = 8;
11
+ const OFFLINE_STREAMING_BUFFER_TOKENS = 10;
12
+
13
+ /** Token ID for [STREAMING_PAD] in the Voxtral tokenizer. */
14
+ const STREAMING_PAD_TOKEN_ID = 32;
15
+
16
+ export class VoxtralRealtimeProcessor extends Processor {
17
+ static tokenizer_class = AutoTokenizer;
18
+ static feature_extractor_class = AutoFeatureExtractor;
19
+ static uses_processor_config = false;
20
+
21
+ /** Number of mel frames in the first audio chunk. */
22
+ get num_mel_frames_first_audio_chunk() {
23
+ return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
24
+ }
25
+
26
+ /** Number of raw audio samples in the first audio chunk. */
27
+ get num_samples_first_audio_chunk() {
28
+ const { hop_length, n_fft } = this.feature_extractor.config;
29
+ return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
30
+ }
31
+
32
+ /** Number of raw audio samples per subsequent audio chunk. */
33
+ get num_samples_per_audio_chunk() {
34
+ const { hop_length, n_fft } = this.feature_extractor.config;
35
+ return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
36
+ }
37
+
38
+ /** Number of right-pad tokens for non-streaming mode. */
39
+ get num_right_pad_tokens() {
40
+ return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
41
+ }
42
+
43
+ /** Number of mel frames per text token. */
44
+ get audio_length_per_tok() {
45
+ return AUDIO_LENGTH_PER_TOK;
46
+ }
47
+
48
+ /** Number of raw audio samples per token. */
49
+ get raw_audio_length_per_tok() {
50
+ return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
51
+ }
52
+
53
+ /**
54
+ * Process audio input for VoxtralRealtime.
55
+ *
56
+ * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
57
+ * with silence and mel features are extracted with `center=true`.
58
+ * Returns `{ input_ids, input_features }`.
59
+ *
60
+ * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
61
+ * processed with `center=false` and only `{ input_features }` is returned.
62
+ *
63
+ * In non-streaming mode, the audio is right-padded to ensure the model
64
+ * transcribes the full audio, then processed with `center=true`.
65
+ * Returns `{ input_features }`.
66
+ *
67
+ * @param {Float32Array|Float64Array} audio The audio waveform.
68
+ * @param {Object} [options]
69
+ * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
70
+ * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
71
+ * @returns {Promise<Object>}
72
+ */
73
+ async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
74
+ validate_audio_inputs(audio, 'VoxtralRealtimeProcessor');
75
+
76
+ if (!is_streaming && !is_first_audio_chunk) {
77
+ throw new Error('In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.');
78
+ }
79
+
80
+ if (is_first_audio_chunk) {
81
+ if (is_streaming) {
82
+ // Streaming first chunk: left-pad audio with silence, extract mel with center=true, build input_ids
83
+ const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
84
+ const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
85
+ padded_audio.set(audio, num_left_pad_samples);
86
+
87
+ const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
88
+
89
+ // Build input_ids: BOS + (num_left_pad_tokens + num_delay_tokens) * [STREAMING_PAD]
90
+ const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
91
+ const num_input_tokens = 1 + num_pad_tokens;
92
+ const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
93
+ input_ids_data[0] = 1n; // BOS
94
+ const input_ids = new Tensor('int64', input_ids_data, [1, num_input_tokens]);
95
+
96
+ return {
97
+ input_ids,
98
+ ...audio_encoding,
99
+ };
100
+ } else {
101
+ // Non-streaming: right-pad audio to ensure full transcription, extract mel with center=true
102
+ const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
103
+ const padded_audio = new Float32Array(audio.length + right_pad_samples);
104
+ padded_audio.set(audio);
105
+
106
+ return await this.feature_extractor(padded_audio, { center: true });
107
+ }
108
+ } else {
109
+ // Subsequent streaming chunks: extract mel with center=false
110
+ return await this.feature_extractor(audio, { center: false });
111
+ }
112
+ }
113
+ }
@@ -1,7 +1,7 @@
1
1
  import { FeatureExtractor, validate_audio_inputs } from '../../feature_extraction_utils.js';
2
2
  import { Tensor } from '../../utils/tensor.js';
3
3
  import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
4
- import { max } from '../../utils/maths.js';
4
+ import { logger } from '../../utils/logger.js';
5
5
 
6
6
  export class WhisperFeatureExtractor extends FeatureExtractor {
7
7
  constructor(config) {
@@ -27,7 +27,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
27
27
  * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
28
28
  */
29
29
  async _extract_fbank_features(waveform) {
30
- const features = await spectrogram(
30
+ return await spectrogram(
31
31
  waveform,
32
32
  this.window, // window
33
33
  this.config.n_fft, // frame_length
@@ -35,7 +35,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
35
35
  {
36
36
  power: 2.0,
37
37
  mel_filters: this.config.mel_filters,
38
- log_mel: 'log10',
38
+ log_mel: 'log10_max_norm',
39
39
 
40
40
  // Custom
41
41
  max_num_frames: Math.min(
@@ -44,15 +44,6 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
44
44
  ),
45
45
  },
46
46
  );
47
-
48
- const data = features.data;
49
- const maxValue = max(/** @type {Float32Array} */ (data))[0];
50
-
51
- for (let i = 0; i < data.length; ++i) {
52
- data[i] = (Math.max(data[i], maxValue - 8.0) + 4.0) / 4.0;
53
- }
54
-
55
- return features;
56
47
  }
57
48
 
58
49
  /**
@@ -67,7 +58,7 @@ export class WhisperFeatureExtractor extends FeatureExtractor {
67
58
  const length = max_length ?? this.config.n_samples;
68
59
  if (audio.length > length) {
69
60
  if (audio.length > this.config.n_samples) {
70
- console.warn(
61
+ logger.warn(
71
62
  'Attempting to extract features for audio longer than 30 seconds. ' +
72
63
  'If using a pipeline to extract transcript from a long audio clip, ' +
73
64
  'remember to specify `chunk_length_s` and/or `stride_length_s`.',
@@ -10,6 +10,7 @@ import {
10
10
  import { medianFilter, dynamic_time_warping } from '../../utils/maths.js';
11
11
  import { mergeArrays } from '../../utils/core.js';
12
12
  import { ModelOutput } from '../modeling_outputs.js';
13
+ import { logger } from '../../utils/logger.js';
13
14
 
14
15
  export class WhisperPreTrainedModel extends PreTrainedModel {
15
16
  requires_attention_mask = false;
@@ -56,7 +57,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
56
57
  if (generation_config.is_multilingual) {
57
58
  if (!language) {
58
59
  // TODO: Implement language detection
59
- console.warn('No language specified - defaulting to English (en).');
60
+ logger.warn('No language specified - defaulting to English (en).');
60
61
  language = 'en';
61
62
  }
62
63
 
@@ -85,7 +86,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
85
86
  generation_config.return_timestamps &&
86
87
  init_tokens.at(-1) === generation_config.no_timestamps_token_id
87
88
  ) {
88
- console.warn(
89
+ logger.warn(
89
90
  '<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`.',
90
91
  );
91
92
  init_tokens.pop();
@@ -138,7 +139,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
138
139
  }
139
140
 
140
141
  if (generation_config.task === 'translate') {
141
- console.warn("Token-level timestamps may not be reliable for task 'translate'.");
142
+ logger.warn("Token-level timestamps may not be reliable for task 'translate'.");
142
143
  }
143
144
 
144
145
  generation_config.output_attentions = true;
@@ -185,7 +186,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
185
186
  );
186
187
  }
187
188
  if (num_frames == null) {
188
- console.warn(
189
+ logger.warn(
189
190
  '`num_frames` has not been set, meaning the entire audio will be analyzed. ' +
190
191
  'This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).',
191
192
  );
@@ -194,7 +195,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
194
195
  // @ts-expect-error TS2339
195
196
  let median_filter_width = this.config.median_filter_width;
196
197
  if (median_filter_width === undefined) {
197
- console.warn('Model config has no `median_filter_width`, using default value of 7.');
198
+ logger.warn('Model config has no `median_filter_width`, using default value of 7.');
198
199
  median_filter_width = 7;
199
200
  }
200
201
 
@@ -1,11 +1,12 @@
1
1
  import { PreTrainedTokenizer } from '../../tokenization_utils.js';
2
+ import { logger } from '../../utils/logger.js';
2
3
 
3
4
  export class XLMTokenizer extends PreTrainedTokenizer {
4
5
  return_token_type_ids = true;
5
6
 
6
7
  constructor(tokenizerJSON, tokenizerConfig) {
7
8
  super(tokenizerJSON, tokenizerConfig);
8
- console.warn(
9
+ logger.warn(
9
10
  'WARNING: `XLMTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.',
10
11
  );
11
12
  }
@@ -2,6 +2,7 @@ import { Pipeline, prepareAudios } from './_base.js';
2
2
 
3
3
  import { Tensor } from '../utils/tensor.js';
4
4
  import { max, round } from '../utils/maths.js';
5
+ import { logger } from '../utils/logger.js';
5
6
 
6
7
  /**
7
8
  * @typedef {import('./_base.js').TextAudioPipelineConstructorArgs} TextAudioPipelineConstructorArgs
@@ -29,7 +30,7 @@ import { max, round } from '../utils/maths.js';
29
30
  * @property {string} [language] The source language. Default is `null`, meaning it should be auto-detected. Use this to potentially improve performance if the source language is known.
30
31
  * @property {string} [task] The task to perform. Default is `null`, meaning it should be auto-detected.
31
32
  * @property {number} [num_frames] The number of frames in the input audio.
32
- * @typedef {import('../generation/configuration_utils.js').GenerationConfig & AutomaticSpeechRecognitionSpecificParams} AutomaticSpeechRecognitionConfig
33
+ * @typedef {import('../generation/parameters.js').GenerationFunctionParameters & AutomaticSpeechRecognitionSpecificParams} AutomaticSpeechRecognitionConfig
33
34
  *
34
35
  * @callback AutomaticSpeechRecognitionPipelineCallbackSingle Transcribe the audio sequence given as inputs to text.
35
36
  * @param {AudioInput} audio The input audio file(s) to be transcribed. The input is either:
@@ -153,6 +154,8 @@ export class AutomaticSpeechRecognitionPipeline
153
154
  return this._call_wav2vec2(audio, kwargs);
154
155
  case 'moonshine':
155
156
  return this._call_moonshine(audio, kwargs);
157
+ case 'cohere_asr':
158
+ return this._call_cohere_asr(audio, kwargs);
156
159
  default:
157
160
  throw new Error(
158
161
  `AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`,
@@ -164,10 +167,10 @@ export class AutomaticSpeechRecognitionPipeline
164
167
  // TODO use kwargs
165
168
 
166
169
  if (kwargs.language) {
167
- console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".');
170
+ logger.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".');
168
171
  }
169
172
  if (kwargs.task) {
170
- console.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');
173
+ logger.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');
171
174
  }
172
175
 
173
176
  const single = !Array.isArray(audio);
@@ -319,4 +322,45 @@ export class AutomaticSpeechRecognitionPipeline
319
322
  }
320
323
  return single ? toReturn[0] : toReturn;
321
324
  }
325
+
326
+ async _call_cohere_asr(audio, kwargs) {
327
+ const single = !Array.isArray(audio);
328
+ const batchedAudio = single ? [audio] : audio;
329
+
330
+ const feature_extractor = this.processor.feature_extractor;
331
+ const sampling_rate = feature_extractor.config.sampling_rate;
332
+ const preparedAudios = await prepareAudios(batchedAudio, sampling_rate);
333
+
334
+ const language = kwargs.language ?? 'en';
335
+ // @ts-expect-error TS2339
336
+ const decoder_input_ids = this.processor.get_decoder_prompt_ids(language);
337
+
338
+ const toReturn = [];
339
+ for (const aud of preparedAudios) {
340
+ // Split long audio at energy-based boundaries
341
+ // @ts-expect-error TS2339
342
+ const audioChunks = feature_extractor.split_audio(aud);
343
+
344
+ const chunk_texts = [];
345
+ for (const chunk of audioChunks) {
346
+ const inputs = await this.processor(chunk);
347
+
348
+ const outputs = await this.model.generate({
349
+ ...inputs,
350
+ decoder_input_ids,
351
+ ...kwargs,
352
+ });
353
+
354
+ const text = this.tokenizer
355
+ .decode(/** @type {Tensor} */ (outputs)[0].tolist(), { skip_special_tokens: true })
356
+ .trim();
357
+ chunk_texts.push(text);
358
+ }
359
+
360
+ // @ts-expect-error TS2339
361
+ const full_text = this.processor.constructor.join_chunks(chunk_texts, language);
362
+ toReturn.push({ text: full_text });
363
+ }
364
+ return single ? toReturn[0] : toReturn;
365
+ }
322
366
  }
@@ -16,7 +16,7 @@ import { Tensor } from '../utils/tensor.js';
16
16
  * @callback DocumentQuestionAnsweringPipelineCallback Answer the question given as input by using the document.
17
17
  * @param {ImageInput|ImageInput[]} image The image of the document to use.
18
18
  * @param {string} question A question to ask of the document.
19
- * @param {Partial<import('../generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
19
+ * @param {Partial<import('../generation/parameters.js').GenerationFunctionParameters>} [options] Additional keyword arguments to pass along to the generate method of the model.
20
20
  * @returns {Promise<DocumentQuestionAnsweringOutput>} An object (or array of objects) containing the answer(s).
21
21
  *
22
22
  * @typedef {TextImagePipelineConstructorArgs & DocumentQuestionAnsweringPipelineCallback & Disposable} DocumentQuestionAnsweringPipelineType
@@ -15,12 +15,12 @@ import { Tensor } from '../utils/tensor.js';
15
15
  *
16
16
  * @callback ImageToTextPipelineCallbackSingle Assign labels to the image passed as input.
17
17
  * @param {ImageInput} texts The image to be captioned.
18
- * @param {Partial<import('../generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
18
+ * @param {Partial<import('../generation/parameters.js').GenerationFunctionParameters>} [options] Additional keyword arguments to pass along to the generate method of the model.
19
19
  * @returns {Promise<ImageToTextOutput>} An object containing the generated text(s).
20
20
  *
21
21
  * @callback ImageToTextPipelineCallbackBatch Assign labels to the images passed as inputs.
22
22
  * @param {ImageInput[]} texts The images to be captioned.
23
- * @param {Partial<import('../generation/configuration_utils.js').GenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
23
+ * @param {Partial<import('../generation/parameters.js').GenerationFunctionParameters>} [options] Additional keyword arguments to pass along to the generate method of the model.
24
24
  * @returns {Promise<ImageToTextOutput[]>} An array containing the generated text(s) for each image.
25
25
  *
26
26
  * @typedef {ImageToTextPipelineCallbackSingle & ImageToTextPipelineCallbackBatch} ImageToTextPipelineCallback