@huggingface/tasks 0.13.1-test → 0.13.1-test2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/package.json +4 -2
  2. package/src/dataset-libraries.ts +89 -0
  3. package/src/default-widget-inputs.ts +718 -0
  4. package/src/gguf.ts +40 -0
  5. package/src/hardware.ts +482 -0
  6. package/src/index.ts +59 -0
  7. package/src/library-to-tasks.ts +76 -0
  8. package/src/local-apps.ts +412 -0
  9. package/src/model-data.ts +149 -0
  10. package/src/model-libraries-downloads.ts +18 -0
  11. package/src/model-libraries-snippets.ts +1128 -0
  12. package/src/model-libraries.ts +820 -0
  13. package/src/pipelines.ts +698 -0
  14. package/src/snippets/common.ts +39 -0
  15. package/src/snippets/curl.spec.ts +94 -0
  16. package/src/snippets/curl.ts +120 -0
  17. package/src/snippets/index.ts +7 -0
  18. package/src/snippets/inputs.ts +167 -0
  19. package/src/snippets/js.spec.ts +148 -0
  20. package/src/snippets/js.ts +305 -0
  21. package/src/snippets/python.spec.ts +144 -0
  22. package/src/snippets/python.ts +321 -0
  23. package/src/snippets/types.ts +16 -0
  24. package/src/tasks/audio-classification/about.md +86 -0
  25. package/src/tasks/audio-classification/data.ts +81 -0
  26. package/src/tasks/audio-classification/inference.ts +52 -0
  27. package/src/tasks/audio-classification/spec/input.json +35 -0
  28. package/src/tasks/audio-classification/spec/output.json +11 -0
  29. package/src/tasks/audio-to-audio/about.md +56 -0
  30. package/src/tasks/audio-to-audio/data.ts +70 -0
  31. package/src/tasks/automatic-speech-recognition/about.md +90 -0
  32. package/src/tasks/automatic-speech-recognition/data.ts +82 -0
  33. package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
  34. package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
  35. package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
  36. package/src/tasks/chat-completion/inference.ts +322 -0
  37. package/src/tasks/chat-completion/spec/input.json +350 -0
  38. package/src/tasks/chat-completion/spec/output.json +206 -0
  39. package/src/tasks/chat-completion/spec/stream_output.json +213 -0
  40. package/src/tasks/common-definitions.json +100 -0
  41. package/src/tasks/depth-estimation/about.md +45 -0
  42. package/src/tasks/depth-estimation/data.ts +70 -0
  43. package/src/tasks/depth-estimation/inference.ts +35 -0
  44. package/src/tasks/depth-estimation/spec/input.json +25 -0
  45. package/src/tasks/depth-estimation/spec/output.json +16 -0
  46. package/src/tasks/document-question-answering/about.md +53 -0
  47. package/src/tasks/document-question-answering/data.ts +85 -0
  48. package/src/tasks/document-question-answering/inference.ts +110 -0
  49. package/src/tasks/document-question-answering/spec/input.json +85 -0
  50. package/src/tasks/document-question-answering/spec/output.json +36 -0
  51. package/src/tasks/feature-extraction/about.md +72 -0
  52. package/src/tasks/feature-extraction/data.ts +57 -0
  53. package/src/tasks/feature-extraction/inference.ts +40 -0
  54. package/src/tasks/feature-extraction/spec/input.json +47 -0
  55. package/src/tasks/feature-extraction/spec/output.json +15 -0
  56. package/src/tasks/fill-mask/about.md +51 -0
  57. package/src/tasks/fill-mask/data.ts +79 -0
  58. package/src/tasks/fill-mask/inference.ts +62 -0
  59. package/src/tasks/fill-mask/spec/input.json +38 -0
  60. package/src/tasks/fill-mask/spec/output.json +29 -0
  61. package/src/tasks/image-classification/about.md +50 -0
  62. package/src/tasks/image-classification/data.ts +88 -0
  63. package/src/tasks/image-classification/inference.ts +52 -0
  64. package/src/tasks/image-classification/spec/input.json +35 -0
  65. package/src/tasks/image-classification/spec/output.json +11 -0
  66. package/src/tasks/image-feature-extraction/about.md +23 -0
  67. package/src/tasks/image-feature-extraction/data.ts +59 -0
  68. package/src/tasks/image-segmentation/about.md +63 -0
  69. package/src/tasks/image-segmentation/data.ts +99 -0
  70. package/src/tasks/image-segmentation/inference.ts +69 -0
  71. package/src/tasks/image-segmentation/spec/input.json +45 -0
  72. package/src/tasks/image-segmentation/spec/output.json +26 -0
  73. package/src/tasks/image-text-to-text/about.md +76 -0
  74. package/src/tasks/image-text-to-text/data.ts +102 -0
  75. package/src/tasks/image-to-3d/about.md +62 -0
  76. package/src/tasks/image-to-3d/data.ts +75 -0
  77. package/src/tasks/image-to-image/about.md +129 -0
  78. package/src/tasks/image-to-image/data.ts +101 -0
  79. package/src/tasks/image-to-image/inference.ts +68 -0
  80. package/src/tasks/image-to-image/spec/input.json +55 -0
  81. package/src/tasks/image-to-image/spec/output.json +12 -0
  82. package/src/tasks/image-to-text/about.md +61 -0
  83. package/src/tasks/image-to-text/data.ts +82 -0
  84. package/src/tasks/image-to-text/inference.ts +143 -0
  85. package/src/tasks/image-to-text/spec/input.json +34 -0
  86. package/src/tasks/image-to-text/spec/output.json +14 -0
  87. package/src/tasks/index.ts +312 -0
  88. package/src/tasks/keypoint-detection/about.md +57 -0
  89. package/src/tasks/keypoint-detection/data.ts +50 -0
  90. package/src/tasks/mask-generation/about.md +65 -0
  91. package/src/tasks/mask-generation/data.ts +55 -0
  92. package/src/tasks/object-detection/about.md +37 -0
  93. package/src/tasks/object-detection/data.ts +86 -0
  94. package/src/tasks/object-detection/inference.ts +75 -0
  95. package/src/tasks/object-detection/spec/input.json +31 -0
  96. package/src/tasks/object-detection/spec/output.json +50 -0
  97. package/src/tasks/placeholder/about.md +15 -0
  98. package/src/tasks/placeholder/data.ts +21 -0
  99. package/src/tasks/placeholder/spec/input.json +35 -0
  100. package/src/tasks/placeholder/spec/output.json +17 -0
  101. package/src/tasks/question-answering/about.md +56 -0
  102. package/src/tasks/question-answering/data.ts +75 -0
  103. package/src/tasks/question-answering/inference.ts +99 -0
  104. package/src/tasks/question-answering/spec/input.json +67 -0
  105. package/src/tasks/question-answering/spec/output.json +29 -0
  106. package/src/tasks/reinforcement-learning/about.md +167 -0
  107. package/src/tasks/reinforcement-learning/data.ts +75 -0
  108. package/src/tasks/sentence-similarity/about.md +97 -0
  109. package/src/tasks/sentence-similarity/data.ts +101 -0
  110. package/src/tasks/sentence-similarity/inference.ts +32 -0
  111. package/src/tasks/sentence-similarity/spec/input.json +40 -0
  112. package/src/tasks/sentence-similarity/spec/output.json +12 -0
  113. package/src/tasks/summarization/about.md +58 -0
  114. package/src/tasks/summarization/data.ts +76 -0
  115. package/src/tasks/summarization/inference.ts +57 -0
  116. package/src/tasks/summarization/spec/input.json +42 -0
  117. package/src/tasks/summarization/spec/output.json +14 -0
  118. package/src/tasks/table-question-answering/about.md +43 -0
  119. package/src/tasks/table-question-answering/data.ts +59 -0
  120. package/src/tasks/table-question-answering/inference.ts +61 -0
  121. package/src/tasks/table-question-answering/spec/input.json +44 -0
  122. package/src/tasks/table-question-answering/spec/output.json +40 -0
  123. package/src/tasks/tabular-classification/about.md +65 -0
  124. package/src/tasks/tabular-classification/data.ts +68 -0
  125. package/src/tasks/tabular-regression/about.md +87 -0
  126. package/src/tasks/tabular-regression/data.ts +57 -0
  127. package/src/tasks/text-classification/about.md +173 -0
  128. package/src/tasks/text-classification/data.ts +103 -0
  129. package/src/tasks/text-classification/inference.ts +51 -0
  130. package/src/tasks/text-classification/spec/input.json +35 -0
  131. package/src/tasks/text-classification/spec/output.json +11 -0
  132. package/src/tasks/text-generation/about.md +154 -0
  133. package/src/tasks/text-generation/data.ts +114 -0
  134. package/src/tasks/text-generation/inference.ts +200 -0
  135. package/src/tasks/text-generation/spec/input.json +219 -0
  136. package/src/tasks/text-generation/spec/output.json +179 -0
  137. package/src/tasks/text-generation/spec/stream_output.json +103 -0
  138. package/src/tasks/text-to-3d/about.md +62 -0
  139. package/src/tasks/text-to-3d/data.ts +56 -0
  140. package/src/tasks/text-to-audio/inference.ts +143 -0
  141. package/src/tasks/text-to-audio/spec/input.json +31 -0
  142. package/src/tasks/text-to-audio/spec/output.json +17 -0
  143. package/src/tasks/text-to-image/about.md +96 -0
  144. package/src/tasks/text-to-image/data.ts +100 -0
  145. package/src/tasks/text-to-image/inference.ts +75 -0
  146. package/src/tasks/text-to-image/spec/input.json +63 -0
  147. package/src/tasks/text-to-image/spec/output.json +13 -0
  148. package/src/tasks/text-to-speech/about.md +63 -0
  149. package/src/tasks/text-to-speech/data.ts +79 -0
  150. package/src/tasks/text-to-speech/inference.ts +145 -0
  151. package/src/tasks/text-to-speech/spec/input.json +31 -0
  152. package/src/tasks/text-to-speech/spec/output.json +7 -0
  153. package/src/tasks/text-to-video/about.md +41 -0
  154. package/src/tasks/text-to-video/data.ts +102 -0
  155. package/src/tasks/text2text-generation/inference.ts +55 -0
  156. package/src/tasks/text2text-generation/spec/input.json +55 -0
  157. package/src/tasks/text2text-generation/spec/output.json +14 -0
  158. package/src/tasks/token-classification/about.md +76 -0
  159. package/src/tasks/token-classification/data.ts +92 -0
  160. package/src/tasks/token-classification/inference.ts +85 -0
  161. package/src/tasks/token-classification/spec/input.json +65 -0
  162. package/src/tasks/token-classification/spec/output.json +37 -0
  163. package/src/tasks/translation/about.md +65 -0
  164. package/src/tasks/translation/data.ts +70 -0
  165. package/src/tasks/translation/inference.ts +67 -0
  166. package/src/tasks/translation/spec/input.json +50 -0
  167. package/src/tasks/translation/spec/output.json +14 -0
  168. package/src/tasks/unconditional-image-generation/about.md +50 -0
  169. package/src/tasks/unconditional-image-generation/data.ts +72 -0
  170. package/src/tasks/video-classification/about.md +37 -0
  171. package/src/tasks/video-classification/data.ts +84 -0
  172. package/src/tasks/video-classification/inference.ts +59 -0
  173. package/src/tasks/video-classification/spec/input.json +42 -0
  174. package/src/tasks/video-classification/spec/output.json +10 -0
  175. package/src/tasks/video-text-to-text/about.md +98 -0
  176. package/src/tasks/video-text-to-text/data.ts +66 -0
  177. package/src/tasks/visual-question-answering/about.md +48 -0
  178. package/src/tasks/visual-question-answering/data.ts +97 -0
  179. package/src/tasks/visual-question-answering/inference.ts +62 -0
  180. package/src/tasks/visual-question-answering/spec/input.json +41 -0
  181. package/src/tasks/visual-question-answering/spec/output.json +21 -0
  182. package/src/tasks/zero-shot-classification/about.md +40 -0
  183. package/src/tasks/zero-shot-classification/data.ts +70 -0
  184. package/src/tasks/zero-shot-classification/inference.ts +67 -0
  185. package/src/tasks/zero-shot-classification/spec/input.json +50 -0
  186. package/src/tasks/zero-shot-classification/spec/output.json +11 -0
  187. package/src/tasks/zero-shot-image-classification/about.md +75 -0
  188. package/src/tasks/zero-shot-image-classification/data.ts +84 -0
  189. package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
  190. package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
  191. package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
  192. package/src/tasks/zero-shot-object-detection/about.md +45 -0
  193. package/src/tasks/zero-shot-object-detection/data.ts +67 -0
  194. package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
  195. package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
  196. package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
  197. package/src/tokenizer-data.ts +32 -0
  198. package/src/widget-example.ts +125 -0
@@ -0,0 +1,70 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "512-element X-vector embeddings of speakers from CMU ARCTIC dataset.",
7
+ id: "Matthijs/cmu-arctic-xvectors",
8
+ },
9
+ ],
10
+ demo: {
11
+ inputs: [
12
+ {
13
+ filename: "input.wav",
14
+ type: "audio",
15
+ },
16
+ ],
17
+ outputs: [
18
+ {
19
+ filename: "label-0.wav",
20
+ type: "audio",
21
+ },
22
+ {
23
+ filename: "label-1.wav",
24
+ type: "audio",
25
+ },
26
+ ],
27
+ },
28
+ metrics: [
29
+ {
30
+ description:
31
+ "The Signal-to-Noise ratio is the relationship between the target signal level and the background noise level. It is calculated as the logarithm of the target signal divided by the background noise, in decibels.",
32
+ id: "snri",
33
+ },
34
+ {
35
+ description:
36
+ "The Signal-to-Distortion ratio is the relationship between the target signal and the sum of noise, interference, and artifact errors",
37
+ id: "sdri",
38
+ },
39
+ ],
40
+ models: [
41
+ {
42
+ description: "A solid model of audio source separation.",
43
+ id: "speechbrain/sepformer-wham",
44
+ },
45
+ {
46
+ description: "A speech enhancement model.",
47
+ id: "ResembleAI/resemble-enhance",
48
+ },
49
+ {
50
+ description: "A model that can change the voice in a speech recording.",
51
+ id: "microsoft/speecht5_vc",
52
+ },
53
+ ],
54
+ spaces: [
55
+ {
56
+ description: "An application for speech separation.",
57
+ id: "younver/speechbrain-speech-separation",
58
+ },
59
+ {
60
+ description: "An application for audio style transfer.",
61
+ id: "nakas/audio-diffusion_style_transfer",
62
+ },
63
+ ],
64
+ summary:
65
+ "Audio-to-Audio is a family of tasks in which the input is an audio and the output is one or multiple generated audios. Some example tasks are speech enhancement and source separation.",
66
+ widgetModels: ["speechbrain/sepformer-wham"],
67
+ youtubeId: "iohj7nCCYoM",
68
+ };
69
+
70
+ export default taskData;
@@ -0,0 +1,90 @@
1
+ ## Use Cases
2
+
3
+ ### Virtual Speech Assistants
4
+
5
+ Many edge devices have an embedded virtual assistant to interact with the end users better. These assistances rely on ASR models to recognize different voice commands to perform various tasks. For instance, you can ask your phone for dialing a phone number, ask a general question, or schedule a meeting.
6
+
7
+ ### Caption Generation
8
+
9
+ A caption generation model takes audio as input from sources to generate automatic captions through transcription, for live-streamed or recorded videos. This can help with content accessibility. For example, an audience watching a video that includes a non-native language, can rely on captions to interpret the content. It can also help with information retention at online-classes environments improving knowledge assimilation while reading and taking notes faster.
10
+
11
+ ## Task Variants
12
+
13
+ ### Multilingual ASR
14
+
15
+ Multilingual ASR models can convert audio inputs with multiple languages into transcripts. Some multilingual ASR models include [language identification](https://huggingface.co/tasks/audio-classification) blocks to improve the performance.
16
+
17
+ The use of Multilingual ASR has become popular, the idea of maintaining just a single model for all language can simplify the production pipeline. Take a look at [Whisper](https://huggingface.co/openai/whisper-large-v2) to get an idea on how 100+ languages can be processed by a single model.
18
+
19
+ ## Inference
20
+
21
+ The Hub contains over [17,000 ASR models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=downloads) that you can test right away in your browser using the model page widgets. You can also use any model as a service using the Serverless Inference API. We also support libraries such as [transformers](https://huggingface.co/models?library=transformers&pipeline_tag=automatic-speech-recognition&sort=downloads), [speechbrain](https://huggingface.co/models?library=speechbrain&pipeline_tag=automatic-speech-recognition&sort=downloads), [NeMo](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&library=nemo&sort=downloads) and [espnet](https://huggingface.co/models?library=espnet&pipeline_tag=automatic-speech-recognition&sort=downloads) via the Serverless Inference API. Here's a simple code snippet to run inference:
22
+
23
+ ```python
24
+ import json
25
+ import requests
26
+
27
+ headers = {"Authorization": f"Bearer {API_TOKEN}"}
28
+ API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
29
+
30
+ def query(filename):
31
+ with open(filename, "rb") as f:
32
+ data = f.read()
33
+ response = requests.request("POST", API_URL, headers=headers, data=data)
34
+ return json.loads(response.content.decode("utf-8"))
35
+
36
+ data = query("sample1.flac")
37
+ ```
38
+
39
+ You can also use [huggingface.js](https://github.com/huggingface/huggingface.js), the JavaScript client, to transcribe audio with the Serverless Inference API.
40
+
41
+ ```javascript
42
+ import { HfInference } from "@huggingface/inference";
43
+
44
+ const inference = new HfInference(HF_TOKEN);
45
+ await inference.automaticSpeechRecognition({
46
+ data: await (await fetch("sample.flac")).blob(),
47
+ model: "openai/whisper-large-v3",
48
+ });
49
+ ```
50
+
51
+ For transformers-compatible models like Whisper, Wav2Vec2, and HuBERT, you can also run inference with the library as follows:
52
+
53
+ ```python
54
+ # pip install --upgrade transformers
55
+
56
+ from transformers import pipeline
57
+
58
+ pipe = pipeline("automatic-speech-recognition", "openai/whisper-large-v3")
59
+
60
+ pipe("sample.flac")
61
+ # {'text': "GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES IN DRAUGHTY SCHOOL ROOMS DAY AFTER DAY FOR A FORTNIGHT HE'LL HAVE TO PUT IN AN APPEARANCE AT SOME PLACE OF WORSHIP ON SUNDAY MORNING AND HE CAN COME TO US IMMEDIATELY AFTERWARDS"}
62
+ ```
63
+
64
+ ## Solving ASR for your own data
65
+
66
+ We have some great news! You can fine-tune (transfer learning) a foundational speech model on a specific language without tonnes of data. Pretrained models such as Whisper, Wav2Vec2-MMS and HuBERT exist. [OpenAI's Whisper model](https://huggingface.co/openai/whisper-large-v3) is a large multilingual model trained on 100+ languages and with 4 Million hours of speech.
67
+
68
+ The following detailed [blog post](https://huggingface.co/blog/fine-tune-whisper) shows how to fine-tune a pre-trained Whisper checkpoint on labeled data for ASR. With the right data and strategy you can fine-tune a high-performant model on a free Google Colab instance too. We suggest to read the blog post for more info!
69
+
70
+ ## Hugging Face Whisper Event
71
+
72
+ On December 2022, over 450 participants collaborated, fine-tuned and shared 600+ ASR Whisper models in 100+ different languages. You can compare these models on the event's speech recognition [leaderboard](https://huggingface.co/spaces/whisper-event/leaderboard?dataset=mozilla-foundation%2Fcommon_voice_11_0&config=ar&split=test).
73
+
74
+ These events help democratize ASR for all languages, including low-resource languages. In addition to the trained models, the [event](https://github.com/huggingface/community-events/tree/main/whisper-fine-tuning-event) helps to build practical collaborative knowledge.
75
+
76
+ ## Useful Resources
77
+
78
+ - [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/chapter5/introduction)
79
+ - [Fine-tuning MetaAI's MMS Adapter Models for Multi-Lingual ASR](https://huggingface.co/blog/mms_adapters)
80
+ - [Making automatic speech recognition work on large files with Wav2Vec2 in 🤗 Transformers](https://huggingface.co/blog/asr-chunking)
81
+ - [Boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram)
82
+ - [ML for Audio Study Group - Intro to Audio and ASR Deep Dive](https://www.youtube.com/watch?v=D-MH6YjuIlE)
83
+ - [Massively Multilingual ASR: 50 Languages, 1 Model, 1 Billion Parameters](https://arxiv.org/pdf/2007.03001.pdf)
84
+ - An ASR toolkit made by [NVIDIA: NeMo](https://github.com/NVIDIA/NeMo) with code and pretrained models useful for new ASR models. Watch the [introductory video](https://www.youtube.com/embed/wBgpMf_KQVw) for an overview.
85
+ - [An introduction to SpeechT5, a multi-purpose speech recognition and synthesis model](https://huggingface.co/blog/speecht5)
86
+ - [Fine-tune Whisper For Multilingual ASR with 🤗Transformers](https://huggingface.co/blog/fine-tune-whisper)
87
+ - [Automatic speech recognition task guide](https://huggingface.co/docs/transformers/tasks/asr)
88
+ - [Speech Synthesis, Recognition, and More With SpeechT5](https://huggingface.co/blog/speecht5)
89
+ - [Fine-Tune W2V2-Bert for low-resource ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-w2v2-bert)
90
+ - [Speculative Decoding for 2x Faster Whisper Inference](https://huggingface.co/blog/whisper-speculative-decoding)
@@ -0,0 +1,82 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
7
+ id: "mozilla-foundation/common_voice_17_0",
8
+ },
9
+ {
10
+ description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
11
+ id: "parler-tts/mls_eng",
12
+ },
13
+ {
14
+ description: "A multi-lingual audio dataset with 370K hours of audio.",
15
+ id: "espnet/yodas",
16
+ },
17
+ ],
18
+ demo: {
19
+ inputs: [
20
+ {
21
+ filename: "input.flac",
22
+ type: "audio",
23
+ },
24
+ ],
25
+ outputs: [
26
+ {
27
+ /// GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES I
28
+ label: "Transcript",
29
+ content: "Going along slushy country roads and speaking to damp audiences in...",
30
+ type: "text",
31
+ },
32
+ ],
33
+ },
34
+ metrics: [
35
+ {
36
+ description: "",
37
+ id: "wer",
38
+ },
39
+ {
40
+ description: "",
41
+ id: "cer",
42
+ },
43
+ ],
44
+ models: [
45
+ {
46
+ description: "A powerful ASR model by OpenAI.",
47
+ id: "openai/whisper-large-v3",
48
+ },
49
+ {
50
+ description: "A good generic speech model by MetaAI for fine-tuning.",
51
+ id: "facebook/w2v-bert-2.0",
52
+ },
53
+ {
54
+ description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
55
+ id: "facebook/seamless-m4t-v2-large",
56
+ },
57
+ {
58
+ description: "Powerful speaker diarization model.",
59
+ id: "pyannote/speaker-diarization-3.1",
60
+ },
61
+ ],
62
+ spaces: [
63
+ {
64
+ description: "A powerful general-purpose speech recognition application.",
65
+ id: "hf-audio/whisper-large-v3",
66
+ },
67
+ {
68
+ description: "Fastest speech recognition application.",
69
+ id: "sanchit-gandhi/whisper-jax",
70
+ },
71
+ {
72
+ description: "A high quality speech and text translation model by Meta.",
73
+ id: "facebook/seamless_m4t",
74
+ },
75
+ ],
76
+ summary:
77
+ "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
78
+ widgetModels: ["openai/whisper-large-v3"],
79
+ youtubeId: "TksaY_FDgnk",
80
+ };
81
+
82
+ export default taskData;
@@ -0,0 +1,160 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+
7
+ /**
8
+ * Inputs for Automatic Speech Recognition inference
9
+ */
10
+ export interface AutomaticSpeechRecognitionInput {
11
+ /**
12
+ * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
13
+ * also provide the audio data as a raw bytes payload.
14
+ */
15
+ inputs: string;
16
+ /**
17
+ * Additional inference parameters
18
+ */
19
+ parameters?: AutomaticSpeechRecognitionParameters;
20
+ [property: string]: unknown;
21
+ }
22
+
23
+ /**
24
+ * Additional inference parameters
25
+ *
26
+ * Additional inference parameters for Automatic Speech Recognition
27
+ */
28
+ export interface AutomaticSpeechRecognitionParameters {
29
+ /**
30
+ * Parametrization of the text generation process
31
+ */
32
+ generation_parameters?: GenerationParameters;
33
+ /**
34
+ * Whether to output corresponding timestamps with the generated text
35
+ */
36
+ return_timestamps?: boolean;
37
+ [property: string]: unknown;
38
+ }
39
+
40
+ /**
41
+ * Parametrization of the text generation process
42
+ *
43
+ * Ad-hoc parametrization of the text generation process
44
+ */
45
+ export interface GenerationParameters {
46
+ /**
47
+ * Whether to use sampling instead of greedy decoding when generating new tokens.
48
+ */
49
+ do_sample?: boolean;
50
+ /**
51
+ * Controls the stopping condition for beam-based methods.
52
+ */
53
+ early_stopping?: EarlyStoppingUnion;
54
+ /**
55
+ * If set to float strictly between 0 and 1, only tokens with a conditional probability
56
+ * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
57
+ * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
58
+ * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
59
+ */
60
+ epsilon_cutoff?: number;
61
+ /**
62
+ * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
63
+ * float strictly between 0 and 1, a token is only considered if it is greater than either
64
+ * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
65
+ * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
66
+ * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
67
+ * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
68
+ * for more details.
69
+ */
70
+ eta_cutoff?: number;
71
+ /**
72
+ * The maximum length (in tokens) of the generated text, including the input.
73
+ */
74
+ max_length?: number;
75
+ /**
76
+ * The maximum number of tokens to generate. Takes precedence over max_length.
77
+ */
78
+ max_new_tokens?: number;
79
+ /**
80
+ * The minimum length (in tokens) of the generated text, including the input.
81
+ */
82
+ min_length?: number;
83
+ /**
84
+ * The minimum number of tokens to generate. Takes precedence over min_length.
85
+ */
86
+ min_new_tokens?: number;
87
+ /**
88
+ * Number of groups to divide num_beams into in order to ensure diversity among different
89
+ * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
90
+ */
91
+ num_beam_groups?: number;
92
+ /**
93
+ * Number of beams to use for beam search.
94
+ */
95
+ num_beams?: number;
96
+ /**
97
+ * The value balances the model confidence and the degeneration penalty in contrastive
98
+ * search decoding.
99
+ */
100
+ penalty_alpha?: number;
101
+ /**
102
+ * The value used to modulate the next token probabilities.
103
+ */
104
+ temperature?: number;
105
+ /**
106
+ * The number of highest probability vocabulary tokens to keep for top-k-filtering.
107
+ */
108
+ top_k?: number;
109
+ /**
110
+ * If set to float < 1, only the smallest set of most probable tokens with probabilities
111
+ * that add up to top_p or higher are kept for generation.
112
+ */
113
+ top_p?: number;
114
+ /**
115
+ * Local typicality measures how similar the conditional probability of predicting a target
116
+ * token next is to the expected conditional probability of predicting a random token next,
117
+ * given the partial text already generated. If set to float < 1, the smallest set of the
118
+ * most locally typical tokens with probabilities that add up to typical_p or higher are
119
+ * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
120
+ */
121
+ typical_p?: number;
122
+ /**
123
+ * Whether the model should use the past last key/values attentions to speed up decoding
124
+ */
125
+ use_cache?: boolean;
126
+ [property: string]: unknown;
127
+ }
128
+
129
+ /**
130
+ * Controls the stopping condition for beam-based methods.
131
+ */
132
+ export type EarlyStoppingUnion = boolean | "never";
133
+
134
+ /**
135
+ * Outputs of inference for the Automatic Speech Recognition task
136
+ */
137
+ export interface AutomaticSpeechRecognitionOutput {
138
+ /**
139
+ * When returnTimestamps is enabled, chunks contains a list of audio chunks identified by
140
+ * the model.
141
+ */
142
+ chunks?: AutomaticSpeechRecognitionOutputChunk[];
143
+ /**
144
+ * The recognized text.
145
+ */
146
+ text: string;
147
+ [property: string]: unknown;
148
+ }
149
+
150
+ export interface AutomaticSpeechRecognitionOutputChunk {
151
+ /**
152
+ * A chunk of text identified by the model
153
+ */
154
+ text: string;
155
+ /**
156
+ * The start and end timestamps corresponding with the text
157
+ */
158
+ timestamps: number[];
159
+ [property: string]: unknown;
160
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "$id": "/inference/schemas/automatic-speech-recognition/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Automatic Speech Recognition inference",
5
+ "title": "AutomaticSpeechRecognitionInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
10
+ "type": "string"
11
+ },
12
+ "parameters": {
13
+ "description": "Additional inference parameters",
14
+ "$ref": "#/$defs/AutomaticSpeechRecognitionParameters"
15
+ }
16
+ },
17
+ "$defs": {
18
+ "AutomaticSpeechRecognitionParameters": {
19
+ "title": "AutomaticSpeechRecognitionParameters",
20
+ "description": "Additional inference parameters for Automatic Speech Recognition",
21
+ "type": "object",
22
+ "properties": {
23
+ "return_timestamps": {
24
+ "type": "boolean",
25
+ "description": "Whether to output corresponding timestamps with the generated text"
26
+ },
27
+ "generation_parameters": {
28
+ "description": "Parametrization of the text generation process",
29
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
30
+ }
31
+ }
32
+ }
33
+ },
34
+ "required": ["inputs"]
35
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "$id": "/inference/schemas/automatic-speech-recognition/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Automatic Speech Recognition task",
5
+ "title": "AutomaticSpeechRecognitionOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "text": {
9
+ "type": "string",
10
+ "description": "The recognized text."
11
+ },
12
+ "chunks": {
13
+ "type": "array",
14
+ "description": "When returnTimestamps is enabled, chunks contains a list of audio chunks identified by the model.",
15
+ "items": {
16
+ "type": "object",
17
+ "title": "AutomaticSpeechRecognitionOutputChunk",
18
+ "properties": {
19
+ "text": {
20
+ "type": "string",
21
+ "description": "A chunk of text identified by the model"
22
+ },
23
+ "timestamps": {
24
+ "type": "array",
25
+ "description": "The start and end timestamps corresponding with the text",
26
+ "items": {
27
+ "type": "number"
28
+ },
29
+ "minLength": 2,
30
+ "maxLength": 2
31
+ }
32
+ },
33
+ "required": ["text", "timestamps"]
34
+ }
35
+ }
36
+ },
37
+ "required": ["text"]
38
+ }