@huggingface/tasks 0.13.1-test → 0.13.1-test2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/package.json +4 -2
  2. package/src/dataset-libraries.ts +89 -0
  3. package/src/default-widget-inputs.ts +718 -0
  4. package/src/gguf.ts +40 -0
  5. package/src/hardware.ts +482 -0
  6. package/src/index.ts +59 -0
  7. package/src/library-to-tasks.ts +76 -0
  8. package/src/local-apps.ts +412 -0
  9. package/src/model-data.ts +149 -0
  10. package/src/model-libraries-downloads.ts +18 -0
  11. package/src/model-libraries-snippets.ts +1128 -0
  12. package/src/model-libraries.ts +820 -0
  13. package/src/pipelines.ts +698 -0
  14. package/src/snippets/common.ts +39 -0
  15. package/src/snippets/curl.spec.ts +94 -0
  16. package/src/snippets/curl.ts +120 -0
  17. package/src/snippets/index.ts +7 -0
  18. package/src/snippets/inputs.ts +167 -0
  19. package/src/snippets/js.spec.ts +148 -0
  20. package/src/snippets/js.ts +305 -0
  21. package/src/snippets/python.spec.ts +144 -0
  22. package/src/snippets/python.ts +321 -0
  23. package/src/snippets/types.ts +16 -0
  24. package/src/tasks/audio-classification/about.md +86 -0
  25. package/src/tasks/audio-classification/data.ts +81 -0
  26. package/src/tasks/audio-classification/inference.ts +52 -0
  27. package/src/tasks/audio-classification/spec/input.json +35 -0
  28. package/src/tasks/audio-classification/spec/output.json +11 -0
  29. package/src/tasks/audio-to-audio/about.md +56 -0
  30. package/src/tasks/audio-to-audio/data.ts +70 -0
  31. package/src/tasks/automatic-speech-recognition/about.md +90 -0
  32. package/src/tasks/automatic-speech-recognition/data.ts +82 -0
  33. package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
  34. package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
  35. package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
  36. package/src/tasks/chat-completion/inference.ts +322 -0
  37. package/src/tasks/chat-completion/spec/input.json +350 -0
  38. package/src/tasks/chat-completion/spec/output.json +206 -0
  39. package/src/tasks/chat-completion/spec/stream_output.json +213 -0
  40. package/src/tasks/common-definitions.json +100 -0
  41. package/src/tasks/depth-estimation/about.md +45 -0
  42. package/src/tasks/depth-estimation/data.ts +70 -0
  43. package/src/tasks/depth-estimation/inference.ts +35 -0
  44. package/src/tasks/depth-estimation/spec/input.json +25 -0
  45. package/src/tasks/depth-estimation/spec/output.json +16 -0
  46. package/src/tasks/document-question-answering/about.md +53 -0
  47. package/src/tasks/document-question-answering/data.ts +85 -0
  48. package/src/tasks/document-question-answering/inference.ts +110 -0
  49. package/src/tasks/document-question-answering/spec/input.json +85 -0
  50. package/src/tasks/document-question-answering/spec/output.json +36 -0
  51. package/src/tasks/feature-extraction/about.md +72 -0
  52. package/src/tasks/feature-extraction/data.ts +57 -0
  53. package/src/tasks/feature-extraction/inference.ts +40 -0
  54. package/src/tasks/feature-extraction/spec/input.json +47 -0
  55. package/src/tasks/feature-extraction/spec/output.json +15 -0
  56. package/src/tasks/fill-mask/about.md +51 -0
  57. package/src/tasks/fill-mask/data.ts +79 -0
  58. package/src/tasks/fill-mask/inference.ts +62 -0
  59. package/src/tasks/fill-mask/spec/input.json +38 -0
  60. package/src/tasks/fill-mask/spec/output.json +29 -0
  61. package/src/tasks/image-classification/about.md +50 -0
  62. package/src/tasks/image-classification/data.ts +88 -0
  63. package/src/tasks/image-classification/inference.ts +52 -0
  64. package/src/tasks/image-classification/spec/input.json +35 -0
  65. package/src/tasks/image-classification/spec/output.json +11 -0
  66. package/src/tasks/image-feature-extraction/about.md +23 -0
  67. package/src/tasks/image-feature-extraction/data.ts +59 -0
  68. package/src/tasks/image-segmentation/about.md +63 -0
  69. package/src/tasks/image-segmentation/data.ts +99 -0
  70. package/src/tasks/image-segmentation/inference.ts +69 -0
  71. package/src/tasks/image-segmentation/spec/input.json +45 -0
  72. package/src/tasks/image-segmentation/spec/output.json +26 -0
  73. package/src/tasks/image-text-to-text/about.md +76 -0
  74. package/src/tasks/image-text-to-text/data.ts +102 -0
  75. package/src/tasks/image-to-3d/about.md +62 -0
  76. package/src/tasks/image-to-3d/data.ts +75 -0
  77. package/src/tasks/image-to-image/about.md +129 -0
  78. package/src/tasks/image-to-image/data.ts +101 -0
  79. package/src/tasks/image-to-image/inference.ts +68 -0
  80. package/src/tasks/image-to-image/spec/input.json +55 -0
  81. package/src/tasks/image-to-image/spec/output.json +12 -0
  82. package/src/tasks/image-to-text/about.md +61 -0
  83. package/src/tasks/image-to-text/data.ts +82 -0
  84. package/src/tasks/image-to-text/inference.ts +143 -0
  85. package/src/tasks/image-to-text/spec/input.json +34 -0
  86. package/src/tasks/image-to-text/spec/output.json +14 -0
  87. package/src/tasks/index.ts +312 -0
  88. package/src/tasks/keypoint-detection/about.md +57 -0
  89. package/src/tasks/keypoint-detection/data.ts +50 -0
  90. package/src/tasks/mask-generation/about.md +65 -0
  91. package/src/tasks/mask-generation/data.ts +55 -0
  92. package/src/tasks/object-detection/about.md +37 -0
  93. package/src/tasks/object-detection/data.ts +86 -0
  94. package/src/tasks/object-detection/inference.ts +75 -0
  95. package/src/tasks/object-detection/spec/input.json +31 -0
  96. package/src/tasks/object-detection/spec/output.json +50 -0
  97. package/src/tasks/placeholder/about.md +15 -0
  98. package/src/tasks/placeholder/data.ts +21 -0
  99. package/src/tasks/placeholder/spec/input.json +35 -0
  100. package/src/tasks/placeholder/spec/output.json +17 -0
  101. package/src/tasks/question-answering/about.md +56 -0
  102. package/src/tasks/question-answering/data.ts +75 -0
  103. package/src/tasks/question-answering/inference.ts +99 -0
  104. package/src/tasks/question-answering/spec/input.json +67 -0
  105. package/src/tasks/question-answering/spec/output.json +29 -0
  106. package/src/tasks/reinforcement-learning/about.md +167 -0
  107. package/src/tasks/reinforcement-learning/data.ts +75 -0
  108. package/src/tasks/sentence-similarity/about.md +97 -0
  109. package/src/tasks/sentence-similarity/data.ts +101 -0
  110. package/src/tasks/sentence-similarity/inference.ts +32 -0
  111. package/src/tasks/sentence-similarity/spec/input.json +40 -0
  112. package/src/tasks/sentence-similarity/spec/output.json +12 -0
  113. package/src/tasks/summarization/about.md +58 -0
  114. package/src/tasks/summarization/data.ts +76 -0
  115. package/src/tasks/summarization/inference.ts +57 -0
  116. package/src/tasks/summarization/spec/input.json +42 -0
  117. package/src/tasks/summarization/spec/output.json +14 -0
  118. package/src/tasks/table-question-answering/about.md +43 -0
  119. package/src/tasks/table-question-answering/data.ts +59 -0
  120. package/src/tasks/table-question-answering/inference.ts +61 -0
  121. package/src/tasks/table-question-answering/spec/input.json +44 -0
  122. package/src/tasks/table-question-answering/spec/output.json +40 -0
  123. package/src/tasks/tabular-classification/about.md +65 -0
  124. package/src/tasks/tabular-classification/data.ts +68 -0
  125. package/src/tasks/tabular-regression/about.md +87 -0
  126. package/src/tasks/tabular-regression/data.ts +57 -0
  127. package/src/tasks/text-classification/about.md +173 -0
  128. package/src/tasks/text-classification/data.ts +103 -0
  129. package/src/tasks/text-classification/inference.ts +51 -0
  130. package/src/tasks/text-classification/spec/input.json +35 -0
  131. package/src/tasks/text-classification/spec/output.json +11 -0
  132. package/src/tasks/text-generation/about.md +154 -0
  133. package/src/tasks/text-generation/data.ts +114 -0
  134. package/src/tasks/text-generation/inference.ts +200 -0
  135. package/src/tasks/text-generation/spec/input.json +219 -0
  136. package/src/tasks/text-generation/spec/output.json +179 -0
  137. package/src/tasks/text-generation/spec/stream_output.json +103 -0
  138. package/src/tasks/text-to-3d/about.md +62 -0
  139. package/src/tasks/text-to-3d/data.ts +56 -0
  140. package/src/tasks/text-to-audio/inference.ts +143 -0
  141. package/src/tasks/text-to-audio/spec/input.json +31 -0
  142. package/src/tasks/text-to-audio/spec/output.json +17 -0
  143. package/src/tasks/text-to-image/about.md +96 -0
  144. package/src/tasks/text-to-image/data.ts +100 -0
  145. package/src/tasks/text-to-image/inference.ts +75 -0
  146. package/src/tasks/text-to-image/spec/input.json +63 -0
  147. package/src/tasks/text-to-image/spec/output.json +13 -0
  148. package/src/tasks/text-to-speech/about.md +63 -0
  149. package/src/tasks/text-to-speech/data.ts +79 -0
  150. package/src/tasks/text-to-speech/inference.ts +145 -0
  151. package/src/tasks/text-to-speech/spec/input.json +31 -0
  152. package/src/tasks/text-to-speech/spec/output.json +7 -0
  153. package/src/tasks/text-to-video/about.md +41 -0
  154. package/src/tasks/text-to-video/data.ts +102 -0
  155. package/src/tasks/text2text-generation/inference.ts +55 -0
  156. package/src/tasks/text2text-generation/spec/input.json +55 -0
  157. package/src/tasks/text2text-generation/spec/output.json +14 -0
  158. package/src/tasks/token-classification/about.md +76 -0
  159. package/src/tasks/token-classification/data.ts +92 -0
  160. package/src/tasks/token-classification/inference.ts +85 -0
  161. package/src/tasks/token-classification/spec/input.json +65 -0
  162. package/src/tasks/token-classification/spec/output.json +37 -0
  163. package/src/tasks/translation/about.md +65 -0
  164. package/src/tasks/translation/data.ts +70 -0
  165. package/src/tasks/translation/inference.ts +67 -0
  166. package/src/tasks/translation/spec/input.json +50 -0
  167. package/src/tasks/translation/spec/output.json +14 -0
  168. package/src/tasks/unconditional-image-generation/about.md +50 -0
  169. package/src/tasks/unconditional-image-generation/data.ts +72 -0
  170. package/src/tasks/video-classification/about.md +37 -0
  171. package/src/tasks/video-classification/data.ts +84 -0
  172. package/src/tasks/video-classification/inference.ts +59 -0
  173. package/src/tasks/video-classification/spec/input.json +42 -0
  174. package/src/tasks/video-classification/spec/output.json +10 -0
  175. package/src/tasks/video-text-to-text/about.md +98 -0
  176. package/src/tasks/video-text-to-text/data.ts +66 -0
  177. package/src/tasks/visual-question-answering/about.md +48 -0
  178. package/src/tasks/visual-question-answering/data.ts +97 -0
  179. package/src/tasks/visual-question-answering/inference.ts +62 -0
  180. package/src/tasks/visual-question-answering/spec/input.json +41 -0
  181. package/src/tasks/visual-question-answering/spec/output.json +21 -0
  182. package/src/tasks/zero-shot-classification/about.md +40 -0
  183. package/src/tasks/zero-shot-classification/data.ts +70 -0
  184. package/src/tasks/zero-shot-classification/inference.ts +67 -0
  185. package/src/tasks/zero-shot-classification/spec/input.json +50 -0
  186. package/src/tasks/zero-shot-classification/spec/output.json +11 -0
  187. package/src/tasks/zero-shot-image-classification/about.md +75 -0
  188. package/src/tasks/zero-shot-image-classification/data.ts +84 -0
  189. package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
  190. package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
  191. package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
  192. package/src/tasks/zero-shot-object-detection/about.md +45 -0
  193. package/src/tasks/zero-shot-object-detection/data.ts +67 -0
  194. package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
  195. package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
  196. package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
  197. package/src/tokenizer-data.ts +32 -0
  198. package/src/widget-example.ts +125 -0
@@ -0,0 +1,321 @@
1
+ import type { PipelineType } from "../pipelines.js";
2
+ import type { ChatCompletionInputMessage, GenerationParameters } from "../tasks/index.js";
3
+ import { stringifyGenerationConfig, stringifyMessages } from "./common.js";
4
+ import { getModelInputSnippet } from "./inputs.js";
5
+ import type { InferenceSnippet, ModelDataMinimal } from "./types.js";
6
+
7
+ const snippetImportInferenceClient = (model: ModelDataMinimal, accessToken: string): string =>
8
+ `from huggingface_hub import InferenceClient
9
+ client = InferenceClient("${model.id}", token="${accessToken || "{API_TOKEN}"}")
10
+ `;
11
+
12
+ export const snippetConversational = (
13
+ model: ModelDataMinimal,
14
+ accessToken: string,
15
+ opts?: {
16
+ streaming?: boolean;
17
+ messages?: ChatCompletionInputMessage[];
18
+ temperature?: GenerationParameters["temperature"];
19
+ max_tokens?: GenerationParameters["max_tokens"];
20
+ top_p?: GenerationParameters["top_p"];
21
+ }
22
+ ): InferenceSnippet[] => {
23
+ const streaming = opts?.streaming ?? true;
24
+ const exampleMessages = getModelInputSnippet(model) as ChatCompletionInputMessage[];
25
+ const messages = opts?.messages ?? exampleMessages;
26
+ const messagesStr = stringifyMessages(messages, { attributeKeyQuotes: true });
27
+
28
+ const config = {
29
+ ...(opts?.temperature ? { temperature: opts.temperature } : undefined),
30
+ max_tokens: opts?.max_tokens ?? 500,
31
+ ...(opts?.top_p ? { top_p: opts.top_p } : undefined),
32
+ };
33
+ const configStr = stringifyGenerationConfig(config, {
34
+ indent: "\n\t",
35
+ attributeValueConnector: "=",
36
+ });
37
+
38
+ if (streaming) {
39
+ return [
40
+ {
41
+ client: "huggingface_hub",
42
+ content: `from huggingface_hub import InferenceClient
43
+
44
+ client = InferenceClient(api_key="${accessToken || "{API_TOKEN}"}")
45
+
46
+ messages = ${messagesStr}
47
+
48
+ stream = client.chat.completions.create(
49
+ model="${model.id}",
50
+ messages=messages,
51
+ ${configStr},
52
+ stream=True
53
+ )
54
+
55
+ for chunk in stream:
56
+ print(chunk.choices[0].delta.content, end="")`,
57
+ },
58
+ {
59
+ client: "openai",
60
+ content: `from openai import OpenAI
61
+
62
+ client = OpenAI(
63
+ base_url="https://api-inference.huggingface.co/v1/",
64
+ api_key="${accessToken || "{API_TOKEN}"}"
65
+ )
66
+
67
+ messages = ${messagesStr}
68
+
69
+ stream = client.chat.completions.create(
70
+ model="${model.id}",
71
+ messages=messages,
72
+ ${configStr},
73
+ stream=True
74
+ )
75
+
76
+ for chunk in stream:
77
+ print(chunk.choices[0].delta.content, end="")`,
78
+ },
79
+ ];
80
+ } else {
81
+ return [
82
+ {
83
+ client: "huggingface_hub",
84
+ content: `from huggingface_hub import InferenceClient
85
+
86
+ client = InferenceClient(api_key="${accessToken || "{API_TOKEN}"}")
87
+
88
+ messages = ${messagesStr}
89
+
90
+ completion = client.chat.completions.create(
91
+ model="${model.id}",
92
+ messages=messages,
93
+ ${configStr}
94
+ )
95
+
96
+ print(completion.choices[0].message)`,
97
+ },
98
+ {
99
+ client: "openai",
100
+ content: `from openai import OpenAI
101
+
102
+ client = OpenAI(
103
+ base_url="https://api-inference.huggingface.co/v1/",
104
+ api_key="${accessToken || "{API_TOKEN}"}"
105
+ )
106
+
107
+ messages = ${messagesStr}
108
+
109
+ completion = client.chat.completions.create(
110
+ model="${model.id}",
111
+ messages=messages,
112
+ ${configStr}
113
+ )
114
+
115
+ print(completion.choices[0].message)`,
116
+ },
117
+ ];
118
+ }
119
+ };
120
+
121
+ export const snippetZeroShotClassification = (model: ModelDataMinimal): InferenceSnippet => ({
122
+ content: `def query(payload):
123
+ response = requests.post(API_URL, headers=headers, json=payload)
124
+ return response.json()
125
+
126
+ output = query({
127
+ "inputs": ${getModelInputSnippet(model)},
128
+ "parameters": {"candidate_labels": ["refund", "legal", "faq"]},
129
+ })`,
130
+ });
131
+
132
+ export const snippetZeroShotImageClassification = (model: ModelDataMinimal): InferenceSnippet => ({
133
+ content: `def query(data):
134
+ with open(data["image_path"], "rb") as f:
135
+ img = f.read()
136
+ payload={
137
+ "parameters": data["parameters"],
138
+ "inputs": base64.b64encode(img).decode("utf-8")
139
+ }
140
+ response = requests.post(API_URL, headers=headers, json=payload)
141
+ return response.json()
142
+
143
+ output = query({
144
+ "image_path": ${getModelInputSnippet(model)},
145
+ "parameters": {"candidate_labels": ["cat", "dog", "llama"]},
146
+ })`,
147
+ });
148
+
149
+ export const snippetBasic = (model: ModelDataMinimal): InferenceSnippet => ({
150
+ content: `def query(payload):
151
+ response = requests.post(API_URL, headers=headers, json=payload)
152
+ return response.json()
153
+
154
+ output = query({
155
+ "inputs": ${getModelInputSnippet(model)},
156
+ })`,
157
+ });
158
+
159
+ export const snippetFile = (model: ModelDataMinimal): InferenceSnippet => ({
160
+ content: `def query(filename):
161
+ with open(filename, "rb") as f:
162
+ data = f.read()
163
+ response = requests.post(API_URL, headers=headers, data=data)
164
+ return response.json()
165
+
166
+ output = query(${getModelInputSnippet(model)})`,
167
+ });
168
+
169
+ export const snippetTextToImage = (model: ModelDataMinimal, accessToken: string): InferenceSnippet[] => [
170
+ {
171
+ client: "huggingface_hub",
172
+ content: `${snippetImportInferenceClient(model, accessToken)}
173
+ # output is a PIL.Image object
174
+ image = client.text_to_image(${getModelInputSnippet(model)})`,
175
+ },
176
+ {
177
+ client: "requests",
178
+ content: `def query(payload):
179
+ response = requests.post(API_URL, headers=headers, json=payload)
180
+ return response.content
181
+ image_bytes = query({
182
+ "inputs": ${getModelInputSnippet(model)},
183
+ })
184
+
185
+ # You can access the image with PIL.Image for example
186
+ import io
187
+ from PIL import Image
188
+ image = Image.open(io.BytesIO(image_bytes))`,
189
+ },
190
+ ];
191
+
192
+ export const snippetTabular = (model: ModelDataMinimal): InferenceSnippet => ({
193
+ content: `def query(payload):
194
+ response = requests.post(API_URL, headers=headers, json=payload)
195
+ return response.content
196
+ response = query({
197
+ "inputs": {"data": ${getModelInputSnippet(model)}},
198
+ })`,
199
+ });
200
+
201
+ export const snippetTextToAudio = (model: ModelDataMinimal): InferenceSnippet => {
202
+ // Transformers TTS pipeline and api-inference-community (AIC) pipeline outputs are diverged
203
+ // with the latest update to inference-api (IA).
204
+ // Transformers IA returns a byte object (wav file), whereas AIC returns wav and sampling_rate.
205
+ if (model.library_name === "transformers") {
206
+ return {
207
+ content: `def query(payload):
208
+ response = requests.post(API_URL, headers=headers, json=payload)
209
+ return response.content
210
+
211
+ audio_bytes = query({
212
+ "inputs": ${getModelInputSnippet(model)},
213
+ })
214
+ # You can access the audio with IPython.display for example
215
+ from IPython.display import Audio
216
+ Audio(audio_bytes)`,
217
+ };
218
+ } else {
219
+ return {
220
+ content: `def query(payload):
221
+ response = requests.post(API_URL, headers=headers, json=payload)
222
+ return response.json()
223
+
224
+ audio, sampling_rate = query({
225
+ "inputs": ${getModelInputSnippet(model)},
226
+ })
227
+ # You can access the audio with IPython.display for example
228
+ from IPython.display import Audio
229
+ Audio(audio, rate=sampling_rate)`,
230
+ };
231
+ }
232
+ };
233
+
234
+ export const snippetDocumentQuestionAnswering = (model: ModelDataMinimal): InferenceSnippet => ({
235
+ content: `def query(payload):
236
+ with open(payload["image"], "rb") as f:
237
+ img = f.read()
238
+ payload["image"] = base64.b64encode(img).decode("utf-8")
239
+ response = requests.post(API_URL, headers=headers, json=payload)
240
+ return response.json()
241
+
242
+ output = query({
243
+ "inputs": ${getModelInputSnippet(model)},
244
+ })`,
245
+ });
246
+
247
+ export const pythonSnippets: Partial<
248
+ Record<
249
+ PipelineType,
250
+ (
251
+ model: ModelDataMinimal,
252
+ accessToken: string,
253
+ opts?: Record<string, unknown>
254
+ ) => InferenceSnippet | InferenceSnippet[]
255
+ >
256
+ > = {
257
+ // Same order as in tasks/src/pipelines.ts
258
+ "text-classification": snippetBasic,
259
+ "token-classification": snippetBasic,
260
+ "table-question-answering": snippetBasic,
261
+ "question-answering": snippetBasic,
262
+ "zero-shot-classification": snippetZeroShotClassification,
263
+ translation: snippetBasic,
264
+ summarization: snippetBasic,
265
+ "feature-extraction": snippetBasic,
266
+ "text-generation": snippetBasic,
267
+ "text2text-generation": snippetBasic,
268
+ "image-text-to-text": snippetConversational,
269
+ "fill-mask": snippetBasic,
270
+ "sentence-similarity": snippetBasic,
271
+ "automatic-speech-recognition": snippetFile,
272
+ "text-to-image": snippetTextToImage,
273
+ "text-to-speech": snippetTextToAudio,
274
+ "text-to-audio": snippetTextToAudio,
275
+ "audio-to-audio": snippetFile,
276
+ "audio-classification": snippetFile,
277
+ "image-classification": snippetFile,
278
+ "tabular-regression": snippetTabular,
279
+ "tabular-classification": snippetTabular,
280
+ "object-detection": snippetFile,
281
+ "image-segmentation": snippetFile,
282
+ "document-question-answering": snippetDocumentQuestionAnswering,
283
+ "image-to-text": snippetFile,
284
+ "zero-shot-image-classification": snippetZeroShotImageClassification,
285
+ };
286
+
287
+ export function getPythonInferenceSnippet(
288
+ model: ModelDataMinimal,
289
+ accessToken: string,
290
+ opts?: Record<string, unknown>
291
+ ): InferenceSnippet | InferenceSnippet[] {
292
+ if (model.tags.includes("conversational")) {
293
+ // Conversational model detected, so we display a code snippet that features the Messages API
294
+ return snippetConversational(model, accessToken, opts);
295
+ } else {
296
+ let snippets =
297
+ model.pipeline_tag && model.pipeline_tag in pythonSnippets
298
+ ? pythonSnippets[model.pipeline_tag]?.(model, accessToken) ?? { content: "" }
299
+ : { content: "" };
300
+
301
+ snippets = Array.isArray(snippets) ? snippets : [snippets];
302
+
303
+ return snippets.map((snippet) => {
304
+ return {
305
+ ...snippet,
306
+ content: snippet.content.includes("requests")
307
+ ? `import requests
308
+
309
+ API_URL = "https://api-inference.huggingface.co/models/${model.id}"
310
+ headers = {"Authorization": ${accessToken ? `"Bearer ${accessToken}"` : `f"Bearer {API_TOKEN}"`}}
311
+
312
+ ${snippet.content}`
313
+ : snippet.content,
314
+ };
315
+ });
316
+ }
317
+ }
318
+
319
+ export function hasPythonInferenceSnippet(model: ModelDataMinimal): boolean {
320
+ return !!model.pipeline_tag && model.pipeline_tag in pythonSnippets;
321
+ }
@@ -0,0 +1,16 @@
1
+ import type { ModelData } from "../model-data.js";
2
+
3
+ /**
4
+ * Minimal model data required for snippets.
5
+ *
6
+ * Add more fields as needed.
7
+ */
8
+ export type ModelDataMinimal = Pick<
9
+ ModelData,
10
+ "id" | "pipeline_tag" | "mask_token" | "library_name" | "config" | "tags" | "inference"
11
+ >;
12
+
13
+ export interface InferenceSnippet {
14
+ content: string;
15
+ client?: string; // for instance: `client` could be `huggingface_hub` or `openai` client for Python snippets
16
+ }
@@ -0,0 +1,86 @@
1
+ ## Use Cases
2
+
3
+ ### Command Recognition
4
+
5
+ Command recognition or keyword spotting classifies utterances into a predefined set of commands. This is often done on-device for fast response time.
6
+
7
+ As an example, using the Google Speech Commands dataset, given an input, a model can classify which of the following commands the user is typing:
8
+
9
+ ```
10
+ 'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence'
11
+ ```
12
+
13
+ Speechbrain models can easily perform this task with just a couple of lines of code!
14
+
15
+ ```python
16
+ from speechbrain.pretrained import EncoderClassifier
17
+ model = EncoderClassifier.from_hparams(
18
+ "speechbrain/google_speech_command_xvector"
19
+ )
20
+ model.classify_file("file.wav")
21
+ ```
22
+
23
+ ### Language Identification
24
+
25
+ Datasets such as VoxLingua107 allow anyone to train language identification models for up to 107 languages! This can be extremely useful as a preprocessing step for other systems. Here's an example [model](https://huggingface.co/TalTechNLP/voxlingua107-epaca-tdnn)trained on VoxLingua107.
26
+
27
+ ### Emotion recognition
28
+
29
+ Emotion recognition is self explanatory. In addition to trying the widgets, you can use Inference Endpoints to perform audio classification. Here is a simple example that uses a [HuBERT](https://huggingface.co/superb/hubert-large-superb-er) model fine-tuned for this task.
30
+
31
+ ```python
32
+ import json
33
+ import requests
34
+
35
+ headers = {"Authorization": f"Bearer {API_TOKEN}"}
36
+ API_URL = "https://api-inference.huggingface.co/models/superb/hubert-large-superb-er"
37
+
38
+ def query(filename):
39
+ with open(filename, "rb") as f:
40
+ data = f.read()
41
+ response = requests.request("POST", API_URL, headers=headers, data=data)
42
+ return json.loads(response.content.decode("utf-8"))
43
+
44
+ data = query("sample1.flac")
45
+ # [{'label': 'neu', 'score': 0.60},
46
+ # {'label': 'hap', 'score': 0.20},
47
+ # {'label': 'ang', 'score': 0.13},
48
+ # {'label': 'sad', 'score': 0.07}]
49
+ ```
50
+
51
+ You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer with audio classification models on Hugging Face Hub.
52
+
53
+ ```javascript
54
+ import { HfInference } from "@huggingface/inference";
55
+
56
+ const inference = new HfInference(HF_TOKEN);
57
+ await inference.audioClassification({
58
+ data: await (await fetch("sample.flac")).blob(),
59
+ model: "facebook/mms-lid-126",
60
+ });
61
+ ```
62
+
63
+ ### Speaker Identification
64
+
65
+ Speaker Identification is classifying the audio of the person speaking. Speakers are usually predefined. You can try out this task with [this model](https://huggingface.co/superb/wav2vec2-base-superb-sid). A useful dataset for this task is VoxCeleb1.
66
+
67
+ ## Solving audio classification for your own data
68
+
69
+ We have some great news! You can do fine-tuning (transfer learning) to train a well-performing model without requiring as much data. Pretrained models such as Wav2Vec2 and HuBERT exist. [Facebook's Wav2Vec2 XLS-R model](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2) is a large multilingual model trained on 128 languages and with 436K hours of speech. Similarly, you can also use [OpenAI's Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) trained on up to 4 Million hours of multilingual speech data for this task too!
70
+
71
+ ## Useful Resources
72
+
73
+ Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful!
74
+
75
+ ### Notebooks
76
+
77
+ - [PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)
78
+
79
+ ### Scripts for training
80
+
81
+ - [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification)
82
+
83
+ ### Documentation
84
+
85
+ - [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/chapter4/introduction)
86
+ - [Audio classification task guide](https://huggingface.co/docs/transformers/tasks/audio_classification)
@@ -0,0 +1,81 @@
1
+ import type { TaskDataCustom } from "../index.js";
2
+
3
+ const taskData: TaskDataCustom = {
4
+ datasets: [
5
+ {
6
+ description: "A benchmark of 10 different audio tasks.",
7
+ id: "s3prl/superb",
8
+ },
9
+ {
10
+ description: "A dataset of YouTube clips and their sound categories.",
11
+ id: "agkphysics/AudioSet",
12
+ },
13
+ ],
14
+ demo: {
15
+ inputs: [
16
+ {
17
+ filename: "audio.wav",
18
+ type: "audio",
19
+ },
20
+ ],
21
+ outputs: [
22
+ {
23
+ data: [
24
+ {
25
+ label: "Up",
26
+ score: 0.2,
27
+ },
28
+ {
29
+ label: "Down",
30
+ score: 0.8,
31
+ },
32
+ ],
33
+ type: "chart",
34
+ },
35
+ ],
36
+ },
37
+ metrics: [
38
+ {
39
+ description: "",
40
+ id: "accuracy",
41
+ },
42
+ {
43
+ description: "",
44
+ id: "recall",
45
+ },
46
+ {
47
+ description: "",
48
+ id: "precision",
49
+ },
50
+ {
51
+ description: "",
52
+ id: "f1",
53
+ },
54
+ ],
55
+ models: [
56
+ {
57
+ description: "An easy-to-use model for command recognition.",
58
+ id: "speechbrain/google_speech_command_xvector",
59
+ },
60
+ {
61
+ description: "An emotion recognition model.",
62
+ id: "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
63
+ },
64
+ {
65
+ description: "A language identification model.",
66
+ id: "facebook/mms-lid-126",
67
+ },
68
+ ],
69
+ spaces: [
70
+ {
71
+ description: "An application that can classify music into different genre.",
72
+ id: "kurianbenoy/audioclassification",
73
+ },
74
+ ],
75
+ summary:
76
+ "Audio classification is the task of assigning a label or class to a given audio. It can be used for recognizing which command a user is giving or the emotion of a statement, as well as identifying a speaker.",
77
+ widgetModels: ["MIT/ast-finetuned-audioset-10-10-0.4593"],
78
+ youtubeId: "KWwzcmG98Ds",
79
+ };
80
+
81
+ export default taskData;
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Audio Classification inference
8
+ */
9
+ export interface AudioClassificationInput {
10
+ /**
11
+ * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
12
+ * also provide the audio data as a raw bytes payload.
13
+ */
14
+ inputs: string;
15
+ /**
16
+ * Additional inference parameters
17
+ */
18
+ parameters?: AudioClassificationParameters;
19
+ [property: string]: unknown;
20
+ }
21
+ /**
22
+ * Additional inference parameters
23
+ *
24
+ * Additional inference parameters for Audio Classification
25
+ */
26
+ export interface AudioClassificationParameters {
27
+ function_to_apply?: ClassificationOutputTransform;
28
+ /**
29
+ * When specified, limits the output to the top K most probable classes.
30
+ */
31
+ top_k?: number;
32
+ [property: string]: unknown;
33
+ }
34
+ /**
35
+ * The function to apply to the model outputs in order to retrieve the scores.
36
+ */
37
+ export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
38
+ export type AudioClassificationOutput = AudioClassificationOutputElement[];
39
+ /**
40
+ * Outputs for Audio Classification inference
41
+ */
42
+ export interface AudioClassificationOutputElement {
43
+ /**
44
+ * The predicted class label.
45
+ */
46
+ label: string;
47
+ /**
48
+ * The corresponding probability.
49
+ */
50
+ score: number;
51
+ [property: string]: unknown;
52
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "$id": "/inference/schemas/audio-classification/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Audio Classification inference",
5
+ "title": "AudioClassificationInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
10
+ "type": "string"
11
+ },
12
+ "parameters": {
13
+ "description": "Additional inference parameters",
14
+ "$ref": "#/$defs/AudioClassificationParameters"
15
+ }
16
+ },
17
+ "$defs": {
18
+ "AudioClassificationParameters": {
19
+ "title": "AudioClassificationParameters",
20
+ "description": "Additional inference parameters for Audio Classification",
21
+ "type": "object",
22
+ "properties": {
23
+ "function_to_apply": {
24
+ "title": "AudioClassificationOutputTransform",
25
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
26
+ },
27
+ "top_k": {
28
+ "type": "integer",
29
+ "description": "When specified, limits the output to the top K most probable classes."
30
+ }
31
+ }
32
+ }
33
+ },
34
+ "required": ["inputs"]
35
+ }
@@ -0,0 +1,11 @@
1
+ {
2
+ "$id": "/inference/schemas/audio-classification/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "title": "AudioClassificationOutput",
5
+ "description": "Outputs for Audio Classification inference",
6
+ "type": "array",
7
+ "items": {
8
+ "type": "object",
9
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
10
+ }
11
+ }
@@ -0,0 +1,56 @@
1
+ ## Use Cases
2
+
3
+ ### Speech Enhancement (Noise removal)
4
+
5
+ Speech Enhancement is a bit self explanatory. It improves (or enhances) the quality of an audio by removing noise. There are multiple libraries to solve this task, such as Speechbrain, Asteroid and ESPNet. Here is a simple example using Speechbrain
6
+
7
+ ```python
8
+ from speechbrain.pretrained import SpectralMaskEnhancement
9
+ model = SpectralMaskEnhancement.from_hparams(
10
+ "speechbrain/mtl-mimic-voicebank"
11
+ )
12
+ model.enhance_file("file.wav")
13
+ ```
14
+
15
+ Alternatively, you can use [Inference Endpoints](https://huggingface.co/inference-endpoints) to solve this task
16
+
17
+ ```python
18
+ import json
19
+ import requests
20
+
21
+ headers = {"Authorization": f"Bearer {API_TOKEN}"}
22
+ API_URL = "https://api-inference.huggingface.co/models/speechbrain/mtl-mimic-voicebank"
23
+
24
+ def query(filename):
25
+ with open(filename, "rb") as f:
26
+ data = f.read()
27
+ response = requests.request("POST", API_URL, headers=headers, data=data)
28
+ return json.loads(response.content.decode("utf-8"))
29
+
30
+ data = query("sample1.flac")
31
+ ```
32
+
33
+ You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer with audio-to-audio models on Hugging Face Hub.
34
+
35
+ ```javascript
36
+ import { HfInference } from "@huggingface/inference";
37
+
38
+ const inference = new HfInference(HF_TOKEN);
39
+ await inference.audioToAudio({
40
+ data: await (await fetch("sample.flac")).blob(),
41
+ model: "speechbrain/sepformer-wham",
42
+ });
43
+ ```
44
+
45
+ ### Audio Source Separation
46
+
47
+ Audio Source Separation allows you to isolate different sounds from individual sources. For example, if you have an audio file with multiple people speaking, you can get an audio file for each of them. You can then use an Automatic Speech Recognition system to extract the text from each of these sources as an initial step for your system!
48
+
49
+ Audio-to-Audio can also be used to remove noise from audio files: you get one audio for the person speaking and another audio for the noise. This can also be useful when you have multi-person audio with some noise: yyou can get one audio for each person and then one audio for the noise.
50
+
51
+ ## Training a model for your own data
52
+
53
+ If you want to learn how to train models for the Audio-to-Audio task, we recommend the following tutorials:
54
+
55
+ - [Speech Enhancement](https://speechbrain.github.io/tutorial_enhancement.html)
56
+ - [Source Separation](https://speechbrain.github.io/tutorial_separation.html)