@huggingface/tasks 0.13.1-test → 0.13.1-test2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/src/dataset-libraries.ts +89 -0
- package/src/default-widget-inputs.ts +718 -0
- package/src/gguf.ts +40 -0
- package/src/hardware.ts +482 -0
- package/src/index.ts +59 -0
- package/src/library-to-tasks.ts +76 -0
- package/src/local-apps.ts +412 -0
- package/src/model-data.ts +149 -0
- package/src/model-libraries-downloads.ts +18 -0
- package/src/model-libraries-snippets.ts +1128 -0
- package/src/model-libraries.ts +820 -0
- package/src/pipelines.ts +698 -0
- package/src/snippets/common.ts +39 -0
- package/src/snippets/curl.spec.ts +94 -0
- package/src/snippets/curl.ts +120 -0
- package/src/snippets/index.ts +7 -0
- package/src/snippets/inputs.ts +167 -0
- package/src/snippets/js.spec.ts +148 -0
- package/src/snippets/js.ts +305 -0
- package/src/snippets/python.spec.ts +144 -0
- package/src/snippets/python.ts +321 -0
- package/src/snippets/types.ts +16 -0
- package/src/tasks/audio-classification/about.md +86 -0
- package/src/tasks/audio-classification/data.ts +81 -0
- package/src/tasks/audio-classification/inference.ts +52 -0
- package/src/tasks/audio-classification/spec/input.json +35 -0
- package/src/tasks/audio-classification/spec/output.json +11 -0
- package/src/tasks/audio-to-audio/about.md +56 -0
- package/src/tasks/audio-to-audio/data.ts +70 -0
- package/src/tasks/automatic-speech-recognition/about.md +90 -0
- package/src/tasks/automatic-speech-recognition/data.ts +82 -0
- package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
- package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
- package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
- package/src/tasks/chat-completion/inference.ts +322 -0
- package/src/tasks/chat-completion/spec/input.json +350 -0
- package/src/tasks/chat-completion/spec/output.json +206 -0
- package/src/tasks/chat-completion/spec/stream_output.json +213 -0
- package/src/tasks/common-definitions.json +100 -0
- package/src/tasks/depth-estimation/about.md +45 -0
- package/src/tasks/depth-estimation/data.ts +70 -0
- package/src/tasks/depth-estimation/inference.ts +35 -0
- package/src/tasks/depth-estimation/spec/input.json +25 -0
- package/src/tasks/depth-estimation/spec/output.json +16 -0
- package/src/tasks/document-question-answering/about.md +53 -0
- package/src/tasks/document-question-answering/data.ts +85 -0
- package/src/tasks/document-question-answering/inference.ts +110 -0
- package/src/tasks/document-question-answering/spec/input.json +85 -0
- package/src/tasks/document-question-answering/spec/output.json +36 -0
- package/src/tasks/feature-extraction/about.md +72 -0
- package/src/tasks/feature-extraction/data.ts +57 -0
- package/src/tasks/feature-extraction/inference.ts +40 -0
- package/src/tasks/feature-extraction/spec/input.json +47 -0
- package/src/tasks/feature-extraction/spec/output.json +15 -0
- package/src/tasks/fill-mask/about.md +51 -0
- package/src/tasks/fill-mask/data.ts +79 -0
- package/src/tasks/fill-mask/inference.ts +62 -0
- package/src/tasks/fill-mask/spec/input.json +38 -0
- package/src/tasks/fill-mask/spec/output.json +29 -0
- package/src/tasks/image-classification/about.md +50 -0
- package/src/tasks/image-classification/data.ts +88 -0
- package/src/tasks/image-classification/inference.ts +52 -0
- package/src/tasks/image-classification/spec/input.json +35 -0
- package/src/tasks/image-classification/spec/output.json +11 -0
- package/src/tasks/image-feature-extraction/about.md +23 -0
- package/src/tasks/image-feature-extraction/data.ts +59 -0
- package/src/tasks/image-segmentation/about.md +63 -0
- package/src/tasks/image-segmentation/data.ts +99 -0
- package/src/tasks/image-segmentation/inference.ts +69 -0
- package/src/tasks/image-segmentation/spec/input.json +45 -0
- package/src/tasks/image-segmentation/spec/output.json +26 -0
- package/src/tasks/image-text-to-text/about.md +76 -0
- package/src/tasks/image-text-to-text/data.ts +102 -0
- package/src/tasks/image-to-3d/about.md +62 -0
- package/src/tasks/image-to-3d/data.ts +75 -0
- package/src/tasks/image-to-image/about.md +129 -0
- package/src/tasks/image-to-image/data.ts +101 -0
- package/src/tasks/image-to-image/inference.ts +68 -0
- package/src/tasks/image-to-image/spec/input.json +55 -0
- package/src/tasks/image-to-image/spec/output.json +12 -0
- package/src/tasks/image-to-text/about.md +61 -0
- package/src/tasks/image-to-text/data.ts +82 -0
- package/src/tasks/image-to-text/inference.ts +143 -0
- package/src/tasks/image-to-text/spec/input.json +34 -0
- package/src/tasks/image-to-text/spec/output.json +14 -0
- package/src/tasks/index.ts +312 -0
- package/src/tasks/keypoint-detection/about.md +57 -0
- package/src/tasks/keypoint-detection/data.ts +50 -0
- package/src/tasks/mask-generation/about.md +65 -0
- package/src/tasks/mask-generation/data.ts +55 -0
- package/src/tasks/object-detection/about.md +37 -0
- package/src/tasks/object-detection/data.ts +86 -0
- package/src/tasks/object-detection/inference.ts +75 -0
- package/src/tasks/object-detection/spec/input.json +31 -0
- package/src/tasks/object-detection/spec/output.json +50 -0
- package/src/tasks/placeholder/about.md +15 -0
- package/src/tasks/placeholder/data.ts +21 -0
- package/src/tasks/placeholder/spec/input.json +35 -0
- package/src/tasks/placeholder/spec/output.json +17 -0
- package/src/tasks/question-answering/about.md +56 -0
- package/src/tasks/question-answering/data.ts +75 -0
- package/src/tasks/question-answering/inference.ts +99 -0
- package/src/tasks/question-answering/spec/input.json +67 -0
- package/src/tasks/question-answering/spec/output.json +29 -0
- package/src/tasks/reinforcement-learning/about.md +167 -0
- package/src/tasks/reinforcement-learning/data.ts +75 -0
- package/src/tasks/sentence-similarity/about.md +97 -0
- package/src/tasks/sentence-similarity/data.ts +101 -0
- package/src/tasks/sentence-similarity/inference.ts +32 -0
- package/src/tasks/sentence-similarity/spec/input.json +40 -0
- package/src/tasks/sentence-similarity/spec/output.json +12 -0
- package/src/tasks/summarization/about.md +58 -0
- package/src/tasks/summarization/data.ts +76 -0
- package/src/tasks/summarization/inference.ts +57 -0
- package/src/tasks/summarization/spec/input.json +42 -0
- package/src/tasks/summarization/spec/output.json +14 -0
- package/src/tasks/table-question-answering/about.md +43 -0
- package/src/tasks/table-question-answering/data.ts +59 -0
- package/src/tasks/table-question-answering/inference.ts +61 -0
- package/src/tasks/table-question-answering/spec/input.json +44 -0
- package/src/tasks/table-question-answering/spec/output.json +40 -0
- package/src/tasks/tabular-classification/about.md +65 -0
- package/src/tasks/tabular-classification/data.ts +68 -0
- package/src/tasks/tabular-regression/about.md +87 -0
- package/src/tasks/tabular-regression/data.ts +57 -0
- package/src/tasks/text-classification/about.md +173 -0
- package/src/tasks/text-classification/data.ts +103 -0
- package/src/tasks/text-classification/inference.ts +51 -0
- package/src/tasks/text-classification/spec/input.json +35 -0
- package/src/tasks/text-classification/spec/output.json +11 -0
- package/src/tasks/text-generation/about.md +154 -0
- package/src/tasks/text-generation/data.ts +114 -0
- package/src/tasks/text-generation/inference.ts +200 -0
- package/src/tasks/text-generation/spec/input.json +219 -0
- package/src/tasks/text-generation/spec/output.json +179 -0
- package/src/tasks/text-generation/spec/stream_output.json +103 -0
- package/src/tasks/text-to-3d/about.md +62 -0
- package/src/tasks/text-to-3d/data.ts +56 -0
- package/src/tasks/text-to-audio/inference.ts +143 -0
- package/src/tasks/text-to-audio/spec/input.json +31 -0
- package/src/tasks/text-to-audio/spec/output.json +17 -0
- package/src/tasks/text-to-image/about.md +96 -0
- package/src/tasks/text-to-image/data.ts +100 -0
- package/src/tasks/text-to-image/inference.ts +75 -0
- package/src/tasks/text-to-image/spec/input.json +63 -0
- package/src/tasks/text-to-image/spec/output.json +13 -0
- package/src/tasks/text-to-speech/about.md +63 -0
- package/src/tasks/text-to-speech/data.ts +79 -0
- package/src/tasks/text-to-speech/inference.ts +145 -0
- package/src/tasks/text-to-speech/spec/input.json +31 -0
- package/src/tasks/text-to-speech/spec/output.json +7 -0
- package/src/tasks/text-to-video/about.md +41 -0
- package/src/tasks/text-to-video/data.ts +102 -0
- package/src/tasks/text2text-generation/inference.ts +55 -0
- package/src/tasks/text2text-generation/spec/input.json +55 -0
- package/src/tasks/text2text-generation/spec/output.json +14 -0
- package/src/tasks/token-classification/about.md +76 -0
- package/src/tasks/token-classification/data.ts +92 -0
- package/src/tasks/token-classification/inference.ts +85 -0
- package/src/tasks/token-classification/spec/input.json +65 -0
- package/src/tasks/token-classification/spec/output.json +37 -0
- package/src/tasks/translation/about.md +65 -0
- package/src/tasks/translation/data.ts +70 -0
- package/src/tasks/translation/inference.ts +67 -0
- package/src/tasks/translation/spec/input.json +50 -0
- package/src/tasks/translation/spec/output.json +14 -0
- package/src/tasks/unconditional-image-generation/about.md +50 -0
- package/src/tasks/unconditional-image-generation/data.ts +72 -0
- package/src/tasks/video-classification/about.md +37 -0
- package/src/tasks/video-classification/data.ts +84 -0
- package/src/tasks/video-classification/inference.ts +59 -0
- package/src/tasks/video-classification/spec/input.json +42 -0
- package/src/tasks/video-classification/spec/output.json +10 -0
- package/src/tasks/video-text-to-text/about.md +98 -0
- package/src/tasks/video-text-to-text/data.ts +66 -0
- package/src/tasks/visual-question-answering/about.md +48 -0
- package/src/tasks/visual-question-answering/data.ts +97 -0
- package/src/tasks/visual-question-answering/inference.ts +62 -0
- package/src/tasks/visual-question-answering/spec/input.json +41 -0
- package/src/tasks/visual-question-answering/spec/output.json +21 -0
- package/src/tasks/zero-shot-classification/about.md +40 -0
- package/src/tasks/zero-shot-classification/data.ts +70 -0
- package/src/tasks/zero-shot-classification/inference.ts +67 -0
- package/src/tasks/zero-shot-classification/spec/input.json +50 -0
- package/src/tasks/zero-shot-classification/spec/output.json +11 -0
- package/src/tasks/zero-shot-image-classification/about.md +75 -0
- package/src/tasks/zero-shot-image-classification/data.ts +84 -0
- package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
- package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
- package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-object-detection/about.md +45 -0
- package/src/tasks/zero-shot-object-detection/data.ts +67 -0
- package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
- package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
- package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
- package/src/tokenizer-data.ts +32 -0
- package/src/widget-example.ts +125 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import type { PipelineType } from "../pipelines.js";
|
|
2
|
+
import type { ChatCompletionInputMessage, GenerationParameters } from "../tasks/index.js";
|
|
3
|
+
import { stringifyGenerationConfig, stringifyMessages } from "./common.js";
|
|
4
|
+
import { getModelInputSnippet } from "./inputs.js";
|
|
5
|
+
import type { InferenceSnippet, ModelDataMinimal } from "./types.js";
|
|
6
|
+
|
|
7
|
+
const snippetImportInferenceClient = (model: ModelDataMinimal, accessToken: string): string =>
|
|
8
|
+
`from huggingface_hub import InferenceClient
|
|
9
|
+
client = InferenceClient("${model.id}", token="${accessToken || "{API_TOKEN}"}")
|
|
10
|
+
`;
|
|
11
|
+
|
|
12
|
+
export const snippetConversational = (
|
|
13
|
+
model: ModelDataMinimal,
|
|
14
|
+
accessToken: string,
|
|
15
|
+
opts?: {
|
|
16
|
+
streaming?: boolean;
|
|
17
|
+
messages?: ChatCompletionInputMessage[];
|
|
18
|
+
temperature?: GenerationParameters["temperature"];
|
|
19
|
+
max_tokens?: GenerationParameters["max_tokens"];
|
|
20
|
+
top_p?: GenerationParameters["top_p"];
|
|
21
|
+
}
|
|
22
|
+
): InferenceSnippet[] => {
|
|
23
|
+
const streaming = opts?.streaming ?? true;
|
|
24
|
+
const exampleMessages = getModelInputSnippet(model) as ChatCompletionInputMessage[];
|
|
25
|
+
const messages = opts?.messages ?? exampleMessages;
|
|
26
|
+
const messagesStr = stringifyMessages(messages, { attributeKeyQuotes: true });
|
|
27
|
+
|
|
28
|
+
const config = {
|
|
29
|
+
...(opts?.temperature ? { temperature: opts.temperature } : undefined),
|
|
30
|
+
max_tokens: opts?.max_tokens ?? 500,
|
|
31
|
+
...(opts?.top_p ? { top_p: opts.top_p } : undefined),
|
|
32
|
+
};
|
|
33
|
+
const configStr = stringifyGenerationConfig(config, {
|
|
34
|
+
indent: "\n\t",
|
|
35
|
+
attributeValueConnector: "=",
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
if (streaming) {
|
|
39
|
+
return [
|
|
40
|
+
{
|
|
41
|
+
client: "huggingface_hub",
|
|
42
|
+
content: `from huggingface_hub import InferenceClient
|
|
43
|
+
|
|
44
|
+
client = InferenceClient(api_key="${accessToken || "{API_TOKEN}"}")
|
|
45
|
+
|
|
46
|
+
messages = ${messagesStr}
|
|
47
|
+
|
|
48
|
+
stream = client.chat.completions.create(
|
|
49
|
+
model="${model.id}",
|
|
50
|
+
messages=messages,
|
|
51
|
+
${configStr},
|
|
52
|
+
stream=True
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
for chunk in stream:
|
|
56
|
+
print(chunk.choices[0].delta.content, end="")`,
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
client: "openai",
|
|
60
|
+
content: `from openai import OpenAI
|
|
61
|
+
|
|
62
|
+
client = OpenAI(
|
|
63
|
+
base_url="https://api-inference.huggingface.co/v1/",
|
|
64
|
+
api_key="${accessToken || "{API_TOKEN}"}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
messages = ${messagesStr}
|
|
68
|
+
|
|
69
|
+
stream = client.chat.completions.create(
|
|
70
|
+
model="${model.id}",
|
|
71
|
+
messages=messages,
|
|
72
|
+
${configStr},
|
|
73
|
+
stream=True
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
for chunk in stream:
|
|
77
|
+
print(chunk.choices[0].delta.content, end="")`,
|
|
78
|
+
},
|
|
79
|
+
];
|
|
80
|
+
} else {
|
|
81
|
+
return [
|
|
82
|
+
{
|
|
83
|
+
client: "huggingface_hub",
|
|
84
|
+
content: `from huggingface_hub import InferenceClient
|
|
85
|
+
|
|
86
|
+
client = InferenceClient(api_key="${accessToken || "{API_TOKEN}"}")
|
|
87
|
+
|
|
88
|
+
messages = ${messagesStr}
|
|
89
|
+
|
|
90
|
+
completion = client.chat.completions.create(
|
|
91
|
+
model="${model.id}",
|
|
92
|
+
messages=messages,
|
|
93
|
+
${configStr}
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
print(completion.choices[0].message)`,
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
client: "openai",
|
|
100
|
+
content: `from openai import OpenAI
|
|
101
|
+
|
|
102
|
+
client = OpenAI(
|
|
103
|
+
base_url="https://api-inference.huggingface.co/v1/",
|
|
104
|
+
api_key="${accessToken || "{API_TOKEN}"}"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
messages = ${messagesStr}
|
|
108
|
+
|
|
109
|
+
completion = client.chat.completions.create(
|
|
110
|
+
model="${model.id}",
|
|
111
|
+
messages=messages,
|
|
112
|
+
${configStr}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
print(completion.choices[0].message)`,
|
|
116
|
+
},
|
|
117
|
+
];
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
export const snippetZeroShotClassification = (model: ModelDataMinimal): InferenceSnippet => ({
|
|
122
|
+
content: `def query(payload):
|
|
123
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
124
|
+
return response.json()
|
|
125
|
+
|
|
126
|
+
output = query({
|
|
127
|
+
"inputs": ${getModelInputSnippet(model)},
|
|
128
|
+
"parameters": {"candidate_labels": ["refund", "legal", "faq"]},
|
|
129
|
+
})`,
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
export const snippetZeroShotImageClassification = (model: ModelDataMinimal): InferenceSnippet => ({
|
|
133
|
+
content: `def query(data):
|
|
134
|
+
with open(data["image_path"], "rb") as f:
|
|
135
|
+
img = f.read()
|
|
136
|
+
payload={
|
|
137
|
+
"parameters": data["parameters"],
|
|
138
|
+
"inputs": base64.b64encode(img).decode("utf-8")
|
|
139
|
+
}
|
|
140
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
141
|
+
return response.json()
|
|
142
|
+
|
|
143
|
+
output = query({
|
|
144
|
+
"image_path": ${getModelInputSnippet(model)},
|
|
145
|
+
"parameters": {"candidate_labels": ["cat", "dog", "llama"]},
|
|
146
|
+
})`,
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
export const snippetBasic = (model: ModelDataMinimal): InferenceSnippet => ({
|
|
150
|
+
content: `def query(payload):
|
|
151
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
152
|
+
return response.json()
|
|
153
|
+
|
|
154
|
+
output = query({
|
|
155
|
+
"inputs": ${getModelInputSnippet(model)},
|
|
156
|
+
})`,
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
export const snippetFile = (model: ModelDataMinimal): InferenceSnippet => ({
|
|
160
|
+
content: `def query(filename):
|
|
161
|
+
with open(filename, "rb") as f:
|
|
162
|
+
data = f.read()
|
|
163
|
+
response = requests.post(API_URL, headers=headers, data=data)
|
|
164
|
+
return response.json()
|
|
165
|
+
|
|
166
|
+
output = query(${getModelInputSnippet(model)})`,
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
export const snippetTextToImage = (model: ModelDataMinimal, accessToken: string): InferenceSnippet[] => [
|
|
170
|
+
{
|
|
171
|
+
client: "huggingface_hub",
|
|
172
|
+
content: `${snippetImportInferenceClient(model, accessToken)}
|
|
173
|
+
# output is a PIL.Image object
|
|
174
|
+
image = client.text_to_image(${getModelInputSnippet(model)})`,
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
client: "requests",
|
|
178
|
+
content: `def query(payload):
|
|
179
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
180
|
+
return response.content
|
|
181
|
+
image_bytes = query({
|
|
182
|
+
"inputs": ${getModelInputSnippet(model)},
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
# You can access the image with PIL.Image for example
|
|
186
|
+
import io
|
|
187
|
+
from PIL import Image
|
|
188
|
+
image = Image.open(io.BytesIO(image_bytes))`,
|
|
189
|
+
},
|
|
190
|
+
];
|
|
191
|
+
|
|
192
|
+
export const snippetTabular = (model: ModelDataMinimal): InferenceSnippet => ({
|
|
193
|
+
content: `def query(payload):
|
|
194
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
195
|
+
return response.content
|
|
196
|
+
response = query({
|
|
197
|
+
"inputs": {"data": ${getModelInputSnippet(model)}},
|
|
198
|
+
})`,
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
export const snippetTextToAudio = (model: ModelDataMinimal): InferenceSnippet => {
|
|
202
|
+
// Transformers TTS pipeline and api-inference-community (AIC) pipeline outputs are diverged
|
|
203
|
+
// with the latest update to inference-api (IA).
|
|
204
|
+
// Transformers IA returns a byte object (wav file), whereas AIC returns wav and sampling_rate.
|
|
205
|
+
if (model.library_name === "transformers") {
|
|
206
|
+
return {
|
|
207
|
+
content: `def query(payload):
|
|
208
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
209
|
+
return response.content
|
|
210
|
+
|
|
211
|
+
audio_bytes = query({
|
|
212
|
+
"inputs": ${getModelInputSnippet(model)},
|
|
213
|
+
})
|
|
214
|
+
# You can access the audio with IPython.display for example
|
|
215
|
+
from IPython.display import Audio
|
|
216
|
+
Audio(audio_bytes)`,
|
|
217
|
+
};
|
|
218
|
+
} else {
|
|
219
|
+
return {
|
|
220
|
+
content: `def query(payload):
|
|
221
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
222
|
+
return response.json()
|
|
223
|
+
|
|
224
|
+
audio, sampling_rate = query({
|
|
225
|
+
"inputs": ${getModelInputSnippet(model)},
|
|
226
|
+
})
|
|
227
|
+
# You can access the audio with IPython.display for example
|
|
228
|
+
from IPython.display import Audio
|
|
229
|
+
Audio(audio, rate=sampling_rate)`,
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
export const snippetDocumentQuestionAnswering = (model: ModelDataMinimal): InferenceSnippet => ({
|
|
235
|
+
content: `def query(payload):
|
|
236
|
+
with open(payload["image"], "rb") as f:
|
|
237
|
+
img = f.read()
|
|
238
|
+
payload["image"] = base64.b64encode(img).decode("utf-8")
|
|
239
|
+
response = requests.post(API_URL, headers=headers, json=payload)
|
|
240
|
+
return response.json()
|
|
241
|
+
|
|
242
|
+
output = query({
|
|
243
|
+
"inputs": ${getModelInputSnippet(model)},
|
|
244
|
+
})`,
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
export const pythonSnippets: Partial<
|
|
248
|
+
Record<
|
|
249
|
+
PipelineType,
|
|
250
|
+
(
|
|
251
|
+
model: ModelDataMinimal,
|
|
252
|
+
accessToken: string,
|
|
253
|
+
opts?: Record<string, unknown>
|
|
254
|
+
) => InferenceSnippet | InferenceSnippet[]
|
|
255
|
+
>
|
|
256
|
+
> = {
|
|
257
|
+
// Same order as in tasks/src/pipelines.ts
|
|
258
|
+
"text-classification": snippetBasic,
|
|
259
|
+
"token-classification": snippetBasic,
|
|
260
|
+
"table-question-answering": snippetBasic,
|
|
261
|
+
"question-answering": snippetBasic,
|
|
262
|
+
"zero-shot-classification": snippetZeroShotClassification,
|
|
263
|
+
translation: snippetBasic,
|
|
264
|
+
summarization: snippetBasic,
|
|
265
|
+
"feature-extraction": snippetBasic,
|
|
266
|
+
"text-generation": snippetBasic,
|
|
267
|
+
"text2text-generation": snippetBasic,
|
|
268
|
+
"image-text-to-text": snippetConversational,
|
|
269
|
+
"fill-mask": snippetBasic,
|
|
270
|
+
"sentence-similarity": snippetBasic,
|
|
271
|
+
"automatic-speech-recognition": snippetFile,
|
|
272
|
+
"text-to-image": snippetTextToImage,
|
|
273
|
+
"text-to-speech": snippetTextToAudio,
|
|
274
|
+
"text-to-audio": snippetTextToAudio,
|
|
275
|
+
"audio-to-audio": snippetFile,
|
|
276
|
+
"audio-classification": snippetFile,
|
|
277
|
+
"image-classification": snippetFile,
|
|
278
|
+
"tabular-regression": snippetTabular,
|
|
279
|
+
"tabular-classification": snippetTabular,
|
|
280
|
+
"object-detection": snippetFile,
|
|
281
|
+
"image-segmentation": snippetFile,
|
|
282
|
+
"document-question-answering": snippetDocumentQuestionAnswering,
|
|
283
|
+
"image-to-text": snippetFile,
|
|
284
|
+
"zero-shot-image-classification": snippetZeroShotImageClassification,
|
|
285
|
+
};
|
|
286
|
+
|
|
287
|
+
export function getPythonInferenceSnippet(
|
|
288
|
+
model: ModelDataMinimal,
|
|
289
|
+
accessToken: string,
|
|
290
|
+
opts?: Record<string, unknown>
|
|
291
|
+
): InferenceSnippet | InferenceSnippet[] {
|
|
292
|
+
if (model.tags.includes("conversational")) {
|
|
293
|
+
// Conversational model detected, so we display a code snippet that features the Messages API
|
|
294
|
+
return snippetConversational(model, accessToken, opts);
|
|
295
|
+
} else {
|
|
296
|
+
let snippets =
|
|
297
|
+
model.pipeline_tag && model.pipeline_tag in pythonSnippets
|
|
298
|
+
? pythonSnippets[model.pipeline_tag]?.(model, accessToken) ?? { content: "" }
|
|
299
|
+
: { content: "" };
|
|
300
|
+
|
|
301
|
+
snippets = Array.isArray(snippets) ? snippets : [snippets];
|
|
302
|
+
|
|
303
|
+
return snippets.map((snippet) => {
|
|
304
|
+
return {
|
|
305
|
+
...snippet,
|
|
306
|
+
content: snippet.content.includes("requests")
|
|
307
|
+
? `import requests
|
|
308
|
+
|
|
309
|
+
API_URL = "https://api-inference.huggingface.co/models/${model.id}"
|
|
310
|
+
headers = {"Authorization": ${accessToken ? `"Bearer ${accessToken}"` : `f"Bearer {API_TOKEN}"`}}
|
|
311
|
+
|
|
312
|
+
${snippet.content}`
|
|
313
|
+
: snippet.content,
|
|
314
|
+
};
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
export function hasPythonInferenceSnippet(model: ModelDataMinimal): boolean {
|
|
320
|
+
return !!model.pipeline_tag && model.pipeline_tag in pythonSnippets;
|
|
321
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { ModelData } from "../model-data.js";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Minimal model data required for snippets.
|
|
5
|
+
*
|
|
6
|
+
* Add more fields as needed.
|
|
7
|
+
*/
|
|
8
|
+
export type ModelDataMinimal = Pick<
|
|
9
|
+
ModelData,
|
|
10
|
+
"id" | "pipeline_tag" | "mask_token" | "library_name" | "config" | "tags" | "inference"
|
|
11
|
+
>;
|
|
12
|
+
|
|
13
|
+
export interface InferenceSnippet {
|
|
14
|
+
content: string;
|
|
15
|
+
client?: string; // for instance: `client` could be `huggingface_hub` or `openai` client for Python snippets
|
|
16
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Command Recognition
|
|
4
|
+
|
|
5
|
+
Command recognition or keyword spotting classifies utterances into a predefined set of commands. This is often done on-device for fast response time.
|
|
6
|
+
|
|
7
|
+
As an example, using the Google Speech Commands dataset, given an input, a model can classify which of the following commands the user is typing:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'silence'
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Speechbrain models can easily perform this task with just a couple of lines of code!
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from speechbrain.pretrained import EncoderClassifier
|
|
17
|
+
model = EncoderClassifier.from_hparams(
|
|
18
|
+
"speechbrain/google_speech_command_xvector"
|
|
19
|
+
)
|
|
20
|
+
model.classify_file("file.wav")
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Language Identification
|
|
24
|
+
|
|
25
|
+
Datasets such as VoxLingua107 allow anyone to train language identification models for up to 107 languages! This can be extremely useful as a preprocessing step for other systems. Here's an example [model](https://huggingface.co/TalTechNLP/voxlingua107-epaca-tdnn)trained on VoxLingua107.
|
|
26
|
+
|
|
27
|
+
### Emotion recognition
|
|
28
|
+
|
|
29
|
+
Emotion recognition is self explanatory. In addition to trying the widgets, you can use Inference Endpoints to perform audio classification. Here is a simple example that uses a [HuBERT](https://huggingface.co/superb/hubert-large-superb-er) model fine-tuned for this task.
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import json
|
|
33
|
+
import requests
|
|
34
|
+
|
|
35
|
+
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
|
36
|
+
API_URL = "https://api-inference.huggingface.co/models/superb/hubert-large-superb-er"
|
|
37
|
+
|
|
38
|
+
def query(filename):
|
|
39
|
+
with open(filename, "rb") as f:
|
|
40
|
+
data = f.read()
|
|
41
|
+
response = requests.request("POST", API_URL, headers=headers, data=data)
|
|
42
|
+
return json.loads(response.content.decode("utf-8"))
|
|
43
|
+
|
|
44
|
+
data = query("sample1.flac")
|
|
45
|
+
# [{'label': 'neu', 'score': 0.60},
|
|
46
|
+
# {'label': 'hap', 'score': 0.20},
|
|
47
|
+
# {'label': 'ang', 'score': 0.13},
|
|
48
|
+
# {'label': 'sad', 'score': 0.07}]
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer with audio classification models on Hugging Face Hub.
|
|
52
|
+
|
|
53
|
+
```javascript
|
|
54
|
+
import { HfInference } from "@huggingface/inference";
|
|
55
|
+
|
|
56
|
+
const inference = new HfInference(HF_TOKEN);
|
|
57
|
+
await inference.audioClassification({
|
|
58
|
+
data: await (await fetch("sample.flac")).blob(),
|
|
59
|
+
model: "facebook/mms-lid-126",
|
|
60
|
+
});
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Speaker Identification
|
|
64
|
+
|
|
65
|
+
Speaker Identification is classifying the audio of the person speaking. Speakers are usually predefined. You can try out this task with [this model](https://huggingface.co/superb/wav2vec2-base-superb-sid). A useful dataset for this task is VoxCeleb1.
|
|
66
|
+
|
|
67
|
+
## Solving audio classification for your own data
|
|
68
|
+
|
|
69
|
+
We have some great news! You can do fine-tuning (transfer learning) to train a well-performing model without requiring as much data. Pretrained models such as Wav2Vec2 and HuBERT exist. [Facebook's Wav2Vec2 XLS-R model](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2) is a large multilingual model trained on 128 languages and with 436K hours of speech. Similarly, you can also use [OpenAI's Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) trained on up to 4 Million hours of multilingual speech data for this task too!
|
|
70
|
+
|
|
71
|
+
## Useful Resources
|
|
72
|
+
|
|
73
|
+
Would you like to learn more about the topic? Awesome! Here you can find some curated resources that you may find helpful!
|
|
74
|
+
|
|
75
|
+
### Notebooks
|
|
76
|
+
|
|
77
|
+
- [PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/audio_classification.ipynb)
|
|
78
|
+
|
|
79
|
+
### Scripts for training
|
|
80
|
+
|
|
81
|
+
- [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/audio-classification)
|
|
82
|
+
|
|
83
|
+
### Documentation
|
|
84
|
+
|
|
85
|
+
- [Hugging Face Audio Course](https://huggingface.co/learn/audio-course/chapter4/introduction)
|
|
86
|
+
- [Audio classification task guide](https://huggingface.co/docs/transformers/tasks/audio_classification)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "A benchmark of 10 different audio tasks.",
|
|
7
|
+
id: "s3prl/superb",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "A dataset of YouTube clips and their sound categories.",
|
|
11
|
+
id: "agkphysics/AudioSet",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
14
|
+
demo: {
|
|
15
|
+
inputs: [
|
|
16
|
+
{
|
|
17
|
+
filename: "audio.wav",
|
|
18
|
+
type: "audio",
|
|
19
|
+
},
|
|
20
|
+
],
|
|
21
|
+
outputs: [
|
|
22
|
+
{
|
|
23
|
+
data: [
|
|
24
|
+
{
|
|
25
|
+
label: "Up",
|
|
26
|
+
score: 0.2,
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
label: "Down",
|
|
30
|
+
score: 0.8,
|
|
31
|
+
},
|
|
32
|
+
],
|
|
33
|
+
type: "chart",
|
|
34
|
+
},
|
|
35
|
+
],
|
|
36
|
+
},
|
|
37
|
+
metrics: [
|
|
38
|
+
{
|
|
39
|
+
description: "",
|
|
40
|
+
id: "accuracy",
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
description: "",
|
|
44
|
+
id: "recall",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
description: "",
|
|
48
|
+
id: "precision",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
description: "",
|
|
52
|
+
id: "f1",
|
|
53
|
+
},
|
|
54
|
+
],
|
|
55
|
+
models: [
|
|
56
|
+
{
|
|
57
|
+
description: "An easy-to-use model for command recognition.",
|
|
58
|
+
id: "speechbrain/google_speech_command_xvector",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
description: "An emotion recognition model.",
|
|
62
|
+
id: "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
description: "A language identification model.",
|
|
66
|
+
id: "facebook/mms-lid-126",
|
|
67
|
+
},
|
|
68
|
+
],
|
|
69
|
+
spaces: [
|
|
70
|
+
{
|
|
71
|
+
description: "An application that can classify music into different genre.",
|
|
72
|
+
id: "kurianbenoy/audioclassification",
|
|
73
|
+
},
|
|
74
|
+
],
|
|
75
|
+
summary:
|
|
76
|
+
"Audio classification is the task of assigning a label or class to a given audio. It can be used for recognizing which command a user is giving or the emotion of a statement, as well as identifying a speaker.",
|
|
77
|
+
widgetModels: ["MIT/ast-finetuned-audioset-10-10-0.4593"],
|
|
78
|
+
youtubeId: "KWwzcmG98Ds",
|
|
79
|
+
};
|
|
80
|
+
|
|
81
|
+
export default taskData;
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Audio Classification inference
|
|
8
|
+
*/
|
|
9
|
+
export interface AudioClassificationInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input audio data as a base64-encoded string. If no `parameters` are provided, you can
|
|
12
|
+
* also provide the audio data as a raw bytes payload.
|
|
13
|
+
*/
|
|
14
|
+
inputs: string;
|
|
15
|
+
/**
|
|
16
|
+
* Additional inference parameters
|
|
17
|
+
*/
|
|
18
|
+
parameters?: AudioClassificationParameters;
|
|
19
|
+
[property: string]: unknown;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Additional inference parameters
|
|
23
|
+
*
|
|
24
|
+
* Additional inference parameters for Audio Classification
|
|
25
|
+
*/
|
|
26
|
+
export interface AudioClassificationParameters {
|
|
27
|
+
function_to_apply?: ClassificationOutputTransform;
|
|
28
|
+
/**
|
|
29
|
+
* When specified, limits the output to the top K most probable classes.
|
|
30
|
+
*/
|
|
31
|
+
top_k?: number;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* The function to apply to the model outputs in order to retrieve the scores.
|
|
36
|
+
*/
|
|
37
|
+
export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
|
|
38
|
+
export type AudioClassificationOutput = AudioClassificationOutputElement[];
|
|
39
|
+
/**
|
|
40
|
+
* Outputs for Audio Classification inference
|
|
41
|
+
*/
|
|
42
|
+
export interface AudioClassificationOutputElement {
|
|
43
|
+
/**
|
|
44
|
+
* The predicted class label.
|
|
45
|
+
*/
|
|
46
|
+
label: string;
|
|
47
|
+
/**
|
|
48
|
+
* The corresponding probability.
|
|
49
|
+
*/
|
|
50
|
+
score: number;
|
|
51
|
+
[property: string]: unknown;
|
|
52
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/audio-classification/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Audio Classification inference",
|
|
5
|
+
"title": "AudioClassificationInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
|
|
10
|
+
"type": "string"
|
|
11
|
+
},
|
|
12
|
+
"parameters": {
|
|
13
|
+
"description": "Additional inference parameters",
|
|
14
|
+
"$ref": "#/$defs/AudioClassificationParameters"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"$defs": {
|
|
18
|
+
"AudioClassificationParameters": {
|
|
19
|
+
"title": "AudioClassificationParameters",
|
|
20
|
+
"description": "Additional inference parameters for Audio Classification",
|
|
21
|
+
"type": "object",
|
|
22
|
+
"properties": {
|
|
23
|
+
"function_to_apply": {
|
|
24
|
+
"title": "AudioClassificationOutputTransform",
|
|
25
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
|
|
26
|
+
},
|
|
27
|
+
"top_k": {
|
|
28
|
+
"type": "integer",
|
|
29
|
+
"description": "When specified, limits the output to the top K most probable classes."
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"required": ["inputs"]
|
|
35
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/audio-classification/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"title": "AudioClassificationOutput",
|
|
5
|
+
"description": "Outputs for Audio Classification inference",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
|
|
10
|
+
}
|
|
11
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Speech Enhancement (Noise removal)
|
|
4
|
+
|
|
5
|
+
Speech Enhancement is a bit self explanatory. It improves (or enhances) the quality of an audio by removing noise. There are multiple libraries to solve this task, such as Speechbrain, Asteroid and ESPNet. Here is a simple example using Speechbrain
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from speechbrain.pretrained import SpectralMaskEnhancement
|
|
9
|
+
model = SpectralMaskEnhancement.from_hparams(
|
|
10
|
+
"speechbrain/mtl-mimic-voicebank"
|
|
11
|
+
)
|
|
12
|
+
model.enhance_file("file.wav")
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Alternatively, you can use [Inference Endpoints](https://huggingface.co/inference-endpoints) to solve this task
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import json
|
|
19
|
+
import requests
|
|
20
|
+
|
|
21
|
+
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
|
22
|
+
API_URL = "https://api-inference.huggingface.co/models/speechbrain/mtl-mimic-voicebank"
|
|
23
|
+
|
|
24
|
+
def query(filename):
|
|
25
|
+
with open(filename, "rb") as f:
|
|
26
|
+
data = f.read()
|
|
27
|
+
response = requests.request("POST", API_URL, headers=headers, data=data)
|
|
28
|
+
return json.loads(response.content.decode("utf-8"))
|
|
29
|
+
|
|
30
|
+
data = query("sample1.flac")
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer with audio-to-audio models on Hugging Face Hub.
|
|
34
|
+
|
|
35
|
+
```javascript
|
|
36
|
+
import { HfInference } from "@huggingface/inference";
|
|
37
|
+
|
|
38
|
+
const inference = new HfInference(HF_TOKEN);
|
|
39
|
+
await inference.audioToAudio({
|
|
40
|
+
data: await (await fetch("sample.flac")).blob(),
|
|
41
|
+
model: "speechbrain/sepformer-wham",
|
|
42
|
+
});
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Audio Source Separation
|
|
46
|
+
|
|
47
|
+
Audio Source Separation allows you to isolate different sounds from individual sources. For example, if you have an audio file with multiple people speaking, you can get an audio file for each of them. You can then use an Automatic Speech Recognition system to extract the text from each of these sources as an initial step for your system!
|
|
48
|
+
|
|
49
|
+
Audio-to-Audio can also be used to remove noise from audio files: you get one audio for the person speaking and another audio for the noise. This can also be useful when you have multi-person audio with some noise: yyou can get one audio for each person and then one audio for the noise.
|
|
50
|
+
|
|
51
|
+
## Training a model for your own data
|
|
52
|
+
|
|
53
|
+
If you want to learn how to train models for the Audio-to-Audio task, we recommend the following tutorials:
|
|
54
|
+
|
|
55
|
+
- [Speech Enhancement](https://speechbrain.github.io/tutorial_enhancement.html)
|
|
56
|
+
- [Source Separation](https://speechbrain.github.io/tutorial_separation.html)
|