@huggingface/tasks 0.16.4 → 0.16.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commonjs/model-libraries.d.ts +7 -1
- package/dist/commonjs/model-libraries.d.ts.map +1 -1
- package/dist/commonjs/model-libraries.js +6 -0
- package/dist/commonjs/snippets/inputs.d.ts.map +1 -1
- package/dist/commonjs/snippets/inputs.js +2 -0
- package/dist/commonjs/snippets/js.d.ts +1 -0
- package/dist/commonjs/snippets/js.d.ts.map +1 -1
- package/dist/commonjs/snippets/js.js +25 -1
- package/dist/commonjs/snippets/python.d.ts +1 -0
- package/dist/commonjs/snippets/python.d.ts.map +1 -1
- package/dist/commonjs/snippets/python.js +19 -1
- package/dist/commonjs/tasks/depth-estimation/data.js +1 -1
- package/dist/commonjs/tasks/image-text-to-text/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-text-to-text/data.js +10 -6
- package/dist/commonjs/tasks/keypoint-detection/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/keypoint-detection/data.js +4 -0
- package/dist/commonjs/tasks/object-detection/data.js +5 -5
- package/dist/commonjs/tasks/text-generation/data.js +1 -1
- package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-speech/data.js +4 -0
- package/dist/commonjs/tasks/text-to-video/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-video/data.js +6 -2
- package/dist/commonjs/tasks/video-text-to-text/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/video-text-to-text/data.js +8 -0
- package/dist/commonjs/tasks/zero-shot-classification/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/zero-shot-classification/data.js +4 -0
- package/dist/commonjs/tasks/zero-shot-image-classification/data.js +2 -2
- package/dist/esm/model-libraries.d.ts +7 -1
- package/dist/esm/model-libraries.d.ts.map +1 -1
- package/dist/esm/model-libraries.js +6 -0
- package/dist/esm/snippets/inputs.d.ts.map +1 -1
- package/dist/esm/snippets/inputs.js +2 -0
- package/dist/esm/snippets/js.d.ts +1 -0
- package/dist/esm/snippets/js.d.ts.map +1 -1
- package/dist/esm/snippets/js.js +23 -0
- package/dist/esm/snippets/python.d.ts +1 -0
- package/dist/esm/snippets/python.d.ts.map +1 -1
- package/dist/esm/snippets/python.js +17 -0
- package/dist/esm/tasks/depth-estimation/data.js +1 -1
- package/dist/esm/tasks/image-text-to-text/data.d.ts.map +1 -1
- package/dist/esm/tasks/image-text-to-text/data.js +10 -6
- package/dist/esm/tasks/keypoint-detection/data.d.ts.map +1 -1
- package/dist/esm/tasks/keypoint-detection/data.js +4 -0
- package/dist/esm/tasks/object-detection/data.js +5 -5
- package/dist/esm/tasks/text-generation/data.js +1 -1
- package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-speech/data.js +4 -0
- package/dist/esm/tasks/text-to-video/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-video/data.js +6 -2
- package/dist/esm/tasks/video-text-to-text/data.d.ts.map +1 -1
- package/dist/esm/tasks/video-text-to-text/data.js +8 -0
- package/dist/esm/tasks/zero-shot-classification/data.d.ts.map +1 -1
- package/dist/esm/tasks/zero-shot-classification/data.js +4 -0
- package/dist/esm/tasks/zero-shot-image-classification/data.js +2 -2
- package/package.json +1 -1
- package/src/model-libraries.ts +6 -0
- package/src/snippets/inputs.ts +3 -0
- package/src/snippets/js.ts +28 -0
- package/src/snippets/python.ts +22 -0
- package/src/tasks/depth-estimation/data.ts +1 -1
- package/src/tasks/image-text-to-text/data.ts +10 -6
- package/src/tasks/keypoint-detection/data.ts +4 -0
- package/src/tasks/object-detection/data.ts +5 -5
- package/src/tasks/text-generation/data.ts +1 -1
- package/src/tasks/text-to-speech/data.ts +4 -0
- package/src/tasks/text-to-video/data.ts +6 -2
- package/src/tasks/video-text-to-text/data.ts +8 -0
- package/src/tasks/zero-shot-classification/data.ts +4 -0
- package/src/tasks/zero-shot-image-classification/data.ts +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Gf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -45,7 +45,7 @@ const taskData = {
|
|
|
45
45
|
},
|
|
46
46
|
{
|
|
47
47
|
description: "A screenshot understanding model used to control computers.",
|
|
48
|
-
id: "
|
|
48
|
+
id: "microsoft/OmniParser-v2.0",
|
|
49
49
|
},
|
|
50
50
|
{
|
|
51
51
|
description: "Cutting-edge vision language model.",
|
|
@@ -60,12 +60,16 @@ const taskData = {
|
|
|
60
60
|
id: "Qwen/Qwen2.5-VL-7B-Instruct",
|
|
61
61
|
},
|
|
62
62
|
{
|
|
63
|
-
description: "Image-text-to-text model with
|
|
64
|
-
id: "
|
|
63
|
+
description: "Image-text-to-text model with agentic capabilities.",
|
|
64
|
+
id: "microsoft/Magma-8B",
|
|
65
65
|
},
|
|
66
66
|
{
|
|
67
67
|
description: "Strong image-text-to-text model focused on documents.",
|
|
68
|
-
id: "
|
|
68
|
+
id: "allenai/olmOCR-7B-0225-preview",
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
description: "Small yet strong image-text-to-text model.",
|
|
72
|
+
id: "ibm-granite/granite-vision-3.2-2b",
|
|
69
73
|
},
|
|
70
74
|
],
|
|
71
75
|
spaces: [
|
|
@@ -82,8 +86,8 @@ const taskData = {
|
|
|
82
86
|
id: "akhaliq/Molmo-7B-D-0924",
|
|
83
87
|
},
|
|
84
88
|
{
|
|
85
|
-
description: "
|
|
86
|
-
id: "
|
|
89
|
+
description: "Powerful vision language assistant that can understand multiple images.",
|
|
90
|
+
id: "HuggingFaceTB/SmolVLM2",
|
|
87
91
|
},
|
|
88
92
|
{
|
|
89
93
|
description: "An application for chatting with an image-text-to-text model.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -25,6 +25,10 @@ const taskData = {
|
|
|
25
25
|
description: "A robust keypoint detection model.",
|
|
26
26
|
id: "magic-leap-community/superpoint",
|
|
27
27
|
},
|
|
28
|
+
{
|
|
29
|
+
description: "A robust keypoint matching model.",
|
|
30
|
+
id: "magic-leap-community/superglue_outdoor",
|
|
31
|
+
},
|
|
28
32
|
{
|
|
29
33
|
description: "Strong keypoint detection model used to detect human pose.",
|
|
30
34
|
id: "facebook/sapiens-pose-1b",
|
|
@@ -43,12 +43,12 @@ const taskData = {
|
|
|
43
43
|
id: "facebook/detr-resnet-50",
|
|
44
44
|
},
|
|
45
45
|
{
|
|
46
|
-
description: "
|
|
47
|
-
id: "
|
|
46
|
+
description: "Accurate object detection model.",
|
|
47
|
+
id: "IDEA-Research/dab-detr-resnet-50",
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
|
-
description: "Fast and accurate object detection model
|
|
51
|
-
id: "PekingU/
|
|
50
|
+
description: "Fast and accurate object detection model.",
|
|
51
|
+
id: "PekingU/rtdetr_v2_r50vd",
|
|
52
52
|
},
|
|
53
53
|
{
|
|
54
54
|
description: "Object detection model for low-lying objects.",
|
|
@@ -66,7 +66,7 @@ const taskData = {
|
|
|
66
66
|
},
|
|
67
67
|
{
|
|
68
68
|
description: "A cutting-edge object detection application.",
|
|
69
|
-
id: "
|
|
69
|
+
id: "sunsmarterjieleaf/yolov12",
|
|
70
70
|
},
|
|
71
71
|
{
|
|
72
72
|
description: "An object tracking, segmentation and inpainting application.",
|
|
@@ -71,7 +71,7 @@ const taskData = {
|
|
|
71
71
|
},
|
|
72
72
|
{
|
|
73
73
|
description: "A very powerful model with reasoning capabilities.",
|
|
74
|
-
id: "
|
|
74
|
+
id: "simplescaling/s1.1-32B",
|
|
75
75
|
},
|
|
76
76
|
{
|
|
77
77
|
description: "Strong conversational model that supports very long instructions.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -74,6 +74,10 @@ const taskData = {
|
|
|
74
74
|
description: "An application that synthesizes emotional speech for diverse speaker prompts.",
|
|
75
75
|
id: "parler-tts/parler-tts-expresso",
|
|
76
76
|
},
|
|
77
|
+
{
|
|
78
|
+
description: "An application that generates podcast episodes.",
|
|
79
|
+
id: "ngxson/kokoro-podcast-generator",
|
|
80
|
+
},
|
|
77
81
|
],
|
|
78
82
|
summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
|
|
79
83
|
widgetModels: ["suno/bark"],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -71,6 +71,10 @@ const taskData = {
|
|
|
71
71
|
description: "A text-to-video model focusing on physics-aware applications like robotics.",
|
|
72
72
|
id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
|
|
73
73
|
},
|
|
74
|
+
{
|
|
75
|
+
description: "A robust model for video generation.",
|
|
76
|
+
id: "Wan-AI/Wan2.1-T2V-1.3B",
|
|
77
|
+
},
|
|
74
78
|
],
|
|
75
79
|
spaces: [
|
|
76
80
|
{
|
|
@@ -79,7 +83,7 @@ const taskData = {
|
|
|
79
83
|
},
|
|
80
84
|
{
|
|
81
85
|
description: "Consistent video generation application.",
|
|
82
|
-
id: "
|
|
86
|
+
id: "Wan-AI/Wan2.1",
|
|
83
87
|
},
|
|
84
88
|
{
|
|
85
89
|
description: "A cutting edge video generation application.",
|
|
@@ -87,7 +91,7 @@ const taskData = {
|
|
|
87
91
|
},
|
|
88
92
|
],
|
|
89
93
|
summary: "Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ",
|
|
90
|
-
widgetModels: [],
|
|
94
|
+
widgetModels: ["tencent/HunyuanVideo"],
|
|
91
95
|
youtubeId: undefined,
|
|
92
96
|
};
|
|
93
97
|
export default taskData;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -43,6 +43,10 @@ const taskData = {
|
|
|
43
43
|
description: "Strong video-text-to-text model with reasoning capabilities.",
|
|
44
44
|
id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
|
|
45
45
|
},
|
|
46
|
+
{
|
|
47
|
+
description: "Strong video-text-to-text model.",
|
|
48
|
+
id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
|
49
|
+
},
|
|
46
50
|
],
|
|
47
51
|
spaces: [
|
|
48
52
|
{
|
|
@@ -53,6 +57,10 @@ const taskData = {
|
|
|
53
57
|
description: "A leaderboard for various video-text-to-text models.",
|
|
54
58
|
id: "opencompass/openvlm_video_leaderboard",
|
|
55
59
|
},
|
|
60
|
+
{
|
|
61
|
+
description: "An application to generate highlights from a video.",
|
|
62
|
+
id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
|
|
63
|
+
},
|
|
56
64
|
],
|
|
57
65
|
summary: "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
|
|
58
66
|
widgetModels: [""],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -56,6 +56,10 @@ const taskData = {
|
|
|
56
56
|
description: "Cutting-edge zero-shot multilingual text classification model.",
|
|
57
57
|
id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
|
|
58
58
|
},
|
|
59
|
+
{
|
|
60
|
+
description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
|
|
61
|
+
id: "knowledgator/gliclass-modern-base-v2.0-init",
|
|
62
|
+
},
|
|
59
63
|
],
|
|
60
64
|
spaces: [],
|
|
61
65
|
summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
|
|
@@ -51,11 +51,11 @@ const taskData = {
|
|
|
51
51
|
},
|
|
52
52
|
{
|
|
53
53
|
description: "Strong zero-shot image classification model.",
|
|
54
|
-
id: "google/
|
|
54
|
+
id: "google/siglip2-base-patch16-224",
|
|
55
55
|
},
|
|
56
56
|
{
|
|
57
57
|
description: "Robust zero-shot image classification model.",
|
|
58
|
-
id: "
|
|
58
|
+
id: "intfloat/mmE5-mllama-11b-instruct",
|
|
59
59
|
},
|
|
60
60
|
{
|
|
61
61
|
description: "Powerful zero-shot image classification model supporting 94 languages.",
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/tasks",
|
|
3
3
|
"packageManager": "pnpm@8.10.5",
|
|
4
|
-
"version": "0.16.
|
|
4
|
+
"version": "0.16.6",
|
|
5
5
|
"description": "List of ML tasks for huggingface.co/tasks",
|
|
6
6
|
"repository": "https://github.com/huggingface/huggingface.js.git",
|
|
7
7
|
"publishConfig": {
|
package/src/model-libraries.ts
CHANGED
|
@@ -394,6 +394,12 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
394
394
|
repoUrl: "https://github.com/Tencent/HunyuanDiT",
|
|
395
395
|
countDownloads: `path:"pytorch_model_ema.pt" OR path:"pytorch_model_distill.pt"`,
|
|
396
396
|
},
|
|
397
|
+
"hunyuan3d-2": {
|
|
398
|
+
prettyLabel: "Hunyuan3D-2",
|
|
399
|
+
repoName: "Hunyuan3D-2",
|
|
400
|
+
repoUrl: "https://github.com/Tencent/Hunyuan3D-2",
|
|
401
|
+
countDownloads: `path:"model_index.json" OR path:"config.yaml"`,
|
|
402
|
+
},
|
|
397
403
|
imstoucan: {
|
|
398
404
|
prettyLabel: "IMS Toucan",
|
|
399
405
|
repoName: "IMS-Toucan",
|
package/src/snippets/inputs.ts
CHANGED
|
@@ -96,6 +96,8 @@ const inputsAudioClassification = () => `"sample1.flac"`;
|
|
|
96
96
|
|
|
97
97
|
const inputsTextToImage = () => `"Astronaut riding a horse"`;
|
|
98
98
|
|
|
99
|
+
const inputsTextToVideo = () => `"A young man walking on the street"`;
|
|
100
|
+
|
|
99
101
|
const inputsTextToSpeech = () => `"The answer to the universe is 42"`;
|
|
100
102
|
|
|
101
103
|
const inputsTextToAudio = () => `"liquid drum and bass, atmospheric synths, airy sounds"`;
|
|
@@ -130,6 +132,7 @@ const modelInputSnippets: {
|
|
|
130
132
|
"text-generation": inputsTextGeneration,
|
|
131
133
|
"image-text-to-text": inputsTextGeneration,
|
|
132
134
|
"text-to-image": inputsTextToImage,
|
|
135
|
+
"text-to-video": inputsTextToVideo,
|
|
133
136
|
"text-to-speech": inputsTextToSpeech,
|
|
134
137
|
"text-to-audio": inputsTextToAudio,
|
|
135
138
|
"text2text-generation": inputsText2TextGeneration,
|
package/src/snippets/js.ts
CHANGED
|
@@ -275,6 +275,33 @@ query({"inputs": ${getModelInputSnippet(model)}}).then((response) => {
|
|
|
275
275
|
];
|
|
276
276
|
};
|
|
277
277
|
|
|
278
|
+
export const snippetTextToVideo = (
|
|
279
|
+
model: ModelDataMinimal,
|
|
280
|
+
accessToken: string,
|
|
281
|
+
provider: SnippetInferenceProvider
|
|
282
|
+
): InferenceSnippet[] => {
|
|
283
|
+
return ["fal-ai", "replicate"].includes(provider)
|
|
284
|
+
? [
|
|
285
|
+
{
|
|
286
|
+
client: "huggingface.js",
|
|
287
|
+
content: `\
|
|
288
|
+
import { HfInference } from "@huggingface/inference";
|
|
289
|
+
|
|
290
|
+
const client = new HfInference("${accessToken || `{API_TOKEN}`}");
|
|
291
|
+
|
|
292
|
+
const video = await client.textToVideo({
|
|
293
|
+
model: "${model.id}",
|
|
294
|
+
provider: "${provider}",
|
|
295
|
+
inputs: ${getModelInputSnippet(model)},
|
|
296
|
+
parameters: { num_inference_steps: 5 },
|
|
297
|
+
});
|
|
298
|
+
// Use the generated video (it's a Blob)
|
|
299
|
+
`,
|
|
300
|
+
},
|
|
301
|
+
]
|
|
302
|
+
: [];
|
|
303
|
+
};
|
|
304
|
+
|
|
278
305
|
export const snippetTextToAudio = (
|
|
279
306
|
model: ModelDataMinimal,
|
|
280
307
|
accessToken: string,
|
|
@@ -420,6 +447,7 @@ export const jsSnippets: Partial<
|
|
|
420
447
|
"sentence-similarity": snippetBasic,
|
|
421
448
|
"automatic-speech-recognition": snippetAutomaticSpeechRecognition,
|
|
422
449
|
"text-to-image": snippetTextToImage,
|
|
450
|
+
"text-to-video": snippetTextToVideo,
|
|
423
451
|
"text-to-speech": snippetTextToAudio,
|
|
424
452
|
"text-to-audio": snippetTextToAudio,
|
|
425
453
|
"audio-to-audio": snippetFile,
|
package/src/snippets/python.ts
CHANGED
|
@@ -308,6 +308,27 @@ image = Image.open(io.BytesIO(image_bytes))`,
|
|
|
308
308
|
];
|
|
309
309
|
};
|
|
310
310
|
|
|
311
|
+
export const snippetTextToVideo = (
|
|
312
|
+
model: ModelDataMinimal,
|
|
313
|
+
accessToken: string,
|
|
314
|
+
provider: SnippetInferenceProvider
|
|
315
|
+
): InferenceSnippet[] => {
|
|
316
|
+
return ["fal-ai", "replicate"].includes(provider)
|
|
317
|
+
? [
|
|
318
|
+
{
|
|
319
|
+
client: "huggingface_hub",
|
|
320
|
+
content: `\
|
|
321
|
+
${snippetImportInferenceClient(accessToken, provider)}
|
|
322
|
+
|
|
323
|
+
video = client.text_to_video(
|
|
324
|
+
${getModelInputSnippet(model)},
|
|
325
|
+
model="${model.id}"
|
|
326
|
+
)`,
|
|
327
|
+
},
|
|
328
|
+
]
|
|
329
|
+
: [];
|
|
330
|
+
};
|
|
331
|
+
|
|
311
332
|
export const snippetTabular = (model: ModelDataMinimal): InferenceSnippet[] => {
|
|
312
333
|
return [
|
|
313
334
|
{
|
|
@@ -412,6 +433,7 @@ export const pythonSnippets: Partial<
|
|
|
412
433
|
"sentence-similarity": snippetBasic,
|
|
413
434
|
"automatic-speech-recognition": snippetFile,
|
|
414
435
|
"text-to-image": snippetTextToImage,
|
|
436
|
+
"text-to-video": snippetTextToVideo,
|
|
415
437
|
"text-to-speech": snippetTextToAudio,
|
|
416
438
|
"text-to-audio": snippetTextToAudio,
|
|
417
439
|
"audio-to-audio": snippetFile,
|
|
@@ -48,7 +48,7 @@ const taskData: TaskDataCustom = {
|
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
50
|
description: "A screenshot understanding model used to control computers.",
|
|
51
|
-
id: "
|
|
51
|
+
id: "microsoft/OmniParser-v2.0",
|
|
52
52
|
},
|
|
53
53
|
{
|
|
54
54
|
description: "Cutting-edge vision language model.",
|
|
@@ -63,12 +63,16 @@ const taskData: TaskDataCustom = {
|
|
|
63
63
|
id: "Qwen/Qwen2.5-VL-7B-Instruct",
|
|
64
64
|
},
|
|
65
65
|
{
|
|
66
|
-
description: "Image-text-to-text model with
|
|
67
|
-
id: "
|
|
66
|
+
description: "Image-text-to-text model with agentic capabilities.",
|
|
67
|
+
id: "microsoft/Magma-8B",
|
|
68
68
|
},
|
|
69
69
|
{
|
|
70
70
|
description: "Strong image-text-to-text model focused on documents.",
|
|
71
|
-
id: "
|
|
71
|
+
id: "allenai/olmOCR-7B-0225-preview",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "Small yet strong image-text-to-text model.",
|
|
75
|
+
id: "ibm-granite/granite-vision-3.2-2b",
|
|
72
76
|
},
|
|
73
77
|
],
|
|
74
78
|
spaces: [
|
|
@@ -85,8 +89,8 @@ const taskData: TaskDataCustom = {
|
|
|
85
89
|
id: "akhaliq/Molmo-7B-D-0924",
|
|
86
90
|
},
|
|
87
91
|
{
|
|
88
|
-
description: "
|
|
89
|
-
id: "
|
|
92
|
+
description: "Powerful vision language assistant that can understand multiple images.",
|
|
93
|
+
id: "HuggingFaceTB/SmolVLM2",
|
|
90
94
|
},
|
|
91
95
|
{
|
|
92
96
|
description: "An application for chatting with an image-text-to-text model.",
|
|
@@ -27,6 +27,10 @@ const taskData: TaskDataCustom = {
|
|
|
27
27
|
description: "A robust keypoint detection model.",
|
|
28
28
|
id: "magic-leap-community/superpoint",
|
|
29
29
|
},
|
|
30
|
+
{
|
|
31
|
+
description: "A robust keypoint matching model.",
|
|
32
|
+
id: "magic-leap-community/superglue_outdoor",
|
|
33
|
+
},
|
|
30
34
|
{
|
|
31
35
|
description: "Strong keypoint detection model used to detect human pose.",
|
|
32
36
|
id: "facebook/sapiens-pose-1b",
|
|
@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {
|
|
|
47
47
|
id: "facebook/detr-resnet-50",
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
|
-
description: "
|
|
51
|
-
id: "
|
|
50
|
+
description: "Accurate object detection model.",
|
|
51
|
+
id: "IDEA-Research/dab-detr-resnet-50",
|
|
52
52
|
},
|
|
53
53
|
{
|
|
54
|
-
description: "Fast and accurate object detection model
|
|
55
|
-
id: "PekingU/
|
|
54
|
+
description: "Fast and accurate object detection model.",
|
|
55
|
+
id: "PekingU/rtdetr_v2_r50vd",
|
|
56
56
|
},
|
|
57
57
|
{
|
|
58
58
|
description: "Object detection model for low-lying objects.",
|
|
@@ -70,7 +70,7 @@ const taskData: TaskDataCustom = {
|
|
|
70
70
|
},
|
|
71
71
|
{
|
|
72
72
|
description: "A cutting-edge object detection application.",
|
|
73
|
-
id: "
|
|
73
|
+
id: "sunsmarterjieleaf/yolov12",
|
|
74
74
|
},
|
|
75
75
|
{
|
|
76
76
|
description: "An object tracking, segmentation and inpainting application.",
|
|
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
|
|
|
76
76
|
},
|
|
77
77
|
{
|
|
78
78
|
description: "A very powerful model with reasoning capabilities.",
|
|
79
|
-
id: "
|
|
79
|
+
id: "simplescaling/s1.1-32B",
|
|
80
80
|
},
|
|
81
81
|
{
|
|
82
82
|
description: "Strong conversational model that supports very long instructions.",
|
|
@@ -76,6 +76,10 @@ const taskData: TaskDataCustom = {
|
|
|
76
76
|
description: "An application that synthesizes emotional speech for diverse speaker prompts.",
|
|
77
77
|
id: "parler-tts/parler-tts-expresso",
|
|
78
78
|
},
|
|
79
|
+
{
|
|
80
|
+
description: "An application that generates podcast episodes.",
|
|
81
|
+
id: "ngxson/kokoro-podcast-generator",
|
|
82
|
+
},
|
|
79
83
|
],
|
|
80
84
|
summary:
|
|
81
85
|
"Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
|
|
@@ -78,6 +78,10 @@ const taskData: TaskDataCustom = {
|
|
|
78
78
|
description: "A text-to-video model focusing on physics-aware applications like robotics.",
|
|
79
79
|
id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
|
|
80
80
|
},
|
|
81
|
+
{
|
|
82
|
+
description: "A robust model for video generation.",
|
|
83
|
+
id: "Wan-AI/Wan2.1-T2V-1.3B",
|
|
84
|
+
},
|
|
81
85
|
],
|
|
82
86
|
spaces: [
|
|
83
87
|
{
|
|
@@ -86,7 +90,7 @@ const taskData: TaskDataCustom = {
|
|
|
86
90
|
},
|
|
87
91
|
{
|
|
88
92
|
description: "Consistent video generation application.",
|
|
89
|
-
id: "
|
|
93
|
+
id: "Wan-AI/Wan2.1",
|
|
90
94
|
},
|
|
91
95
|
{
|
|
92
96
|
description: "A cutting edge video generation application.",
|
|
@@ -95,7 +99,7 @@ const taskData: TaskDataCustom = {
|
|
|
95
99
|
],
|
|
96
100
|
summary:
|
|
97
101
|
"Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ",
|
|
98
|
-
widgetModels: [],
|
|
102
|
+
widgetModels: ["tencent/HunyuanVideo"],
|
|
99
103
|
youtubeId: undefined,
|
|
100
104
|
};
|
|
101
105
|
|
|
@@ -46,6 +46,10 @@ const taskData: TaskDataCustom = {
|
|
|
46
46
|
description: "Strong video-text-to-text model with reasoning capabilities.",
|
|
47
47
|
id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
|
|
48
48
|
},
|
|
49
|
+
{
|
|
50
|
+
description: "Strong video-text-to-text model.",
|
|
51
|
+
id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
|
52
|
+
},
|
|
49
53
|
],
|
|
50
54
|
spaces: [
|
|
51
55
|
{
|
|
@@ -56,6 +60,10 @@ const taskData: TaskDataCustom = {
|
|
|
56
60
|
description: "A leaderboard for various video-text-to-text models.",
|
|
57
61
|
id: "opencompass/openvlm_video_leaderboard",
|
|
58
62
|
},
|
|
63
|
+
{
|
|
64
|
+
description: "An application to generate highlights from a video.",
|
|
65
|
+
id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
|
|
66
|
+
},
|
|
59
67
|
],
|
|
60
68
|
summary:
|
|
61
69
|
"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
|
|
@@ -60,6 +60,10 @@ const taskData: TaskDataCustom = {
|
|
|
60
60
|
description: "Cutting-edge zero-shot multilingual text classification model.",
|
|
61
61
|
id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
|
|
62
62
|
},
|
|
63
|
+
{
|
|
64
|
+
description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
|
|
65
|
+
id: "knowledgator/gliclass-modern-base-v2.0-init",
|
|
66
|
+
},
|
|
63
67
|
],
|
|
64
68
|
spaces: [],
|
|
65
69
|
summary:
|
|
@@ -53,11 +53,11 @@ const taskData: TaskDataCustom = {
|
|
|
53
53
|
},
|
|
54
54
|
{
|
|
55
55
|
description: "Strong zero-shot image classification model.",
|
|
56
|
-
id: "google/
|
|
56
|
+
id: "google/siglip2-base-patch16-224",
|
|
57
57
|
},
|
|
58
58
|
{
|
|
59
59
|
description: "Robust zero-shot image classification model.",
|
|
60
|
-
id: "
|
|
60
|
+
id: "intfloat/mmE5-mllama-11b-instruct",
|
|
61
61
|
},
|
|
62
62
|
{
|
|
63
63
|
description: "Powerful zero-shot image classification model supporting 94 languages.",
|