npm - @huggingface/tasks - Versions diffs - 0.16.4 → 0.16.6 - Mend

@huggingface/tasks 0.16.4 → 0.16.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/dist/commonjs/model-libraries.d.ts +7 -1
package/dist/commonjs/model-libraries.d.ts.map +1 -1
package/dist/commonjs/model-libraries.js +6 -0
package/dist/commonjs/snippets/inputs.d.ts.map +1 -1
package/dist/commonjs/snippets/inputs.js +2 -0
package/dist/commonjs/snippets/js.d.ts +1 -0
package/dist/commonjs/snippets/js.d.ts.map +1 -1
package/dist/commonjs/snippets/js.js +25 -1
package/dist/commonjs/snippets/python.d.ts +1 -0
package/dist/commonjs/snippets/python.d.ts.map +1 -1
package/dist/commonjs/snippets/python.js +19 -1
package/dist/commonjs/tasks/depth-estimation/data.js +1 -1
package/dist/commonjs/tasks/image-text-to-text/data.d.ts.map +1 -1
package/dist/commonjs/tasks/image-text-to-text/data.js +10 -6
package/dist/commonjs/tasks/keypoint-detection/data.d.ts.map +1 -1
package/dist/commonjs/tasks/keypoint-detection/data.js +4 -0
package/dist/commonjs/tasks/object-detection/data.js +5 -5
package/dist/commonjs/tasks/text-generation/data.js +1 -1
package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
package/dist/commonjs/tasks/text-to-speech/data.js +4 -0
package/dist/commonjs/tasks/text-to-video/data.d.ts.map +1 -1
package/dist/commonjs/tasks/text-to-video/data.js +6 -2
package/dist/commonjs/tasks/video-text-to-text/data.d.ts.map +1 -1
package/dist/commonjs/tasks/video-text-to-text/data.js +8 -0
package/dist/commonjs/tasks/zero-shot-classification/data.d.ts.map +1 -1
package/dist/commonjs/tasks/zero-shot-classification/data.js +4 -0
package/dist/commonjs/tasks/zero-shot-image-classification/data.js +2 -2
package/dist/esm/model-libraries.d.ts +7 -1
package/dist/esm/model-libraries.d.ts.map +1 -1
package/dist/esm/model-libraries.js +6 -0
package/dist/esm/snippets/inputs.d.ts.map +1 -1
package/dist/esm/snippets/inputs.js +2 -0
package/dist/esm/snippets/js.d.ts +1 -0
package/dist/esm/snippets/js.d.ts.map +1 -1
package/dist/esm/snippets/js.js +23 -0
package/dist/esm/snippets/python.d.ts +1 -0
package/dist/esm/snippets/python.d.ts.map +1 -1
package/dist/esm/snippets/python.js +17 -0
package/dist/esm/tasks/depth-estimation/data.js +1 -1
package/dist/esm/tasks/image-text-to-text/data.d.ts.map +1 -1
package/dist/esm/tasks/image-text-to-text/data.js +10 -6
package/dist/esm/tasks/keypoint-detection/data.d.ts.map +1 -1
package/dist/esm/tasks/keypoint-detection/data.js +4 -0
package/dist/esm/tasks/object-detection/data.js +5 -5
package/dist/esm/tasks/text-generation/data.js +1 -1
package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
package/dist/esm/tasks/text-to-speech/data.js +4 -0
package/dist/esm/tasks/text-to-video/data.d.ts.map +1 -1
package/dist/esm/tasks/text-to-video/data.js +6 -2
package/dist/esm/tasks/video-text-to-text/data.d.ts.map +1 -1
package/dist/esm/tasks/video-text-to-text/data.js +8 -0
package/dist/esm/tasks/zero-shot-classification/data.d.ts.map +1 -1
package/dist/esm/tasks/zero-shot-classification/data.js +4 -0
package/dist/esm/tasks/zero-shot-image-classification/data.js +2 -2
package/package.json +1 -1
package/src/model-libraries.ts +6 -0
package/src/snippets/inputs.ts +3 -0
package/src/snippets/js.ts +28 -0
package/src/snippets/python.ts +22 -0
package/src/tasks/depth-estimation/data.ts +1 -1
package/src/tasks/image-text-to-text/data.ts +10 -6
package/src/tasks/keypoint-detection/data.ts +4 -0
package/src/tasks/object-detection/data.ts +5 -5
package/src/tasks/text-generation/data.ts +1 -1
package/src/tasks/text-to-speech/data.ts +4 -0
package/src/tasks/text-to-video/data.ts +6 -2
package/src/tasks/video-text-to-text/data.ts +8 -0
package/src/tasks/zero-shot-classification/data.ts +4 -0
package/src/tasks/zero-shot-image-classification/data.ts +2 -2

package/dist/esm/tasks/depth-estimation/data.js CHANGED Viewed

@@ -39,7 +39,7 @@ const taskData = {
         },
         {
             description: "A robust depth estimation model.",
-            id: "apple/DepthPro",
+            id: "apple/DepthPro-hf",
         },
     ],
     spaces: [

package/dist/esm/tasks/image-text-to-text/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cAyGf~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Gf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/image-text-to-text/data.js CHANGED Viewed

@@ -45,7 +45,7 @@ const taskData = {
         },
         {
             description: "A screenshot understanding model used to control computers.",
-            id: "showlab/ShowUI-2B",
+            id: "microsoft/OmniParser-v2.0",
         },
         {
             description: "Cutting-edge vision language model.",
@@ -60,12 +60,16 @@ const taskData = {
             id: "Qwen/Qwen2.5-VL-7B-Instruct",
         },
         {
-            description: "Image-text-to-text model with reasoning capabilities.",
-            id: "Qwen/QVQ-72B-Preview",
+            description: "Image-text-to-text model with agentic capabilities.",
+            id: "microsoft/Magma-8B",
         },
         {
             description: "Strong image-text-to-text model focused on documents.",
-            id: "stepfun-ai/GOT-OCR2_0",
+            id: "allenai/olmOCR-7B-0225-preview",
+        },
+        {
+            description: "Small yet strong image-text-to-text model.",
+            id: "ibm-granite/granite-vision-3.2-2b",
         },
     ],
     spaces: [
@@ -82,8 +86,8 @@ const taskData = {
             id: "akhaliq/Molmo-7B-D-0924",
         },
         {
-            description: "An image-text-to-text application focused on documents.",
-            id: "stepfun-ai/GOT_official_online_demo",
+            description: "Powerful vision language assistant that can understand multiple images.",
+            id: "HuggingFaceTB/SmolVLM2",
         },
         {
             description: "An application for chatting with an image-text-to-text model.",

package/dist/esm/tasks/keypoint-detection/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cAiDf~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/keypoint-detection/data.js CHANGED Viewed

@@ -25,6 +25,10 @@ const taskData = {
             description: "A robust keypoint detection model.",
             id: "magic-leap-community/superpoint",
         },
+        {
+            description: "A robust keypoint matching model.",
+            id: "magic-leap-community/superglue_outdoor",
+        },
         {
             description: "Strong keypoint detection model used to detect human pose.",
             id: "facebook/sapiens-pose-1b",

package/dist/esm/tasks/object-detection/data.js CHANGED Viewed

@@ -43,12 +43,12 @@ const taskData = {
             id: "facebook/detr-resnet-50",
         },
         {
-            description: "Real-time and accurate object detection model.",
-            id: "jameslahm/yolov10x",
+            description: "Accurate object detection model.",
+            id: "IDEA-Research/dab-detr-resnet-50",
         },
         {
-            description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
-            id: "PekingU/rtdetr_r18vd_coco_o365",
+            description: "Fast and accurate object detection model.",
+            id: "PekingU/rtdetr_v2_r50vd",
         },
         {
             description: "Object detection model for low-lying objects.",
@@ -66,7 +66,7 @@ const taskData = {
         },
         {
             description: "A cutting-edge object detection application.",
-            id: "Ultralytics/YOLO11",
+            id: "sunsmarterjieleaf/yolov12",
         },
         {
             description: "An object tracking, segmentation and inpainting application.",

package/dist/esm/tasks/text-generation/data.js CHANGED Viewed

@@ -71,7 +71,7 @@ const taskData = {
         },
         {
             description: "A very powerful model with reasoning capabilities.",
-            id: "PowerInfer/SmallThinker-3B-Preview",
+            id: "simplescaling/s1.1-32B",
         },
         {
             description: "Strong conversational model that supports very long instructions.",

package/dist/esm/tasks/text-to-speech/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cAiFf~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/text-to-speech/data.js CHANGED Viewed

@@ -74,6 +74,10 @@ const taskData = {
             description: "An application that synthesizes emotional speech for diverse speaker prompts.",
             id: "parler-tts/parler-tts-expresso",
         },
+        {
+            description: "An application that generates podcast episodes.",
+            id: "ngxson/kokoro-podcast-generator",
+        },
     ],
     summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
     widgetModels: ["suno/bark"],

package/dist/esm/tasks/text-to-video/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cAiGf~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/text-to-video/data.js CHANGED Viewed

@@ -71,6 +71,10 @@ const taskData = {
             description: "A text-to-video model focusing on physics-aware applications like robotics.",
             id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
         },
+        {
+            description: "A robust model for video generation.",
+            id: "Wan-AI/Wan2.1-T2V-1.3B",
+        },
     ],
     spaces: [
         {
@@ -79,7 +83,7 @@ const taskData = {
         },
         {
             description: "Consistent video generation application.",
-            id: "TIGER-Lab/T2V-Turbo-V2",
+            id: "Wan-AI/Wan2.1",
         },
         {
             description: "A cutting edge video generation application.",
@@ -87,7 +91,7 @@ const taskData = {
         },
     ],
     summary: "Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ",
-    widgetModels: [],
+    widgetModels: ["tencent/HunyuanVideo"],
     youtubeId: undefined,
 };
 export default taskData;

package/dist/esm/tasks/video-text-to-text/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cA6Df~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/video-text-to-text/data.js CHANGED Viewed

@@ -43,6 +43,10 @@ const taskData = {
             description: "Strong video-text-to-text model with reasoning capabilities.",
             id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
         },
+        {
+            description: "Strong video-text-to-text model.",
+            id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+        },
     ],
     spaces: [
         {
@@ -53,6 +57,10 @@ const taskData = {
             description: "A leaderboard for various video-text-to-text models.",
             id: "opencompass/openvlm_video_leaderboard",
         },
+        {
+            description: "An application to generate highlights from a video.",
+            id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
+        },
     ],
     summary: "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
     widgetModels: [""],

package/dist/esm/tasks/zero-shot-classification/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cAiEf~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/zero-shot-classification/data.js CHANGED Viewed

@@ -56,6 +56,10 @@ const taskData = {
             description: "Cutting-edge zero-shot multilingual text classification model.",
             id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
         },
+        {
+            description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
+            id: "knowledgator/gliclass-modern-base-v2.0-init",
+        },
     ],
     spaces: [],
     summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",

package/dist/esm/tasks/zero-shot-image-classification/data.js CHANGED Viewed

@@ -51,11 +51,11 @@ const taskData = {
         },
         {
             description: "Strong zero-shot image classification model.",
-            id: "google/siglip-so400m-patch14-224",
+            id: "google/siglip2-base-patch16-224",
         },
         {
             description: "Robust zero-shot image classification model.",
-            id: "microsoft/LLM2CLIP-EVA02-L-14-336",
+            id: "intfloat/mmE5-mllama-11b-instruct",
         },
         {
             description: "Powerful zero-shot image classification model supporting 94 languages.",

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@huggingface/tasks",
   "packageManager": "pnpm@8.10.5",
-  "version": "0.16.4",
+  "version": "0.16.6",
   "description": "List of ML tasks for huggingface.co/tasks",
   "repository": "https://github.com/huggingface/huggingface.js.git",
   "publishConfig": {

package/src/model-libraries.ts CHANGED Viewed

@@ -394,6 +394,12 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
 		repoUrl: "https://github.com/Tencent/HunyuanDiT",
 		countDownloads: `path:"pytorch_model_ema.pt" OR path:"pytorch_model_distill.pt"`,
 	},
+	"hunyuan3d-2": {
+		prettyLabel: "Hunyuan3D-2",
+		repoName: "Hunyuan3D-2",
+		repoUrl: "https://github.com/Tencent/Hunyuan3D-2",
+		countDownloads: `path:"model_index.json" OR path:"config.yaml"`,
+	},
 	imstoucan: {
 		prettyLabel: "IMS Toucan",
 		repoName: "IMS-Toucan",

package/src/snippets/inputs.ts CHANGED Viewed

@@ -96,6 +96,8 @@ const inputsAudioClassification = () => `"sample1.flac"`;
 const inputsTextToImage = () => `"Astronaut riding a horse"`;
+const inputsTextToVideo = () => `"A young man walking on the street"`;
 const inputsTextToSpeech = () => `"The answer to the universe is 42"`;
 const inputsTextToAudio = () => `"liquid drum and bass, atmospheric synths, airy sounds"`;
@@ -130,6 +132,7 @@ const modelInputSnippets: {
 	"text-generation": inputsTextGeneration,
 	"image-text-to-text": inputsTextGeneration,
 	"text-to-image": inputsTextToImage,
+	"text-to-video": inputsTextToVideo,
 	"text-to-speech": inputsTextToSpeech,
 	"text-to-audio": inputsTextToAudio,
 	"text2text-generation": inputsText2TextGeneration,

package/src/snippets/js.ts CHANGED Viewed

@@ -275,6 +275,33 @@ query({"inputs": ${getModelInputSnippet(model)}}).then((response) => {
 	];
 };
+export const snippetTextToVideo = (
+	model: ModelDataMinimal,
+	accessToken: string,
+	provider: SnippetInferenceProvider
+): InferenceSnippet[] => {
+	return ["fal-ai", "replicate"].includes(provider)
+		? [
+				{
+					client: "huggingface.js",
+					content: `\
+import { HfInference } from "@huggingface/inference";
+const client = new HfInference("${accessToken || `{API_TOKEN}`}");
+const video = await client.textToVideo({
+	model: "${model.id}",
+	provider: "${provider}",
+	inputs: ${getModelInputSnippet(model)},
+	parameters: { num_inference_steps: 5 },
+});
+// Use the generated video (it's a Blob)
+`,
+				},
+		  ]
+		: [];
+};
 export const snippetTextToAudio = (
 	model: ModelDataMinimal,
 	accessToken: string,
@@ -420,6 +447,7 @@ export const jsSnippets: Partial<
 	"sentence-similarity": snippetBasic,
 	"automatic-speech-recognition": snippetAutomaticSpeechRecognition,
 	"text-to-image": snippetTextToImage,
+	"text-to-video": snippetTextToVideo,
 	"text-to-speech": snippetTextToAudio,
 	"text-to-audio": snippetTextToAudio,
 	"audio-to-audio": snippetFile,

package/src/snippets/python.ts CHANGED Viewed

@@ -308,6 +308,27 @@ image = Image.open(io.BytesIO(image_bytes))`,
 	];
 };
+export const snippetTextToVideo = (
+	model: ModelDataMinimal,
+	accessToken: string,
+	provider: SnippetInferenceProvider
+): InferenceSnippet[] => {
+	return ["fal-ai", "replicate"].includes(provider)
+		? [
+				{
+					client: "huggingface_hub",
+					content: `\
+${snippetImportInferenceClient(accessToken, provider)}
+video = client.text_to_video(
+	${getModelInputSnippet(model)},
+	model="${model.id}"
+)`,
+				},
+		  ]
+		: [];
+};
 export const snippetTabular = (model: ModelDataMinimal): InferenceSnippet[] => {
 	return [
 		{
@@ -412,6 +433,7 @@ export const pythonSnippets: Partial<
 	"sentence-similarity": snippetBasic,
 	"automatic-speech-recognition": snippetFile,
 	"text-to-image": snippetTextToImage,
+	"text-to-video": snippetTextToVideo,
 	"text-to-speech": snippetTextToAudio,
 	"text-to-audio": snippetTextToAudio,
 	"audio-to-audio": snippetFile,

package/src/tasks/depth-estimation/data.ts CHANGED Viewed

@@ -41,7 +41,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A robust depth estimation model.",
-			id: "apple/DepthPro",
+			id: "apple/DepthPro-hf",
 		},
 	],
 	spaces: [

package/src/tasks/image-text-to-text/data.ts CHANGED Viewed

@@ -48,7 +48,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A screenshot understanding model used to control computers.",
-			id: "showlab/ShowUI-2B",
+			id: "microsoft/OmniParser-v2.0",
 		},
 		{
 			description: "Cutting-edge vision language model.",
@@ -63,12 +63,16 @@ const taskData: TaskDataCustom = {
 			id: "Qwen/Qwen2.5-VL-7B-Instruct",
 		},
 		{
-			description: "Image-text-to-text model with reasoning capabilities.",
-			id: "Qwen/QVQ-72B-Preview",
+			description: "Image-text-to-text model with agentic capabilities.",
+			id: "microsoft/Magma-8B",
 		},
 		{
 			description: "Strong image-text-to-text model focused on documents.",
-			id: "stepfun-ai/GOT-OCR2_0",
+			id: "allenai/olmOCR-7B-0225-preview",
+		},
+		{
+			description: "Small yet strong image-text-to-text model.",
+			id: "ibm-granite/granite-vision-3.2-2b",
 		},
 	],
 	spaces: [
@@ -85,8 +89,8 @@ const taskData: TaskDataCustom = {
 			id: "akhaliq/Molmo-7B-D-0924",
 		},
 		{
-			description: "An image-text-to-text application focused on documents.",
-			id: "stepfun-ai/GOT_official_online_demo",
+			description: "Powerful vision language assistant that can understand multiple images.",
+			id: "HuggingFaceTB/SmolVLM2",
 		},
 		{
 			description: "An application for chatting with an image-text-to-text model.",

package/src/tasks/keypoint-detection/data.ts CHANGED Viewed

@@ -27,6 +27,10 @@ const taskData: TaskDataCustom = {
 			description: "A robust keypoint detection model.",
 			id: "magic-leap-community/superpoint",
 		},
+		{
+			description: "A robust keypoint matching model.",
+			id: "magic-leap-community/superglue_outdoor",
+		},
 		{
 			description: "Strong keypoint detection model used to detect human pose.",
 			id: "facebook/sapiens-pose-1b",

package/src/tasks/object-detection/data.ts CHANGED Viewed

@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {
 			id: "facebook/detr-resnet-50",
 		},
 		{
-			description: "Real-time and accurate object detection model.",
-			id: "jameslahm/yolov10x",
+			description: "Accurate object detection model.",
+			id: "IDEA-Research/dab-detr-resnet-50",
 		},
 		{
-			description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
-			id: "PekingU/rtdetr_r18vd_coco_o365",
+			description: "Fast and accurate object detection model.",
+			id: "PekingU/rtdetr_v2_r50vd",
 		},
 		{
 			description: "Object detection model for low-lying objects.",
@@ -70,7 +70,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A cutting-edge object detection application.",
-			id: "Ultralytics/YOLO11",
+			id: "sunsmarterjieleaf/yolov12",
 		},
 		{
 			description: "An object tracking, segmentation and inpainting application.",

package/src/tasks/text-generation/data.ts CHANGED Viewed

@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A very powerful model with reasoning capabilities.",
-			id: "PowerInfer/SmallThinker-3B-Preview",
+			id: "simplescaling/s1.1-32B",
 		},
 		{
 			description: "Strong conversational model that supports very long instructions.",

package/src/tasks/text-to-speech/data.ts CHANGED Viewed

@@ -76,6 +76,10 @@ const taskData: TaskDataCustom = {
 			description: "An application that synthesizes emotional speech for diverse speaker prompts.",
 			id: "parler-tts/parler-tts-expresso",
 		},
+		{
+			description: "An application that generates podcast episodes.",
+			id: "ngxson/kokoro-podcast-generator",
+		},
 	],
 	summary:
 		"Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",

package/src/tasks/text-to-video/data.ts CHANGED Viewed

@@ -78,6 +78,10 @@ const taskData: TaskDataCustom = {
 			description: "A text-to-video model focusing on physics-aware applications like robotics.",
 			id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
 		},
+		{
+			description: "A robust model for video generation.",
+			id: "Wan-AI/Wan2.1-T2V-1.3B",
+		},
 	],
 	spaces: [
 		{
@@ -86,7 +90,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Consistent video generation application.",
-			id: "TIGER-Lab/T2V-Turbo-V2",
+			id: "Wan-AI/Wan2.1",
 		},
 		{
 			description: "A cutting edge video generation application.",
@@ -95,7 +99,7 @@ const taskData: TaskDataCustom = {
 	],
 	summary:
 		"Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ",
-	widgetModels: [],
+	widgetModels: ["tencent/HunyuanVideo"],
 	youtubeId: undefined,
 };

package/src/tasks/video-text-to-text/data.ts CHANGED Viewed

@@ -46,6 +46,10 @@ const taskData: TaskDataCustom = {
 			description: "Strong video-text-to-text model with reasoning capabilities.",
 			id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
 		},
+		{
+			description: "Strong video-text-to-text model.",
+			id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+		},
 	],
 	spaces: [
 		{
@@ -56,6 +60,10 @@ const taskData: TaskDataCustom = {
 			description: "A leaderboard for various video-text-to-text models.",
 			id: "opencompass/openvlm_video_leaderboard",
 		},
+		{
+			description: "An application to generate highlights from a video.",
+			id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
+		},
 	],
 	summary:
 		"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",

package/src/tasks/zero-shot-classification/data.ts CHANGED Viewed

@@ -60,6 +60,10 @@ const taskData: TaskDataCustom = {
 			description: "Cutting-edge zero-shot multilingual text classification model.",
 			id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
 		},
+		{
+			description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
+			id: "knowledgator/gliclass-modern-base-v2.0-init",
+		},
 	],
 	spaces: [],
 	summary:

package/src/tasks/zero-shot-image-classification/data.ts CHANGED Viewed

@@ -53,11 +53,11 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Strong zero-shot image classification model.",
-			id: "google/siglip-so400m-patch14-224",
+			id: "google/siglip2-base-patch16-224",
 		},
 		{
 			description: "Robust zero-shot image classification model.",
-			id: "microsoft/LLM2CLIP-EVA02-L-14-336",
+			id: "intfloat/mmE5-mllama-11b-instruct",
 		},
 		{
 			description: "Powerful zero-shot image classification model supporting 94 languages.",