@huggingface/tasks 0.16.4 → 0.16.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/commonjs/tasks/depth-estimation/data.js +1 -1
  2. package/dist/commonjs/tasks/image-text-to-text/data.d.ts.map +1 -1
  3. package/dist/commonjs/tasks/image-text-to-text/data.js +10 -6
  4. package/dist/commonjs/tasks/keypoint-detection/data.d.ts.map +1 -1
  5. package/dist/commonjs/tasks/keypoint-detection/data.js +4 -0
  6. package/dist/commonjs/tasks/object-detection/data.js +5 -5
  7. package/dist/commonjs/tasks/text-generation/data.js +1 -1
  8. package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
  9. package/dist/commonjs/tasks/text-to-speech/data.js +4 -0
  10. package/dist/commonjs/tasks/text-to-video/data.d.ts.map +1 -1
  11. package/dist/commonjs/tasks/text-to-video/data.js +5 -1
  12. package/dist/commonjs/tasks/video-text-to-text/data.d.ts.map +1 -1
  13. package/dist/commonjs/tasks/video-text-to-text/data.js +8 -0
  14. package/dist/commonjs/tasks/zero-shot-classification/data.d.ts.map +1 -1
  15. package/dist/commonjs/tasks/zero-shot-classification/data.js +4 -0
  16. package/dist/commonjs/tasks/zero-shot-image-classification/data.js +2 -2
  17. package/dist/esm/tasks/depth-estimation/data.js +1 -1
  18. package/dist/esm/tasks/image-text-to-text/data.d.ts.map +1 -1
  19. package/dist/esm/tasks/image-text-to-text/data.js +10 -6
  20. package/dist/esm/tasks/keypoint-detection/data.d.ts.map +1 -1
  21. package/dist/esm/tasks/keypoint-detection/data.js +4 -0
  22. package/dist/esm/tasks/object-detection/data.js +5 -5
  23. package/dist/esm/tasks/text-generation/data.js +1 -1
  24. package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
  25. package/dist/esm/tasks/text-to-speech/data.js +4 -0
  26. package/dist/esm/tasks/text-to-video/data.d.ts.map +1 -1
  27. package/dist/esm/tasks/text-to-video/data.js +5 -1
  28. package/dist/esm/tasks/video-text-to-text/data.d.ts.map +1 -1
  29. package/dist/esm/tasks/video-text-to-text/data.js +8 -0
  30. package/dist/esm/tasks/zero-shot-classification/data.d.ts.map +1 -1
  31. package/dist/esm/tasks/zero-shot-classification/data.js +4 -0
  32. package/dist/esm/tasks/zero-shot-image-classification/data.js +2 -2
  33. package/package.json +1 -1
  34. package/src/tasks/depth-estimation/data.ts +1 -1
  35. package/src/tasks/image-text-to-text/data.ts +10 -6
  36. package/src/tasks/keypoint-detection/data.ts +4 -0
  37. package/src/tasks/object-detection/data.ts +5 -5
  38. package/src/tasks/text-generation/data.ts +1 -1
  39. package/src/tasks/text-to-speech/data.ts +4 -0
  40. package/src/tasks/text-to-video/data.ts +5 -1
  41. package/src/tasks/video-text-to-text/data.ts +8 -0
  42. package/src/tasks/zero-shot-classification/data.ts +4 -0
  43. package/src/tasks/zero-shot-image-classification/data.ts +2 -2
@@ -41,7 +41,7 @@ const taskData = {
41
41
  },
42
42
  {
43
43
  description: "A robust depth estimation model.",
44
- id: "apple/DepthPro",
44
+ id: "apple/DepthPro-hf",
45
45
  },
46
46
  ],
47
47
  spaces: [
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAyGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Gf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -47,7 +47,7 @@ const taskData = {
47
47
  },
48
48
  {
49
49
  description: "A screenshot understanding model used to control computers.",
50
- id: "showlab/ShowUI-2B",
50
+ id: "microsoft/OmniParser-v2.0",
51
51
  },
52
52
  {
53
53
  description: "Cutting-edge vision language model.",
@@ -62,12 +62,16 @@ const taskData = {
62
62
  id: "Qwen/Qwen2.5-VL-7B-Instruct",
63
63
  },
64
64
  {
65
- description: "Image-text-to-text model with reasoning capabilities.",
66
- id: "Qwen/QVQ-72B-Preview",
65
+ description: "Image-text-to-text model with agentic capabilities.",
66
+ id: "microsoft/Magma-8B",
67
67
  },
68
68
  {
69
69
  description: "Strong image-text-to-text model focused on documents.",
70
- id: "stepfun-ai/GOT-OCR2_0",
70
+ id: "allenai/olmOCR-7B-0225-preview",
71
+ },
72
+ {
73
+ description: "Small yet strong image-text-to-text model.",
74
+ id: "ibm-granite/granite-vision-3.2-2b",
71
75
  },
72
76
  ],
73
77
  spaces: [
@@ -84,8 +88,8 @@ const taskData = {
84
88
  id: "akhaliq/Molmo-7B-D-0924",
85
89
  },
86
90
  {
87
- description: "An image-text-to-text application focused on documents.",
88
- id: "stepfun-ai/GOT_official_online_demo",
91
+ description: "Powerful vision language assistant that can understand multiple images.",
92
+ id: "HuggingFaceTB/SmolVLM2",
89
93
  },
90
94
  {
91
95
  description: "An application for chatting with an image-text-to-text model.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -27,6 +27,10 @@ const taskData = {
27
27
  description: "A robust keypoint detection model.",
28
28
  id: "magic-leap-community/superpoint",
29
29
  },
30
+ {
31
+ description: "A robust keypoint matching model.",
32
+ id: "magic-leap-community/superglue_outdoor",
33
+ },
30
34
  {
31
35
  description: "Strong keypoint detection model used to detect human pose.",
32
36
  id: "facebook/sapiens-pose-1b",
@@ -45,12 +45,12 @@ const taskData = {
45
45
  id: "facebook/detr-resnet-50",
46
46
  },
47
47
  {
48
- description: "Real-time and accurate object detection model.",
49
- id: "jameslahm/yolov10x",
48
+ description: "Accurate object detection model.",
49
+ id: "IDEA-Research/dab-detr-resnet-50",
50
50
  },
51
51
  {
52
- description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
53
- id: "PekingU/rtdetr_r18vd_coco_o365",
52
+ description: "Fast and accurate object detection model.",
53
+ id: "PekingU/rtdetr_v2_r50vd",
54
54
  },
55
55
  {
56
56
  description: "Object detection model for low-lying objects.",
@@ -68,7 +68,7 @@ const taskData = {
68
68
  },
69
69
  {
70
70
  description: "A cutting-edge object detection application.",
71
- id: "Ultralytics/YOLO11",
71
+ id: "sunsmarterjieleaf/yolov12",
72
72
  },
73
73
  {
74
74
  description: "An object tracking, segmentation and inpainting application.",
@@ -73,7 +73,7 @@ const taskData = {
73
73
  },
74
74
  {
75
75
  description: "A very powerful model with reasoning capabilities.",
76
- id: "PowerInfer/SmallThinker-3B-Preview",
76
+ id: "simplescaling/s1.1-32B",
77
77
  },
78
78
  {
79
79
  description: "Strong conversational model that supports very long instructions.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -76,6 +76,10 @@ const taskData = {
76
76
  description: "An application that synthesizes emotional speech for diverse speaker prompts.",
77
77
  id: "parler-tts/parler-tts-expresso",
78
78
  },
79
+ {
80
+ description: "An application that generates podcast episodes.",
81
+ id: "ngxson/kokoro-podcast-generator",
82
+ },
79
83
  ],
80
84
  summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
81
85
  widgetModels: ["suno/bark"],
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -73,6 +73,10 @@ const taskData = {
73
73
  description: "A text-to-video model focusing on physics-aware applications like robotics.",
74
74
  id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
75
75
  },
76
+ {
77
+ description: "A robust model for video generation.",
78
+ id: "Wan-AI/Wan2.1-T2V-1.3B",
79
+ },
76
80
  ],
77
81
  spaces: [
78
82
  {
@@ -81,7 +85,7 @@ const taskData = {
81
85
  },
82
86
  {
83
87
  description: "Consistent video generation application.",
84
- id: "TIGER-Lab/T2V-Turbo-V2",
88
+ id: "Wan-AI/Wan2.1",
85
89
  },
86
90
  {
87
91
  description: "A cutting edge video generation application.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Df,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -45,6 +45,10 @@ const taskData = {
45
45
  description: "Strong video-text-to-text model with reasoning capabilities.",
46
46
  id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
47
47
  },
48
+ {
49
+ description: "Strong video-text-to-text model.",
50
+ id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
51
+ },
48
52
  ],
49
53
  spaces: [
50
54
  {
@@ -55,6 +59,10 @@ const taskData = {
55
59
  description: "A leaderboard for various video-text-to-text models.",
56
60
  id: "opencompass/openvlm_video_leaderboard",
57
61
  },
62
+ {
63
+ description: "An application to generate highlights from a video.",
64
+ id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
65
+ },
58
66
  ],
59
67
  summary: "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
60
68
  widgetModels: [""],
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -58,6 +58,10 @@ const taskData = {
58
58
  description: "Cutting-edge zero-shot multilingual text classification model.",
59
59
  id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
60
60
  },
61
+ {
62
+ description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
63
+ id: "knowledgator/gliclass-modern-base-v2.0-init",
64
+ },
61
65
  ],
62
66
  spaces: [],
63
67
  summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
@@ -53,11 +53,11 @@ const taskData = {
53
53
  },
54
54
  {
55
55
  description: "Strong zero-shot image classification model.",
56
- id: "google/siglip-so400m-patch14-224",
56
+ id: "google/siglip2-base-patch16-224",
57
57
  },
58
58
  {
59
59
  description: "Robust zero-shot image classification model.",
60
- id: "microsoft/LLM2CLIP-EVA02-L-14-336",
60
+ id: "intfloat/mmE5-mllama-11b-instruct",
61
61
  },
62
62
  {
63
63
  description: "Powerful zero-shot image classification model supporting 94 languages.",
@@ -39,7 +39,7 @@ const taskData = {
39
39
  },
40
40
  {
41
41
  description: "A robust depth estimation model.",
42
- id: "apple/DepthPro",
42
+ id: "apple/DepthPro-hf",
43
43
  },
44
44
  ],
45
45
  spaces: [
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAyGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Gf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -45,7 +45,7 @@ const taskData = {
45
45
  },
46
46
  {
47
47
  description: "A screenshot understanding model used to control computers.",
48
- id: "showlab/ShowUI-2B",
48
+ id: "microsoft/OmniParser-v2.0",
49
49
  },
50
50
  {
51
51
  description: "Cutting-edge vision language model.",
@@ -60,12 +60,16 @@ const taskData = {
60
60
  id: "Qwen/Qwen2.5-VL-7B-Instruct",
61
61
  },
62
62
  {
63
- description: "Image-text-to-text model with reasoning capabilities.",
64
- id: "Qwen/QVQ-72B-Preview",
63
+ description: "Image-text-to-text model with agentic capabilities.",
64
+ id: "microsoft/Magma-8B",
65
65
  },
66
66
  {
67
67
  description: "Strong image-text-to-text model focused on documents.",
68
- id: "stepfun-ai/GOT-OCR2_0",
68
+ id: "allenai/olmOCR-7B-0225-preview",
69
+ },
70
+ {
71
+ description: "Small yet strong image-text-to-text model.",
72
+ id: "ibm-granite/granite-vision-3.2-2b",
69
73
  },
70
74
  ],
71
75
  spaces: [
@@ -82,8 +86,8 @@ const taskData = {
82
86
  id: "akhaliq/Molmo-7B-D-0924",
83
87
  },
84
88
  {
85
- description: "An image-text-to-text application focused on documents.",
86
- id: "stepfun-ai/GOT_official_online_demo",
89
+ description: "Powerful vision language assistant that can understand multiple images.",
90
+ id: "HuggingFaceTB/SmolVLM2",
87
91
  },
88
92
  {
89
93
  description: "An application for chatting with an image-text-to-text model.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -25,6 +25,10 @@ const taskData = {
25
25
  description: "A robust keypoint detection model.",
26
26
  id: "magic-leap-community/superpoint",
27
27
  },
28
+ {
29
+ description: "A robust keypoint matching model.",
30
+ id: "magic-leap-community/superglue_outdoor",
31
+ },
28
32
  {
29
33
  description: "Strong keypoint detection model used to detect human pose.",
30
34
  id: "facebook/sapiens-pose-1b",
@@ -43,12 +43,12 @@ const taskData = {
43
43
  id: "facebook/detr-resnet-50",
44
44
  },
45
45
  {
46
- description: "Real-time and accurate object detection model.",
47
- id: "jameslahm/yolov10x",
46
+ description: "Accurate object detection model.",
47
+ id: "IDEA-Research/dab-detr-resnet-50",
48
48
  },
49
49
  {
50
- description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
51
- id: "PekingU/rtdetr_r18vd_coco_o365",
50
+ description: "Fast and accurate object detection model.",
51
+ id: "PekingU/rtdetr_v2_r50vd",
52
52
  },
53
53
  {
54
54
  description: "Object detection model for low-lying objects.",
@@ -66,7 +66,7 @@ const taskData = {
66
66
  },
67
67
  {
68
68
  description: "A cutting-edge object detection application.",
69
- id: "Ultralytics/YOLO11",
69
+ id: "sunsmarterjieleaf/yolov12",
70
70
  },
71
71
  {
72
72
  description: "An object tracking, segmentation and inpainting application.",
@@ -71,7 +71,7 @@ const taskData = {
71
71
  },
72
72
  {
73
73
  description: "A very powerful model with reasoning capabilities.",
74
- id: "PowerInfer/SmallThinker-3B-Preview",
74
+ id: "simplescaling/s1.1-32B",
75
75
  },
76
76
  {
77
77
  description: "Strong conversational model that supports very long instructions.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -74,6 +74,10 @@ const taskData = {
74
74
  description: "An application that synthesizes emotional speech for diverse speaker prompts.",
75
75
  id: "parler-tts/parler-tts-expresso",
76
76
  },
77
+ {
78
+ description: "An application that generates podcast episodes.",
79
+ id: "ngxson/kokoro-podcast-generator",
80
+ },
77
81
  ],
78
82
  summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
79
83
  widgetModels: ["suno/bark"],
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -71,6 +71,10 @@ const taskData = {
71
71
  description: "A text-to-video model focusing on physics-aware applications like robotics.",
72
72
  id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
73
73
  },
74
+ {
75
+ description: "A robust model for video generation.",
76
+ id: "Wan-AI/Wan2.1-T2V-1.3B",
77
+ },
74
78
  ],
75
79
  spaces: [
76
80
  {
@@ -79,7 +83,7 @@ const taskData = {
79
83
  },
80
84
  {
81
85
  description: "Consistent video generation application.",
82
- id: "TIGER-Lab/T2V-Turbo-V2",
86
+ id: "Wan-AI/Wan2.1",
83
87
  },
84
88
  {
85
89
  description: "A cutting edge video generation application.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Df,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -43,6 +43,10 @@ const taskData = {
43
43
  description: "Strong video-text-to-text model with reasoning capabilities.",
44
44
  id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
45
45
  },
46
+ {
47
+ description: "Strong video-text-to-text model.",
48
+ id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
49
+ },
46
50
  ],
47
51
  spaces: [
48
52
  {
@@ -53,6 +57,10 @@ const taskData = {
53
57
  description: "A leaderboard for various video-text-to-text models.",
54
58
  id: "opencompass/openvlm_video_leaderboard",
55
59
  },
60
+ {
61
+ description: "An application to generate highlights from a video.",
62
+ id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
63
+ },
56
64
  ],
57
65
  summary: "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
58
66
  widgetModels: [""],
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -56,6 +56,10 @@ const taskData = {
56
56
  description: "Cutting-edge zero-shot multilingual text classification model.",
57
57
  id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
58
58
  },
59
+ {
60
+ description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
61
+ id: "knowledgator/gliclass-modern-base-v2.0-init",
62
+ },
59
63
  ],
60
64
  spaces: [],
61
65
  summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
@@ -51,11 +51,11 @@ const taskData = {
51
51
  },
52
52
  {
53
53
  description: "Strong zero-shot image classification model.",
54
- id: "google/siglip-so400m-patch14-224",
54
+ id: "google/siglip2-base-patch16-224",
55
55
  },
56
56
  {
57
57
  description: "Robust zero-shot image classification model.",
58
- id: "microsoft/LLM2CLIP-EVA02-L-14-336",
58
+ id: "intfloat/mmE5-mllama-11b-instruct",
59
59
  },
60
60
  {
61
61
  description: "Powerful zero-shot image classification model supporting 94 languages.",
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.16.4",
4
+ "version": "0.16.5",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
@@ -41,7 +41,7 @@ const taskData: TaskDataCustom = {
41
41
  },
42
42
  {
43
43
  description: "A robust depth estimation model.",
44
- id: "apple/DepthPro",
44
+ id: "apple/DepthPro-hf",
45
45
  },
46
46
  ],
47
47
  spaces: [
@@ -48,7 +48,7 @@ const taskData: TaskDataCustom = {
48
48
  },
49
49
  {
50
50
  description: "A screenshot understanding model used to control computers.",
51
- id: "showlab/ShowUI-2B",
51
+ id: "microsoft/OmniParser-v2.0",
52
52
  },
53
53
  {
54
54
  description: "Cutting-edge vision language model.",
@@ -63,12 +63,16 @@ const taskData: TaskDataCustom = {
63
63
  id: "Qwen/Qwen2.5-VL-7B-Instruct",
64
64
  },
65
65
  {
66
- description: "Image-text-to-text model with reasoning capabilities.",
67
- id: "Qwen/QVQ-72B-Preview",
66
+ description: "Image-text-to-text model with agentic capabilities.",
67
+ id: "microsoft/Magma-8B",
68
68
  },
69
69
  {
70
70
  description: "Strong image-text-to-text model focused on documents.",
71
- id: "stepfun-ai/GOT-OCR2_0",
71
+ id: "allenai/olmOCR-7B-0225-preview",
72
+ },
73
+ {
74
+ description: "Small yet strong image-text-to-text model.",
75
+ id: "ibm-granite/granite-vision-3.2-2b",
72
76
  },
73
77
  ],
74
78
  spaces: [
@@ -85,8 +89,8 @@ const taskData: TaskDataCustom = {
85
89
  id: "akhaliq/Molmo-7B-D-0924",
86
90
  },
87
91
  {
88
- description: "An image-text-to-text application focused on documents.",
89
- id: "stepfun-ai/GOT_official_online_demo",
92
+ description: "Powerful vision language assistant that can understand multiple images.",
93
+ id: "HuggingFaceTB/SmolVLM2",
90
94
  },
91
95
  {
92
96
  description: "An application for chatting with an image-text-to-text model.",
@@ -27,6 +27,10 @@ const taskData: TaskDataCustom = {
27
27
  description: "A robust keypoint detection model.",
28
28
  id: "magic-leap-community/superpoint",
29
29
  },
30
+ {
31
+ description: "A robust keypoint matching model.",
32
+ id: "magic-leap-community/superglue_outdoor",
33
+ },
30
34
  {
31
35
  description: "Strong keypoint detection model used to detect human pose.",
32
36
  id: "facebook/sapiens-pose-1b",
@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {
47
47
  id: "facebook/detr-resnet-50",
48
48
  },
49
49
  {
50
- description: "Real-time and accurate object detection model.",
51
- id: "jameslahm/yolov10x",
50
+ description: "Accurate object detection model.",
51
+ id: "IDEA-Research/dab-detr-resnet-50",
52
52
  },
53
53
  {
54
- description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
55
- id: "PekingU/rtdetr_r18vd_coco_o365",
54
+ description: "Fast and accurate object detection model.",
55
+ id: "PekingU/rtdetr_v2_r50vd",
56
56
  },
57
57
  {
58
58
  description: "Object detection model for low-lying objects.",
@@ -70,7 +70,7 @@ const taskData: TaskDataCustom = {
70
70
  },
71
71
  {
72
72
  description: "A cutting-edge object detection application.",
73
- id: "Ultralytics/YOLO11",
73
+ id: "sunsmarterjieleaf/yolov12",
74
74
  },
75
75
  {
76
76
  description: "An object tracking, segmentation and inpainting application.",
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
76
76
  },
77
77
  {
78
78
  description: "A very powerful model with reasoning capabilities.",
79
- id: "PowerInfer/SmallThinker-3B-Preview",
79
+ id: "simplescaling/s1.1-32B",
80
80
  },
81
81
  {
82
82
  description: "Strong conversational model that supports very long instructions.",
@@ -76,6 +76,10 @@ const taskData: TaskDataCustom = {
76
76
  description: "An application that synthesizes emotional speech for diverse speaker prompts.",
77
77
  id: "parler-tts/parler-tts-expresso",
78
78
  },
79
+ {
80
+ description: "An application that generates podcast episodes.",
81
+ id: "ngxson/kokoro-podcast-generator",
82
+ },
79
83
  ],
80
84
  summary:
81
85
  "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
@@ -78,6 +78,10 @@ const taskData: TaskDataCustom = {
78
78
  description: "A text-to-video model focusing on physics-aware applications like robotics.",
79
79
  id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
80
80
  },
81
+ {
82
+ description: "A robust model for video generation.",
83
+ id: "Wan-AI/Wan2.1-T2V-1.3B",
84
+ },
81
85
  ],
82
86
  spaces: [
83
87
  {
@@ -86,7 +90,7 @@ const taskData: TaskDataCustom = {
86
90
  },
87
91
  {
88
92
  description: "Consistent video generation application.",
89
- id: "TIGER-Lab/T2V-Turbo-V2",
93
+ id: "Wan-AI/Wan2.1",
90
94
  },
91
95
  {
92
96
  description: "A cutting edge video generation application.",
@@ -46,6 +46,10 @@ const taskData: TaskDataCustom = {
46
46
  description: "Strong video-text-to-text model with reasoning capabilities.",
47
47
  id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
48
48
  },
49
+ {
50
+ description: "Strong video-text-to-text model.",
51
+ id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
52
+ },
49
53
  ],
50
54
  spaces: [
51
55
  {
@@ -56,6 +60,10 @@ const taskData: TaskDataCustom = {
56
60
  description: "A leaderboard for various video-text-to-text models.",
57
61
  id: "opencompass/openvlm_video_leaderboard",
58
62
  },
63
+ {
64
+ description: "An application to generate highlights from a video.",
65
+ id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
66
+ },
59
67
  ],
60
68
  summary:
61
69
  "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
@@ -60,6 +60,10 @@ const taskData: TaskDataCustom = {
60
60
  description: "Cutting-edge zero-shot multilingual text classification model.",
61
61
  id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
62
62
  },
63
+ {
64
+ description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
65
+ id: "knowledgator/gliclass-modern-base-v2.0-init",
66
+ },
63
67
  ],
64
68
  spaces: [],
65
69
  summary:
@@ -53,11 +53,11 @@ const taskData: TaskDataCustom = {
53
53
  },
54
54
  {
55
55
  description: "Strong zero-shot image classification model.",
56
- id: "google/siglip-so400m-patch14-224",
56
+ id: "google/siglip2-base-patch16-224",
57
57
  },
58
58
  {
59
59
  description: "Robust zero-shot image classification model.",
60
- id: "microsoft/LLM2CLIP-EVA02-L-14-336",
60
+ id: "intfloat/mmE5-mllama-11b-instruct",
61
61
  },
62
62
  {
63
63
  description: "Powerful zero-shot image classification model supporting 94 languages.",