@huggingface/tasks 0.16.4 → 0.16.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commonjs/tasks/depth-estimation/data.js +1 -1
- package/dist/commonjs/tasks/image-text-to-text/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-text-to-text/data.js +10 -6
- package/dist/commonjs/tasks/keypoint-detection/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/keypoint-detection/data.js +4 -0
- package/dist/commonjs/tasks/object-detection/data.js +5 -5
- package/dist/commonjs/tasks/text-generation/data.js +1 -1
- package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-speech/data.js +4 -0
- package/dist/commonjs/tasks/text-to-video/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-video/data.js +5 -1
- package/dist/commonjs/tasks/video-text-to-text/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/video-text-to-text/data.js +8 -0
- package/dist/commonjs/tasks/zero-shot-classification/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/zero-shot-classification/data.js +4 -0
- package/dist/commonjs/tasks/zero-shot-image-classification/data.js +2 -2
- package/dist/esm/tasks/depth-estimation/data.js +1 -1
- package/dist/esm/tasks/image-text-to-text/data.d.ts.map +1 -1
- package/dist/esm/tasks/image-text-to-text/data.js +10 -6
- package/dist/esm/tasks/keypoint-detection/data.d.ts.map +1 -1
- package/dist/esm/tasks/keypoint-detection/data.js +4 -0
- package/dist/esm/tasks/object-detection/data.js +5 -5
- package/dist/esm/tasks/text-generation/data.js +1 -1
- package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-speech/data.js +4 -0
- package/dist/esm/tasks/text-to-video/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-video/data.js +5 -1
- package/dist/esm/tasks/video-text-to-text/data.d.ts.map +1 -1
- package/dist/esm/tasks/video-text-to-text/data.js +8 -0
- package/dist/esm/tasks/zero-shot-classification/data.d.ts.map +1 -1
- package/dist/esm/tasks/zero-shot-classification/data.js +4 -0
- package/dist/esm/tasks/zero-shot-image-classification/data.js +2 -2
- package/package.json +1 -1
- package/src/tasks/depth-estimation/data.ts +1 -1
- package/src/tasks/image-text-to-text/data.ts +10 -6
- package/src/tasks/keypoint-detection/data.ts +4 -0
- package/src/tasks/object-detection/data.ts +5 -5
- package/src/tasks/text-generation/data.ts +1 -1
- package/src/tasks/text-to-speech/data.ts +4 -0
- package/src/tasks/text-to-video/data.ts +5 -1
- package/src/tasks/video-text-to-text/data.ts +8 -0
- package/src/tasks/zero-shot-classification/data.ts +4 -0
- package/src/tasks/zero-shot-image-classification/data.ts +2 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Gf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -47,7 +47,7 @@ const taskData = {
|
|
|
47
47
|
},
|
|
48
48
|
{
|
|
49
49
|
description: "A screenshot understanding model used to control computers.",
|
|
50
|
-
id: "
|
|
50
|
+
id: "microsoft/OmniParser-v2.0",
|
|
51
51
|
},
|
|
52
52
|
{
|
|
53
53
|
description: "Cutting-edge vision language model.",
|
|
@@ -62,12 +62,16 @@ const taskData = {
|
|
|
62
62
|
id: "Qwen/Qwen2.5-VL-7B-Instruct",
|
|
63
63
|
},
|
|
64
64
|
{
|
|
65
|
-
description: "Image-text-to-text model with
|
|
66
|
-
id: "
|
|
65
|
+
description: "Image-text-to-text model with agentic capabilities.",
|
|
66
|
+
id: "microsoft/Magma-8B",
|
|
67
67
|
},
|
|
68
68
|
{
|
|
69
69
|
description: "Strong image-text-to-text model focused on documents.",
|
|
70
|
-
id: "
|
|
70
|
+
id: "allenai/olmOCR-7B-0225-preview",
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
description: "Small yet strong image-text-to-text model.",
|
|
74
|
+
id: "ibm-granite/granite-vision-3.2-2b",
|
|
71
75
|
},
|
|
72
76
|
],
|
|
73
77
|
spaces: [
|
|
@@ -84,8 +88,8 @@ const taskData = {
|
|
|
84
88
|
id: "akhaliq/Molmo-7B-D-0924",
|
|
85
89
|
},
|
|
86
90
|
{
|
|
87
|
-
description: "
|
|
88
|
-
id: "
|
|
91
|
+
description: "Powerful vision language assistant that can understand multiple images.",
|
|
92
|
+
id: "HuggingFaceTB/SmolVLM2",
|
|
89
93
|
},
|
|
90
94
|
{
|
|
91
95
|
description: "An application for chatting with an image-text-to-text model.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -27,6 +27,10 @@ const taskData = {
|
|
|
27
27
|
description: "A robust keypoint detection model.",
|
|
28
28
|
id: "magic-leap-community/superpoint",
|
|
29
29
|
},
|
|
30
|
+
{
|
|
31
|
+
description: "A robust keypoint matching model.",
|
|
32
|
+
id: "magic-leap-community/superglue_outdoor",
|
|
33
|
+
},
|
|
30
34
|
{
|
|
31
35
|
description: "Strong keypoint detection model used to detect human pose.",
|
|
32
36
|
id: "facebook/sapiens-pose-1b",
|
|
@@ -45,12 +45,12 @@ const taskData = {
|
|
|
45
45
|
id: "facebook/detr-resnet-50",
|
|
46
46
|
},
|
|
47
47
|
{
|
|
48
|
-
description: "
|
|
49
|
-
id: "
|
|
48
|
+
description: "Accurate object detection model.",
|
|
49
|
+
id: "IDEA-Research/dab-detr-resnet-50",
|
|
50
50
|
},
|
|
51
51
|
{
|
|
52
|
-
description: "Fast and accurate object detection model
|
|
53
|
-
id: "PekingU/
|
|
52
|
+
description: "Fast and accurate object detection model.",
|
|
53
|
+
id: "PekingU/rtdetr_v2_r50vd",
|
|
54
54
|
},
|
|
55
55
|
{
|
|
56
56
|
description: "Object detection model for low-lying objects.",
|
|
@@ -68,7 +68,7 @@ const taskData = {
|
|
|
68
68
|
},
|
|
69
69
|
{
|
|
70
70
|
description: "A cutting-edge object detection application.",
|
|
71
|
-
id: "
|
|
71
|
+
id: "sunsmarterjieleaf/yolov12",
|
|
72
72
|
},
|
|
73
73
|
{
|
|
74
74
|
description: "An object tracking, segmentation and inpainting application.",
|
|
@@ -73,7 +73,7 @@ const taskData = {
|
|
|
73
73
|
},
|
|
74
74
|
{
|
|
75
75
|
description: "A very powerful model with reasoning capabilities.",
|
|
76
|
-
id: "
|
|
76
|
+
id: "simplescaling/s1.1-32B",
|
|
77
77
|
},
|
|
78
78
|
{
|
|
79
79
|
description: "Strong conversational model that supports very long instructions.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -76,6 +76,10 @@ const taskData = {
|
|
|
76
76
|
description: "An application that synthesizes emotional speech for diverse speaker prompts.",
|
|
77
77
|
id: "parler-tts/parler-tts-expresso",
|
|
78
78
|
},
|
|
79
|
+
{
|
|
80
|
+
description: "An application that generates podcast episodes.",
|
|
81
|
+
id: "ngxson/kokoro-podcast-generator",
|
|
82
|
+
},
|
|
79
83
|
],
|
|
80
84
|
summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
|
|
81
85
|
widgetModels: ["suno/bark"],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -73,6 +73,10 @@ const taskData = {
|
|
|
73
73
|
description: "A text-to-video model focusing on physics-aware applications like robotics.",
|
|
74
74
|
id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
|
|
75
75
|
},
|
|
76
|
+
{
|
|
77
|
+
description: "A robust model for video generation.",
|
|
78
|
+
id: "Wan-AI/Wan2.1-T2V-1.3B",
|
|
79
|
+
},
|
|
76
80
|
],
|
|
77
81
|
spaces: [
|
|
78
82
|
{
|
|
@@ -81,7 +85,7 @@ const taskData = {
|
|
|
81
85
|
},
|
|
82
86
|
{
|
|
83
87
|
description: "Consistent video generation application.",
|
|
84
|
-
id: "
|
|
88
|
+
id: "Wan-AI/Wan2.1",
|
|
85
89
|
},
|
|
86
90
|
{
|
|
87
91
|
description: "A cutting edge video generation application.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -45,6 +45,10 @@ const taskData = {
|
|
|
45
45
|
description: "Strong video-text-to-text model with reasoning capabilities.",
|
|
46
46
|
id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
|
|
47
47
|
},
|
|
48
|
+
{
|
|
49
|
+
description: "Strong video-text-to-text model.",
|
|
50
|
+
id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
|
51
|
+
},
|
|
48
52
|
],
|
|
49
53
|
spaces: [
|
|
50
54
|
{
|
|
@@ -55,6 +59,10 @@ const taskData = {
|
|
|
55
59
|
description: "A leaderboard for various video-text-to-text models.",
|
|
56
60
|
id: "opencompass/openvlm_video_leaderboard",
|
|
57
61
|
},
|
|
62
|
+
{
|
|
63
|
+
description: "An application to generate highlights from a video.",
|
|
64
|
+
id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
|
|
65
|
+
},
|
|
58
66
|
],
|
|
59
67
|
summary: "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
|
|
60
68
|
widgetModels: [""],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -58,6 +58,10 @@ const taskData = {
|
|
|
58
58
|
description: "Cutting-edge zero-shot multilingual text classification model.",
|
|
59
59
|
id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
|
|
60
60
|
},
|
|
61
|
+
{
|
|
62
|
+
description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
|
|
63
|
+
id: "knowledgator/gliclass-modern-base-v2.0-init",
|
|
64
|
+
},
|
|
61
65
|
],
|
|
62
66
|
spaces: [],
|
|
63
67
|
summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
|
|
@@ -53,11 +53,11 @@ const taskData = {
|
|
|
53
53
|
},
|
|
54
54
|
{
|
|
55
55
|
description: "Strong zero-shot image classification model.",
|
|
56
|
-
id: "google/
|
|
56
|
+
id: "google/siglip2-base-patch16-224",
|
|
57
57
|
},
|
|
58
58
|
{
|
|
59
59
|
description: "Robust zero-shot image classification model.",
|
|
60
|
-
id: "
|
|
60
|
+
id: "intfloat/mmE5-mllama-11b-instruct",
|
|
61
61
|
},
|
|
62
62
|
{
|
|
63
63
|
description: "Powerful zero-shot image classification model supporting 94 languages.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/image-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Gf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -45,7 +45,7 @@ const taskData = {
|
|
|
45
45
|
},
|
|
46
46
|
{
|
|
47
47
|
description: "A screenshot understanding model used to control computers.",
|
|
48
|
-
id: "
|
|
48
|
+
id: "microsoft/OmniParser-v2.0",
|
|
49
49
|
},
|
|
50
50
|
{
|
|
51
51
|
description: "Cutting-edge vision language model.",
|
|
@@ -60,12 +60,16 @@ const taskData = {
|
|
|
60
60
|
id: "Qwen/Qwen2.5-VL-7B-Instruct",
|
|
61
61
|
},
|
|
62
62
|
{
|
|
63
|
-
description: "Image-text-to-text model with
|
|
64
|
-
id: "
|
|
63
|
+
description: "Image-text-to-text model with agentic capabilities.",
|
|
64
|
+
id: "microsoft/Magma-8B",
|
|
65
65
|
},
|
|
66
66
|
{
|
|
67
67
|
description: "Strong image-text-to-text model focused on documents.",
|
|
68
|
-
id: "
|
|
68
|
+
id: "allenai/olmOCR-7B-0225-preview",
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
description: "Small yet strong image-text-to-text model.",
|
|
72
|
+
id: "ibm-granite/granite-vision-3.2-2b",
|
|
69
73
|
},
|
|
70
74
|
],
|
|
71
75
|
spaces: [
|
|
@@ -82,8 +86,8 @@ const taskData = {
|
|
|
82
86
|
id: "akhaliq/Molmo-7B-D-0924",
|
|
83
87
|
},
|
|
84
88
|
{
|
|
85
|
-
description: "
|
|
86
|
-
id: "
|
|
89
|
+
description: "Powerful vision language assistant that can understand multiple images.",
|
|
90
|
+
id: "HuggingFaceTB/SmolVLM2",
|
|
87
91
|
},
|
|
88
92
|
{
|
|
89
93
|
description: "An application for chatting with an image-text-to-text model.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/keypoint-detection/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -25,6 +25,10 @@ const taskData = {
|
|
|
25
25
|
description: "A robust keypoint detection model.",
|
|
26
26
|
id: "magic-leap-community/superpoint",
|
|
27
27
|
},
|
|
28
|
+
{
|
|
29
|
+
description: "A robust keypoint matching model.",
|
|
30
|
+
id: "magic-leap-community/superglue_outdoor",
|
|
31
|
+
},
|
|
28
32
|
{
|
|
29
33
|
description: "Strong keypoint detection model used to detect human pose.",
|
|
30
34
|
id: "facebook/sapiens-pose-1b",
|
|
@@ -43,12 +43,12 @@ const taskData = {
|
|
|
43
43
|
id: "facebook/detr-resnet-50",
|
|
44
44
|
},
|
|
45
45
|
{
|
|
46
|
-
description: "
|
|
47
|
-
id: "
|
|
46
|
+
description: "Accurate object detection model.",
|
|
47
|
+
id: "IDEA-Research/dab-detr-resnet-50",
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
|
-
description: "Fast and accurate object detection model
|
|
51
|
-
id: "PekingU/
|
|
50
|
+
description: "Fast and accurate object detection model.",
|
|
51
|
+
id: "PekingU/rtdetr_v2_r50vd",
|
|
52
52
|
},
|
|
53
53
|
{
|
|
54
54
|
description: "Object detection model for low-lying objects.",
|
|
@@ -66,7 +66,7 @@ const taskData = {
|
|
|
66
66
|
},
|
|
67
67
|
{
|
|
68
68
|
description: "A cutting-edge object detection application.",
|
|
69
|
-
id: "
|
|
69
|
+
id: "sunsmarterjieleaf/yolov12",
|
|
70
70
|
},
|
|
71
71
|
{
|
|
72
72
|
description: "An object tracking, segmentation and inpainting application.",
|
|
@@ -71,7 +71,7 @@ const taskData = {
|
|
|
71
71
|
},
|
|
72
72
|
{
|
|
73
73
|
description: "A very powerful model with reasoning capabilities.",
|
|
74
|
-
id: "
|
|
74
|
+
id: "simplescaling/s1.1-32B",
|
|
75
75
|
},
|
|
76
76
|
{
|
|
77
77
|
description: "Strong conversational model that supports very long instructions.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -74,6 +74,10 @@ const taskData = {
|
|
|
74
74
|
description: "An application that synthesizes emotional speech for diverse speaker prompts.",
|
|
75
75
|
id: "parler-tts/parler-tts-expresso",
|
|
76
76
|
},
|
|
77
|
+
{
|
|
78
|
+
description: "An application that generates podcast episodes.",
|
|
79
|
+
id: "ngxson/kokoro-podcast-generator",
|
|
80
|
+
},
|
|
77
81
|
],
|
|
78
82
|
summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
|
|
79
83
|
widgetModels: ["suno/bark"],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-video/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqGf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -71,6 +71,10 @@ const taskData = {
|
|
|
71
71
|
description: "A text-to-video model focusing on physics-aware applications like robotics.",
|
|
72
72
|
id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
|
|
73
73
|
},
|
|
74
|
+
{
|
|
75
|
+
description: "A robust model for video generation.",
|
|
76
|
+
id: "Wan-AI/Wan2.1-T2V-1.3B",
|
|
77
|
+
},
|
|
74
78
|
],
|
|
75
79
|
spaces: [
|
|
76
80
|
{
|
|
@@ -79,7 +83,7 @@ const taskData = {
|
|
|
79
83
|
},
|
|
80
84
|
{
|
|
81
85
|
description: "Consistent video generation application.",
|
|
82
|
-
id: "
|
|
86
|
+
id: "Wan-AI/Wan2.1",
|
|
83
87
|
},
|
|
84
88
|
{
|
|
85
89
|
description: "A cutting edge video generation application.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/video-text-to-text/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -43,6 +43,10 @@ const taskData = {
|
|
|
43
43
|
description: "Strong video-text-to-text model with reasoning capabilities.",
|
|
44
44
|
id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
|
|
45
45
|
},
|
|
46
|
+
{
|
|
47
|
+
description: "Strong video-text-to-text model.",
|
|
48
|
+
id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
|
49
|
+
},
|
|
46
50
|
],
|
|
47
51
|
spaces: [
|
|
48
52
|
{
|
|
@@ -53,6 +57,10 @@ const taskData = {
|
|
|
53
57
|
description: "A leaderboard for various video-text-to-text models.",
|
|
54
58
|
id: "opencompass/openvlm_video_leaderboard",
|
|
55
59
|
},
|
|
60
|
+
{
|
|
61
|
+
description: "An application to generate highlights from a video.",
|
|
62
|
+
id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
|
|
63
|
+
},
|
|
56
64
|
],
|
|
57
65
|
summary: "Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
|
|
58
66
|
widgetModels: [""],
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -56,6 +56,10 @@ const taskData = {
|
|
|
56
56
|
description: "Cutting-edge zero-shot multilingual text classification model.",
|
|
57
57
|
id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
|
|
58
58
|
},
|
|
59
|
+
{
|
|
60
|
+
description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
|
|
61
|
+
id: "knowledgator/gliclass-modern-base-v2.0-init",
|
|
62
|
+
},
|
|
59
63
|
],
|
|
60
64
|
spaces: [],
|
|
61
65
|
summary: "Zero-shot text classification is a task in natural language processing where a model is trained on a set of labeled examples but is then able to classify new examples from previously unseen classes.",
|
|
@@ -51,11 +51,11 @@ const taskData = {
|
|
|
51
51
|
},
|
|
52
52
|
{
|
|
53
53
|
description: "Strong zero-shot image classification model.",
|
|
54
|
-
id: "google/
|
|
54
|
+
id: "google/siglip2-base-patch16-224",
|
|
55
55
|
},
|
|
56
56
|
{
|
|
57
57
|
description: "Robust zero-shot image classification model.",
|
|
58
|
-
id: "
|
|
58
|
+
id: "intfloat/mmE5-mllama-11b-instruct",
|
|
59
59
|
},
|
|
60
60
|
{
|
|
61
61
|
description: "Powerful zero-shot image classification model supporting 94 languages.",
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/tasks",
|
|
3
3
|
"packageManager": "pnpm@8.10.5",
|
|
4
|
-
"version": "0.16.
|
|
4
|
+
"version": "0.16.5",
|
|
5
5
|
"description": "List of ML tasks for huggingface.co/tasks",
|
|
6
6
|
"repository": "https://github.com/huggingface/huggingface.js.git",
|
|
7
7
|
"publishConfig": {
|
|
@@ -48,7 +48,7 @@ const taskData: TaskDataCustom = {
|
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
50
|
description: "A screenshot understanding model used to control computers.",
|
|
51
|
-
id: "
|
|
51
|
+
id: "microsoft/OmniParser-v2.0",
|
|
52
52
|
},
|
|
53
53
|
{
|
|
54
54
|
description: "Cutting-edge vision language model.",
|
|
@@ -63,12 +63,16 @@ const taskData: TaskDataCustom = {
|
|
|
63
63
|
id: "Qwen/Qwen2.5-VL-7B-Instruct",
|
|
64
64
|
},
|
|
65
65
|
{
|
|
66
|
-
description: "Image-text-to-text model with
|
|
67
|
-
id: "
|
|
66
|
+
description: "Image-text-to-text model with agentic capabilities.",
|
|
67
|
+
id: "microsoft/Magma-8B",
|
|
68
68
|
},
|
|
69
69
|
{
|
|
70
70
|
description: "Strong image-text-to-text model focused on documents.",
|
|
71
|
-
id: "
|
|
71
|
+
id: "allenai/olmOCR-7B-0225-preview",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "Small yet strong image-text-to-text model.",
|
|
75
|
+
id: "ibm-granite/granite-vision-3.2-2b",
|
|
72
76
|
},
|
|
73
77
|
],
|
|
74
78
|
spaces: [
|
|
@@ -85,8 +89,8 @@ const taskData: TaskDataCustom = {
|
|
|
85
89
|
id: "akhaliq/Molmo-7B-D-0924",
|
|
86
90
|
},
|
|
87
91
|
{
|
|
88
|
-
description: "
|
|
89
|
-
id: "
|
|
92
|
+
description: "Powerful vision language assistant that can understand multiple images.",
|
|
93
|
+
id: "HuggingFaceTB/SmolVLM2",
|
|
90
94
|
},
|
|
91
95
|
{
|
|
92
96
|
description: "An application for chatting with an image-text-to-text model.",
|
|
@@ -27,6 +27,10 @@ const taskData: TaskDataCustom = {
|
|
|
27
27
|
description: "A robust keypoint detection model.",
|
|
28
28
|
id: "magic-leap-community/superpoint",
|
|
29
29
|
},
|
|
30
|
+
{
|
|
31
|
+
description: "A robust keypoint matching model.",
|
|
32
|
+
id: "magic-leap-community/superglue_outdoor",
|
|
33
|
+
},
|
|
30
34
|
{
|
|
31
35
|
description: "Strong keypoint detection model used to detect human pose.",
|
|
32
36
|
id: "facebook/sapiens-pose-1b",
|
|
@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {
|
|
|
47
47
|
id: "facebook/detr-resnet-50",
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
|
-
description: "
|
|
51
|
-
id: "
|
|
50
|
+
description: "Accurate object detection model.",
|
|
51
|
+
id: "IDEA-Research/dab-detr-resnet-50",
|
|
52
52
|
},
|
|
53
53
|
{
|
|
54
|
-
description: "Fast and accurate object detection model
|
|
55
|
-
id: "PekingU/
|
|
54
|
+
description: "Fast and accurate object detection model.",
|
|
55
|
+
id: "PekingU/rtdetr_v2_r50vd",
|
|
56
56
|
},
|
|
57
57
|
{
|
|
58
58
|
description: "Object detection model for low-lying objects.",
|
|
@@ -70,7 +70,7 @@ const taskData: TaskDataCustom = {
|
|
|
70
70
|
},
|
|
71
71
|
{
|
|
72
72
|
description: "A cutting-edge object detection application.",
|
|
73
|
-
id: "
|
|
73
|
+
id: "sunsmarterjieleaf/yolov12",
|
|
74
74
|
},
|
|
75
75
|
{
|
|
76
76
|
description: "An object tracking, segmentation and inpainting application.",
|
|
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
|
|
|
76
76
|
},
|
|
77
77
|
{
|
|
78
78
|
description: "A very powerful model with reasoning capabilities.",
|
|
79
|
-
id: "
|
|
79
|
+
id: "simplescaling/s1.1-32B",
|
|
80
80
|
},
|
|
81
81
|
{
|
|
82
82
|
description: "Strong conversational model that supports very long instructions.",
|
|
@@ -76,6 +76,10 @@ const taskData: TaskDataCustom = {
|
|
|
76
76
|
description: "An application that synthesizes emotional speech for diverse speaker prompts.",
|
|
77
77
|
id: "parler-tts/parler-tts-expresso",
|
|
78
78
|
},
|
|
79
|
+
{
|
|
80
|
+
description: "An application that generates podcast episodes.",
|
|
81
|
+
id: "ngxson/kokoro-podcast-generator",
|
|
82
|
+
},
|
|
79
83
|
],
|
|
80
84
|
summary:
|
|
81
85
|
"Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
|
|
@@ -78,6 +78,10 @@ const taskData: TaskDataCustom = {
|
|
|
78
78
|
description: "A text-to-video model focusing on physics-aware applications like robotics.",
|
|
79
79
|
id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
|
|
80
80
|
},
|
|
81
|
+
{
|
|
82
|
+
description: "A robust model for video generation.",
|
|
83
|
+
id: "Wan-AI/Wan2.1-T2V-1.3B",
|
|
84
|
+
},
|
|
81
85
|
],
|
|
82
86
|
spaces: [
|
|
83
87
|
{
|
|
@@ -86,7 +90,7 @@ const taskData: TaskDataCustom = {
|
|
|
86
90
|
},
|
|
87
91
|
{
|
|
88
92
|
description: "Consistent video generation application.",
|
|
89
|
-
id: "
|
|
93
|
+
id: "Wan-AI/Wan2.1",
|
|
90
94
|
},
|
|
91
95
|
{
|
|
92
96
|
description: "A cutting edge video generation application.",
|
|
@@ -46,6 +46,10 @@ const taskData: TaskDataCustom = {
|
|
|
46
46
|
description: "Strong video-text-to-text model with reasoning capabilities.",
|
|
47
47
|
id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
|
|
48
48
|
},
|
|
49
|
+
{
|
|
50
|
+
description: "Strong video-text-to-text model.",
|
|
51
|
+
id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
|
52
|
+
},
|
|
49
53
|
],
|
|
50
54
|
spaces: [
|
|
51
55
|
{
|
|
@@ -56,6 +60,10 @@ const taskData: TaskDataCustom = {
|
|
|
56
60
|
description: "A leaderboard for various video-text-to-text models.",
|
|
57
61
|
id: "opencompass/openvlm_video_leaderboard",
|
|
58
62
|
},
|
|
63
|
+
{
|
|
64
|
+
description: "An application to generate highlights from a video.",
|
|
65
|
+
id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
|
|
66
|
+
},
|
|
59
67
|
],
|
|
60
68
|
summary:
|
|
61
69
|
"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
|
|
@@ -60,6 +60,10 @@ const taskData: TaskDataCustom = {
|
|
|
60
60
|
description: "Cutting-edge zero-shot multilingual text classification model.",
|
|
61
61
|
id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
|
|
62
62
|
},
|
|
63
|
+
{
|
|
64
|
+
description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
|
|
65
|
+
id: "knowledgator/gliclass-modern-base-v2.0-init",
|
|
66
|
+
},
|
|
63
67
|
],
|
|
64
68
|
spaces: [],
|
|
65
69
|
summary:
|
|
@@ -53,11 +53,11 @@ const taskData: TaskDataCustom = {
|
|
|
53
53
|
},
|
|
54
54
|
{
|
|
55
55
|
description: "Strong zero-shot image classification model.",
|
|
56
|
-
id: "google/
|
|
56
|
+
id: "google/siglip2-base-patch16-224",
|
|
57
57
|
},
|
|
58
58
|
{
|
|
59
59
|
description: "Robust zero-shot image classification model.",
|
|
60
|
-
id: "
|
|
60
|
+
id: "intfloat/mmE5-mllama-11b-instruct",
|
|
61
61
|
},
|
|
62
62
|
{
|
|
63
63
|
description: "Powerful zero-shot image classification model supporting 94 languages.",
|