@huggingface/tasks 0.19.35 → 0.19.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/dist/commonjs/local-apps.d.ts +7 -0
  2. package/dist/commonjs/local-apps.d.ts.map +1 -1
  3. package/dist/commonjs/local-apps.js +31 -0
  4. package/dist/commonjs/model-libraries-snippets.d.ts +1 -0
  5. package/dist/commonjs/model-libraries-snippets.d.ts.map +1 -1
  6. package/dist/commonjs/model-libraries-snippets.js +12 -2
  7. package/dist/commonjs/model-libraries.d.ts +15 -1
  8. package/dist/commonjs/model-libraries.d.ts.map +1 -1
  9. package/dist/commonjs/model-libraries.js +14 -0
  10. package/dist/commonjs/tasks/any-to-any/data.js +3 -3
  11. package/dist/commonjs/tasks/image-segmentation/data.js +2 -2
  12. package/dist/commonjs/tasks/image-text-to-text/data.d.ts.map +1 -1
  13. package/dist/commonjs/tasks/image-text-to-text/data.js +10 -38
  14. package/dist/commonjs/tasks/image-to-3d/data.js +2 -2
  15. package/dist/commonjs/tasks/image-to-image/data.d.ts.map +1 -1
  16. package/dist/commonjs/tasks/image-to-image/data.js +13 -25
  17. package/dist/commonjs/tasks/image-to-text/data.d.ts.map +1 -1
  18. package/dist/commonjs/tasks/image-to-text/data.js +10 -30
  19. package/dist/commonjs/tasks/keypoint-detection/data.js +5 -5
  20. package/dist/commonjs/tasks/object-detection/data.js +2 -2
  21. package/dist/commonjs/tasks/text-generation/data.d.ts.map +1 -1
  22. package/dist/commonjs/tasks/text-generation/data.js +14 -10
  23. package/dist/commonjs/tasks/text-to-image/data.js +6 -6
  24. package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
  25. package/dist/commonjs/tasks/text-to-speech/data.js +10 -14
  26. package/dist/commonjs/tasks/text-to-video/data.js +3 -3
  27. package/dist/commonjs/tasks/visual-document-retrieval/data.d.ts.map +1 -1
  28. package/dist/commonjs/tasks/visual-document-retrieval/data.js +6 -2
  29. package/dist/commonjs/tasks/zero-shot-object-detection/data.js +4 -4
  30. package/dist/esm/local-apps.d.ts +7 -0
  31. package/dist/esm/local-apps.d.ts.map +1 -1
  32. package/dist/esm/local-apps.js +31 -0
  33. package/dist/esm/model-libraries-snippets.d.ts +1 -0
  34. package/dist/esm/model-libraries-snippets.d.ts.map +1 -1
  35. package/dist/esm/model-libraries-snippets.js +10 -1
  36. package/dist/esm/model-libraries.d.ts +15 -1
  37. package/dist/esm/model-libraries.d.ts.map +1 -1
  38. package/dist/esm/model-libraries.js +14 -0
  39. package/dist/esm/tasks/any-to-any/data.js +3 -3
  40. package/dist/esm/tasks/image-segmentation/data.js +2 -2
  41. package/dist/esm/tasks/image-text-to-text/data.d.ts.map +1 -1
  42. package/dist/esm/tasks/image-text-to-text/data.js +10 -38
  43. package/dist/esm/tasks/image-to-3d/data.js +2 -2
  44. package/dist/esm/tasks/image-to-image/data.d.ts.map +1 -1
  45. package/dist/esm/tasks/image-to-image/data.js +13 -25
  46. package/dist/esm/tasks/image-to-text/data.d.ts.map +1 -1
  47. package/dist/esm/tasks/image-to-text/data.js +10 -30
  48. package/dist/esm/tasks/keypoint-detection/data.js +5 -5
  49. package/dist/esm/tasks/object-detection/data.js +2 -2
  50. package/dist/esm/tasks/text-generation/data.d.ts.map +1 -1
  51. package/dist/esm/tasks/text-generation/data.js +14 -10
  52. package/dist/esm/tasks/text-to-image/data.js +6 -6
  53. package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
  54. package/dist/esm/tasks/text-to-speech/data.js +10 -14
  55. package/dist/esm/tasks/text-to-video/data.js +3 -3
  56. package/dist/esm/tasks/visual-document-retrieval/data.d.ts.map +1 -1
  57. package/dist/esm/tasks/visual-document-retrieval/data.js +6 -2
  58. package/dist/esm/tasks/zero-shot-object-detection/data.js +4 -4
  59. package/package.json +1 -1
  60. package/src/local-apps.ts +32 -0
  61. package/src/model-libraries-snippets.ts +11 -1
  62. package/src/model-libraries.ts +14 -0
  63. package/src/tasks/any-to-any/data.ts +3 -3
  64. package/src/tasks/image-segmentation/data.ts +2 -2
  65. package/src/tasks/image-text-to-text/data.ts +10 -38
  66. package/src/tasks/image-to-3d/data.ts +2 -2
  67. package/src/tasks/image-to-image/data.ts +13 -25
  68. package/src/tasks/image-to-text/data.ts +10 -30
  69. package/src/tasks/keypoint-detection/data.ts +5 -5
  70. package/src/tasks/object-detection/data.ts +2 -2
  71. package/src/tasks/text-generation/data.ts +14 -10
  72. package/src/tasks/text-to-image/data.ts +6 -6
  73. package/src/tasks/text-to-speech/data.ts +10 -14
  74. package/src/tasks/text-to-video/data.ts +3 -3
  75. package/src/tasks/visual-document-retrieval/data.ts +7 -2
  76. package/src/tasks/zero-shot-object-detection/data.ts +4 -4
@@ -31,11 +31,11 @@ const taskData = {
31
31
  },
32
32
  {
33
33
  description: "Strong keypoint detection model used to detect human pose.",
34
- id: "facebook/sapiens-pose-1b",
34
+ id: "qualcomm/RTMPose-Body2d",
35
35
  },
36
36
  {
37
- description: "Powerful keypoint detection model used to detect human pose.",
38
- id: "usyd-community/vitpose-plus-base",
37
+ description: "Powerful keypoint matching model.",
38
+ id: "ETH-CVG/lightglue_disk",
39
39
  },
40
40
  ],
41
41
  spaces: [
@@ -44,8 +44,8 @@ const taskData = {
44
44
  id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
45
45
  },
46
46
  {
47
- description: "An application to try a universal keypoint detection model.",
48
- id: "merve/SuperPoint",
47
+ description: "An application for keypoint detection and matching.",
48
+ id: "ETH-CVG/LightGlue",
49
49
  },
50
50
  ],
51
51
  summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",
@@ -57,8 +57,8 @@ const taskData = {
57
57
  ],
58
58
  spaces: [
59
59
  {
60
- description: "Leaderboard to compare various object detection models across several metrics.",
61
- id: "hf-vision/object_detection_leaderboard",
60
+ description: "Real-time object detection demo.",
61
+ id: "Roboflow/RF-DETR",
62
62
  },
63
63
  {
64
64
  description: "An application that contains various object detection models to try from.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAyHf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Hf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -58,20 +58,20 @@ const taskData = {
58
58
  models: [
59
59
  { description: "A text-generation model trained to follow instructions.", id: "google/gemma-2-2b-it" },
60
60
  {
61
- description: "Smaller variant of one of the most powerful models.",
62
- id: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
61
+ description: "Powerful text generation model for coding.",
62
+ id: "Qwen/Qwen3-Coder-480B-A35B-Instruct",
63
63
  },
64
64
  {
65
- description: "Very powerful text generation model trained to follow instructions.",
66
- id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
65
+ description: "Great text generation model with top-notch tool calling capabilities.",
66
+ id: "openai/gpt-oss-120b",
67
67
  },
68
68
  {
69
- description: "Powerful text generation model by Microsoft.",
70
- id: "microsoft/phi-4",
69
+ description: "Powerful text generation model.",
70
+ id: "zai-org/GLM-4.5",
71
71
  },
72
72
  {
73
- description: "A very powerful model with reasoning capabilities.",
74
- id: "simplescaling/s1.1-32B",
73
+ description: "A powerful small model with reasoning capabilities.",
74
+ id: "Qwen/Qwen3-4B-Thinking-2507",
75
75
  },
76
76
  {
77
77
  description: "Strong conversational model that supports very long instructions.",
@@ -88,8 +88,12 @@ const taskData = {
88
88
  ],
89
89
  spaces: [
90
90
  {
91
- description: "A leaderboard to compare different open-source text generation models based on various benchmarks.",
92
- id: "open-llm-leaderboard/open_llm_leaderboard",
91
+ description: "An application that writes and executes code from text instructions and supports many models.",
92
+ id: "akhaliq/anycoder",
93
+ },
94
+ {
95
+ description: "An application that builds websites from natural language prompts.",
96
+ id: "enzostvs/deepsite",
93
97
  },
94
98
  {
95
99
  description: "A leaderboard for comparing chain-of-thought performance of models.",
@@ -45,19 +45,19 @@ const taskData = {
45
45
  models: [
46
46
  {
47
47
  description: "One of the most powerful image generation models that can generate realistic outputs.",
48
- id: "black-forest-labs/FLUX.1-dev",
48
+ id: "black-forest-labs/FLUX.1-Krea-dev",
49
49
  },
50
50
  {
51
- description: "A powerful yet fast image generation model.",
52
- id: "latent-consistency/lcm-lora-sdxl",
51
+ description: "A powerful image generation model.",
52
+ id: "Qwen/Qwen-Image",
53
53
  },
54
54
  {
55
- description: "Text-to-image model for photorealistic generation.",
56
- id: "Kwai-Kolors/Kolors",
55
+ description: "Powerful and fast image generation model.",
56
+ id: "ByteDance/SDXL-Lightning",
57
57
  },
58
58
  {
59
59
  description: "A powerful text-to-image model.",
60
- id: "stabilityai/stable-diffusion-3-medium-diffusers",
60
+ id: "ByteDance/Hyper-SD",
61
61
  },
62
62
  ],
63
63
  spaces: [
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAqFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAiFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -37,24 +37,20 @@ const taskData = {
37
37
  ],
38
38
  models: [
39
39
  {
40
- description: "A prompt based, powerful TTS model.",
41
- id: "parler-tts/parler-tts-large-v1",
40
+ description: "Small yet powerful TTS model.",
41
+ id: "KittenML/kitten-tts-nano-0.1",
42
42
  },
43
43
  {
44
- description: "A powerful TTS model that supports English and Chinese.",
45
- id: "SWivid/F5-TTS",
44
+ description: "Bleeding edge TTS model.",
45
+ id: "ResembleAI/chatterbox",
46
46
  },
47
47
  {
48
48
  description: "A massively multi-lingual TTS model.",
49
49
  id: "fishaudio/fish-speech-1.5",
50
50
  },
51
51
  {
52
- description: "A powerful TTS model.",
53
- id: "OuteAI/OuteTTS-0.1-350M",
54
- },
55
- {
56
- description: "Small yet powerful TTS model.",
57
- id: "hexgrad/Kokoro-82M",
52
+ description: "A text-to-dialogue model.",
53
+ id: "nari-labs/Dia-1.6B-0626",
58
54
  },
59
55
  ],
60
56
  spaces: [
@@ -67,12 +63,12 @@ const taskData = {
67
63
  id: "fishaudio/fish-speech-1",
68
64
  },
69
65
  {
70
- description: "An application that generates speech in different styles in English and Chinese.",
71
- id: "mrfakename/E2-F5-TTS",
66
+ description: "Performant TTS application.",
67
+ id: "ResembleAI/Chatterbox",
72
68
  },
73
69
  {
74
- description: "An application that synthesizes emotional speech for diverse speaker prompts.",
75
- id: "parler-tts/parler-tts-expresso",
70
+ description: "An application to compare different TTS models.",
71
+ id: "TTS-AGI/TTS-Arena-V2",
76
72
  },
77
73
  {
78
74
  description: "An application that generates podcast episodes.",
@@ -72,8 +72,8 @@ const taskData = {
72
72
  id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
73
73
  },
74
74
  {
75
- description: "A robust model for video generation.",
76
- id: "Wan-AI/Wan2.1-T2V-1.3B",
75
+ description: "Very fast model for video generation.",
76
+ id: "Lightricks/LTX-Video-0.9.8-13B-distilled",
77
77
  },
78
78
  ],
79
79
  spaces: [
@@ -91,7 +91,7 @@ const taskData = {
91
91
  },
92
92
  ],
93
93
  summary: "Text-to-video models can be used in any application that requires generating consistent sequence of images from text. ",
94
- widgetModels: ["Wan-AI/Wan2.1-T2V-14B"],
94
+ widgetModels: ["Wan-AI/Wan2.2-TI2V-5B"],
95
95
  youtubeId: undefined,
96
96
  };
97
97
  export default taskData;
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/visual-document-retrieval/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAkEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/visual-document-retrieval/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAuEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -50,8 +50,8 @@ const taskData = {
50
50
  id: "vidore/colqwen2-v1.0",
51
51
  },
52
52
  {
53
- description: "Very fast and efficient visual document retrieval model that works on five languages.",
54
- id: "marco/mcdse-2b-v1",
53
+ description: "Very fast and efficient visual document retrieval model that can also take in other modalities like audio.",
54
+ id: "Tevatron/OmniEmbed-v0.1",
55
55
  },
56
56
  ],
57
57
  spaces: [
@@ -59,6 +59,10 @@ const taskData = {
59
59
  description: "A leaderboard of visual document retrieval models.",
60
60
  id: "vidore/vidore-leaderboard",
61
61
  },
62
+ {
63
+ description: "Visual retrieval augmented generation demo based on ColQwen2 model.",
64
+ id: "vidore/visual-rag-tool",
65
+ },
62
66
  ],
63
67
  summary: "Visual document retrieval is the task of searching for relevant image-based documents, such as PDFs. These models take a text query and multiple documents as input and return the top-most relevant documents and relevancy scores as output.",
64
68
  widgetModels: [""],
@@ -36,17 +36,17 @@ const taskData = {
36
36
  models: [
37
37
  {
38
38
  description: "Solid zero-shot object detection model.",
39
- id: "IDEA-Research/grounding-dino-base",
39
+ id: "openmmlab-community/mm_grounding_dino_large_all",
40
40
  },
41
41
  {
42
42
  description: "Cutting-edge zero-shot object detection model.",
43
- id: "google/owlv2-base-patch16-ensemble",
43
+ id: "fushh7/LLMDet",
44
44
  },
45
45
  ],
46
46
  spaces: [
47
47
  {
48
- description: "A demo to try the state-of-the-art zero-shot object detection model, OWLv2.",
49
- id: "merve/owlv2",
48
+ description: "A demo to compare different zero-shot object detection models per output and latency.",
49
+ id: "ariG23498/zero-shot-od",
50
50
  },
51
51
  {
52
52
  description: "A demo that combines a zero-shot object detection and mask generation model for zero-shot segmentation.",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
- "version": "0.19.35",
3
+ "version": "0.19.37",
4
4
  "description": "List of ML tasks for huggingface.co/tasks",
5
5
  "repository": "https://github.com/huggingface/huggingface.js.git",
6
6
  "publishConfig": {
package/src/local-apps.ts CHANGED
@@ -315,6 +315,31 @@ const snippetDockerModelRunner = (model: ModelData, filepath?: string): string =
315
315
  return `docker model run hf.co/${model.id}${getQuantTag(filepath)}`;
316
316
  };
317
317
 
318
+ const snippetLemonade = (model: ModelData, filepath?: string): LocalAppSnippet[] => {
319
+ const tagName = getQuantTag(filepath);
320
+ const modelName = model.id.split("/")[1];
321
+ return [
322
+ {
323
+ title: "Pull the model",
324
+ setup: "# Download Lemonade from https://lemonade-server.ai/",
325
+ content: [
326
+ `lemonade-server pull user.${modelName} \\
327
+ --checkpoint ${model.id}${tagName} \\
328
+ --recipe llamacpp`,
329
+ "# Note: If you installed from source, use the lemonade-server-dev command instead.",
330
+ ].join("\n"),
331
+ },
332
+ {
333
+ title: "Run and chat with the model",
334
+ content: `lemonade-server run user.${modelName}`,
335
+ },
336
+ {
337
+ title: "List all available models",
338
+ content: "lemonade-server list",
339
+ },
340
+ ];
341
+ };
342
+
318
343
  /**
319
344
  * Add your new local app here.
320
345
  *
@@ -492,6 +517,13 @@ export const LOCAL_APPS = {
492
517
  displayOnModelPage: isLlamaCppGgufModel,
493
518
  snippet: snippetDockerModelRunner,
494
519
  },
520
+ lemonade: {
521
+ prettyLabel: "Lemonade",
522
+ docsUrl: "https://lemonade-server.ai",
523
+ mainTask: "text-generation",
524
+ displayOnModelPage: isLlamaCppGgufModel,
525
+ snippet: snippetLemonade,
526
+ },
495
527
  } satisfies Record<string, LocalApp>;
496
528
 
497
529
  export type LocalAppKey = keyof typeof LOCAL_APPS;
@@ -1523,7 +1523,7 @@ export const transformers = (model: ModelData): string[] => {
1523
1523
  autoSnippet.push(
1524
1524
  "# Load model directly",
1525
1525
  `from transformers import ${info.auto_model}`,
1526
- `model = ${info.auto_model}.from_pretrained("${model.id}"` + remote_code_snippet + ', torch_dtype="auto"),'
1526
+ `model = ${info.auto_model}.from_pretrained("${model.id}"` + remote_code_snippet + ', torch_dtype="auto")'
1527
1527
  );
1528
1528
  }
1529
1529
 
@@ -1703,6 +1703,16 @@ export const vfimamba = (model: ModelData): string[] => [
1703
1703
  model = Model.from_pretrained("${model.id}")`,
1704
1704
  ];
1705
1705
 
1706
+ export const lvface = (model: ModelData): string[] => [
1707
+ `from huggingface_hub import hf_hub_download
1708
+ from inference_onnx import LVFaceONNXInferencer
1709
+
1710
+ model_path = hf_hub_download("${model.id}", "LVFace-L_Glint360K/LVFace-L_Glint360K.onnx")
1711
+ inferencer = LVFaceONNXInferencer(model_path, use_gpu=True, timeout=300)
1712
+ img_path = 'path/to/image1.jpg'
1713
+ embedding = inferencer.infer_from_image(img_path)`,
1714
+ ];
1715
+
1706
1716
  export const voicecraft = (model: ModelData): string[] => [
1707
1717
  `from voicecraft import VoiceCraft
1708
1718
 
@@ -130,6 +130,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
130
130
  filter: false,
131
131
  countDownloads: `path:"llm_config.json"`,
132
132
  },
133
+ bboxmaskpose: {
134
+ prettyLabel: "BBoxMaskPose",
135
+ repoName: "BBoxMaskPose",
136
+ repoUrl: "https://github.com/MiraPurkrabek/BBoxMaskPose",
137
+ filter: false,
138
+ countDownloads: `path_extension:"pth"`,
139
+ },
133
140
  ben2: {
134
141
  prettyLabel: "BEN2",
135
142
  repoName: "BEN2",
@@ -1156,6 +1163,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
1156
1163
  countDownloads: `path_extension:"pkl"`,
1157
1164
  snippets: snippets.vfimamba,
1158
1165
  },
1166
+ lvface: {
1167
+ prettyLabel: "LVFace",
1168
+ repoName: "LVFace",
1169
+ repoUrl: "https://github.com/bytedance/LVFace",
1170
+ countDownloads: `path_extension:"pt" OR path_extension:"onnx"`,
1171
+ snippets: snippets.lvface,
1172
+ },
1159
1173
  voicecraft: {
1160
1174
  prettyLabel: "VoiceCraft",
1161
1175
  repoName: "VoiceCraft",
@@ -40,7 +40,7 @@ const taskData: TaskDataCustom = {
40
40
  },
41
41
  {
42
42
  description: "Robust model that can take in image and text and generate image and text.",
43
- id: "deepseek-ai/Janus-Pro-7B",
43
+ id: "OmniGen2/OmniGen2",
44
44
  },
45
45
  {
46
46
  description: "Any-to-any model with speech, video, audio, image and text understanding capabilities.",
@@ -48,13 +48,13 @@ const taskData: TaskDataCustom = {
48
48
  },
49
49
  {
50
50
  description: "A model that can understand image and text and generate image and text.",
51
- id: "EPFL-VILAB/4M-21_XL",
51
+ id: "ByteDance-Seed/BAGEL-7B-MoT",
52
52
  },
53
53
  ],
54
54
  spaces: [
55
55
  {
56
56
  description: "An application to chat with an any-to-any (image & text) model.",
57
- id: "deepseek-ai/Janus-Pro-7B",
57
+ id: "OmniGen2/OmniGen2",
58
58
  },
59
59
  ],
60
60
  summary: "Any-to-any models can understand two or more modalities and output two or more modalities.",
@@ -44,8 +44,8 @@ const taskData: TaskDataCustom = {
44
44
  models: [
45
45
  {
46
46
  // TO DO: write description
47
- description: "Solid semantic segmentation model trained on ADE20k.",
48
- id: "openmmlab/upernet-convnext-small",
47
+ description: "Solid panoptic segmentation model trained on COCO.",
48
+ id: "tue-mps/coco_panoptic_eomt_large_640",
49
49
  },
50
50
  {
51
51
  description: "Background removal model.",
@@ -47,33 +47,21 @@ const taskData: TaskDataCustom = {
47
47
  id: "HuggingFaceTB/SmolVLM-Instruct",
48
48
  },
49
49
  {
50
- description: "A screenshot understanding model used to control computers.",
51
- id: "microsoft/OmniParser-v2.0",
50
+ description: "Cutting-edge reasoning vision language model.",
51
+ id: "zai-org/GLM-4.5V",
52
52
  },
53
53
  {
54
- description: "Cutting-edge vision language model.",
55
- id: "allenai/Molmo-7B-D-0924",
54
+ description: "Cutting-edge small vision language model to convert documents to text.",
55
+ id: "rednote-hilab/dots.ocr",
56
56
  },
57
57
  {
58
58
  description: "Small yet powerful model.",
59
- id: "vikhyatk/moondream2",
60
- },
61
- {
62
- description: "Strong image-text-to-text model.",
63
- id: "Qwen/Qwen2.5-VL-7B-Instruct",
59
+ id: "Qwen/Qwen2.5-VL-3B-Instruct",
64
60
  },
65
61
  {
66
62
  description: "Image-text-to-text model with agentic capabilities.",
67
63
  id: "microsoft/Magma-8B",
68
64
  },
69
- {
70
- description: "Strong image-text-to-text model focused on documents.",
71
- id: "allenai/olmOCR-7B-0225-preview",
72
- },
73
- {
74
- description: "Small yet strong image-text-to-text model.",
75
- id: "ibm-granite/granite-vision-3.2-2b",
76
- },
77
65
  ],
78
66
  spaces: [
79
67
  {
@@ -81,33 +69,17 @@ const taskData: TaskDataCustom = {
81
69
  id: "opencompass/open_vlm_leaderboard",
82
70
  },
83
71
  {
84
- description: "Vision language models arena, where models are ranked by votes of users.",
85
- id: "WildVision/vision-arena",
86
- },
87
- {
88
- description: "Powerful vision-language model assistant.",
89
- id: "akhaliq/Molmo-7B-D-0924",
90
- },
91
- {
92
- description: "Powerful vision language assistant that can understand multiple images.",
93
- id: "HuggingFaceTB/SmolVLM2",
94
- },
95
- {
96
- description: "An application for chatting with an image-text-to-text model.",
97
- id: "GanymedeNil/Qwen2-VL-7B",
98
- },
99
- {
100
- description: "An application that parses screenshots into actions.",
101
- id: "showlab/ShowUI",
72
+ description: "An application that compares object detection capabilities of different vision language models.",
73
+ id: "sergiopaniego/vlm_object_understanding",
102
74
  },
103
75
  {
104
- description: "An application that detects gaze.",
105
- id: "moondream/gaze-demo",
76
+ description: "An application to compare different OCR models.",
77
+ id: "prithivMLmods/Multimodal-OCR",
106
78
  },
107
79
  ],
108
80
  summary:
109
81
  "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
110
- widgetModels: ["Qwen/Qwen2-VL-7B-Instruct"],
82
+ widgetModels: ["zai-org/GLM-4.5V"],
111
83
  youtubeId: "IoGaGfU1CIg",
112
84
  };
113
85
 
@@ -33,8 +33,8 @@ const taskData: TaskDataCustom = {
33
33
  id: "TencentARC/InstantMesh",
34
34
  },
35
35
  {
36
- description: "Fast image-to-3D mesh model by StabilityAI",
37
- id: "stabilityai/TripoSR",
36
+ description: "3D world generation model.",
37
+ id: "tencent/HunyuanWorld-1",
38
38
  },
39
39
  {
40
40
  description: "A scaled up image-to-3D mesh model derived from TripoSR.",
@@ -53,16 +53,16 @@ const taskData: TaskDataCustom = {
53
53
  id: "fal/AuraSR-v2",
54
54
  },
55
55
  {
56
- description: "A model that increases the resolution of an image.",
57
- id: "keras-io/super-resolution",
56
+ description: "Powerful image editing model.",
57
+ id: "black-forest-labs/FLUX.1-Kontext-dev",
58
58
  },
59
59
  {
60
- description: "A model for applying edits to images through image controls.",
61
- id: "Yuanshi/OminiControl",
60
+ description: "Virtual try-on model.",
61
+ id: "yisol/IDM-VTON",
62
62
  },
63
63
  {
64
- description: "A model that generates images based on segments in the input image and the text prompt.",
65
- id: "mfidabel/controlnet-segment-anything",
64
+ description: "Image re-lighting model.",
65
+ id: "kontext-community/relighting-kontext-dev-lora-v3",
66
66
  },
67
67
  {
68
68
  description: "Strong model for inpainting and outpainting.",
@@ -75,33 +75,21 @@ const taskData: TaskDataCustom = {
75
75
  ],
76
76
  spaces: [
77
77
  {
78
- description: "Image enhancer application for low light.",
79
- id: "keras-io/low-light-image-enhancement",
78
+ description: "Image editing application.",
79
+ id: "black-forest-labs/FLUX.1-Kontext-Dev",
80
80
  },
81
81
  {
82
- description: "Style transfer application.",
83
- id: "keras-io/neural-style-transfer",
82
+ description: "Image relighting application.",
83
+ id: "lllyasviel/iclight-v2-vary",
84
84
  },
85
85
  {
86
- description: "An application that generates images based on segment control.",
87
- id: "mfidabel/controlnet-segment-anything",
88
- },
89
- {
90
- description: "Image generation application that takes image control and text prompt.",
91
- id: "hysts/ControlNet",
92
- },
93
- {
94
- description: "Colorize any image using this app.",
95
- id: "ioclab/brightness-controlnet",
96
- },
97
- {
98
- description: "Edit images with instructions.",
99
- id: "timbrooks/instruct-pix2pix",
86
+ description: "An application for image upscaling.",
87
+ id: "jasperai/Flux.1-dev-Controlnet-Upscaler",
100
88
  },
101
89
  ],
102
90
  summary:
103
91
  "Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
104
- widgetModels: ["stabilityai/stable-diffusion-2-inpainting"],
92
+ widgetModels: ["Qwen/Qwen-Image"],
105
93
  youtubeId: "",
106
94
  };
107
95
 
@@ -31,46 +31,26 @@ const taskData: TaskDataCustom = {
31
31
  metrics: [],
32
32
  models: [
33
33
  {
34
- description: "A robust image captioning model.",
35
- id: "Salesforce/blip2-opt-2.7b",
34
+ description: "Strong OCR model.",
35
+ id: "allenai/olmOCR-7B-0725",
36
36
  },
37
37
  {
38
- description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
39
- id: "microsoft/kosmos-2-patch14-224",
40
- },
41
- {
42
- description: "A strong optical character recognition model.",
43
- id: "facebook/nougat-base",
44
- },
45
- {
46
- description: "A powerful model that lets you have a conversation with the image.",
47
- id: "llava-hf/llava-1.5-7b-hf",
38
+ description: "Powerful image captioning model.",
39
+ id: "fancyfeast/llama-joycaption-beta-one-hf-llava",
48
40
  },
49
41
  ],
50
42
  spaces: [
51
43
  {
52
- description: "An application that compares various image captioning models.",
53
- id: "nielsr/comparing-captioning-models",
54
- },
55
- {
56
- description: "A robust image captioning application.",
57
- id: "flax-community/image-captioning",
58
- },
59
- {
60
- description: "An application that transcribes handwritings into text.",
61
- id: "nielsr/TrOCR-handwritten",
62
- },
63
- {
64
- description: "An application that can caption images and answer questions about a given image.",
65
- id: "Salesforce/BLIP",
44
+ description: "SVG generator app from images.",
45
+ id: "multimodalart/OmniSVG-3B",
66
46
  },
67
47
  {
68
- description: "An application that can caption images and answer questions with a conversational agent.",
69
- id: "Salesforce/BLIP2",
48
+ description: "An application that converts documents to markdown.",
49
+ id: "numind/NuMarkdown-8B-Thinking",
70
50
  },
71
51
  {
72
- description: "An image captioning application that demonstrates the effect of noise on captions.",
73
- id: "johko/capdec-image-captioning",
52
+ description: "An application that can caption images.",
53
+ id: "fancyfeast/joy-caption-beta-one",
74
54
  },
75
55
  ],
76
56
  summary:
@@ -33,11 +33,11 @@ const taskData: TaskDataCustom = {
33
33
  },
34
34
  {
35
35
  description: "Strong keypoint detection model used to detect human pose.",
36
- id: "facebook/sapiens-pose-1b",
36
+ id: "qualcomm/RTMPose-Body2d",
37
37
  },
38
38
  {
39
- description: "Powerful keypoint detection model used to detect human pose.",
40
- id: "usyd-community/vitpose-plus-base",
39
+ description: "Powerful keypoint matching model.",
40
+ id: "ETH-CVG/lightglue_disk",
41
41
  },
42
42
  ],
43
43
  spaces: [
@@ -46,8 +46,8 @@ const taskData: TaskDataCustom = {
46
46
  id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
47
47
  },
48
48
  {
49
- description: "An application to try a universal keypoint detection model.",
50
- id: "merve/SuperPoint",
49
+ description: "An application for keypoint detection and matching.",
50
+ id: "ETH-CVG/LightGlue",
51
51
  },
52
52
  ],
53
53
  summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",
@@ -61,8 +61,8 @@ const taskData: TaskDataCustom = {
61
61
  ],
62
62
  spaces: [
63
63
  {
64
- description: "Leaderboard to compare various object detection models across several metrics.",
65
- id: "hf-vision/object_detection_leaderboard",
64
+ description: "Real-time object detection demo.",
65
+ id: "Roboflow/RF-DETR",
66
66
  },
67
67
  {
68
68
  description: "An application that contains various object detection models to try from.",