@huggingface/tasks 0.13.15 → 0.13.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/commonjs/local-apps.js +9 -9
  2. package/dist/commonjs/local-apps.spec.js +2 -8
  3. package/dist/commonjs/model-data.d.ts +3 -0
  4. package/dist/commonjs/model-data.d.ts.map +1 -1
  5. package/dist/commonjs/model-libraries-snippets.d.ts +3 -1
  6. package/dist/commonjs/model-libraries-snippets.d.ts.map +1 -1
  7. package/dist/commonjs/model-libraries-snippets.js +134 -22
  8. package/dist/commonjs/model-libraries.d.ts +12 -8
  9. package/dist/commonjs/model-libraries.d.ts.map +1 -1
  10. package/dist/commonjs/model-libraries.js +15 -8
  11. package/dist/commonjs/tasks/audio-to-audio/data.d.ts.map +1 -1
  12. package/dist/commonjs/tasks/audio-to-audio/data.js +0 -4
  13. package/dist/commonjs/tasks/fill-mask/data.js +2 -2
  14. package/dist/commonjs/tasks/image-classification/data.d.ts.map +1 -1
  15. package/dist/commonjs/tasks/image-classification/data.js +2 -3
  16. package/dist/commonjs/tasks/image-feature-extraction/data.d.ts.map +1 -1
  17. package/dist/commonjs/tasks/image-feature-extraction/data.js +8 -3
  18. package/dist/commonjs/tasks/image-text-to-text/data.d.ts.map +1 -1
  19. package/dist/commonjs/tasks/image-text-to-text/data.js +19 -11
  20. package/dist/commonjs/tasks/image-to-3d/data.js +4 -4
  21. package/dist/commonjs/tasks/image-to-image/data.d.ts.map +1 -1
  22. package/dist/commonjs/tasks/image-to-image/data.js +12 -4
  23. package/dist/commonjs/tasks/index.js +1 -1
  24. package/dist/commonjs/tasks/keypoint-detection/data.d.ts.map +1 -1
  25. package/dist/commonjs/tasks/keypoint-detection/data.js +4 -0
  26. package/dist/commonjs/tasks/object-detection/data.d.ts.map +1 -1
  27. package/dist/commonjs/tasks/object-detection/data.js +6 -2
  28. package/dist/commonjs/tasks/sentence-similarity/data.d.ts.map +1 -1
  29. package/dist/commonjs/tasks/sentence-similarity/data.js +5 -1
  30. package/dist/commonjs/tasks/text-generation/data.d.ts.map +1 -1
  31. package/dist/commonjs/tasks/text-generation/data.js +17 -13
  32. package/dist/commonjs/tasks/text-to-image/data.d.ts.map +1 -1
  33. package/dist/commonjs/tasks/text-to-image/data.js +4 -0
  34. package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
  35. package/dist/commonjs/tasks/text-to-speech/data.js +9 -9
  36. package/dist/commonjs/tasks/text-to-video/data.js +5 -5
  37. package/dist/commonjs/tasks/video-text-to-text/data.js +4 -4
  38. package/dist/commonjs/tasks/zero-shot-classification/data.js +2 -2
  39. package/dist/commonjs/tasks/zero-shot-image-classification/data.d.ts.map +1 -1
  40. package/dist/commonjs/tasks/zero-shot-image-classification/data.js +8 -4
  41. package/dist/esm/local-apps.js +9 -9
  42. package/dist/esm/local-apps.spec.js +2 -8
  43. package/dist/esm/model-data.d.ts +3 -0
  44. package/dist/esm/model-data.d.ts.map +1 -1
  45. package/dist/esm/model-libraries-snippets.d.ts +3 -1
  46. package/dist/esm/model-libraries-snippets.d.ts.map +1 -1
  47. package/dist/esm/model-libraries-snippets.js +129 -19
  48. package/dist/esm/model-libraries.d.ts +12 -8
  49. package/dist/esm/model-libraries.d.ts.map +1 -1
  50. package/dist/esm/model-libraries.js +15 -8
  51. package/dist/esm/tasks/audio-to-audio/data.d.ts.map +1 -1
  52. package/dist/esm/tasks/audio-to-audio/data.js +0 -4
  53. package/dist/esm/tasks/fill-mask/data.js +2 -2
  54. package/dist/esm/tasks/image-classification/data.d.ts.map +1 -1
  55. package/dist/esm/tasks/image-classification/data.js +2 -3
  56. package/dist/esm/tasks/image-feature-extraction/data.d.ts.map +1 -1
  57. package/dist/esm/tasks/image-feature-extraction/data.js +8 -3
  58. package/dist/esm/tasks/image-text-to-text/data.d.ts.map +1 -1
  59. package/dist/esm/tasks/image-text-to-text/data.js +19 -11
  60. package/dist/esm/tasks/image-to-3d/data.js +4 -4
  61. package/dist/esm/tasks/image-to-image/data.d.ts.map +1 -1
  62. package/dist/esm/tasks/image-to-image/data.js +12 -4
  63. package/dist/esm/tasks/index.js +1 -1
  64. package/dist/esm/tasks/keypoint-detection/data.d.ts.map +1 -1
  65. package/dist/esm/tasks/keypoint-detection/data.js +4 -0
  66. package/dist/esm/tasks/object-detection/data.d.ts.map +1 -1
  67. package/dist/esm/tasks/object-detection/data.js +6 -2
  68. package/dist/esm/tasks/sentence-similarity/data.d.ts.map +1 -1
  69. package/dist/esm/tasks/sentence-similarity/data.js +5 -1
  70. package/dist/esm/tasks/text-generation/data.d.ts.map +1 -1
  71. package/dist/esm/tasks/text-generation/data.js +17 -13
  72. package/dist/esm/tasks/text-to-image/data.d.ts.map +1 -1
  73. package/dist/esm/tasks/text-to-image/data.js +4 -0
  74. package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
  75. package/dist/esm/tasks/text-to-speech/data.js +9 -9
  76. package/dist/esm/tasks/text-to-video/data.js +5 -5
  77. package/dist/esm/tasks/video-text-to-text/data.js +4 -4
  78. package/dist/esm/tasks/zero-shot-classification/data.js +2 -2
  79. package/dist/esm/tasks/zero-shot-image-classification/data.d.ts.map +1 -1
  80. package/dist/esm/tasks/zero-shot-image-classification/data.js +8 -4
  81. package/package.json +1 -1
  82. package/src/local-apps.spec.ts +2 -8
  83. package/src/local-apps.ts +9 -9
  84. package/src/model-data.ts +3 -0
  85. package/src/model-libraries-snippets.ts +141 -19
  86. package/src/model-libraries.ts +15 -8
  87. package/src/tasks/audio-to-audio/data.ts +0 -4
  88. package/src/tasks/fill-mask/data.ts +2 -2
  89. package/src/tasks/image-classification/data.ts +2 -3
  90. package/src/tasks/image-feature-extraction/data.ts +8 -3
  91. package/src/tasks/image-text-to-text/about.md +8 -3
  92. package/src/tasks/image-text-to-text/data.ts +19 -11
  93. package/src/tasks/image-to-3d/data.ts +4 -4
  94. package/src/tasks/image-to-image/data.ts +12 -5
  95. package/src/tasks/index.ts +1 -1
  96. package/src/tasks/keypoint-detection/data.ts +4 -0
  97. package/src/tasks/object-detection/data.ts +6 -2
  98. package/src/tasks/sentence-similarity/data.ts +5 -1
  99. package/src/tasks/text-generation/data.ts +17 -14
  100. package/src/tasks/text-to-image/data.ts +4 -0
  101. package/src/tasks/text-to-speech/data.ts +9 -10
  102. package/src/tasks/text-to-video/data.ts +5 -5
  103. package/src/tasks/video-text-to-text/data.ts +4 -4
  104. package/src/tasks/zero-shot-classification/data.ts +2 -2
  105. package/src/tasks/zero-shot-image-classification/data.ts +8 -4
@@ -46,16 +46,20 @@ const taskData = {
46
46
  ],
47
47
  models: [
48
48
  {
49
- description: "Robust image classification model trained on publicly available image-caption data.",
50
- id: "openai/clip-vit-base-patch16",
49
+ description: "Multilingual image classification model for 80 languages.",
50
+ id: "visheratin/mexma-siglip",
51
51
  },
52
52
  {
53
53
  description: "Strong zero-shot image classification model.",
54
54
  id: "google/siglip-so400m-patch14-224",
55
55
  },
56
56
  {
57
- description: "Small yet powerful zero-shot image classification model that can run on edge devices.",
58
- id: "apple/MobileCLIP-S1-OpenCLIP",
57
+ description: "Robust zero-shot image classification model.",
58
+ id: "microsoft/LLM2CLIP-EVA02-L-14-336",
59
+ },
60
+ {
61
+ description: "Powerful zero-shot image classification model supporting 94 languages.",
62
+ id: "jinaai/jina-clip-v2",
59
63
  },
60
64
  {
61
65
  description: "Strong image classification model for biomedical domain.",
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.13.15",
4
+ "version": "0.13.17",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
@@ -13,11 +13,7 @@ describe("local-apps", () => {
13
13
  const snippet = snippetFunc(model);
14
14
 
15
15
  expect(snippet[0].content).toEqual(`# Load and run the model:
16
- llama-cli \\
17
- --hf-repo "bartowski/Llama-3.2-3B-Instruct-GGUF" \\
18
- --hf-file {{GGUF_FILE}} \\
19
- -p "You are a helpful assistant" \\
20
- --conversation`);
16
+ llama-cli -hf bartowski/Llama-3.2-3B-Instruct-GGUF`);
21
17
  });
22
18
 
23
19
  it("llama.cpp non-conversational", async () => {
@@ -30,9 +26,7 @@ llama-cli \\
30
26
  const snippet = snippetFunc(model);
31
27
 
32
28
  expect(snippet[0].content).toEqual(`# Load and run the model:
33
- llama-cli \\
34
- --hf-repo "mlabonne/gemma-2b-GGUF" \\
35
- --hf-file {{GGUF_FILE}} \\
29
+ llama-cli -hf mlabonne/gemma-2b-GGUF \\
36
30
  -p "Once upon a time,"`);
37
31
  });
38
32
 
package/src/local-apps.ts CHANGED
@@ -95,17 +95,17 @@ function isMlxModel(model: ModelData) {
95
95
  }
96
96
 
97
97
  const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[] => {
98
+ let tagName = "";
99
+ if (filepath) {
100
+ const quantLabel = parseGGUFQuantLabel(filepath);
101
+ tagName = quantLabel ? `:${quantLabel}` : "";
102
+ }
98
103
  const command = (binary: string) => {
99
- const snippet = [
100
- "# Load and run the model:",
101
- `${binary} \\`,
102
- ` --hf-repo "${model.id}" \\`,
103
- ` --hf-file ${filepath ?? "{{GGUF_FILE}}"} \\`,
104
- ` -p "${model.tags.includes("conversational") ? "You are a helpful assistant" : "Once upon a time,"}"`,
105
- ];
106
- if (model.tags.includes("conversational")) {
104
+ const snippet = ["# Load and run the model:", `${binary} -hf ${model.id}${tagName}`];
105
+ if (!model.tags.includes("conversational")) {
106
+ // for non-conversational models, add a prompt
107
107
  snippet[snippet.length - 1] += " \\";
108
- snippet.push(" --conversation");
108
+ snippet.push(' -p "Once upon a time,"');
109
109
  }
110
110
  return snippet.join("\n");
111
111
  };
package/src/model-data.ts CHANGED
@@ -66,6 +66,9 @@ export interface ModelData {
66
66
  base_model_name_or_path?: string;
67
67
  task_type?: string;
68
68
  };
69
+ keras_hub?: {
70
+ tasks?: string[];
71
+ };
69
72
  };
70
73
  /**
71
74
  * all the model tags
@@ -95,6 +95,29 @@ export const bm25s = (model: ModelData): string[] => [
95
95
  retriever = BM25HF.load_from_hub("${model.id}")`,
96
96
  ];
97
97
 
98
+ export const cxr_foundation = (model: ModelData): string[] => [
99
+ `!git clone https://github.com/Google-Health/cxr-foundation.git
100
+ import tensorflow as tf, sys, requests
101
+ sys.path.append('cxr-foundation/python/')
102
+
103
+ # Install dependencies
104
+ major_version = tf.__version__.rsplit(".", 1)[0]
105
+ !pip install tensorflow-text=={major_version} pypng && pip install --no-deps pydicom hcls_imaging_ml_toolkit retrying
106
+
107
+ # Load image (Stillwaterising, CC0, via Wikimedia Commons)
108
+ from PIL import Image
109
+ from io import BytesIO
110
+ image_url = "https://upload.wikimedia.org/wikipedia/commons/c/c8/Chest_Xray_PA_3-8-2010.png"
111
+ response = requests.get(image_url, headers={'User-Agent': 'Demo'}, stream=True)
112
+ response.raw.decode_content = True # Ensure correct decoding
113
+ img = Image.open(BytesIO(response.content)).convert('L') # Convert to grayscale
114
+
115
+ # Run inference
116
+ from clientside.clients import make_hugging_face_client
117
+ cxr_client = make_hugging_face_client('cxr_model')
118
+ print(cxr_client.get_image_embeddings_from_images([img]))`,
119
+ ];
120
+
98
121
  export const depth_anything_v2 = (model: ModelData): string[] => {
99
122
  let encoder: string;
100
123
  let features: string;
@@ -168,6 +191,28 @@ focallength_px = prediction["focallength_px"]`;
168
191
  return [installSnippet, inferenceSnippet];
169
192
  };
170
193
 
194
+ export const derm_foundation = (model: ModelData): string[] => [
195
+ `from huggingface_hub import from_pretrained_keras
196
+ import tensorflow as tf, requests
197
+
198
+ # Load and format input
199
+ IMAGE_URL = "https://storage.googleapis.com/dx-scin-public-data/dataset/images/3445096909671059178.png"
200
+ input_tensor = tf.train.Example(
201
+ features=tf.train.Features(
202
+ feature={
203
+ "image/encoded": tf.train.Feature(
204
+ bytes_list=tf.train.BytesList(value=[requests.get(IMAGE_URL, stream=True).content])
205
+ )
206
+ }
207
+ )
208
+ ).SerializeToString()
209
+
210
+ # Load model and run inference
211
+ loaded_model = from_pretrained_keras("google/derm-foundation")
212
+ infer = loaded_model.signatures["serving_default"]
213
+ print(infer(inputs=tf.constant([input_tensor])))`,
214
+ ]
215
+
171
216
  const diffusersDefaultPrompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k";
172
217
 
173
218
  const diffusers_default = (model: ModelData) => [
@@ -394,32 +439,103 @@ model = keras.saving.load_model("hf://${model.id}")
394
439
  `,
395
440
  ];
396
441
 
397
- export const keras_nlp = (model: ModelData): string[] => [
398
- `# Available backend options are: "jax", "torch", "tensorflow".
399
- import os
400
- os.environ["KERAS_BACKEND"] = "jax"
442
+ const _keras_hub_causal_lm = (modelId: string): string => `
443
+ import keras_hub
401
444
 
402
- import keras_nlp
445
+ # Load CausalLM model (optional: use half precision for inference)
446
+ causal_lm = keras_hub.models.CausalLM.from_preset(${modelId}, dtype="bfloat16")
447
+ causal_lm.compile(sampler="greedy") # (optional) specify a sampler
403
448
 
404
- tokenizer = keras_nlp.models.Tokenizer.from_preset("hf://${model.id}")
405
- backbone = keras_nlp.models.Backbone.from_preset("hf://${model.id}")
406
- `,
407
- ];
449
+ # Generate text
450
+ causal_lm.generate("Keras: deep learning for", max_length=64)
451
+ `;
408
452
 
409
- export const keras_hub = (model: ModelData): string[] => [
410
- `# Available backend options are: "jax", "torch", "tensorflow".
411
- import os
412
- os.environ["KERAS_BACKEND"] = "jax"
453
+ const _keras_hub_text_to_image = (modelId: string): string => `
454
+ import keras_hub
455
+
456
+ # Load TextToImage model (optional: use half precision for inference)
457
+ text_to_image = keras_hub.models.TextToImage.from_preset(${modelId}, dtype="bfloat16")
413
458
 
459
+ # Generate images with a TextToImage model.
460
+ text_to_image.generate("Astronaut in a jungle")
461
+ `;
462
+
463
+ const _keras_hub_text_classifier = (modelId: string): string => `
414
464
  import keras_hub
415
465
 
416
- # Load a task-specific model (*replace CausalLM with your task*)
417
- model = keras_hub.models.CausalLM.from_preset("hf://${model.id}", dtype="bfloat16")
466
+ # Load TextClassifier model
467
+ text_classifier = keras_hub.models.TextClassifier.from_preset(
468
+ ${modelId},
469
+ num_classes=2,
470
+ )
471
+ # Fine-tune
472
+ text_classifier.fit(x=["Thilling adventure!", "Total snoozefest."], y=[1, 0])
473
+ # Classify text
474
+ text_classifier.predict(["Not my cup of tea."])
475
+ `;
418
476
 
419
- # Possible tasks are CausalLM, TextToImage, ImageClassifier, ...
420
- # full list here: https://keras.io/api/keras_hub/models/#api-documentation
421
- `,
422
- ];
477
+ const _keras_hub_image_classifier = (modelId: string): string => `
478
+ import keras_hub
479
+ import keras
480
+
481
+ # Load ImageClassifier model
482
+ image_classifier = keras_hub.models.ImageClassifier.from_preset(
483
+ ${modelId},
484
+ num_classes=2,
485
+ )
486
+ # Fine-tune
487
+ image_classifier.fit(
488
+ x=keras.random.randint((32, 64, 64, 3), 0, 256),
489
+ y=keras.random.randint((32, 1), 0, 2),
490
+ )
491
+ # Classify image
492
+ image_classifier.predict(keras.random.randint((1, 64, 64, 3), 0, 256))
493
+ `;
494
+
495
+ const _keras_hub_tasks_with_example = {
496
+ CausalLM: _keras_hub_causal_lm,
497
+ TextToImage: _keras_hub_text_to_image,
498
+ TextClassifier: _keras_hub_text_classifier,
499
+ ImageClassifier: _keras_hub_image_classifier,
500
+ };
501
+
502
+ const _keras_hub_task_without_example = (task: string, modelId: string): string => `
503
+ import keras_hub
504
+
505
+ # Create a ${task} model
506
+ task = keras_hub.models.${task}.from_preset(${modelId})
507
+ `;
508
+
509
+ const _keras_hub_generic_backbone = (modelId: string): string => `
510
+ import keras_hub
511
+
512
+ # Create a Backbone model unspecialized for any task
513
+ backbone = keras_hub.models.Backbone.from_preset(${modelId})
514
+ `;
515
+
516
+ export const keras_hub = (model: ModelData): string[] => {
517
+ const modelId = model.id;
518
+ const tasks = model.config?.keras_hub?.tasks ?? [];
519
+
520
+ const snippets: string[] = [];
521
+
522
+ // First, generate tasks with examples
523
+ for (const [task, snippet] of Object.entries(_keras_hub_tasks_with_example)) {
524
+ if (tasks.includes(task)) {
525
+ snippets.push(snippet(modelId));
526
+ }
527
+ }
528
+ // Then, add remaining tasks
529
+ for (const task in tasks) {
530
+ if (!Object.keys(_keras_hub_tasks_with_example).includes(task)) {
531
+ snippets.push(_keras_hub_task_without_example(task, modelId));
532
+ }
533
+ }
534
+ // Finally, add generic backbone snippet
535
+ snippets.push(_keras_hub_generic_backbone(modelId));
536
+
537
+ return snippets;
538
+ };
423
539
 
424
540
  export const llama_cpp_python = (model: ModelData): string[] => {
425
541
  const snippets = [
@@ -845,6 +961,12 @@ model.${speechbrainMethod}("file.wav")`,
845
961
  ];
846
962
  };
847
963
 
964
+ export const terratorch = (model: ModelData): string[] => [
965
+ `from terratorch.registry import BACKBONE_REGISTRY
966
+
967
+ model = BACKBONE_REGISTRY.build("${model.id}")`,
968
+ ];
969
+
848
970
  export const transformers = (model: ModelData): string[] => {
849
971
  const info = model.transformersInfo;
850
972
  if (!info) {
@@ -176,6 +176,7 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
176
176
  prettyLabel: "CXR Foundation",
177
177
  repoName: "cxr-foundation",
178
178
  repoUrl: "https://github.com/google-health/cxr-foundation",
179
+ snippets: snippets.cxr_foundation,
179
180
  filter: false,
180
181
  countDownloads: `path:"precomputed_embeddings/embeddings.npz" OR path:"pax-elixr-b-text/saved_model.pb"`,
181
182
  },
@@ -206,6 +207,7 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
206
207
  prettyLabel: "Derm Foundation",
207
208
  repoName: "derm-foundation",
208
209
  repoUrl: "https://github.com/google-health/derm-foundation",
210
+ snippets: snippets.derm_foundation,
209
211
  filter: false,
210
212
  countDownloads: `path:"scin_dataset_precomputed_embeddings.npz" OR path:"saved_model.pb"`,
211
213
  },
@@ -404,13 +406,6 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
404
406
  snippets: snippets.tf_keras,
405
407
  countDownloads: `path:"saved_model.pb"`,
406
408
  },
407
- "keras-nlp": {
408
- prettyLabel: "KerasNLP",
409
- repoName: "KerasNLP",
410
- repoUrl: "https://github.com/keras-team/keras-nlp",
411
- docsUrl: "https://keras.io/keras_nlp/",
412
- snippets: snippets.keras_nlp,
413
- },
414
409
  "keras-hub": {
415
410
  prettyLabel: "KerasHub",
416
411
  repoName: "KerasHub",
@@ -544,7 +539,10 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
544
539
  repoUrl: "https://github.com/mlfoundations/open_clip",
545
540
  snippets: snippets.open_clip,
546
541
  filter: true,
547
- countDownloads: `path_extension:"bin" AND path_filename:*pytorch_model`,
542
+ countDownloads: `path:"open_clip_model.safetensors"
543
+ OR path:"model.safetensors"
544
+ OR path:"open_clip_pytorch_model.bin"
545
+ OR path:"pytorch_model.bin"`,
548
546
  },
549
547
  paddlenlp: {
550
548
  prettyLabel: "paddlenlp",
@@ -780,6 +778,15 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
780
778
  repoName: "TabPFN",
781
779
  repoUrl: "https://github.com/PriorLabs/TabPFN",
782
780
  },
781
+ terratorch: {
782
+ prettyLabel: "TerraTorch",
783
+ repoName: "TerraTorch",
784
+ repoUrl: "https://github.com/IBM/terratorch",
785
+ docsUrl: "https://ibm.github.io/terratorch/",
786
+ filter: false,
787
+ countDownloads: `path_extension:"pt"`,
788
+ snippets: snippets.terratorch,
789
+ },
783
790
  "tic-clip": {
784
791
  prettyLabel: "TiC-CLIP",
785
792
  repoName: "TiC-CLIP",
@@ -38,10 +38,6 @@ const taskData: TaskDataCustom = {
38
38
  },
39
39
  ],
40
40
  models: [
41
- {
42
- description: "A solid model of audio source separation.",
43
- id: "speechbrain/sepformer-wham",
44
- },
45
41
  {
46
42
  description: "A speech enhancement model.",
47
43
  id: "ResembleAI/resemble-enhance",
@@ -61,8 +61,8 @@ const taskData: TaskDataCustom = {
61
61
  ],
62
62
  models: [
63
63
  {
64
- description: "The famous BERT model.",
65
- id: "google-bert/bert-base-uncased",
64
+ description: "State-of-the-art masked language model.",
65
+ id: "answerdotai/ModernBERT-large",
66
66
  },
67
67
  {
68
68
  description: "A multilingual model trained on 100 languages.",
@@ -74,9 +74,8 @@ const taskData: TaskDataCustom = {
74
74
  ],
75
75
  spaces: [
76
76
  {
77
- // TO DO: write description
78
- description: "An application that classifies what a given image is about.",
79
- id: "nielsr/perceiver-image-classification",
77
+ description: "A leaderboard to evaluate different image classification models.",
78
+ id: "timm/leaderboard",
80
79
  },
81
80
  ],
82
81
  summary:
@@ -43,15 +43,20 @@ const taskData: TaskDataCustom = {
43
43
  id: "facebook/dino-vitb16",
44
44
  },
45
45
  {
46
- description: "Strong image feature extraction model made for information retrieval from documents.",
47
- id: "vidore/colpali",
46
+ description: "Cutting-edge image feature extraction model.",
47
+ id: "apple/aimv2-large-patch14-336-distilled",
48
48
  },
49
49
  {
50
50
  description: "Strong image feature extraction model that can be used on images and documents.",
51
51
  id: "OpenGVLab/InternViT-6B-448px-V1-2",
52
52
  },
53
53
  ],
54
- spaces: [],
54
+ spaces: [
55
+ {
56
+ description: "A leaderboard to evaluate different image-feature-extraction models on classification performances",
57
+ id: "timm/leaderboard",
58
+ },
59
+ ],
55
60
  summary: "Image feature extraction is the task of extracting features learnt in a computer vision model.",
56
61
  widgetModels: [],
57
62
  };
@@ -24,12 +24,16 @@ Vision language models trained on image-text pairs can be used for visual questi
24
24
 
25
25
  ### Document Question Answering and Retrieval
26
26
 
27
- Documents often consist of different layouts, charts, tables, images, and more. Vision language models trained on formatted documents can extract information from them. This is an OCR-free approach; the inputs skip OCR, and documents are directly fed to vision language models.
27
+ Documents often consist of different layouts, charts, tables, images, and more. Vision language models trained on formatted documents can extract information from them. This is an OCR-free approach; the inputs skip OCR, and documents are directly fed to vision language models. To find the relevant documents to be fed, models like [ColPali](https://huggingface.co/blog/manu/colpali) are used. An example workflow can be found [here](https://github.com/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb).
28
28
 
29
29
  ### Image Recognition with Instructions
30
30
 
31
31
  Vision language models can recognize images through descriptions. When given detailed descriptions of specific entities, it can classify the entities in an image.
32
32
 
33
+ ### Computer Use
34
+
35
+ Image-text-to-text models can be used to control computers with agentic workflows. Models like [ShowUI](https://huggingface.co/showlab/ShowUI-2B) and [OmniParser](https://huggingface.co/microsoft/OmniParser) are used to parse screenshots to later take actions on the computer autonomously.
36
+
33
37
  ## Inference
34
38
 
35
39
  You can use the Transformers library to interact with [vision-language models](https://huggingface.co/models?pipeline_tag=image-text-to-text&transformers). Specifically, `pipeline` makes it easy to infer models.
@@ -82,7 +86,8 @@ curl https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision
82
86
  ## Useful Resources
83
87
 
84
88
  - [Vision Language Models Explained](https://huggingface.co/blog/vlms)
85
- - [Open-source Multimodality and How to Achieve it using Hugging Face](https://www.youtube.com/watch?v=IoGaGfU1CIg&t=601s)
86
- - [Introducing Idefics2: A Powerful 8B Vision-Language Model for the community](https://huggingface.co/blog/idefics2)
89
+ - [Welcome PaliGemma 2 New vision language models by Google](https://huggingface.co/blog/paligemma2)
90
+ - [SmolVLM - small yet mighty Vision Language Model](https://huggingface.co/blog/smolvlm)
91
+ - [Multimodal RAG using ColPali and Qwen2-VL](https://github.com/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb)
87
92
  - [Image-text-to-text task guide](https://huggingface.co/tasks/image-text-to-text)
88
93
  - [Preference Optimization for Vision Language Models with TRL](https://huggingface.co/blog/dpo_vlm)
@@ -7,8 +7,8 @@ const taskData: TaskDataCustom = {
7
7
  id: "liuhaotian/LLaVA-Instruct-150K",
8
8
  },
9
9
  {
10
- description: "Conversation turns where questions involve image and text.",
11
- id: "liuhaotian/LLaVA-Pretrain",
10
+ description: "Collection of image-text pairs on scientific topics.",
11
+ id: "DAMO-NLP-SG/multimodal_textbook",
12
12
  },
13
13
  {
14
14
  description: "A collection of datasets made for model fine-tuning.",
@@ -43,11 +43,15 @@ const taskData: TaskDataCustom = {
43
43
  metrics: [],
44
44
  models: [
45
45
  {
46
- description: "Powerful vision language model with great visual understanding and reasoning capabilities.",
47
- id: "meta-llama/Llama-3.2-11B-Vision-Instruct",
46
+ description: "Small and efficient yet powerful vision language model.",
47
+ id: "HuggingFaceTB/SmolVLM-Instruct",
48
48
  },
49
49
  {
50
- description: "Cutting-edge vision language models.",
50
+ description: "A screenshot understanding model used to control computers.",
51
+ id: "showlab/ShowUI-2B",
52
+ },
53
+ {
54
+ description: "Cutting-edge vision language model.",
51
55
  id: "allenai/Molmo-7B-D-0924",
52
56
  },
53
57
  {
@@ -59,8 +63,8 @@ const taskData: TaskDataCustom = {
59
63
  id: "Qwen/Qwen2-VL-7B-Instruct",
60
64
  },
61
65
  {
62
- description: "Strong image-text-to-text model.",
63
- id: "mistralai/Pixtral-12B-2409",
66
+ description: "Image-text-to-text model with reasoning capabilities.",
67
+ id: "Qwen/QVQ-72B-Preview",
64
68
  },
65
69
  {
66
70
  description: "Strong image-text-to-text model focused on documents.",
@@ -84,14 +88,18 @@ const taskData: TaskDataCustom = {
84
88
  description: "An image-text-to-text application focused on documents.",
85
89
  id: "stepfun-ai/GOT_official_online_demo",
86
90
  },
87
- {
88
- description: "An application to compare outputs of different vision language models.",
89
- id: "merve/compare_VLMs",
90
- },
91
91
  {
92
92
  description: "An application for chatting with an image-text-to-text model.",
93
93
  id: "GanymedeNil/Qwen2-VL-7B",
94
94
  },
95
+ {
96
+ description: "An application that parses screenshots into actions.",
97
+ id: "showlab/ShowUI",
98
+ },
99
+ {
100
+ description: "An application that detects gaze.",
101
+ id: "smoondream/gaze-demo",
102
+ },
95
103
  ],
96
104
  summary:
97
105
  "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
@@ -41,8 +41,8 @@ const taskData: TaskDataCustom = {
41
41
  id: "hwjiang/Real3D",
42
42
  },
43
43
  {
44
- description: "Generative 3D gaussian splatting model.",
45
- id: "ashawkey/LGM",
44
+ description: "Consistent image-to-3d generation model.",
45
+ id: "stabilityai/stable-point-aware-3d",
46
46
  },
47
47
  ],
48
48
  spaces: [
@@ -55,8 +55,8 @@ const taskData: TaskDataCustom = {
55
55
  id: "TencentARC/InstantMesh",
56
56
  },
57
57
  {
58
- description: "Image-to-3D demo with mesh outputs.",
59
- id: "stabilityai/TripoSR",
58
+ description: "Image-to-3D demo.",
59
+ id: "stabilityai/stable-point-aware-3d",
60
60
  },
61
61
  {
62
62
  description: "Image-to-3D demo with mesh outputs.",
@@ -10,6 +10,10 @@ const taskData: TaskDataCustom = {
10
10
  description: "Multiple images of celebrities, used for facial expression translation",
11
11
  id: "huggan/CelebA-faces",
12
12
  },
13
+ {
14
+ description: "12M image-caption pairs.",
15
+ id: "Spawning/PD12M",
16
+ },
13
17
  ],
14
18
  demo: {
15
19
  inputs: [
@@ -53,17 +57,20 @@ const taskData: TaskDataCustom = {
53
57
  id: "keras-io/super-resolution",
54
58
  },
55
59
  {
56
- description:
57
- "A model that creates a set of variations of the input image in the style of DALL-E using Stable Diffusion.",
58
- id: "lambdalabs/sd-image-variations-diffusers",
60
+ description: "A model for applying edits to images through image controls.",
61
+ id: "Yuanshi/OminiControl",
59
62
  },
60
63
  {
61
64
  description: "A model that generates images based on segments in the input image and the text prompt.",
62
65
  id: "mfidabel/controlnet-segment-anything",
63
66
  },
64
67
  {
65
- description: "A model that takes an image and an instruction to edit the image.",
66
- id: "timbrooks/instruct-pix2pix",
68
+ description: "Strong model for inpainting and outpainting.",
69
+ id: "black-forest-labs/FLUX.1-Fill-dev",
70
+ },
71
+ {
72
+ description: "Strong model for image editing using depth maps.",
73
+ id: "black-forest-labs/FLUX.1-Depth-dev-lora",
67
74
  },
68
75
  ],
69
76
  spaces: [
@@ -132,7 +132,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
132
132
  "video-classification": ["transformers"],
133
133
  "mask-generation": ["transformers"],
134
134
  "multiple-choice": ["transformers"],
135
- "object-detection": ["transformers", "transformers.js"],
135
+ "object-detection": ["transformers", "transformers.js", "ultralytics"],
136
136
  other: [],
137
137
  "question-answering": ["adapter-transformers", "allennlp", "transformers", "transformers.js"],
138
138
  robotics: [],
@@ -31,6 +31,10 @@ const taskData: TaskDataCustom = {
31
31
  description: "Strong keypoint detection model used to detect human pose.",
32
32
  id: "facebook/sapiens-pose-1b",
33
33
  },
34
+ {
35
+ description: "Powerful keypoint detection model used to detect human pose.",
36
+ id: "usyd-community/vitpose-plus-base",
37
+ },
34
38
  ],
35
39
  spaces: [
36
40
  {
@@ -54,6 +54,10 @@ const taskData: TaskDataCustom = {
54
54
  description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
55
55
  id: "PekingU/rtdetr_r18vd_coco_o365",
56
56
  },
57
+ {
58
+ description: "Object detection model for low-lying objects.",
59
+ id: "StephanST/WALDO30",
60
+ },
57
61
  ],
58
62
  spaces: [
59
63
  {
@@ -65,8 +69,8 @@ const taskData: TaskDataCustom = {
65
69
  id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS",
66
70
  },
67
71
  {
68
- description: "An application that shows multiple cutting edge techniques for object detection and tracking.",
69
- id: "kadirnar/torchyolo",
72
+ description: "A cutting-edge object detection application.",
73
+ id: "Ultralytics/YOLO11",
70
74
  },
71
75
  {
72
76
  description: "An object tracking, segmentation and inpainting application.",
@@ -69,9 +69,13 @@ const taskData: TaskDataCustom = {
69
69
  id: "sentence-transformers/all-mpnet-base-v2",
70
70
  },
71
71
  {
72
- description: "A multilingual robust sentence similarity model..",
72
+ description: "A multilingual robust sentence similarity model.",
73
73
  id: "BAAI/bge-m3",
74
74
  },
75
+ {
76
+ description: "A robust sentence similarity model.",
77
+ id: "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5",
78
+ },
75
79
  ],
76
80
  spaces: [
77
81
  {