@huggingface/tasks 0.13.15 → 0.13.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commonjs/local-apps.js +9 -9
- package/dist/commonjs/local-apps.spec.js +2 -8
- package/dist/commonjs/model-data.d.ts +3 -0
- package/dist/commonjs/model-data.d.ts.map +1 -1
- package/dist/commonjs/model-libraries-snippets.d.ts +3 -1
- package/dist/commonjs/model-libraries-snippets.d.ts.map +1 -1
- package/dist/commonjs/model-libraries-snippets.js +134 -22
- package/dist/commonjs/model-libraries.d.ts +12 -8
- package/dist/commonjs/model-libraries.d.ts.map +1 -1
- package/dist/commonjs/model-libraries.js +15 -8
- package/dist/commonjs/tasks/audio-to-audio/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/audio-to-audio/data.js +0 -4
- package/dist/commonjs/tasks/fill-mask/data.js +2 -2
- package/dist/commonjs/tasks/image-classification/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-classification/data.js +2 -3
- package/dist/commonjs/tasks/image-feature-extraction/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-feature-extraction/data.js +8 -3
- package/dist/commonjs/tasks/image-text-to-text/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-text-to-text/data.js +19 -11
- package/dist/commonjs/tasks/image-to-3d/data.js +4 -4
- package/dist/commonjs/tasks/image-to-image/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-to-image/data.js +12 -4
- package/dist/commonjs/tasks/index.js +1 -1
- package/dist/commonjs/tasks/keypoint-detection/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/keypoint-detection/data.js +4 -0
- package/dist/commonjs/tasks/object-detection/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/object-detection/data.js +6 -2
- package/dist/commonjs/tasks/sentence-similarity/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/sentence-similarity/data.js +5 -1
- package/dist/commonjs/tasks/text-generation/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-generation/data.js +17 -13
- package/dist/commonjs/tasks/text-to-image/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-image/data.js +4 -0
- package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-speech/data.js +9 -9
- package/dist/commonjs/tasks/text-to-video/data.js +5 -5
- package/dist/commonjs/tasks/video-text-to-text/data.js +4 -4
- package/dist/commonjs/tasks/zero-shot-classification/data.js +2 -2
- package/dist/commonjs/tasks/zero-shot-image-classification/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/zero-shot-image-classification/data.js +8 -4
- package/dist/esm/local-apps.js +9 -9
- package/dist/esm/local-apps.spec.js +2 -8
- package/dist/esm/model-data.d.ts +3 -0
- package/dist/esm/model-data.d.ts.map +1 -1
- package/dist/esm/model-libraries-snippets.d.ts +3 -1
- package/dist/esm/model-libraries-snippets.d.ts.map +1 -1
- package/dist/esm/model-libraries-snippets.js +129 -19
- package/dist/esm/model-libraries.d.ts +12 -8
- package/dist/esm/model-libraries.d.ts.map +1 -1
- package/dist/esm/model-libraries.js +15 -8
- package/dist/esm/tasks/audio-to-audio/data.d.ts.map +1 -1
- package/dist/esm/tasks/audio-to-audio/data.js +0 -4
- package/dist/esm/tasks/fill-mask/data.js +2 -2
- package/dist/esm/tasks/image-classification/data.d.ts.map +1 -1
- package/dist/esm/tasks/image-classification/data.js +2 -3
- package/dist/esm/tasks/image-feature-extraction/data.d.ts.map +1 -1
- package/dist/esm/tasks/image-feature-extraction/data.js +8 -3
- package/dist/esm/tasks/image-text-to-text/data.d.ts.map +1 -1
- package/dist/esm/tasks/image-text-to-text/data.js +19 -11
- package/dist/esm/tasks/image-to-3d/data.js +4 -4
- package/dist/esm/tasks/image-to-image/data.d.ts.map +1 -1
- package/dist/esm/tasks/image-to-image/data.js +12 -4
- package/dist/esm/tasks/index.js +1 -1
- package/dist/esm/tasks/keypoint-detection/data.d.ts.map +1 -1
- package/dist/esm/tasks/keypoint-detection/data.js +4 -0
- package/dist/esm/tasks/object-detection/data.d.ts.map +1 -1
- package/dist/esm/tasks/object-detection/data.js +6 -2
- package/dist/esm/tasks/sentence-similarity/data.d.ts.map +1 -1
- package/dist/esm/tasks/sentence-similarity/data.js +5 -1
- package/dist/esm/tasks/text-generation/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-generation/data.js +17 -13
- package/dist/esm/tasks/text-to-image/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-image/data.js +4 -0
- package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-speech/data.js +9 -9
- package/dist/esm/tasks/text-to-video/data.js +5 -5
- package/dist/esm/tasks/video-text-to-text/data.js +4 -4
- package/dist/esm/tasks/zero-shot-classification/data.js +2 -2
- package/dist/esm/tasks/zero-shot-image-classification/data.d.ts.map +1 -1
- package/dist/esm/tasks/zero-shot-image-classification/data.js +8 -4
- package/package.json +1 -1
- package/src/local-apps.spec.ts +2 -8
- package/src/local-apps.ts +9 -9
- package/src/model-data.ts +3 -0
- package/src/model-libraries-snippets.ts +141 -19
- package/src/model-libraries.ts +15 -8
- package/src/tasks/audio-to-audio/data.ts +0 -4
- package/src/tasks/fill-mask/data.ts +2 -2
- package/src/tasks/image-classification/data.ts +2 -3
- package/src/tasks/image-feature-extraction/data.ts +8 -3
- package/src/tasks/image-text-to-text/about.md +8 -3
- package/src/tasks/image-text-to-text/data.ts +19 -11
- package/src/tasks/image-to-3d/data.ts +4 -4
- package/src/tasks/image-to-image/data.ts +12 -5
- package/src/tasks/index.ts +1 -1
- package/src/tasks/keypoint-detection/data.ts +4 -0
- package/src/tasks/object-detection/data.ts +6 -2
- package/src/tasks/sentence-similarity/data.ts +5 -1
- package/src/tasks/text-generation/data.ts +17 -14
- package/src/tasks/text-to-image/data.ts +4 -0
- package/src/tasks/text-to-speech/data.ts +9 -10
- package/src/tasks/text-to-video/data.ts +5 -5
- package/src/tasks/video-text-to-text/data.ts +4 -4
- package/src/tasks/zero-shot-classification/data.ts +2 -2
- package/src/tasks/zero-shot-image-classification/data.ts +8 -4
|
@@ -46,16 +46,20 @@ const taskData = {
|
|
|
46
46
|
],
|
|
47
47
|
models: [
|
|
48
48
|
{
|
|
49
|
-
description: "
|
|
50
|
-
id: "
|
|
49
|
+
description: "Multilingual image classification model for 80 languages.",
|
|
50
|
+
id: "visheratin/mexma-siglip",
|
|
51
51
|
},
|
|
52
52
|
{
|
|
53
53
|
description: "Strong zero-shot image classification model.",
|
|
54
54
|
id: "google/siglip-so400m-patch14-224",
|
|
55
55
|
},
|
|
56
56
|
{
|
|
57
|
-
description: "
|
|
58
|
-
id: "
|
|
57
|
+
description: "Robust zero-shot image classification model.",
|
|
58
|
+
id: "microsoft/LLM2CLIP-EVA02-L-14-336",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
description: "Powerful zero-shot image classification model supporting 94 languages.",
|
|
62
|
+
id: "jinaai/jina-clip-v2",
|
|
59
63
|
},
|
|
60
64
|
{
|
|
61
65
|
description: "Strong image classification model for biomedical domain.",
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/tasks",
|
|
3
3
|
"packageManager": "pnpm@8.10.5",
|
|
4
|
-
"version": "0.13.
|
|
4
|
+
"version": "0.13.17",
|
|
5
5
|
"description": "List of ML tasks for huggingface.co/tasks",
|
|
6
6
|
"repository": "https://github.com/huggingface/huggingface.js.git",
|
|
7
7
|
"publishConfig": {
|
package/src/local-apps.spec.ts
CHANGED
|
@@ -13,11 +13,7 @@ describe("local-apps", () => {
|
|
|
13
13
|
const snippet = snippetFunc(model);
|
|
14
14
|
|
|
15
15
|
expect(snippet[0].content).toEqual(`# Load and run the model:
|
|
16
|
-
llama-cli
|
|
17
|
-
--hf-repo "bartowski/Llama-3.2-3B-Instruct-GGUF" \\
|
|
18
|
-
--hf-file {{GGUF_FILE}} \\
|
|
19
|
-
-p "You are a helpful assistant" \\
|
|
20
|
-
--conversation`);
|
|
16
|
+
llama-cli -hf bartowski/Llama-3.2-3B-Instruct-GGUF`);
|
|
21
17
|
});
|
|
22
18
|
|
|
23
19
|
it("llama.cpp non-conversational", async () => {
|
|
@@ -30,9 +26,7 @@ llama-cli \\
|
|
|
30
26
|
const snippet = snippetFunc(model);
|
|
31
27
|
|
|
32
28
|
expect(snippet[0].content).toEqual(`# Load and run the model:
|
|
33
|
-
llama-cli \\
|
|
34
|
-
--hf-repo "mlabonne/gemma-2b-GGUF" \\
|
|
35
|
-
--hf-file {{GGUF_FILE}} \\
|
|
29
|
+
llama-cli -hf mlabonne/gemma-2b-GGUF \\
|
|
36
30
|
-p "Once upon a time,"`);
|
|
37
31
|
});
|
|
38
32
|
|
package/src/local-apps.ts
CHANGED
|
@@ -95,17 +95,17 @@ function isMlxModel(model: ModelData) {
|
|
|
95
95
|
}
|
|
96
96
|
|
|
97
97
|
const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[] => {
|
|
98
|
+
let tagName = "";
|
|
99
|
+
if (filepath) {
|
|
100
|
+
const quantLabel = parseGGUFQuantLabel(filepath);
|
|
101
|
+
tagName = quantLabel ? `:${quantLabel}` : "";
|
|
102
|
+
}
|
|
98
103
|
const command = (binary: string) => {
|
|
99
|
-
const snippet = [
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
` --hf-repo "${model.id}" \\`,
|
|
103
|
-
` --hf-file ${filepath ?? "{{GGUF_FILE}}"} \\`,
|
|
104
|
-
` -p "${model.tags.includes("conversational") ? "You are a helpful assistant" : "Once upon a time,"}"`,
|
|
105
|
-
];
|
|
106
|
-
if (model.tags.includes("conversational")) {
|
|
104
|
+
const snippet = ["# Load and run the model:", `${binary} -hf ${model.id}${tagName}`];
|
|
105
|
+
if (!model.tags.includes("conversational")) {
|
|
106
|
+
// for non-conversational models, add a prompt
|
|
107
107
|
snippet[snippet.length - 1] += " \\";
|
|
108
|
-
snippet.push(
|
|
108
|
+
snippet.push(' -p "Once upon a time,"');
|
|
109
109
|
}
|
|
110
110
|
return snippet.join("\n");
|
|
111
111
|
};
|
package/src/model-data.ts
CHANGED
|
@@ -95,6 +95,29 @@ export const bm25s = (model: ModelData): string[] => [
|
|
|
95
95
|
retriever = BM25HF.load_from_hub("${model.id}")`,
|
|
96
96
|
];
|
|
97
97
|
|
|
98
|
+
export const cxr_foundation = (model: ModelData): string[] => [
|
|
99
|
+
`!git clone https://github.com/Google-Health/cxr-foundation.git
|
|
100
|
+
import tensorflow as tf, sys, requests
|
|
101
|
+
sys.path.append('cxr-foundation/python/')
|
|
102
|
+
|
|
103
|
+
# Install dependencies
|
|
104
|
+
major_version = tf.__version__.rsplit(".", 1)[0]
|
|
105
|
+
!pip install tensorflow-text=={major_version} pypng && pip install --no-deps pydicom hcls_imaging_ml_toolkit retrying
|
|
106
|
+
|
|
107
|
+
# Load image (Stillwaterising, CC0, via Wikimedia Commons)
|
|
108
|
+
from PIL import Image
|
|
109
|
+
from io import BytesIO
|
|
110
|
+
image_url = "https://upload.wikimedia.org/wikipedia/commons/c/c8/Chest_Xray_PA_3-8-2010.png"
|
|
111
|
+
response = requests.get(image_url, headers={'User-Agent': 'Demo'}, stream=True)
|
|
112
|
+
response.raw.decode_content = True # Ensure correct decoding
|
|
113
|
+
img = Image.open(BytesIO(response.content)).convert('L') # Convert to grayscale
|
|
114
|
+
|
|
115
|
+
# Run inference
|
|
116
|
+
from clientside.clients import make_hugging_face_client
|
|
117
|
+
cxr_client = make_hugging_face_client('cxr_model')
|
|
118
|
+
print(cxr_client.get_image_embeddings_from_images([img]))`,
|
|
119
|
+
];
|
|
120
|
+
|
|
98
121
|
export const depth_anything_v2 = (model: ModelData): string[] => {
|
|
99
122
|
let encoder: string;
|
|
100
123
|
let features: string;
|
|
@@ -168,6 +191,28 @@ focallength_px = prediction["focallength_px"]`;
|
|
|
168
191
|
return [installSnippet, inferenceSnippet];
|
|
169
192
|
};
|
|
170
193
|
|
|
194
|
+
export const derm_foundation = (model: ModelData): string[] => [
|
|
195
|
+
`from huggingface_hub import from_pretrained_keras
|
|
196
|
+
import tensorflow as tf, requests
|
|
197
|
+
|
|
198
|
+
# Load and format input
|
|
199
|
+
IMAGE_URL = "https://storage.googleapis.com/dx-scin-public-data/dataset/images/3445096909671059178.png"
|
|
200
|
+
input_tensor = tf.train.Example(
|
|
201
|
+
features=tf.train.Features(
|
|
202
|
+
feature={
|
|
203
|
+
"image/encoded": tf.train.Feature(
|
|
204
|
+
bytes_list=tf.train.BytesList(value=[requests.get(IMAGE_URL, stream=True).content])
|
|
205
|
+
)
|
|
206
|
+
}
|
|
207
|
+
)
|
|
208
|
+
).SerializeToString()
|
|
209
|
+
|
|
210
|
+
# Load model and run inference
|
|
211
|
+
loaded_model = from_pretrained_keras("google/derm-foundation")
|
|
212
|
+
infer = loaded_model.signatures["serving_default"]
|
|
213
|
+
print(infer(inputs=tf.constant([input_tensor])))`,
|
|
214
|
+
]
|
|
215
|
+
|
|
171
216
|
const diffusersDefaultPrompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k";
|
|
172
217
|
|
|
173
218
|
const diffusers_default = (model: ModelData) => [
|
|
@@ -394,32 +439,103 @@ model = keras.saving.load_model("hf://${model.id}")
|
|
|
394
439
|
`,
|
|
395
440
|
];
|
|
396
441
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
import os
|
|
400
|
-
os.environ["KERAS_BACKEND"] = "jax"
|
|
442
|
+
const _keras_hub_causal_lm = (modelId: string): string => `
|
|
443
|
+
import keras_hub
|
|
401
444
|
|
|
402
|
-
|
|
445
|
+
# Load CausalLM model (optional: use half precision for inference)
|
|
446
|
+
causal_lm = keras_hub.models.CausalLM.from_preset(${modelId}, dtype="bfloat16")
|
|
447
|
+
causal_lm.compile(sampler="greedy") # (optional) specify a sampler
|
|
403
448
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
];
|
|
449
|
+
# Generate text
|
|
450
|
+
causal_lm.generate("Keras: deep learning for", max_length=64)
|
|
451
|
+
`;
|
|
408
452
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
453
|
+
const _keras_hub_text_to_image = (modelId: string): string => `
|
|
454
|
+
import keras_hub
|
|
455
|
+
|
|
456
|
+
# Load TextToImage model (optional: use half precision for inference)
|
|
457
|
+
text_to_image = keras_hub.models.TextToImage.from_preset(${modelId}, dtype="bfloat16")
|
|
413
458
|
|
|
459
|
+
# Generate images with a TextToImage model.
|
|
460
|
+
text_to_image.generate("Astronaut in a jungle")
|
|
461
|
+
`;
|
|
462
|
+
|
|
463
|
+
const _keras_hub_text_classifier = (modelId: string): string => `
|
|
414
464
|
import keras_hub
|
|
415
465
|
|
|
416
|
-
# Load
|
|
417
|
-
|
|
466
|
+
# Load TextClassifier model
|
|
467
|
+
text_classifier = keras_hub.models.TextClassifier.from_preset(
|
|
468
|
+
${modelId},
|
|
469
|
+
num_classes=2,
|
|
470
|
+
)
|
|
471
|
+
# Fine-tune
|
|
472
|
+
text_classifier.fit(x=["Thilling adventure!", "Total snoozefest."], y=[1, 0])
|
|
473
|
+
# Classify text
|
|
474
|
+
text_classifier.predict(["Not my cup of tea."])
|
|
475
|
+
`;
|
|
418
476
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
477
|
+
const _keras_hub_image_classifier = (modelId: string): string => `
|
|
478
|
+
import keras_hub
|
|
479
|
+
import keras
|
|
480
|
+
|
|
481
|
+
# Load ImageClassifier model
|
|
482
|
+
image_classifier = keras_hub.models.ImageClassifier.from_preset(
|
|
483
|
+
${modelId},
|
|
484
|
+
num_classes=2,
|
|
485
|
+
)
|
|
486
|
+
# Fine-tune
|
|
487
|
+
image_classifier.fit(
|
|
488
|
+
x=keras.random.randint((32, 64, 64, 3), 0, 256),
|
|
489
|
+
y=keras.random.randint((32, 1), 0, 2),
|
|
490
|
+
)
|
|
491
|
+
# Classify image
|
|
492
|
+
image_classifier.predict(keras.random.randint((1, 64, 64, 3), 0, 256))
|
|
493
|
+
`;
|
|
494
|
+
|
|
495
|
+
const _keras_hub_tasks_with_example = {
|
|
496
|
+
CausalLM: _keras_hub_causal_lm,
|
|
497
|
+
TextToImage: _keras_hub_text_to_image,
|
|
498
|
+
TextClassifier: _keras_hub_text_classifier,
|
|
499
|
+
ImageClassifier: _keras_hub_image_classifier,
|
|
500
|
+
};
|
|
501
|
+
|
|
502
|
+
const _keras_hub_task_without_example = (task: string, modelId: string): string => `
|
|
503
|
+
import keras_hub
|
|
504
|
+
|
|
505
|
+
# Create a ${task} model
|
|
506
|
+
task = keras_hub.models.${task}.from_preset(${modelId})
|
|
507
|
+
`;
|
|
508
|
+
|
|
509
|
+
const _keras_hub_generic_backbone = (modelId: string): string => `
|
|
510
|
+
import keras_hub
|
|
511
|
+
|
|
512
|
+
# Create a Backbone model unspecialized for any task
|
|
513
|
+
backbone = keras_hub.models.Backbone.from_preset(${modelId})
|
|
514
|
+
`;
|
|
515
|
+
|
|
516
|
+
export const keras_hub = (model: ModelData): string[] => {
|
|
517
|
+
const modelId = model.id;
|
|
518
|
+
const tasks = model.config?.keras_hub?.tasks ?? [];
|
|
519
|
+
|
|
520
|
+
const snippets: string[] = [];
|
|
521
|
+
|
|
522
|
+
// First, generate tasks with examples
|
|
523
|
+
for (const [task, snippet] of Object.entries(_keras_hub_tasks_with_example)) {
|
|
524
|
+
if (tasks.includes(task)) {
|
|
525
|
+
snippets.push(snippet(modelId));
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
// Then, add remaining tasks
|
|
529
|
+
for (const task in tasks) {
|
|
530
|
+
if (!Object.keys(_keras_hub_tasks_with_example).includes(task)) {
|
|
531
|
+
snippets.push(_keras_hub_task_without_example(task, modelId));
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
// Finally, add generic backbone snippet
|
|
535
|
+
snippets.push(_keras_hub_generic_backbone(modelId));
|
|
536
|
+
|
|
537
|
+
return snippets;
|
|
538
|
+
};
|
|
423
539
|
|
|
424
540
|
export const llama_cpp_python = (model: ModelData): string[] => {
|
|
425
541
|
const snippets = [
|
|
@@ -845,6 +961,12 @@ model.${speechbrainMethod}("file.wav")`,
|
|
|
845
961
|
];
|
|
846
962
|
};
|
|
847
963
|
|
|
964
|
+
export const terratorch = (model: ModelData): string[] => [
|
|
965
|
+
`from terratorch.registry import BACKBONE_REGISTRY
|
|
966
|
+
|
|
967
|
+
model = BACKBONE_REGISTRY.build("${model.id}")`,
|
|
968
|
+
];
|
|
969
|
+
|
|
848
970
|
export const transformers = (model: ModelData): string[] => {
|
|
849
971
|
const info = model.transformersInfo;
|
|
850
972
|
if (!info) {
|
package/src/model-libraries.ts
CHANGED
|
@@ -176,6 +176,7 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
176
176
|
prettyLabel: "CXR Foundation",
|
|
177
177
|
repoName: "cxr-foundation",
|
|
178
178
|
repoUrl: "https://github.com/google-health/cxr-foundation",
|
|
179
|
+
snippets: snippets.cxr_foundation,
|
|
179
180
|
filter: false,
|
|
180
181
|
countDownloads: `path:"precomputed_embeddings/embeddings.npz" OR path:"pax-elixr-b-text/saved_model.pb"`,
|
|
181
182
|
},
|
|
@@ -206,6 +207,7 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
206
207
|
prettyLabel: "Derm Foundation",
|
|
207
208
|
repoName: "derm-foundation",
|
|
208
209
|
repoUrl: "https://github.com/google-health/derm-foundation",
|
|
210
|
+
snippets: snippets.derm_foundation,
|
|
209
211
|
filter: false,
|
|
210
212
|
countDownloads: `path:"scin_dataset_precomputed_embeddings.npz" OR path:"saved_model.pb"`,
|
|
211
213
|
},
|
|
@@ -404,13 +406,6 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
404
406
|
snippets: snippets.tf_keras,
|
|
405
407
|
countDownloads: `path:"saved_model.pb"`,
|
|
406
408
|
},
|
|
407
|
-
"keras-nlp": {
|
|
408
|
-
prettyLabel: "KerasNLP",
|
|
409
|
-
repoName: "KerasNLP",
|
|
410
|
-
repoUrl: "https://github.com/keras-team/keras-nlp",
|
|
411
|
-
docsUrl: "https://keras.io/keras_nlp/",
|
|
412
|
-
snippets: snippets.keras_nlp,
|
|
413
|
-
},
|
|
414
409
|
"keras-hub": {
|
|
415
410
|
prettyLabel: "KerasHub",
|
|
416
411
|
repoName: "KerasHub",
|
|
@@ -544,7 +539,10 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
544
539
|
repoUrl: "https://github.com/mlfoundations/open_clip",
|
|
545
540
|
snippets: snippets.open_clip,
|
|
546
541
|
filter: true,
|
|
547
|
-
countDownloads: `
|
|
542
|
+
countDownloads: `path:"open_clip_model.safetensors"
|
|
543
|
+
OR path:"model.safetensors"
|
|
544
|
+
OR path:"open_clip_pytorch_model.bin"
|
|
545
|
+
OR path:"pytorch_model.bin"`,
|
|
548
546
|
},
|
|
549
547
|
paddlenlp: {
|
|
550
548
|
prettyLabel: "paddlenlp",
|
|
@@ -780,6 +778,15 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
780
778
|
repoName: "TabPFN",
|
|
781
779
|
repoUrl: "https://github.com/PriorLabs/TabPFN",
|
|
782
780
|
},
|
|
781
|
+
terratorch: {
|
|
782
|
+
prettyLabel: "TerraTorch",
|
|
783
|
+
repoName: "TerraTorch",
|
|
784
|
+
repoUrl: "https://github.com/IBM/terratorch",
|
|
785
|
+
docsUrl: "https://ibm.github.io/terratorch/",
|
|
786
|
+
filter: false,
|
|
787
|
+
countDownloads: `path_extension:"pt"`,
|
|
788
|
+
snippets: snippets.terratorch,
|
|
789
|
+
},
|
|
783
790
|
"tic-clip": {
|
|
784
791
|
prettyLabel: "TiC-CLIP",
|
|
785
792
|
repoName: "TiC-CLIP",
|
|
@@ -38,10 +38,6 @@ const taskData: TaskDataCustom = {
|
|
|
38
38
|
},
|
|
39
39
|
],
|
|
40
40
|
models: [
|
|
41
|
-
{
|
|
42
|
-
description: "A solid model of audio source separation.",
|
|
43
|
-
id: "speechbrain/sepformer-wham",
|
|
44
|
-
},
|
|
45
41
|
{
|
|
46
42
|
description: "A speech enhancement model.",
|
|
47
43
|
id: "ResembleAI/resemble-enhance",
|
|
@@ -61,8 +61,8 @@ const taskData: TaskDataCustom = {
|
|
|
61
61
|
],
|
|
62
62
|
models: [
|
|
63
63
|
{
|
|
64
|
-
description: "
|
|
65
|
-
id: "
|
|
64
|
+
description: "State-of-the-art masked language model.",
|
|
65
|
+
id: "answerdotai/ModernBERT-large",
|
|
66
66
|
},
|
|
67
67
|
{
|
|
68
68
|
description: "A multilingual model trained on 100 languages.",
|
|
@@ -74,9 +74,8 @@ const taskData: TaskDataCustom = {
|
|
|
74
74
|
],
|
|
75
75
|
spaces: [
|
|
76
76
|
{
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
id: "nielsr/perceiver-image-classification",
|
|
77
|
+
description: "A leaderboard to evaluate different image classification models.",
|
|
78
|
+
id: "timm/leaderboard",
|
|
80
79
|
},
|
|
81
80
|
],
|
|
82
81
|
summary:
|
|
@@ -43,15 +43,20 @@ const taskData: TaskDataCustom = {
|
|
|
43
43
|
id: "facebook/dino-vitb16",
|
|
44
44
|
},
|
|
45
45
|
{
|
|
46
|
-
description: "
|
|
47
|
-
id: "
|
|
46
|
+
description: "Cutting-edge image feature extraction model.",
|
|
47
|
+
id: "apple/aimv2-large-patch14-336-distilled",
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
50
|
description: "Strong image feature extraction model that can be used on images and documents.",
|
|
51
51
|
id: "OpenGVLab/InternViT-6B-448px-V1-2",
|
|
52
52
|
},
|
|
53
53
|
],
|
|
54
|
-
spaces: [
|
|
54
|
+
spaces: [
|
|
55
|
+
{
|
|
56
|
+
description: "A leaderboard to evaluate different image-feature-extraction models on classification performances",
|
|
57
|
+
id: "timm/leaderboard",
|
|
58
|
+
},
|
|
59
|
+
],
|
|
55
60
|
summary: "Image feature extraction is the task of extracting features learnt in a computer vision model.",
|
|
56
61
|
widgetModels: [],
|
|
57
62
|
};
|
|
@@ -24,12 +24,16 @@ Vision language models trained on image-text pairs can be used for visual questi
|
|
|
24
24
|
|
|
25
25
|
### Document Question Answering and Retrieval
|
|
26
26
|
|
|
27
|
-
Documents often consist of different layouts, charts, tables, images, and more. Vision language models trained on formatted documents can extract information from them. This is an OCR-free approach; the inputs skip OCR, and documents are directly fed to vision language models.
|
|
27
|
+
Documents often consist of different layouts, charts, tables, images, and more. Vision language models trained on formatted documents can extract information from them. This is an OCR-free approach; the inputs skip OCR, and documents are directly fed to vision language models. To find the relevant documents to be fed, models like [ColPali](https://huggingface.co/blog/manu/colpali) are used. An example workflow can be found [here](https://github.com/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb).
|
|
28
28
|
|
|
29
29
|
### Image Recognition with Instructions
|
|
30
30
|
|
|
31
31
|
Vision language models can recognize images through descriptions. When given detailed descriptions of specific entities, it can classify the entities in an image.
|
|
32
32
|
|
|
33
|
+
### Computer Use
|
|
34
|
+
|
|
35
|
+
Image-text-to-text models can be used to control computers with agentic workflows. Models like [ShowUI](https://huggingface.co/showlab/ShowUI-2B) and [OmniParser](https://huggingface.co/microsoft/OmniParser) are used to parse screenshots to later take actions on the computer autonomously.
|
|
36
|
+
|
|
33
37
|
## Inference
|
|
34
38
|
|
|
35
39
|
You can use the Transformers library to interact with [vision-language models](https://huggingface.co/models?pipeline_tag=image-text-to-text&transformers). Specifically, `pipeline` makes it easy to infer models.
|
|
@@ -82,7 +86,8 @@ curl https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision
|
|
|
82
86
|
## Useful Resources
|
|
83
87
|
|
|
84
88
|
- [Vision Language Models Explained](https://huggingface.co/blog/vlms)
|
|
85
|
-
- [
|
|
86
|
-
- [
|
|
89
|
+
- [Welcome PaliGemma 2 – New vision language models by Google](https://huggingface.co/blog/paligemma2)
|
|
90
|
+
- [SmolVLM - small yet mighty Vision Language Model](https://huggingface.co/blog/smolvlm)
|
|
91
|
+
- [Multimodal RAG using ColPali and Qwen2-VL](https://github.com/merveenoyan/smol-vision/blob/main/ColPali_%2B_Qwen2_VL.ipynb)
|
|
87
92
|
- [Image-text-to-text task guide](https://huggingface.co/tasks/image-text-to-text)
|
|
88
93
|
- [Preference Optimization for Vision Language Models with TRL](https://huggingface.co/blog/dpo_vlm)
|
|
@@ -7,8 +7,8 @@ const taskData: TaskDataCustom = {
|
|
|
7
7
|
id: "liuhaotian/LLaVA-Instruct-150K",
|
|
8
8
|
},
|
|
9
9
|
{
|
|
10
|
-
description: "
|
|
11
|
-
id: "
|
|
10
|
+
description: "Collection of image-text pairs on scientific topics.",
|
|
11
|
+
id: "DAMO-NLP-SG/multimodal_textbook",
|
|
12
12
|
},
|
|
13
13
|
{
|
|
14
14
|
description: "A collection of datasets made for model fine-tuning.",
|
|
@@ -43,11 +43,15 @@ const taskData: TaskDataCustom = {
|
|
|
43
43
|
metrics: [],
|
|
44
44
|
models: [
|
|
45
45
|
{
|
|
46
|
-
description: "
|
|
47
|
-
id: "
|
|
46
|
+
description: "Small and efficient yet powerful vision language model.",
|
|
47
|
+
id: "HuggingFaceTB/SmolVLM-Instruct",
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
|
-
description: "
|
|
50
|
+
description: "A screenshot understanding model used to control computers.",
|
|
51
|
+
id: "showlab/ShowUI-2B",
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
description: "Cutting-edge vision language model.",
|
|
51
55
|
id: "allenai/Molmo-7B-D-0924",
|
|
52
56
|
},
|
|
53
57
|
{
|
|
@@ -59,8 +63,8 @@ const taskData: TaskDataCustom = {
|
|
|
59
63
|
id: "Qwen/Qwen2-VL-7B-Instruct",
|
|
60
64
|
},
|
|
61
65
|
{
|
|
62
|
-
description: "
|
|
63
|
-
id: "
|
|
66
|
+
description: "Image-text-to-text model with reasoning capabilities.",
|
|
67
|
+
id: "Qwen/QVQ-72B-Preview",
|
|
64
68
|
},
|
|
65
69
|
{
|
|
66
70
|
description: "Strong image-text-to-text model focused on documents.",
|
|
@@ -84,14 +88,18 @@ const taskData: TaskDataCustom = {
|
|
|
84
88
|
description: "An image-text-to-text application focused on documents.",
|
|
85
89
|
id: "stepfun-ai/GOT_official_online_demo",
|
|
86
90
|
},
|
|
87
|
-
{
|
|
88
|
-
description: "An application to compare outputs of different vision language models.",
|
|
89
|
-
id: "merve/compare_VLMs",
|
|
90
|
-
},
|
|
91
91
|
{
|
|
92
92
|
description: "An application for chatting with an image-text-to-text model.",
|
|
93
93
|
id: "GanymedeNil/Qwen2-VL-7B",
|
|
94
94
|
},
|
|
95
|
+
{
|
|
96
|
+
description: "An application that parses screenshots into actions.",
|
|
97
|
+
id: "showlab/ShowUI",
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
description: "An application that detects gaze.",
|
|
101
|
+
id: "smoondream/gaze-demo",
|
|
102
|
+
},
|
|
95
103
|
],
|
|
96
104
|
summary:
|
|
97
105
|
"Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
|
|
@@ -41,8 +41,8 @@ const taskData: TaskDataCustom = {
|
|
|
41
41
|
id: "hwjiang/Real3D",
|
|
42
42
|
},
|
|
43
43
|
{
|
|
44
|
-
description: "
|
|
45
|
-
id: "
|
|
44
|
+
description: "Consistent image-to-3d generation model.",
|
|
45
|
+
id: "stabilityai/stable-point-aware-3d",
|
|
46
46
|
},
|
|
47
47
|
],
|
|
48
48
|
spaces: [
|
|
@@ -55,8 +55,8 @@ const taskData: TaskDataCustom = {
|
|
|
55
55
|
id: "TencentARC/InstantMesh",
|
|
56
56
|
},
|
|
57
57
|
{
|
|
58
|
-
description: "Image-to-3D demo
|
|
59
|
-
id: "stabilityai/
|
|
58
|
+
description: "Image-to-3D demo.",
|
|
59
|
+
id: "stabilityai/stable-point-aware-3d",
|
|
60
60
|
},
|
|
61
61
|
{
|
|
62
62
|
description: "Image-to-3D demo with mesh outputs.",
|
|
@@ -10,6 +10,10 @@ const taskData: TaskDataCustom = {
|
|
|
10
10
|
description: "Multiple images of celebrities, used for facial expression translation",
|
|
11
11
|
id: "huggan/CelebA-faces",
|
|
12
12
|
},
|
|
13
|
+
{
|
|
14
|
+
description: "12M image-caption pairs.",
|
|
15
|
+
id: "Spawning/PD12M",
|
|
16
|
+
},
|
|
13
17
|
],
|
|
14
18
|
demo: {
|
|
15
19
|
inputs: [
|
|
@@ -53,17 +57,20 @@ const taskData: TaskDataCustom = {
|
|
|
53
57
|
id: "keras-io/super-resolution",
|
|
54
58
|
},
|
|
55
59
|
{
|
|
56
|
-
description:
|
|
57
|
-
|
|
58
|
-
id: "lambdalabs/sd-image-variations-diffusers",
|
|
60
|
+
description: "A model for applying edits to images through image controls.",
|
|
61
|
+
id: "Yuanshi/OminiControl",
|
|
59
62
|
},
|
|
60
63
|
{
|
|
61
64
|
description: "A model that generates images based on segments in the input image and the text prompt.",
|
|
62
65
|
id: "mfidabel/controlnet-segment-anything",
|
|
63
66
|
},
|
|
64
67
|
{
|
|
65
|
-
description: "
|
|
66
|
-
id: "
|
|
68
|
+
description: "Strong model for inpainting and outpainting.",
|
|
69
|
+
id: "black-forest-labs/FLUX.1-Fill-dev",
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
description: "Strong model for image editing using depth maps.",
|
|
73
|
+
id: "black-forest-labs/FLUX.1-Depth-dev-lora",
|
|
67
74
|
},
|
|
68
75
|
],
|
|
69
76
|
spaces: [
|
package/src/tasks/index.ts
CHANGED
|
@@ -132,7 +132,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
132
132
|
"video-classification": ["transformers"],
|
|
133
133
|
"mask-generation": ["transformers"],
|
|
134
134
|
"multiple-choice": ["transformers"],
|
|
135
|
-
"object-detection": ["transformers", "transformers.js"],
|
|
135
|
+
"object-detection": ["transformers", "transformers.js", "ultralytics"],
|
|
136
136
|
other: [],
|
|
137
137
|
"question-answering": ["adapter-transformers", "allennlp", "transformers", "transformers.js"],
|
|
138
138
|
robotics: [],
|
|
@@ -31,6 +31,10 @@ const taskData: TaskDataCustom = {
|
|
|
31
31
|
description: "Strong keypoint detection model used to detect human pose.",
|
|
32
32
|
id: "facebook/sapiens-pose-1b",
|
|
33
33
|
},
|
|
34
|
+
{
|
|
35
|
+
description: "Powerful keypoint detection model used to detect human pose.",
|
|
36
|
+
id: "usyd-community/vitpose-plus-base",
|
|
37
|
+
},
|
|
34
38
|
],
|
|
35
39
|
spaces: [
|
|
36
40
|
{
|
|
@@ -54,6 +54,10 @@ const taskData: TaskDataCustom = {
|
|
|
54
54
|
description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
|
|
55
55
|
id: "PekingU/rtdetr_r18vd_coco_o365",
|
|
56
56
|
},
|
|
57
|
+
{
|
|
58
|
+
description: "Object detection model for low-lying objects.",
|
|
59
|
+
id: "StephanST/WALDO30",
|
|
60
|
+
},
|
|
57
61
|
],
|
|
58
62
|
spaces: [
|
|
59
63
|
{
|
|
@@ -65,8 +69,8 @@ const taskData: TaskDataCustom = {
|
|
|
65
69
|
id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS",
|
|
66
70
|
},
|
|
67
71
|
{
|
|
68
|
-
description: "
|
|
69
|
-
id: "
|
|
72
|
+
description: "A cutting-edge object detection application.",
|
|
73
|
+
id: "Ultralytics/YOLO11",
|
|
70
74
|
},
|
|
71
75
|
{
|
|
72
76
|
description: "An object tracking, segmentation and inpainting application.",
|
|
@@ -69,9 +69,13 @@ const taskData: TaskDataCustom = {
|
|
|
69
69
|
id: "sentence-transformers/all-mpnet-base-v2",
|
|
70
70
|
},
|
|
71
71
|
{
|
|
72
|
-
description: "A multilingual robust sentence similarity model
|
|
72
|
+
description: "A multilingual robust sentence similarity model.",
|
|
73
73
|
id: "BAAI/bge-m3",
|
|
74
74
|
},
|
|
75
|
+
{
|
|
76
|
+
description: "A robust sentence similarity model.",
|
|
77
|
+
id: "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5",
|
|
78
|
+
},
|
|
75
79
|
],
|
|
76
80
|
spaces: [
|
|
77
81
|
{
|