@huggingface/tasks 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -1
- package/dist/index.js +64 -34
- package/dist/index.mjs +64 -34
- package/package.json +1 -1
- package/src/library-ui-elements.ts +14 -1
- package/src/model-data.ts +1 -1
- package/src/model-libraries.ts +1 -0
- package/src/tasks/depth-estimation/data.ts +5 -3
- package/src/tasks/document-question-answering/data.ts +8 -0
- package/src/tasks/image-to-text/about.md +13 -0
- package/src/tasks/image-to-text/data.ts +10 -14
- package/src/tasks/index.ts +2 -2
- package/src/tasks/object-detection/data.ts +5 -2
- package/src/tasks/table-question-answering/about.md +1 -1
- package/src/tasks/text-generation/about.md +4 -0
- package/src/tasks/text-to-image/data.ts +13 -11
- package/src/tasks/text-to-video/data.ts +3 -3
- package/src/tasks/visual-question-answering/data.ts +4 -0
package/dist/index.d.ts
CHANGED
|
@@ -16,6 +16,7 @@ declare enum ModelLibrary {
|
|
|
16
16
|
"flair" = "Flair",
|
|
17
17
|
"keras" = "Keras",
|
|
18
18
|
"k2" = "K2",
|
|
19
|
+
"mlx" = "mlx",
|
|
19
20
|
"nemo" = "NeMo",
|
|
20
21
|
"open_clip" = "OpenCLIP",
|
|
21
22
|
"paddlenlp" = "PaddleNLP",
|
|
@@ -623,7 +624,7 @@ interface ModelData {
|
|
|
623
624
|
inference?: boolean | {
|
|
624
625
|
parameters?: Record<string, unknown>;
|
|
625
626
|
};
|
|
626
|
-
base_model?: string;
|
|
627
|
+
base_model?: string | string[];
|
|
627
628
|
};
|
|
628
629
|
/**
|
|
629
630
|
* Library name
|
package/dist/index.js
CHANGED
|
@@ -119,7 +119,7 @@ var asteroid = (model) => [
|
|
|
119
119
|
model = BaseModel.from_pretrained("${model.id}")`
|
|
120
120
|
];
|
|
121
121
|
function get_base_diffusers_model(model) {
|
|
122
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
122
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
123
123
|
}
|
|
124
124
|
var bertopic = (model) => [
|
|
125
125
|
`from bertopic import BERTopic
|
|
@@ -531,6 +531,12 @@ transcriptions = asr_model.transcribe(["file.wav"])`
|
|
|
531
531
|
}
|
|
532
532
|
};
|
|
533
533
|
var mlAgents = (model) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
|
|
534
|
+
var mlx = (model) => [
|
|
535
|
+
`pip install huggingface_hub hf_transfer
|
|
536
|
+
|
|
537
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
538
|
+
huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`
|
|
539
|
+
];
|
|
534
540
|
var nemo = (model) => {
|
|
535
541
|
let command = void 0;
|
|
536
542
|
if (model.tags?.includes("automatic-speech-recognition")) {
|
|
@@ -605,6 +611,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
605
611
|
docsUrl: "https://huggingface.co/docs/hub/keras",
|
|
606
612
|
snippets: keras
|
|
607
613
|
},
|
|
614
|
+
mlx: {
|
|
615
|
+
btnLabel: "MLX",
|
|
616
|
+
repoName: "MLX",
|
|
617
|
+
repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
|
|
618
|
+
snippets: mlx
|
|
619
|
+
},
|
|
608
620
|
nemo: {
|
|
609
621
|
btnLabel: "NeMo",
|
|
610
622
|
repoName: "NeMo",
|
|
@@ -2356,6 +2368,10 @@ var taskData5 = {
|
|
|
2356
2368
|
{
|
|
2357
2369
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
2358
2370
|
id: "naver-clova-ix/donut-base-finetuned-docvqa"
|
|
2371
|
+
},
|
|
2372
|
+
{
|
|
2373
|
+
description: "A powerful model for document question answering.",
|
|
2374
|
+
id: "google/pix2struct-docvqa-large"
|
|
2359
2375
|
}
|
|
2360
2376
|
],
|
|
2361
2377
|
spaces: [
|
|
@@ -2366,6 +2382,10 @@ var taskData5 = {
|
|
|
2366
2382
|
{
|
|
2367
2383
|
description: "An application that can answer questions from invoices.",
|
|
2368
2384
|
id: "impira/invoices"
|
|
2385
|
+
},
|
|
2386
|
+
{
|
|
2387
|
+
description: "An application to compare different document question answering models.",
|
|
2388
|
+
id: "merve/compare_docvqa_models"
|
|
2369
2389
|
}
|
|
2370
2390
|
],
|
|
2371
2391
|
summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -2709,30 +2729,26 @@ var taskData10 = {
|
|
|
2709
2729
|
models: [
|
|
2710
2730
|
{
|
|
2711
2731
|
description: "A robust image captioning model.",
|
|
2712
|
-
id: "Salesforce/
|
|
2732
|
+
id: "Salesforce/blip2-opt-2.7b"
|
|
2713
2733
|
},
|
|
2714
2734
|
{
|
|
2715
|
-
description: "A
|
|
2716
|
-
id: "
|
|
2735
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
2736
|
+
id: "microsoft/kosmos-2-patch14-224"
|
|
2717
2737
|
},
|
|
2718
2738
|
{
|
|
2719
2739
|
description: "A strong optical character recognition model.",
|
|
2720
|
-
id: "
|
|
2721
|
-
},
|
|
2722
|
-
{
|
|
2723
|
-
description: "A strong visual question answering model for scientific diagrams.",
|
|
2724
|
-
id: "google/pix2struct-ai2d-base"
|
|
2740
|
+
id: "facebook/nougat-base"
|
|
2725
2741
|
},
|
|
2726
2742
|
{
|
|
2727
|
-
description: "A
|
|
2728
|
-
id: "
|
|
2729
|
-
},
|
|
2730
|
-
{
|
|
2731
|
-
description: "A captioning model for images that contain text.",
|
|
2732
|
-
id: "google/pix2struct-textcaps-base"
|
|
2743
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
2744
|
+
id: "llava-hf/llava-1.5-7b-hf"
|
|
2733
2745
|
}
|
|
2734
2746
|
],
|
|
2735
2747
|
spaces: [
|
|
2748
|
+
{
|
|
2749
|
+
description: "An application that compares various image captioning models.",
|
|
2750
|
+
id: "nielsr/comparing-captioning-models"
|
|
2751
|
+
},
|
|
2736
2752
|
{
|
|
2737
2753
|
description: "A robust image captioning application.",
|
|
2738
2754
|
id: "flax-community/image-captioning"
|
|
@@ -2894,7 +2910,6 @@ var taskData12 = {
|
|
|
2894
2910
|
],
|
|
2895
2911
|
models: [
|
|
2896
2912
|
{
|
|
2897
|
-
// TO DO: write description
|
|
2898
2913
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
2899
2914
|
id: "facebook/detr-resnet-50"
|
|
2900
2915
|
},
|
|
@@ -2904,9 +2919,13 @@ var taskData12 = {
|
|
|
2904
2919
|
}
|
|
2905
2920
|
],
|
|
2906
2921
|
spaces: [
|
|
2922
|
+
{
|
|
2923
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
2924
|
+
id: "hf-vision/object_detection_leaderboard"
|
|
2925
|
+
},
|
|
2907
2926
|
{
|
|
2908
2927
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
2909
|
-
id: "
|
|
2928
|
+
id: "merve/owlv2"
|
|
2910
2929
|
},
|
|
2911
2930
|
{
|
|
2912
2931
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -2952,14 +2971,16 @@ var taskData13 = {
|
|
|
2952
2971
|
metrics: [],
|
|
2953
2972
|
models: [
|
|
2954
2973
|
{
|
|
2955
|
-
// TO DO: write description
|
|
2956
2974
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
2957
2975
|
id: "Intel/dpt-large"
|
|
2958
2976
|
},
|
|
2959
2977
|
{
|
|
2960
|
-
// TO DO: write description
|
|
2961
2978
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
2962
|
-
id: "
|
|
2979
|
+
id: "facebook/dpt-dinov2-large-kitti"
|
|
2980
|
+
},
|
|
2981
|
+
{
|
|
2982
|
+
description: "A strong monocular depth estimation model.",
|
|
2983
|
+
id: "Bingxin/Marigold"
|
|
2963
2984
|
}
|
|
2964
2985
|
],
|
|
2965
2986
|
spaces: [
|
|
@@ -3513,12 +3534,12 @@ var taskData22 = {
|
|
|
3513
3534
|
],
|
|
3514
3535
|
models: [
|
|
3515
3536
|
{
|
|
3516
|
-
description: "
|
|
3517
|
-
id: "
|
|
3537
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
3538
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0"
|
|
3518
3539
|
},
|
|
3519
3540
|
{
|
|
3520
|
-
description: "A
|
|
3521
|
-
id: "
|
|
3541
|
+
description: "A powerful yet fast image generation model.",
|
|
3542
|
+
id: "latent-consistency/lcm-lora-sdxl"
|
|
3522
3543
|
},
|
|
3523
3544
|
{
|
|
3524
3545
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -3535,19 +3556,23 @@ var taskData22 = {
|
|
|
3535
3556
|
id: "stabilityai/stable-diffusion"
|
|
3536
3557
|
},
|
|
3537
3558
|
{
|
|
3538
|
-
description: "
|
|
3559
|
+
description: "A text-to-image application to generate comics.",
|
|
3560
|
+
id: "jbilcke-hf/ai-comic-factory"
|
|
3561
|
+
},
|
|
3562
|
+
{
|
|
3563
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
3539
3564
|
id: "DeepFloyd/IF"
|
|
3540
3565
|
},
|
|
3541
3566
|
{
|
|
3542
|
-
description: "
|
|
3543
|
-
id: "
|
|
3567
|
+
description: "A powerful yet very fast image generation application.",
|
|
3568
|
+
id: "latent-consistency/lcm-lora-for-sdxl"
|
|
3544
3569
|
},
|
|
3545
3570
|
{
|
|
3546
|
-
description: "
|
|
3571
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
3547
3572
|
id: "hysts/Shap-E"
|
|
3548
3573
|
},
|
|
3549
3574
|
{
|
|
3550
|
-
description: "
|
|
3575
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
3551
3576
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
|
|
3552
3577
|
}
|
|
3553
3578
|
],
|
|
@@ -4039,7 +4064,7 @@ var taskData28 = {
|
|
|
4039
4064
|
models: [
|
|
4040
4065
|
{
|
|
4041
4066
|
description: "A strong model for video generation.",
|
|
4042
|
-
id: "
|
|
4067
|
+
id: "Vchitect/LaVie"
|
|
4043
4068
|
},
|
|
4044
4069
|
{
|
|
4045
4070
|
description: "A robust model for text-to-video generation.",
|
|
@@ -4047,7 +4072,7 @@ var taskData28 = {
|
|
|
4047
4072
|
},
|
|
4048
4073
|
{
|
|
4049
4074
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
4050
|
-
id: "
|
|
4075
|
+
id: "hotshotco/Hotshot-XL"
|
|
4051
4076
|
}
|
|
4052
4077
|
],
|
|
4053
4078
|
spaces: [
|
|
@@ -4057,7 +4082,7 @@ var taskData28 = {
|
|
|
4057
4082
|
},
|
|
4058
4083
|
{
|
|
4059
4084
|
description: "An application that generates video from image and text.",
|
|
4060
|
-
id: "
|
|
4085
|
+
id: "Vchitect/LaVie"
|
|
4061
4086
|
},
|
|
4062
4087
|
{
|
|
4063
4088
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -4287,6 +4312,10 @@ var taskData31 = {
|
|
|
4287
4312
|
}
|
|
4288
4313
|
],
|
|
4289
4314
|
spaces: [
|
|
4315
|
+
{
|
|
4316
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
4317
|
+
id: "merve/pix2struct"
|
|
4318
|
+
},
|
|
4290
4319
|
{
|
|
4291
4320
|
description: "An application that can answer questions based on images.",
|
|
4292
4321
|
id: "nielsr/vilt-vqa"
|
|
@@ -4454,8 +4483,8 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4454
4483
|
"graph-ml": ["transformers"],
|
|
4455
4484
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
4456
4485
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
4457
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
4458
|
-
"image-to-text": ["transformers.js"],
|
|
4486
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
4487
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
4459
4488
|
"image-to-video": ["diffusers"],
|
|
4460
4489
|
"video-classification": ["transformers"],
|
|
4461
4490
|
"mask-generation": ["transformers"],
|
|
@@ -4572,6 +4601,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
|
|
|
4572
4601
|
ModelLibrary2["flair"] = "Flair";
|
|
4573
4602
|
ModelLibrary2["keras"] = "Keras";
|
|
4574
4603
|
ModelLibrary2["k2"] = "K2";
|
|
4604
|
+
ModelLibrary2["mlx"] = "mlx";
|
|
4575
4605
|
ModelLibrary2["nemo"] = "NeMo";
|
|
4576
4606
|
ModelLibrary2["open_clip"] = "OpenCLIP";
|
|
4577
4607
|
ModelLibrary2["paddlenlp"] = "PaddleNLP";
|
package/dist/index.mjs
CHANGED
|
@@ -81,7 +81,7 @@ var asteroid = (model) => [
|
|
|
81
81
|
model = BaseModel.from_pretrained("${model.id}")`
|
|
82
82
|
];
|
|
83
83
|
function get_base_diffusers_model(model) {
|
|
84
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
84
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
85
85
|
}
|
|
86
86
|
var bertopic = (model) => [
|
|
87
87
|
`from bertopic import BERTopic
|
|
@@ -493,6 +493,12 @@ transcriptions = asr_model.transcribe(["file.wav"])`
|
|
|
493
493
|
}
|
|
494
494
|
};
|
|
495
495
|
var mlAgents = (model) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
|
|
496
|
+
var mlx = (model) => [
|
|
497
|
+
`pip install huggingface_hub hf_transfer
|
|
498
|
+
|
|
499
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
500
|
+
huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`
|
|
501
|
+
];
|
|
496
502
|
var nemo = (model) => {
|
|
497
503
|
let command = void 0;
|
|
498
504
|
if (model.tags?.includes("automatic-speech-recognition")) {
|
|
@@ -567,6 +573,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
567
573
|
docsUrl: "https://huggingface.co/docs/hub/keras",
|
|
568
574
|
snippets: keras
|
|
569
575
|
},
|
|
576
|
+
mlx: {
|
|
577
|
+
btnLabel: "MLX",
|
|
578
|
+
repoName: "MLX",
|
|
579
|
+
repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
|
|
580
|
+
snippets: mlx
|
|
581
|
+
},
|
|
570
582
|
nemo: {
|
|
571
583
|
btnLabel: "NeMo",
|
|
572
584
|
repoName: "NeMo",
|
|
@@ -2318,6 +2330,10 @@ var taskData5 = {
|
|
|
2318
2330
|
{
|
|
2319
2331
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
2320
2332
|
id: "naver-clova-ix/donut-base-finetuned-docvqa"
|
|
2333
|
+
},
|
|
2334
|
+
{
|
|
2335
|
+
description: "A powerful model for document question answering.",
|
|
2336
|
+
id: "google/pix2struct-docvqa-large"
|
|
2321
2337
|
}
|
|
2322
2338
|
],
|
|
2323
2339
|
spaces: [
|
|
@@ -2328,6 +2344,10 @@ var taskData5 = {
|
|
|
2328
2344
|
{
|
|
2329
2345
|
description: "An application that can answer questions from invoices.",
|
|
2330
2346
|
id: "impira/invoices"
|
|
2347
|
+
},
|
|
2348
|
+
{
|
|
2349
|
+
description: "An application to compare different document question answering models.",
|
|
2350
|
+
id: "merve/compare_docvqa_models"
|
|
2331
2351
|
}
|
|
2332
2352
|
],
|
|
2333
2353
|
summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -2671,30 +2691,26 @@ var taskData10 = {
|
|
|
2671
2691
|
models: [
|
|
2672
2692
|
{
|
|
2673
2693
|
description: "A robust image captioning model.",
|
|
2674
|
-
id: "Salesforce/
|
|
2694
|
+
id: "Salesforce/blip2-opt-2.7b"
|
|
2675
2695
|
},
|
|
2676
2696
|
{
|
|
2677
|
-
description: "A
|
|
2678
|
-
id: "
|
|
2697
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
2698
|
+
id: "microsoft/kosmos-2-patch14-224"
|
|
2679
2699
|
},
|
|
2680
2700
|
{
|
|
2681
2701
|
description: "A strong optical character recognition model.",
|
|
2682
|
-
id: "
|
|
2683
|
-
},
|
|
2684
|
-
{
|
|
2685
|
-
description: "A strong visual question answering model for scientific diagrams.",
|
|
2686
|
-
id: "google/pix2struct-ai2d-base"
|
|
2702
|
+
id: "facebook/nougat-base"
|
|
2687
2703
|
},
|
|
2688
2704
|
{
|
|
2689
|
-
description: "A
|
|
2690
|
-
id: "
|
|
2691
|
-
},
|
|
2692
|
-
{
|
|
2693
|
-
description: "A captioning model for images that contain text.",
|
|
2694
|
-
id: "google/pix2struct-textcaps-base"
|
|
2705
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
2706
|
+
id: "llava-hf/llava-1.5-7b-hf"
|
|
2695
2707
|
}
|
|
2696
2708
|
],
|
|
2697
2709
|
spaces: [
|
|
2710
|
+
{
|
|
2711
|
+
description: "An application that compares various image captioning models.",
|
|
2712
|
+
id: "nielsr/comparing-captioning-models"
|
|
2713
|
+
},
|
|
2698
2714
|
{
|
|
2699
2715
|
description: "A robust image captioning application.",
|
|
2700
2716
|
id: "flax-community/image-captioning"
|
|
@@ -2856,7 +2872,6 @@ var taskData12 = {
|
|
|
2856
2872
|
],
|
|
2857
2873
|
models: [
|
|
2858
2874
|
{
|
|
2859
|
-
// TO DO: write description
|
|
2860
2875
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
2861
2876
|
id: "facebook/detr-resnet-50"
|
|
2862
2877
|
},
|
|
@@ -2866,9 +2881,13 @@ var taskData12 = {
|
|
|
2866
2881
|
}
|
|
2867
2882
|
],
|
|
2868
2883
|
spaces: [
|
|
2884
|
+
{
|
|
2885
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
2886
|
+
id: "hf-vision/object_detection_leaderboard"
|
|
2887
|
+
},
|
|
2869
2888
|
{
|
|
2870
2889
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
2871
|
-
id: "
|
|
2890
|
+
id: "merve/owlv2"
|
|
2872
2891
|
},
|
|
2873
2892
|
{
|
|
2874
2893
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -2914,14 +2933,16 @@ var taskData13 = {
|
|
|
2914
2933
|
metrics: [],
|
|
2915
2934
|
models: [
|
|
2916
2935
|
{
|
|
2917
|
-
// TO DO: write description
|
|
2918
2936
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
2919
2937
|
id: "Intel/dpt-large"
|
|
2920
2938
|
},
|
|
2921
2939
|
{
|
|
2922
|
-
// TO DO: write description
|
|
2923
2940
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
2924
|
-
id: "
|
|
2941
|
+
id: "facebook/dpt-dinov2-large-kitti"
|
|
2942
|
+
},
|
|
2943
|
+
{
|
|
2944
|
+
description: "A strong monocular depth estimation model.",
|
|
2945
|
+
id: "Bingxin/Marigold"
|
|
2925
2946
|
}
|
|
2926
2947
|
],
|
|
2927
2948
|
spaces: [
|
|
@@ -3475,12 +3496,12 @@ var taskData22 = {
|
|
|
3475
3496
|
],
|
|
3476
3497
|
models: [
|
|
3477
3498
|
{
|
|
3478
|
-
description: "
|
|
3479
|
-
id: "
|
|
3499
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
3500
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0"
|
|
3480
3501
|
},
|
|
3481
3502
|
{
|
|
3482
|
-
description: "A
|
|
3483
|
-
id: "
|
|
3503
|
+
description: "A powerful yet fast image generation model.",
|
|
3504
|
+
id: "latent-consistency/lcm-lora-sdxl"
|
|
3484
3505
|
},
|
|
3485
3506
|
{
|
|
3486
3507
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -3497,19 +3518,23 @@ var taskData22 = {
|
|
|
3497
3518
|
id: "stabilityai/stable-diffusion"
|
|
3498
3519
|
},
|
|
3499
3520
|
{
|
|
3500
|
-
description: "
|
|
3521
|
+
description: "A text-to-image application to generate comics.",
|
|
3522
|
+
id: "jbilcke-hf/ai-comic-factory"
|
|
3523
|
+
},
|
|
3524
|
+
{
|
|
3525
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
3501
3526
|
id: "DeepFloyd/IF"
|
|
3502
3527
|
},
|
|
3503
3528
|
{
|
|
3504
|
-
description: "
|
|
3505
|
-
id: "
|
|
3529
|
+
description: "A powerful yet very fast image generation application.",
|
|
3530
|
+
id: "latent-consistency/lcm-lora-for-sdxl"
|
|
3506
3531
|
},
|
|
3507
3532
|
{
|
|
3508
|
-
description: "
|
|
3533
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
3509
3534
|
id: "hysts/Shap-E"
|
|
3510
3535
|
},
|
|
3511
3536
|
{
|
|
3512
|
-
description: "
|
|
3537
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
3513
3538
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
|
|
3514
3539
|
}
|
|
3515
3540
|
],
|
|
@@ -4001,7 +4026,7 @@ var taskData28 = {
|
|
|
4001
4026
|
models: [
|
|
4002
4027
|
{
|
|
4003
4028
|
description: "A strong model for video generation.",
|
|
4004
|
-
id: "
|
|
4029
|
+
id: "Vchitect/LaVie"
|
|
4005
4030
|
},
|
|
4006
4031
|
{
|
|
4007
4032
|
description: "A robust model for text-to-video generation.",
|
|
@@ -4009,7 +4034,7 @@ var taskData28 = {
|
|
|
4009
4034
|
},
|
|
4010
4035
|
{
|
|
4011
4036
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
4012
|
-
id: "
|
|
4037
|
+
id: "hotshotco/Hotshot-XL"
|
|
4013
4038
|
}
|
|
4014
4039
|
],
|
|
4015
4040
|
spaces: [
|
|
@@ -4019,7 +4044,7 @@ var taskData28 = {
|
|
|
4019
4044
|
},
|
|
4020
4045
|
{
|
|
4021
4046
|
description: "An application that generates video from image and text.",
|
|
4022
|
-
id: "
|
|
4047
|
+
id: "Vchitect/LaVie"
|
|
4023
4048
|
},
|
|
4024
4049
|
{
|
|
4025
4050
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -4249,6 +4274,10 @@ var taskData31 = {
|
|
|
4249
4274
|
}
|
|
4250
4275
|
],
|
|
4251
4276
|
spaces: [
|
|
4277
|
+
{
|
|
4278
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
4279
|
+
id: "merve/pix2struct"
|
|
4280
|
+
},
|
|
4252
4281
|
{
|
|
4253
4282
|
description: "An application that can answer questions based on images.",
|
|
4254
4283
|
id: "nielsr/vilt-vqa"
|
|
@@ -4416,8 +4445,8 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4416
4445
|
"graph-ml": ["transformers"],
|
|
4417
4446
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
4418
4447
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
4419
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
4420
|
-
"image-to-text": ["transformers.js"],
|
|
4448
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
4449
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
4421
4450
|
"image-to-video": ["diffusers"],
|
|
4422
4451
|
"video-classification": ["transformers"],
|
|
4423
4452
|
"mask-generation": ["transformers"],
|
|
@@ -4534,6 +4563,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
|
|
|
4534
4563
|
ModelLibrary2["flair"] = "Flair";
|
|
4535
4564
|
ModelLibrary2["keras"] = "Keras";
|
|
4536
4565
|
ModelLibrary2["k2"] = "K2";
|
|
4566
|
+
ModelLibrary2["mlx"] = "mlx";
|
|
4537
4567
|
ModelLibrary2["nemo"] = "NeMo";
|
|
4538
4568
|
ModelLibrary2["open_clip"] = "OpenCLIP";
|
|
4539
4569
|
ModelLibrary2["paddlenlp"] = "PaddleNLP";
|
package/package.json
CHANGED
|
@@ -72,7 +72,7 @@ model = BaseModel.from_pretrained("${model.id}")`,
|
|
|
72
72
|
];
|
|
73
73
|
|
|
74
74
|
function get_base_diffusers_model(model: ModelData): string {
|
|
75
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
75
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
76
76
|
}
|
|
77
77
|
|
|
78
78
|
const bertopic = (model: ModelData) => [
|
|
@@ -541,6 +541,13 @@ transcriptions = asr_model.transcribe(["file.wav"])`,
|
|
|
541
541
|
|
|
542
542
|
const mlAgents = (model: ModelData) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
|
|
543
543
|
|
|
544
|
+
const mlx = (model: ModelData) => [
|
|
545
|
+
`pip install huggingface_hub hf_transfer
|
|
546
|
+
|
|
547
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
548
|
+
huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`,
|
|
549
|
+
];
|
|
550
|
+
|
|
544
551
|
const nemo = (model: ModelData) => {
|
|
545
552
|
let command: string[] | undefined = undefined;
|
|
546
553
|
// Resolve the tag to a nemo domain/sub-domain
|
|
@@ -621,6 +628,12 @@ export const MODEL_LIBRARIES_UI_ELEMENTS: Partial<Record<ModelLibraryKey, Librar
|
|
|
621
628
|
docsUrl: "https://huggingface.co/docs/hub/keras",
|
|
622
629
|
snippets: keras,
|
|
623
630
|
},
|
|
631
|
+
mlx: {
|
|
632
|
+
btnLabel: "MLX",
|
|
633
|
+
repoName: "MLX",
|
|
634
|
+
repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
|
|
635
|
+
snippets: mlx,
|
|
636
|
+
},
|
|
624
637
|
nemo: {
|
|
625
638
|
btnLabel: "NeMo",
|
|
626
639
|
repoName: "NeMo",
|
package/src/model-data.ts
CHANGED
package/src/model-libraries.ts
CHANGED
|
@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
|
|
|
24
24
|
metrics: [],
|
|
25
25
|
models: [
|
|
26
26
|
{
|
|
27
|
-
// TO DO: write description
|
|
28
27
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
29
28
|
id: "Intel/dpt-large",
|
|
30
29
|
},
|
|
31
30
|
{
|
|
32
|
-
// TO DO: write description
|
|
33
31
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
34
|
-
id: "
|
|
32
|
+
id: "facebook/dpt-dinov2-large-kitti",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
description: "A strong monocular depth estimation model.",
|
|
36
|
+
id: "Bingxin/Marigold",
|
|
35
37
|
},
|
|
36
38
|
],
|
|
37
39
|
spaces: [
|
|
@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
|
|
|
50
50
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
51
51
|
id: "naver-clova-ix/donut-base-finetuned-docvqa",
|
|
52
52
|
},
|
|
53
|
+
{
|
|
54
|
+
description: "A powerful model for document question answering.",
|
|
55
|
+
id: "google/pix2struct-docvqa-large",
|
|
56
|
+
},
|
|
53
57
|
],
|
|
54
58
|
spaces: [
|
|
55
59
|
{
|
|
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
|
|
|
60
64
|
description: "An application that can answer questions from invoices.",
|
|
61
65
|
id: "impira/invoices",
|
|
62
66
|
},
|
|
67
|
+
{
|
|
68
|
+
description: "An application to compare different document question answering models.",
|
|
69
|
+
id: "merve/compare_docvqa_models",
|
|
70
|
+
},
|
|
63
71
|
],
|
|
64
72
|
summary:
|
|
65
73
|
"Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -27,6 +27,19 @@ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parro
|
|
|
27
27
|
## [{'generated_text': 'two birds are standing next to each other '}]
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
+
### Conversation about the Image
|
|
31
|
+
|
|
32
|
+
Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from transformers import pipeline
|
|
36
|
+
|
|
37
|
+
mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
|
|
38
|
+
mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")
|
|
39
|
+
|
|
40
|
+
## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
|
|
41
|
+
```
|
|
42
|
+
|
|
30
43
|
### OCR
|
|
31
44
|
|
|
32
45
|
This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
|
|
@@ -32,30 +32,26 @@ const taskData: TaskDataCustom = {
|
|
|
32
32
|
models: [
|
|
33
33
|
{
|
|
34
34
|
description: "A robust image captioning model.",
|
|
35
|
-
id: "Salesforce/
|
|
35
|
+
id: "Salesforce/blip2-opt-2.7b",
|
|
36
36
|
},
|
|
37
37
|
{
|
|
38
|
-
description: "A
|
|
39
|
-
id: "
|
|
38
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
39
|
+
id: "microsoft/kosmos-2-patch14-224",
|
|
40
40
|
},
|
|
41
41
|
{
|
|
42
42
|
description: "A strong optical character recognition model.",
|
|
43
|
-
id: "
|
|
43
|
+
id: "facebook/nougat-base",
|
|
44
44
|
},
|
|
45
45
|
{
|
|
46
|
-
description: "A
|
|
47
|
-
id: "
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
description: "A strong captioning model for UI components.",
|
|
51
|
-
id: "google/pix2struct-widget-captioning-base",
|
|
52
|
-
},
|
|
53
|
-
{
|
|
54
|
-
description: "A captioning model for images that contain text.",
|
|
55
|
-
id: "google/pix2struct-textcaps-base",
|
|
46
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
47
|
+
id: "llava-hf/llava-1.5-7b-hf",
|
|
56
48
|
},
|
|
57
49
|
],
|
|
58
50
|
spaces: [
|
|
51
|
+
{
|
|
52
|
+
description: "An application that compares various image captioning models.",
|
|
53
|
+
id: "nielsr/comparing-captioning-models",
|
|
54
|
+
},
|
|
59
55
|
{
|
|
60
56
|
description: "A robust image captioning application.",
|
|
61
57
|
id: "flax-community/image-captioning",
|
package/src/tasks/index.ts
CHANGED
|
@@ -51,8 +51,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
51
51
|
"graph-ml": ["transformers"],
|
|
52
52
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
53
53
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
54
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
55
|
-
"image-to-text": ["transformers.js"],
|
|
54
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
55
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
56
56
|
"image-to-video": ["diffusers"],
|
|
57
57
|
"video-classification": ["transformers"],
|
|
58
58
|
"mask-generation": ["transformers"],
|
|
@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
|
|
|
40
40
|
],
|
|
41
41
|
models: [
|
|
42
42
|
{
|
|
43
|
-
// TO DO: write description
|
|
44
43
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
45
44
|
id: "facebook/detr-resnet-50",
|
|
46
45
|
},
|
|
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
|
|
|
50
49
|
},
|
|
51
50
|
],
|
|
52
51
|
spaces: [
|
|
52
|
+
{
|
|
53
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
54
|
+
id: "hf-vision/object_detection_leaderboard",
|
|
55
|
+
},
|
|
53
56
|
{
|
|
54
57
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
55
|
-
id: "
|
|
58
|
+
id: "merve/owlv2",
|
|
56
59
|
},
|
|
57
60
|
{
|
|
58
61
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -42,6 +42,10 @@ When it comes to text generation, the underlying language model can come in seve
|
|
|
42
42
|
|
|
43
43
|
- **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
|
|
44
44
|
|
|
45
|
+
## Text Generation from Image and Text
|
|
46
|
+
|
|
47
|
+
There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.
|
|
48
|
+
|
|
45
49
|
## Inference
|
|
46
50
|
|
|
47
51
|
You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.
|
|
@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
|
|
|
45
45
|
],
|
|
46
46
|
models: [
|
|
47
47
|
{
|
|
48
|
-
description:
|
|
49
|
-
|
|
50
|
-
id: "CompVis/stable-diffusion-v1-4",
|
|
48
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
49
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0",
|
|
51
50
|
},
|
|
52
51
|
{
|
|
53
|
-
description:
|
|
54
|
-
|
|
55
|
-
id: "dalle-mini/dalle-mega",
|
|
52
|
+
description: "A powerful yet fast image generation model.",
|
|
53
|
+
id: "latent-consistency/lcm-lora-sdxl",
|
|
56
54
|
},
|
|
57
55
|
{
|
|
58
56
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
|
|
|
69
67
|
id: "stabilityai/stable-diffusion",
|
|
70
68
|
},
|
|
71
69
|
{
|
|
72
|
-
description: "
|
|
70
|
+
description: "A text-to-image application to generate comics.",
|
|
71
|
+
id: "jbilcke-hf/ai-comic-factory",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
73
75
|
id: "DeepFloyd/IF",
|
|
74
76
|
},
|
|
75
77
|
{
|
|
76
|
-
description: "
|
|
77
|
-
id: "
|
|
78
|
+
description: "A powerful yet very fast image generation application.",
|
|
79
|
+
id: "latent-consistency/lcm-lora-for-sdxl",
|
|
78
80
|
},
|
|
79
81
|
{
|
|
80
|
-
description: "
|
|
82
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
81
83
|
id: "hysts/Shap-E",
|
|
82
84
|
},
|
|
83
85
|
{
|
|
84
|
-
description: "
|
|
86
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
85
87
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
|
|
86
88
|
},
|
|
87
89
|
],
|
|
@@ -68,7 +68,7 @@ const taskData: TaskDataCustom = {
|
|
|
68
68
|
models: [
|
|
69
69
|
{
|
|
70
70
|
description: "A strong model for video generation.",
|
|
71
|
-
id: "
|
|
71
|
+
id: "Vchitect/LaVie",
|
|
72
72
|
},
|
|
73
73
|
{
|
|
74
74
|
description: "A robust model for text-to-video generation.",
|
|
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
|
|
|
76
76
|
},
|
|
77
77
|
{
|
|
78
78
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
79
|
-
id: "
|
|
79
|
+
id: "hotshotco/Hotshot-XL",
|
|
80
80
|
},
|
|
81
81
|
],
|
|
82
82
|
spaces: [
|
|
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
|
|
|
86
86
|
},
|
|
87
87
|
{
|
|
88
88
|
description: "An application that generates video from image and text.",
|
|
89
|
-
id: "
|
|
89
|
+
id: "Vchitect/LaVie",
|
|
90
90
|
},
|
|
91
91
|
{
|
|
92
92
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
|
|
|
71
71
|
},
|
|
72
72
|
],
|
|
73
73
|
spaces: [
|
|
74
|
+
{
|
|
75
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
76
|
+
id: "merve/pix2struct",
|
|
77
|
+
},
|
|
74
78
|
{
|
|
75
79
|
description: "An application that can answer questions based on images.",
|
|
76
80
|
id: "nielsr/vilt-vqa",
|