@huggingface/tasks 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1 -1
- package/dist/index.js +51 -34
- package/dist/index.mjs +51 -34
- package/package.json +1 -1
- package/src/library-ui-elements.ts +1 -1
- package/src/model-data.ts +1 -1
- package/src/tasks/depth-estimation/data.ts +5 -3
- package/src/tasks/document-question-answering/data.ts +8 -0
- package/src/tasks/image-to-text/about.md +13 -0
- package/src/tasks/image-to-text/data.ts +10 -14
- package/src/tasks/index.ts +2 -2
- package/src/tasks/object-detection/data.ts +5 -2
- package/src/tasks/table-question-answering/about.md +1 -1
- package/src/tasks/text-generation/about.md +4 -0
- package/src/tasks/text-to-image/data.ts +13 -11
- package/src/tasks/text-to-video/data.ts +3 -3
- package/src/tasks/visual-question-answering/data.ts +4 -0
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -119,7 +119,7 @@ var asteroid = (model) => [
|
|
|
119
119
|
model = BaseModel.from_pretrained("${model.id}")`
|
|
120
120
|
];
|
|
121
121
|
function get_base_diffusers_model(model) {
|
|
122
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
122
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
123
123
|
}
|
|
124
124
|
var bertopic = (model) => [
|
|
125
125
|
`from bertopic import BERTopic
|
|
@@ -2356,6 +2356,10 @@ var taskData5 = {
|
|
|
2356
2356
|
{
|
|
2357
2357
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
2358
2358
|
id: "naver-clova-ix/donut-base-finetuned-docvqa"
|
|
2359
|
+
},
|
|
2360
|
+
{
|
|
2361
|
+
description: "A powerful model for document question answering.",
|
|
2362
|
+
id: "google/pix2struct-docvqa-large"
|
|
2359
2363
|
}
|
|
2360
2364
|
],
|
|
2361
2365
|
spaces: [
|
|
@@ -2366,6 +2370,10 @@ var taskData5 = {
|
|
|
2366
2370
|
{
|
|
2367
2371
|
description: "An application that can answer questions from invoices.",
|
|
2368
2372
|
id: "impira/invoices"
|
|
2373
|
+
},
|
|
2374
|
+
{
|
|
2375
|
+
description: "An application to compare different document question answering models.",
|
|
2376
|
+
id: "merve/compare_docvqa_models"
|
|
2369
2377
|
}
|
|
2370
2378
|
],
|
|
2371
2379
|
summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -2709,30 +2717,26 @@ var taskData10 = {
|
|
|
2709
2717
|
models: [
|
|
2710
2718
|
{
|
|
2711
2719
|
description: "A robust image captioning model.",
|
|
2712
|
-
id: "Salesforce/
|
|
2720
|
+
id: "Salesforce/blip2-opt-2.7b"
|
|
2713
2721
|
},
|
|
2714
2722
|
{
|
|
2715
|
-
description: "A
|
|
2716
|
-
id: "
|
|
2723
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
2724
|
+
id: "microsoft/kosmos-2-patch14-224"
|
|
2717
2725
|
},
|
|
2718
2726
|
{
|
|
2719
2727
|
description: "A strong optical character recognition model.",
|
|
2720
|
-
id: "
|
|
2721
|
-
},
|
|
2722
|
-
{
|
|
2723
|
-
description: "A strong visual question answering model for scientific diagrams.",
|
|
2724
|
-
id: "google/pix2struct-ai2d-base"
|
|
2728
|
+
id: "facebook/nougat-base"
|
|
2725
2729
|
},
|
|
2726
2730
|
{
|
|
2727
|
-
description: "A
|
|
2728
|
-
id: "
|
|
2729
|
-
},
|
|
2730
|
-
{
|
|
2731
|
-
description: "A captioning model for images that contain text.",
|
|
2732
|
-
id: "google/pix2struct-textcaps-base"
|
|
2731
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
2732
|
+
id: "llava-hf/llava-1.5-7b-hf"
|
|
2733
2733
|
}
|
|
2734
2734
|
],
|
|
2735
2735
|
spaces: [
|
|
2736
|
+
{
|
|
2737
|
+
description: "An application that compares various image captioning models.",
|
|
2738
|
+
id: "nielsr/comparing-captioning-models"
|
|
2739
|
+
},
|
|
2736
2740
|
{
|
|
2737
2741
|
description: "A robust image captioning application.",
|
|
2738
2742
|
id: "flax-community/image-captioning"
|
|
@@ -2894,7 +2898,6 @@ var taskData12 = {
|
|
|
2894
2898
|
],
|
|
2895
2899
|
models: [
|
|
2896
2900
|
{
|
|
2897
|
-
// TO DO: write description
|
|
2898
2901
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
2899
2902
|
id: "facebook/detr-resnet-50"
|
|
2900
2903
|
},
|
|
@@ -2904,9 +2907,13 @@ var taskData12 = {
|
|
|
2904
2907
|
}
|
|
2905
2908
|
],
|
|
2906
2909
|
spaces: [
|
|
2910
|
+
{
|
|
2911
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
2912
|
+
id: "hf-vision/object_detection_leaderboard"
|
|
2913
|
+
},
|
|
2907
2914
|
{
|
|
2908
2915
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
2909
|
-
id: "
|
|
2916
|
+
id: "merve/owlv2"
|
|
2910
2917
|
},
|
|
2911
2918
|
{
|
|
2912
2919
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -2952,14 +2959,16 @@ var taskData13 = {
|
|
|
2952
2959
|
metrics: [],
|
|
2953
2960
|
models: [
|
|
2954
2961
|
{
|
|
2955
|
-
// TO DO: write description
|
|
2956
2962
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
2957
2963
|
id: "Intel/dpt-large"
|
|
2958
2964
|
},
|
|
2959
2965
|
{
|
|
2960
|
-
// TO DO: write description
|
|
2961
2966
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
2962
|
-
id: "
|
|
2967
|
+
id: "facebook/dpt-dinov2-large-kitti"
|
|
2968
|
+
},
|
|
2969
|
+
{
|
|
2970
|
+
description: "A strong monocular depth estimation model.",
|
|
2971
|
+
id: "Bingxin/Marigold"
|
|
2963
2972
|
}
|
|
2964
2973
|
],
|
|
2965
2974
|
spaces: [
|
|
@@ -3513,12 +3522,12 @@ var taskData22 = {
|
|
|
3513
3522
|
],
|
|
3514
3523
|
models: [
|
|
3515
3524
|
{
|
|
3516
|
-
description: "
|
|
3517
|
-
id: "
|
|
3525
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
3526
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0"
|
|
3518
3527
|
},
|
|
3519
3528
|
{
|
|
3520
|
-
description: "A
|
|
3521
|
-
id: "
|
|
3529
|
+
description: "A powerful yet fast image generation model.",
|
|
3530
|
+
id: "latent-consistency/lcm-lora-sdxl"
|
|
3522
3531
|
},
|
|
3523
3532
|
{
|
|
3524
3533
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -3535,19 +3544,23 @@ var taskData22 = {
|
|
|
3535
3544
|
id: "stabilityai/stable-diffusion"
|
|
3536
3545
|
},
|
|
3537
3546
|
{
|
|
3538
|
-
description: "
|
|
3547
|
+
description: "A text-to-image application to generate comics.",
|
|
3548
|
+
id: "jbilcke-hf/ai-comic-factory"
|
|
3549
|
+
},
|
|
3550
|
+
{
|
|
3551
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
3539
3552
|
id: "DeepFloyd/IF"
|
|
3540
3553
|
},
|
|
3541
3554
|
{
|
|
3542
|
-
description: "
|
|
3543
|
-
id: "
|
|
3555
|
+
description: "A powerful yet very fast image generation application.",
|
|
3556
|
+
id: "latent-consistency/lcm-lora-for-sdxl"
|
|
3544
3557
|
},
|
|
3545
3558
|
{
|
|
3546
|
-
description: "
|
|
3559
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
3547
3560
|
id: "hysts/Shap-E"
|
|
3548
3561
|
},
|
|
3549
3562
|
{
|
|
3550
|
-
description: "
|
|
3563
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
3551
3564
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
|
|
3552
3565
|
}
|
|
3553
3566
|
],
|
|
@@ -4039,7 +4052,7 @@ var taskData28 = {
|
|
|
4039
4052
|
models: [
|
|
4040
4053
|
{
|
|
4041
4054
|
description: "A strong model for video generation.",
|
|
4042
|
-
id: "
|
|
4055
|
+
id: "Vchitect/LaVie"
|
|
4043
4056
|
},
|
|
4044
4057
|
{
|
|
4045
4058
|
description: "A robust model for text-to-video generation.",
|
|
@@ -4047,7 +4060,7 @@ var taskData28 = {
|
|
|
4047
4060
|
},
|
|
4048
4061
|
{
|
|
4049
4062
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
4050
|
-
id: "
|
|
4063
|
+
id: "hotshotco/Hotshot-XL"
|
|
4051
4064
|
}
|
|
4052
4065
|
],
|
|
4053
4066
|
spaces: [
|
|
@@ -4057,7 +4070,7 @@ var taskData28 = {
|
|
|
4057
4070
|
},
|
|
4058
4071
|
{
|
|
4059
4072
|
description: "An application that generates video from image and text.",
|
|
4060
|
-
id: "
|
|
4073
|
+
id: "Vchitect/LaVie"
|
|
4061
4074
|
},
|
|
4062
4075
|
{
|
|
4063
4076
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -4287,6 +4300,10 @@ var taskData31 = {
|
|
|
4287
4300
|
}
|
|
4288
4301
|
],
|
|
4289
4302
|
spaces: [
|
|
4303
|
+
{
|
|
4304
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
4305
|
+
id: "merve/pix2struct"
|
|
4306
|
+
},
|
|
4290
4307
|
{
|
|
4291
4308
|
description: "An application that can answer questions based on images.",
|
|
4292
4309
|
id: "nielsr/vilt-vqa"
|
|
@@ -4454,8 +4471,8 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4454
4471
|
"graph-ml": ["transformers"],
|
|
4455
4472
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
4456
4473
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
4457
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
4458
|
-
"image-to-text": ["transformers.js"],
|
|
4474
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
4475
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
4459
4476
|
"image-to-video": ["diffusers"],
|
|
4460
4477
|
"video-classification": ["transformers"],
|
|
4461
4478
|
"mask-generation": ["transformers"],
|
package/dist/index.mjs
CHANGED
|
@@ -81,7 +81,7 @@ var asteroid = (model) => [
|
|
|
81
81
|
model = BaseModel.from_pretrained("${model.id}")`
|
|
82
82
|
];
|
|
83
83
|
function get_base_diffusers_model(model) {
|
|
84
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
84
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
85
85
|
}
|
|
86
86
|
var bertopic = (model) => [
|
|
87
87
|
`from bertopic import BERTopic
|
|
@@ -2318,6 +2318,10 @@ var taskData5 = {
|
|
|
2318
2318
|
{
|
|
2319
2319
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
2320
2320
|
id: "naver-clova-ix/donut-base-finetuned-docvqa"
|
|
2321
|
+
},
|
|
2322
|
+
{
|
|
2323
|
+
description: "A powerful model for document question answering.",
|
|
2324
|
+
id: "google/pix2struct-docvqa-large"
|
|
2321
2325
|
}
|
|
2322
2326
|
],
|
|
2323
2327
|
spaces: [
|
|
@@ -2328,6 +2332,10 @@ var taskData5 = {
|
|
|
2328
2332
|
{
|
|
2329
2333
|
description: "An application that can answer questions from invoices.",
|
|
2330
2334
|
id: "impira/invoices"
|
|
2335
|
+
},
|
|
2336
|
+
{
|
|
2337
|
+
description: "An application to compare different document question answering models.",
|
|
2338
|
+
id: "merve/compare_docvqa_models"
|
|
2331
2339
|
}
|
|
2332
2340
|
],
|
|
2333
2341
|
summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -2671,30 +2679,26 @@ var taskData10 = {
|
|
|
2671
2679
|
models: [
|
|
2672
2680
|
{
|
|
2673
2681
|
description: "A robust image captioning model.",
|
|
2674
|
-
id: "Salesforce/
|
|
2682
|
+
id: "Salesforce/blip2-opt-2.7b"
|
|
2675
2683
|
},
|
|
2676
2684
|
{
|
|
2677
|
-
description: "A
|
|
2678
|
-
id: "
|
|
2685
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
2686
|
+
id: "microsoft/kosmos-2-patch14-224"
|
|
2679
2687
|
},
|
|
2680
2688
|
{
|
|
2681
2689
|
description: "A strong optical character recognition model.",
|
|
2682
|
-
id: "
|
|
2683
|
-
},
|
|
2684
|
-
{
|
|
2685
|
-
description: "A strong visual question answering model for scientific diagrams.",
|
|
2686
|
-
id: "google/pix2struct-ai2d-base"
|
|
2690
|
+
id: "facebook/nougat-base"
|
|
2687
2691
|
},
|
|
2688
2692
|
{
|
|
2689
|
-
description: "A
|
|
2690
|
-
id: "
|
|
2691
|
-
},
|
|
2692
|
-
{
|
|
2693
|
-
description: "A captioning model for images that contain text.",
|
|
2694
|
-
id: "google/pix2struct-textcaps-base"
|
|
2693
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
2694
|
+
id: "llava-hf/llava-1.5-7b-hf"
|
|
2695
2695
|
}
|
|
2696
2696
|
],
|
|
2697
2697
|
spaces: [
|
|
2698
|
+
{
|
|
2699
|
+
description: "An application that compares various image captioning models.",
|
|
2700
|
+
id: "nielsr/comparing-captioning-models"
|
|
2701
|
+
},
|
|
2698
2702
|
{
|
|
2699
2703
|
description: "A robust image captioning application.",
|
|
2700
2704
|
id: "flax-community/image-captioning"
|
|
@@ -2856,7 +2860,6 @@ var taskData12 = {
|
|
|
2856
2860
|
],
|
|
2857
2861
|
models: [
|
|
2858
2862
|
{
|
|
2859
|
-
// TO DO: write description
|
|
2860
2863
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
2861
2864
|
id: "facebook/detr-resnet-50"
|
|
2862
2865
|
},
|
|
@@ -2866,9 +2869,13 @@ var taskData12 = {
|
|
|
2866
2869
|
}
|
|
2867
2870
|
],
|
|
2868
2871
|
spaces: [
|
|
2872
|
+
{
|
|
2873
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
2874
|
+
id: "hf-vision/object_detection_leaderboard"
|
|
2875
|
+
},
|
|
2869
2876
|
{
|
|
2870
2877
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
2871
|
-
id: "
|
|
2878
|
+
id: "merve/owlv2"
|
|
2872
2879
|
},
|
|
2873
2880
|
{
|
|
2874
2881
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -2914,14 +2921,16 @@ var taskData13 = {
|
|
|
2914
2921
|
metrics: [],
|
|
2915
2922
|
models: [
|
|
2916
2923
|
{
|
|
2917
|
-
// TO DO: write description
|
|
2918
2924
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
2919
2925
|
id: "Intel/dpt-large"
|
|
2920
2926
|
},
|
|
2921
2927
|
{
|
|
2922
|
-
// TO DO: write description
|
|
2923
2928
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
2924
|
-
id: "
|
|
2929
|
+
id: "facebook/dpt-dinov2-large-kitti"
|
|
2930
|
+
},
|
|
2931
|
+
{
|
|
2932
|
+
description: "A strong monocular depth estimation model.",
|
|
2933
|
+
id: "Bingxin/Marigold"
|
|
2925
2934
|
}
|
|
2926
2935
|
],
|
|
2927
2936
|
spaces: [
|
|
@@ -3475,12 +3484,12 @@ var taskData22 = {
|
|
|
3475
3484
|
],
|
|
3476
3485
|
models: [
|
|
3477
3486
|
{
|
|
3478
|
-
description: "
|
|
3479
|
-
id: "
|
|
3487
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
3488
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0"
|
|
3480
3489
|
},
|
|
3481
3490
|
{
|
|
3482
|
-
description: "A
|
|
3483
|
-
id: "
|
|
3491
|
+
description: "A powerful yet fast image generation model.",
|
|
3492
|
+
id: "latent-consistency/lcm-lora-sdxl"
|
|
3484
3493
|
},
|
|
3485
3494
|
{
|
|
3486
3495
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -3497,19 +3506,23 @@ var taskData22 = {
|
|
|
3497
3506
|
id: "stabilityai/stable-diffusion"
|
|
3498
3507
|
},
|
|
3499
3508
|
{
|
|
3500
|
-
description: "
|
|
3509
|
+
description: "A text-to-image application to generate comics.",
|
|
3510
|
+
id: "jbilcke-hf/ai-comic-factory"
|
|
3511
|
+
},
|
|
3512
|
+
{
|
|
3513
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
3501
3514
|
id: "DeepFloyd/IF"
|
|
3502
3515
|
},
|
|
3503
3516
|
{
|
|
3504
|
-
description: "
|
|
3505
|
-
id: "
|
|
3517
|
+
description: "A powerful yet very fast image generation application.",
|
|
3518
|
+
id: "latent-consistency/lcm-lora-for-sdxl"
|
|
3506
3519
|
},
|
|
3507
3520
|
{
|
|
3508
|
-
description: "
|
|
3521
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
3509
3522
|
id: "hysts/Shap-E"
|
|
3510
3523
|
},
|
|
3511
3524
|
{
|
|
3512
|
-
description: "
|
|
3525
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
3513
3526
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
|
|
3514
3527
|
}
|
|
3515
3528
|
],
|
|
@@ -4001,7 +4014,7 @@ var taskData28 = {
|
|
|
4001
4014
|
models: [
|
|
4002
4015
|
{
|
|
4003
4016
|
description: "A strong model for video generation.",
|
|
4004
|
-
id: "
|
|
4017
|
+
id: "Vchitect/LaVie"
|
|
4005
4018
|
},
|
|
4006
4019
|
{
|
|
4007
4020
|
description: "A robust model for text-to-video generation.",
|
|
@@ -4009,7 +4022,7 @@ var taskData28 = {
|
|
|
4009
4022
|
},
|
|
4010
4023
|
{
|
|
4011
4024
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
4012
|
-
id: "
|
|
4025
|
+
id: "hotshotco/Hotshot-XL"
|
|
4013
4026
|
}
|
|
4014
4027
|
],
|
|
4015
4028
|
spaces: [
|
|
@@ -4019,7 +4032,7 @@ var taskData28 = {
|
|
|
4019
4032
|
},
|
|
4020
4033
|
{
|
|
4021
4034
|
description: "An application that generates video from image and text.",
|
|
4022
|
-
id: "
|
|
4035
|
+
id: "Vchitect/LaVie"
|
|
4023
4036
|
},
|
|
4024
4037
|
{
|
|
4025
4038
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -4249,6 +4262,10 @@ var taskData31 = {
|
|
|
4249
4262
|
}
|
|
4250
4263
|
],
|
|
4251
4264
|
spaces: [
|
|
4265
|
+
{
|
|
4266
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
4267
|
+
id: "merve/pix2struct"
|
|
4268
|
+
},
|
|
4252
4269
|
{
|
|
4253
4270
|
description: "An application that can answer questions based on images.",
|
|
4254
4271
|
id: "nielsr/vilt-vqa"
|
|
@@ -4416,8 +4433,8 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4416
4433
|
"graph-ml": ["transformers"],
|
|
4417
4434
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
4418
4435
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
4419
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
4420
|
-
"image-to-text": ["transformers.js"],
|
|
4436
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
4437
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
4421
4438
|
"image-to-video": ["diffusers"],
|
|
4422
4439
|
"video-classification": ["transformers"],
|
|
4423
4440
|
"mask-generation": ["transformers"],
|
package/package.json
CHANGED
|
@@ -72,7 +72,7 @@ model = BaseModel.from_pretrained("${model.id}")`,
|
|
|
72
72
|
];
|
|
73
73
|
|
|
74
74
|
function get_base_diffusers_model(model: ModelData): string {
|
|
75
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
75
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
76
76
|
}
|
|
77
77
|
|
|
78
78
|
const bertopic = (model: ModelData) => [
|
package/src/model-data.ts
CHANGED
|
@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
|
|
|
24
24
|
metrics: [],
|
|
25
25
|
models: [
|
|
26
26
|
{
|
|
27
|
-
// TO DO: write description
|
|
28
27
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
29
28
|
id: "Intel/dpt-large",
|
|
30
29
|
},
|
|
31
30
|
{
|
|
32
|
-
// TO DO: write description
|
|
33
31
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
34
|
-
id: "
|
|
32
|
+
id: "facebook/dpt-dinov2-large-kitti",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
description: "A strong monocular depth estimation model.",
|
|
36
|
+
id: "Bingxin/Marigold",
|
|
35
37
|
},
|
|
36
38
|
],
|
|
37
39
|
spaces: [
|
|
@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
|
|
|
50
50
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
51
51
|
id: "naver-clova-ix/donut-base-finetuned-docvqa",
|
|
52
52
|
},
|
|
53
|
+
{
|
|
54
|
+
description: "A powerful model for document question answering.",
|
|
55
|
+
id: "google/pix2struct-docvqa-large",
|
|
56
|
+
},
|
|
53
57
|
],
|
|
54
58
|
spaces: [
|
|
55
59
|
{
|
|
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
|
|
|
60
64
|
description: "An application that can answer questions from invoices.",
|
|
61
65
|
id: "impira/invoices",
|
|
62
66
|
},
|
|
67
|
+
{
|
|
68
|
+
description: "An application to compare different document question answering models.",
|
|
69
|
+
id: "merve/compare_docvqa_models",
|
|
70
|
+
},
|
|
63
71
|
],
|
|
64
72
|
summary:
|
|
65
73
|
"Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -27,6 +27,19 @@ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parro
|
|
|
27
27
|
## [{'generated_text': 'two birds are standing next to each other '}]
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
+
### Conversation about the Image
|
|
31
|
+
|
|
32
|
+
Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from transformers import pipeline
|
|
36
|
+
|
|
37
|
+
mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
|
|
38
|
+
mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")
|
|
39
|
+
|
|
40
|
+
## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
|
|
41
|
+
```
|
|
42
|
+
|
|
30
43
|
### OCR
|
|
31
44
|
|
|
32
45
|
This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
|
|
@@ -32,30 +32,26 @@ const taskData: TaskDataCustom = {
|
|
|
32
32
|
models: [
|
|
33
33
|
{
|
|
34
34
|
description: "A robust image captioning model.",
|
|
35
|
-
id: "Salesforce/
|
|
35
|
+
id: "Salesforce/blip2-opt-2.7b",
|
|
36
36
|
},
|
|
37
37
|
{
|
|
38
|
-
description: "A
|
|
39
|
-
id: "
|
|
38
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
39
|
+
id: "microsoft/kosmos-2-patch14-224",
|
|
40
40
|
},
|
|
41
41
|
{
|
|
42
42
|
description: "A strong optical character recognition model.",
|
|
43
|
-
id: "
|
|
43
|
+
id: "facebook/nougat-base",
|
|
44
44
|
},
|
|
45
45
|
{
|
|
46
|
-
description: "A
|
|
47
|
-
id: "
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
description: "A strong captioning model for UI components.",
|
|
51
|
-
id: "google/pix2struct-widget-captioning-base",
|
|
52
|
-
},
|
|
53
|
-
{
|
|
54
|
-
description: "A captioning model for images that contain text.",
|
|
55
|
-
id: "google/pix2struct-textcaps-base",
|
|
46
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
47
|
+
id: "llava-hf/llava-1.5-7b-hf",
|
|
56
48
|
},
|
|
57
49
|
],
|
|
58
50
|
spaces: [
|
|
51
|
+
{
|
|
52
|
+
description: "An application that compares various image captioning models.",
|
|
53
|
+
id: "nielsr/comparing-captioning-models",
|
|
54
|
+
},
|
|
59
55
|
{
|
|
60
56
|
description: "A robust image captioning application.",
|
|
61
57
|
id: "flax-community/image-captioning",
|
package/src/tasks/index.ts
CHANGED
|
@@ -51,8 +51,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
51
51
|
"graph-ml": ["transformers"],
|
|
52
52
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
53
53
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
54
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
55
|
-
"image-to-text": ["transformers.js"],
|
|
54
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
55
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
56
56
|
"image-to-video": ["diffusers"],
|
|
57
57
|
"video-classification": ["transformers"],
|
|
58
58
|
"mask-generation": ["transformers"],
|
|
@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
|
|
|
40
40
|
],
|
|
41
41
|
models: [
|
|
42
42
|
{
|
|
43
|
-
// TO DO: write description
|
|
44
43
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
45
44
|
id: "facebook/detr-resnet-50",
|
|
46
45
|
},
|
|
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
|
|
|
50
49
|
},
|
|
51
50
|
],
|
|
52
51
|
spaces: [
|
|
52
|
+
{
|
|
53
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
54
|
+
id: "hf-vision/object_detection_leaderboard",
|
|
55
|
+
},
|
|
53
56
|
{
|
|
54
57
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
55
|
-
id: "
|
|
58
|
+
id: "merve/owlv2",
|
|
56
59
|
},
|
|
57
60
|
{
|
|
58
61
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -42,6 +42,10 @@ When it comes to text generation, the underlying language model can come in seve
|
|
|
42
42
|
|
|
43
43
|
- **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
|
|
44
44
|
|
|
45
|
+
## Text Generation from Image and Text
|
|
46
|
+
|
|
47
|
+
There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.
|
|
48
|
+
|
|
45
49
|
## Inference
|
|
46
50
|
|
|
47
51
|
You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.
|
|
@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
|
|
|
45
45
|
],
|
|
46
46
|
models: [
|
|
47
47
|
{
|
|
48
|
-
description:
|
|
49
|
-
|
|
50
|
-
id: "CompVis/stable-diffusion-v1-4",
|
|
48
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
49
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0",
|
|
51
50
|
},
|
|
52
51
|
{
|
|
53
|
-
description:
|
|
54
|
-
|
|
55
|
-
id: "dalle-mini/dalle-mega",
|
|
52
|
+
description: "A powerful yet fast image generation model.",
|
|
53
|
+
id: "latent-consistency/lcm-lora-sdxl",
|
|
56
54
|
},
|
|
57
55
|
{
|
|
58
56
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
|
|
|
69
67
|
id: "stabilityai/stable-diffusion",
|
|
70
68
|
},
|
|
71
69
|
{
|
|
72
|
-
description: "
|
|
70
|
+
description: "A text-to-image application to generate comics.",
|
|
71
|
+
id: "jbilcke-hf/ai-comic-factory",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
73
75
|
id: "DeepFloyd/IF",
|
|
74
76
|
},
|
|
75
77
|
{
|
|
76
|
-
description: "
|
|
77
|
-
id: "
|
|
78
|
+
description: "A powerful yet very fast image generation application.",
|
|
79
|
+
id: "latent-consistency/lcm-lora-for-sdxl",
|
|
78
80
|
},
|
|
79
81
|
{
|
|
80
|
-
description: "
|
|
82
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
81
83
|
id: "hysts/Shap-E",
|
|
82
84
|
},
|
|
83
85
|
{
|
|
84
|
-
description: "
|
|
86
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
85
87
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
|
|
86
88
|
},
|
|
87
89
|
],
|
|
@@ -68,7 +68,7 @@ const taskData: TaskDataCustom = {
|
|
|
68
68
|
models: [
|
|
69
69
|
{
|
|
70
70
|
description: "A strong model for video generation.",
|
|
71
|
-
id: "
|
|
71
|
+
id: "Vchitect/LaVie",
|
|
72
72
|
},
|
|
73
73
|
{
|
|
74
74
|
description: "A robust model for text-to-video generation.",
|
|
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
|
|
|
76
76
|
},
|
|
77
77
|
{
|
|
78
78
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
79
|
-
id: "
|
|
79
|
+
id: "hotshotco/Hotshot-XL",
|
|
80
80
|
},
|
|
81
81
|
],
|
|
82
82
|
spaces: [
|
|
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
|
|
|
86
86
|
},
|
|
87
87
|
{
|
|
88
88
|
description: "An application that generates video from image and text.",
|
|
89
|
-
id: "
|
|
89
|
+
id: "Vchitect/LaVie",
|
|
90
90
|
},
|
|
91
91
|
{
|
|
92
92
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
|
|
|
71
71
|
},
|
|
72
72
|
],
|
|
73
73
|
spaces: [
|
|
74
|
+
{
|
|
75
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
76
|
+
id: "merve/pix2struct",
|
|
77
|
+
},
|
|
74
78
|
{
|
|
75
79
|
description: "An application that can answer questions based on images.",
|
|
76
80
|
id: "nielsr/vilt-vqa",
|