@huggingface/tasks 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -623,7 +623,7 @@ interface ModelData {
623
623
  inference?: boolean | {
624
624
  parameters?: Record<string, unknown>;
625
625
  };
626
- base_model?: string;
626
+ base_model?: string | string[];
627
627
  };
628
628
  /**
629
629
  * Library name
package/dist/index.js CHANGED
@@ -119,7 +119,7 @@ var asteroid = (model) => [
119
119
  model = BaseModel.from_pretrained("${model.id}")`
120
120
  ];
121
121
  function get_base_diffusers_model(model) {
122
- return model.cardData?.base_model ?? "fill-in-base-model";
122
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
123
123
  }
124
124
  var bertopic = (model) => [
125
125
  `from bertopic import BERTopic
@@ -2356,6 +2356,10 @@ var taskData5 = {
2356
2356
  {
2357
2357
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
2358
2358
  id: "naver-clova-ix/donut-base-finetuned-docvqa"
2359
+ },
2360
+ {
2361
+ description: "A powerful model for document question answering.",
2362
+ id: "google/pix2struct-docvqa-large"
2359
2363
  }
2360
2364
  ],
2361
2365
  spaces: [
@@ -2366,6 +2370,10 @@ var taskData5 = {
2366
2370
  {
2367
2371
  description: "An application that can answer questions from invoices.",
2368
2372
  id: "impira/invoices"
2373
+ },
2374
+ {
2375
+ description: "An application to compare different document question answering models.",
2376
+ id: "merve/compare_docvqa_models"
2369
2377
  }
2370
2378
  ],
2371
2379
  summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2709,30 +2717,26 @@ var taskData10 = {
2709
2717
  models: [
2710
2718
  {
2711
2719
  description: "A robust image captioning model.",
2712
- id: "Salesforce/blip-image-captioning-large"
2720
+ id: "Salesforce/blip2-opt-2.7b"
2713
2721
  },
2714
2722
  {
2715
- description: "A strong image captioning model.",
2716
- id: "nlpconnect/vit-gpt2-image-captioning"
2723
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
2724
+ id: "microsoft/kosmos-2-patch14-224"
2717
2725
  },
2718
2726
  {
2719
2727
  description: "A strong optical character recognition model.",
2720
- id: "microsoft/trocr-base-printed"
2721
- },
2722
- {
2723
- description: "A strong visual question answering model for scientific diagrams.",
2724
- id: "google/pix2struct-ai2d-base"
2728
+ id: "facebook/nougat-base"
2725
2729
  },
2726
2730
  {
2727
- description: "A strong captioning model for UI components.",
2728
- id: "google/pix2struct-widget-captioning-base"
2729
- },
2730
- {
2731
- description: "A captioning model for images that contain text.",
2732
- id: "google/pix2struct-textcaps-base"
2731
+ description: "A powerful model that lets you have a conversation with the image.",
2732
+ id: "llava-hf/llava-1.5-7b-hf"
2733
2733
  }
2734
2734
  ],
2735
2735
  spaces: [
2736
+ {
2737
+ description: "An application that compares various image captioning models.",
2738
+ id: "nielsr/comparing-captioning-models"
2739
+ },
2736
2740
  {
2737
2741
  description: "A robust image captioning application.",
2738
2742
  id: "flax-community/image-captioning"
@@ -2894,7 +2898,6 @@ var taskData12 = {
2894
2898
  ],
2895
2899
  models: [
2896
2900
  {
2897
- // TO DO: write description
2898
2901
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
2899
2902
  id: "facebook/detr-resnet-50"
2900
2903
  },
@@ -2904,9 +2907,13 @@ var taskData12 = {
2904
2907
  }
2905
2908
  ],
2906
2909
  spaces: [
2910
+ {
2911
+ description: "Leaderboard to compare various object detection models across several metrics.",
2912
+ id: "hf-vision/object_detection_leaderboard"
2913
+ },
2907
2914
  {
2908
2915
  description: "An object detection application that can detect unseen objects out of the box.",
2909
- id: "adirik/OWL-ViT"
2916
+ id: "merve/owlv2"
2910
2917
  },
2911
2918
  {
2912
2919
  description: "An application that contains various object detection models to try from.",
@@ -2952,14 +2959,16 @@ var taskData13 = {
2952
2959
  metrics: [],
2953
2960
  models: [
2954
2961
  {
2955
- // TO DO: write description
2956
2962
  description: "Strong Depth Estimation model trained on 1.4 million images.",
2957
2963
  id: "Intel/dpt-large"
2958
2964
  },
2959
2965
  {
2960
- // TO DO: write description
2961
2966
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
2962
- id: "vinvino02/glpn-kitti"
2967
+ id: "facebook/dpt-dinov2-large-kitti"
2968
+ },
2969
+ {
2970
+ description: "A strong monocular depth estimation model.",
2971
+ id: "Bingxin/Marigold"
2963
2972
  }
2964
2973
  ],
2965
2974
  spaces: [
@@ -3513,12 +3522,12 @@ var taskData22 = {
3513
3522
  ],
3514
3523
  models: [
3515
3524
  {
3516
- description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
3517
- id: "CompVis/stable-diffusion-v1-4"
3525
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
3526
+ id: "stabilityai/stable-diffusion-xl-base-1.0"
3518
3527
  },
3519
3528
  {
3520
- description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
3521
- id: "dalle-mini/dalle-mega"
3529
+ description: "A powerful yet fast image generation model.",
3530
+ id: "latent-consistency/lcm-lora-sdxl"
3522
3531
  },
3523
3532
  {
3524
3533
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -3535,19 +3544,23 @@ var taskData22 = {
3535
3544
  id: "stabilityai/stable-diffusion"
3536
3545
  },
3537
3546
  {
3538
- description: "An text-to-image application that can generate coherent text inside the image.",
3547
+ description: "A text-to-image application to generate comics.",
3548
+ id: "jbilcke-hf/ai-comic-factory"
3549
+ },
3550
+ {
3551
+ description: "A text-to-image application that can generate coherent text inside the image.",
3539
3552
  id: "DeepFloyd/IF"
3540
3553
  },
3541
3554
  {
3542
- description: "An powerful text-to-image application that can generate images.",
3543
- id: "kakaobrain/karlo"
3555
+ description: "A powerful yet very fast image generation application.",
3556
+ id: "latent-consistency/lcm-lora-for-sdxl"
3544
3557
  },
3545
3558
  {
3546
- description: "An powerful text-to-image application that can generates 3D representations.",
3559
+ description: "A powerful text-to-image application that can generate 3D representations.",
3547
3560
  id: "hysts/Shap-E"
3548
3561
  },
3549
3562
  {
3550
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
3563
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
3551
3564
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
3552
3565
  }
3553
3566
  ],
@@ -4039,7 +4052,7 @@ var taskData28 = {
4039
4052
  models: [
4040
4053
  {
4041
4054
  description: "A strong model for video generation.",
4042
- id: "PAIR/text2video-zero-controlnet-canny-arcane"
4055
+ id: "Vchitect/LaVie"
4043
4056
  },
4044
4057
  {
4045
4058
  description: "A robust model for text-to-video generation.",
@@ -4047,7 +4060,7 @@ var taskData28 = {
4047
4060
  },
4048
4061
  {
4049
4062
  description: "A text-to-video generation model with high quality and smooth outputs.",
4050
- id: "cerspense/zeroscope_v2_576w"
4063
+ id: "hotshotco/Hotshot-XL"
4051
4064
  }
4052
4065
  ],
4053
4066
  spaces: [
@@ -4057,7 +4070,7 @@ var taskData28 = {
4057
4070
  },
4058
4071
  {
4059
4072
  description: "An application that generates video from image and text.",
4060
- id: "TempoFunk/makeavid-sd-jax"
4073
+ id: "Vchitect/LaVie"
4061
4074
  },
4062
4075
  {
4063
4076
  description: "An application that generates videos from text and provides multi-model support.",
@@ -4287,6 +4300,10 @@ var taskData31 = {
4287
4300
  }
4288
4301
  ],
4289
4302
  spaces: [
4303
+ {
4304
+ description: "An application that compares visual question answering models across different tasks.",
4305
+ id: "merve/pix2struct"
4306
+ },
4290
4307
  {
4291
4308
  description: "An application that can answer questions based on images.",
4292
4309
  id: "nielsr/vilt-vqa"
@@ -4454,8 +4471,8 @@ var TASKS_MODEL_LIBRARIES = {
4454
4471
  "graph-ml": ["transformers"],
4455
4472
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4456
4473
  "image-segmentation": ["transformers", "transformers.js"],
4457
- "image-to-image": ["diffusers", "transformers.js"],
4458
- "image-to-text": ["transformers.js"],
4474
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
4475
+ "image-to-text": ["transformers", "transformers.js"],
4459
4476
  "image-to-video": ["diffusers"],
4460
4477
  "video-classification": ["transformers"],
4461
4478
  "mask-generation": ["transformers"],
package/dist/index.mjs CHANGED
@@ -81,7 +81,7 @@ var asteroid = (model) => [
81
81
  model = BaseModel.from_pretrained("${model.id}")`
82
82
  ];
83
83
  function get_base_diffusers_model(model) {
84
- return model.cardData?.base_model ?? "fill-in-base-model";
84
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
85
85
  }
86
86
  var bertopic = (model) => [
87
87
  `from bertopic import BERTopic
@@ -2318,6 +2318,10 @@ var taskData5 = {
2318
2318
  {
2319
2319
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
2320
2320
  id: "naver-clova-ix/donut-base-finetuned-docvqa"
2321
+ },
2322
+ {
2323
+ description: "A powerful model for document question answering.",
2324
+ id: "google/pix2struct-docvqa-large"
2321
2325
  }
2322
2326
  ],
2323
2327
  spaces: [
@@ -2328,6 +2332,10 @@ var taskData5 = {
2328
2332
  {
2329
2333
  description: "An application that can answer questions from invoices.",
2330
2334
  id: "impira/invoices"
2335
+ },
2336
+ {
2337
+ description: "An application to compare different document question answering models.",
2338
+ id: "merve/compare_docvqa_models"
2331
2339
  }
2332
2340
  ],
2333
2341
  summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2671,30 +2679,26 @@ var taskData10 = {
2671
2679
  models: [
2672
2680
  {
2673
2681
  description: "A robust image captioning model.",
2674
- id: "Salesforce/blip-image-captioning-large"
2682
+ id: "Salesforce/blip2-opt-2.7b"
2675
2683
  },
2676
2684
  {
2677
- description: "A strong image captioning model.",
2678
- id: "nlpconnect/vit-gpt2-image-captioning"
2685
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
2686
+ id: "microsoft/kosmos-2-patch14-224"
2679
2687
  },
2680
2688
  {
2681
2689
  description: "A strong optical character recognition model.",
2682
- id: "microsoft/trocr-base-printed"
2683
- },
2684
- {
2685
- description: "A strong visual question answering model for scientific diagrams.",
2686
- id: "google/pix2struct-ai2d-base"
2690
+ id: "facebook/nougat-base"
2687
2691
  },
2688
2692
  {
2689
- description: "A strong captioning model for UI components.",
2690
- id: "google/pix2struct-widget-captioning-base"
2691
- },
2692
- {
2693
- description: "A captioning model for images that contain text.",
2694
- id: "google/pix2struct-textcaps-base"
2693
+ description: "A powerful model that lets you have a conversation with the image.",
2694
+ id: "llava-hf/llava-1.5-7b-hf"
2695
2695
  }
2696
2696
  ],
2697
2697
  spaces: [
2698
+ {
2699
+ description: "An application that compares various image captioning models.",
2700
+ id: "nielsr/comparing-captioning-models"
2701
+ },
2698
2702
  {
2699
2703
  description: "A robust image captioning application.",
2700
2704
  id: "flax-community/image-captioning"
@@ -2856,7 +2860,6 @@ var taskData12 = {
2856
2860
  ],
2857
2861
  models: [
2858
2862
  {
2859
- // TO DO: write description
2860
2863
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
2861
2864
  id: "facebook/detr-resnet-50"
2862
2865
  },
@@ -2866,9 +2869,13 @@ var taskData12 = {
2866
2869
  }
2867
2870
  ],
2868
2871
  spaces: [
2872
+ {
2873
+ description: "Leaderboard to compare various object detection models across several metrics.",
2874
+ id: "hf-vision/object_detection_leaderboard"
2875
+ },
2869
2876
  {
2870
2877
  description: "An object detection application that can detect unseen objects out of the box.",
2871
- id: "adirik/OWL-ViT"
2878
+ id: "merve/owlv2"
2872
2879
  },
2873
2880
  {
2874
2881
  description: "An application that contains various object detection models to try from.",
@@ -2914,14 +2921,16 @@ var taskData13 = {
2914
2921
  metrics: [],
2915
2922
  models: [
2916
2923
  {
2917
- // TO DO: write description
2918
2924
  description: "Strong Depth Estimation model trained on 1.4 million images.",
2919
2925
  id: "Intel/dpt-large"
2920
2926
  },
2921
2927
  {
2922
- // TO DO: write description
2923
2928
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
2924
- id: "vinvino02/glpn-kitti"
2929
+ id: "facebook/dpt-dinov2-large-kitti"
2930
+ },
2931
+ {
2932
+ description: "A strong monocular depth estimation model.",
2933
+ id: "Bingxin/Marigold"
2925
2934
  }
2926
2935
  ],
2927
2936
  spaces: [
@@ -3475,12 +3484,12 @@ var taskData22 = {
3475
3484
  ],
3476
3485
  models: [
3477
3486
  {
3478
- description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
3479
- id: "CompVis/stable-diffusion-v1-4"
3487
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
3488
+ id: "stabilityai/stable-diffusion-xl-base-1.0"
3480
3489
  },
3481
3490
  {
3482
- description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
3483
- id: "dalle-mini/dalle-mega"
3491
+ description: "A powerful yet fast image generation model.",
3492
+ id: "latent-consistency/lcm-lora-sdxl"
3484
3493
  },
3485
3494
  {
3486
3495
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -3497,19 +3506,23 @@ var taskData22 = {
3497
3506
  id: "stabilityai/stable-diffusion"
3498
3507
  },
3499
3508
  {
3500
- description: "An text-to-image application that can generate coherent text inside the image.",
3509
+ description: "A text-to-image application to generate comics.",
3510
+ id: "jbilcke-hf/ai-comic-factory"
3511
+ },
3512
+ {
3513
+ description: "A text-to-image application that can generate coherent text inside the image.",
3501
3514
  id: "DeepFloyd/IF"
3502
3515
  },
3503
3516
  {
3504
- description: "An powerful text-to-image application that can generate images.",
3505
- id: "kakaobrain/karlo"
3517
+ description: "A powerful yet very fast image generation application.",
3518
+ id: "latent-consistency/lcm-lora-for-sdxl"
3506
3519
  },
3507
3520
  {
3508
- description: "An powerful text-to-image application that can generates 3D representations.",
3521
+ description: "A powerful text-to-image application that can generate 3D representations.",
3509
3522
  id: "hysts/Shap-E"
3510
3523
  },
3511
3524
  {
3512
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
3525
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
3513
3526
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
3514
3527
  }
3515
3528
  ],
@@ -4001,7 +4014,7 @@ var taskData28 = {
4001
4014
  models: [
4002
4015
  {
4003
4016
  description: "A strong model for video generation.",
4004
- id: "PAIR/text2video-zero-controlnet-canny-arcane"
4017
+ id: "Vchitect/LaVie"
4005
4018
  },
4006
4019
  {
4007
4020
  description: "A robust model for text-to-video generation.",
@@ -4009,7 +4022,7 @@ var taskData28 = {
4009
4022
  },
4010
4023
  {
4011
4024
  description: "A text-to-video generation model with high quality and smooth outputs.",
4012
- id: "cerspense/zeroscope_v2_576w"
4025
+ id: "hotshotco/Hotshot-XL"
4013
4026
  }
4014
4027
  ],
4015
4028
  spaces: [
@@ -4019,7 +4032,7 @@ var taskData28 = {
4019
4032
  },
4020
4033
  {
4021
4034
  description: "An application that generates video from image and text.",
4022
- id: "TempoFunk/makeavid-sd-jax"
4035
+ id: "Vchitect/LaVie"
4023
4036
  },
4024
4037
  {
4025
4038
  description: "An application that generates videos from text and provides multi-model support.",
@@ -4249,6 +4262,10 @@ var taskData31 = {
4249
4262
  }
4250
4263
  ],
4251
4264
  spaces: [
4265
+ {
4266
+ description: "An application that compares visual question answering models across different tasks.",
4267
+ id: "merve/pix2struct"
4268
+ },
4252
4269
  {
4253
4270
  description: "An application that can answer questions based on images.",
4254
4271
  id: "nielsr/vilt-vqa"
@@ -4416,8 +4433,8 @@ var TASKS_MODEL_LIBRARIES = {
4416
4433
  "graph-ml": ["transformers"],
4417
4434
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4418
4435
  "image-segmentation": ["transformers", "transformers.js"],
4419
- "image-to-image": ["diffusers", "transformers.js"],
4420
- "image-to-text": ["transformers.js"],
4436
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
4437
+ "image-to-text": ["transformers", "transformers.js"],
4421
4438
  "image-to-video": ["diffusers"],
4422
4439
  "video-classification": ["transformers"],
4423
4440
  "mask-generation": ["transformers"],
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.1.0",
4
+ "version": "0.1.1",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
@@ -72,7 +72,7 @@ model = BaseModel.from_pretrained("${model.id}")`,
72
72
  ];
73
73
 
74
74
  function get_base_diffusers_model(model: ModelData): string {
75
- return model.cardData?.base_model ?? "fill-in-base-model";
75
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
76
76
  }
77
77
 
78
78
  const bertopic = (model: ModelData) => [
package/src/model-data.ts CHANGED
@@ -93,7 +93,7 @@ export interface ModelData {
93
93
  | {
94
94
  parameters?: Record<string, unknown>;
95
95
  };
96
- base_model?: string;
96
+ base_model?: string | string[];
97
97
  };
98
98
  /**
99
99
  * Library name
@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
24
24
  metrics: [],
25
25
  models: [
26
26
  {
27
- // TO DO: write description
28
27
  description: "Strong Depth Estimation model trained on 1.4 million images.",
29
28
  id: "Intel/dpt-large",
30
29
  },
31
30
  {
32
- // TO DO: write description
33
31
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
34
- id: "vinvino02/glpn-kitti",
32
+ id: "facebook/dpt-dinov2-large-kitti",
33
+ },
34
+ {
35
+ description: "A strong monocular depth estimation model.",
36
+ id: "Bingxin/Marigold",
35
37
  },
36
38
  ],
37
39
  spaces: [
@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
50
50
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
51
51
  id: "naver-clova-ix/donut-base-finetuned-docvqa",
52
52
  },
53
+ {
54
+ description: "A powerful model for document question answering.",
55
+ id: "google/pix2struct-docvqa-large",
56
+ },
53
57
  ],
54
58
  spaces: [
55
59
  {
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
60
64
  description: "An application that can answer questions from invoices.",
61
65
  id: "impira/invoices",
62
66
  },
67
+ {
68
+ description: "An application to compare different document question answering models.",
69
+ id: "merve/compare_docvqa_models",
70
+ },
63
71
  ],
64
72
  summary:
65
73
  "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -27,6 +27,19 @@ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parro
27
27
  ## [{'generated_text': 'two birds are standing next to each other '}]
28
28
  ```
29
29
 
30
+ ### Conversation about the Image
31
+
32
+ Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.
33
+
34
+ ```python
35
+ from transformers import pipeline
36
+
37
+ mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
38
+ mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")
39
+
40
+ ## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
41
+ ```
42
+
30
43
  ### OCR
31
44
 
32
45
  This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
@@ -32,30 +32,26 @@ const taskData: TaskDataCustom = {
32
32
  models: [
33
33
  {
34
34
  description: "A robust image captioning model.",
35
- id: "Salesforce/blip-image-captioning-large",
35
+ id: "Salesforce/blip2-opt-2.7b",
36
36
  },
37
37
  {
38
- description: "A strong image captioning model.",
39
- id: "nlpconnect/vit-gpt2-image-captioning",
38
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
39
+ id: "microsoft/kosmos-2-patch14-224",
40
40
  },
41
41
  {
42
42
  description: "A strong optical character recognition model.",
43
- id: "microsoft/trocr-base-printed",
43
+ id: "facebook/nougat-base",
44
44
  },
45
45
  {
46
- description: "A strong visual question answering model for scientific diagrams.",
47
- id: "google/pix2struct-ai2d-base",
48
- },
49
- {
50
- description: "A strong captioning model for UI components.",
51
- id: "google/pix2struct-widget-captioning-base",
52
- },
53
- {
54
- description: "A captioning model for images that contain text.",
55
- id: "google/pix2struct-textcaps-base",
46
+ description: "A powerful model that lets you have a conversation with the image.",
47
+ id: "llava-hf/llava-1.5-7b-hf",
56
48
  },
57
49
  ],
58
50
  spaces: [
51
+ {
52
+ description: "An application that compares various image captioning models.",
53
+ id: "nielsr/comparing-captioning-models",
54
+ },
59
55
  {
60
56
  description: "A robust image captioning application.",
61
57
  id: "flax-community/image-captioning",
@@ -51,8 +51,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
51
51
  "graph-ml": ["transformers"],
52
52
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
53
53
  "image-segmentation": ["transformers", "transformers.js"],
54
- "image-to-image": ["diffusers", "transformers.js"],
55
- "image-to-text": ["transformers.js"],
54
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
55
+ "image-to-text": ["transformers", "transformers.js"],
56
56
  "image-to-video": ["diffusers"],
57
57
  "video-classification": ["transformers"],
58
58
  "mask-generation": ["transformers"],
@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
40
40
  ],
41
41
  models: [
42
42
  {
43
- // TO DO: write description
44
43
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
45
44
  id: "facebook/detr-resnet-50",
46
45
  },
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
50
49
  },
51
50
  ],
52
51
  spaces: [
52
+ {
53
+ description: "Leaderboard to compare various object detection models across several metrics.",
54
+ id: "hf-vision/object_detection_leaderboard",
55
+ },
53
56
  {
54
57
  description: "An object detection application that can detect unseen objects out of the box.",
55
- id: "adirik/OWL-ViT",
58
+ id: "merve/owlv2",
56
59
  },
57
60
  {
58
61
  description: "An application that contains various object detection models to try from.",
@@ -31,7 +31,7 @@ tqa = pipeline(task="table-question-answering", model="google/tapas-large-finetu
31
31
 
32
32
  # result
33
33
 
34
- print(tqa(table=table, query=query)['cells'][0])
34
+ print(tqa(table=table, query=question)['cells'][0])
35
35
  #53
36
36
 
37
37
  ```
@@ -42,6 +42,10 @@ When it comes to text generation, the underlying language model can come in seve
42
42
 
43
43
  - **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
44
44
 
45
+ ## Text Generation from Image and Text
46
+
47
+ There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.
48
+
45
49
  ## Inference
46
50
 
47
51
  You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.
@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
45
45
  ],
46
46
  models: [
47
47
  {
48
- description:
49
- "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
50
- id: "CompVis/stable-diffusion-v1-4",
48
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
49
+ id: "stabilityai/stable-diffusion-xl-base-1.0",
51
50
  },
52
51
  {
53
- description:
54
- "A model that can be used to generate images based on text prompts. The DALL·E Mega model is the largest version of DALLE Mini.",
55
- id: "dalle-mini/dalle-mega",
52
+ description: "A powerful yet fast image generation model.",
53
+ id: "latent-consistency/lcm-lora-sdxl",
56
54
  },
57
55
  {
58
56
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
69
67
  id: "stabilityai/stable-diffusion",
70
68
  },
71
69
  {
72
- description: "An text-to-image application that can generate coherent text inside the image.",
70
+ description: "A text-to-image application to generate comics.",
71
+ id: "jbilcke-hf/ai-comic-factory",
72
+ },
73
+ {
74
+ description: "A text-to-image application that can generate coherent text inside the image.",
73
75
  id: "DeepFloyd/IF",
74
76
  },
75
77
  {
76
- description: "An powerful text-to-image application that can generate images.",
77
- id: "kakaobrain/karlo",
78
+ description: "A powerful yet very fast image generation application.",
79
+ id: "latent-consistency/lcm-lora-for-sdxl",
78
80
  },
79
81
  {
80
- description: "An powerful text-to-image application that can generates 3D representations.",
82
+ description: "A powerful text-to-image application that can generate 3D representations.",
81
83
  id: "hysts/Shap-E",
82
84
  },
83
85
  {
84
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
86
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
85
87
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
86
88
  },
87
89
  ],
@@ -68,7 +68,7 @@ const taskData: TaskDataCustom = {
68
68
  models: [
69
69
  {
70
70
  description: "A strong model for video generation.",
71
- id: "PAIR/text2video-zero-controlnet-canny-arcane",
71
+ id: "Vchitect/LaVie",
72
72
  },
73
73
  {
74
74
  description: "A robust model for text-to-video generation.",
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
76
76
  },
77
77
  {
78
78
  description: "A text-to-video generation model with high quality and smooth outputs.",
79
- id: "cerspense/zeroscope_v2_576w",
79
+ id: "hotshotco/Hotshot-XL",
80
80
  },
81
81
  ],
82
82
  spaces: [
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
86
86
  },
87
87
  {
88
88
  description: "An application that generates video from image and text.",
89
- id: "TempoFunk/makeavid-sd-jax",
89
+ id: "Vchitect/LaVie",
90
90
  },
91
91
  {
92
92
  description: "An application that generates videos from text and provides multi-model support.",
@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
71
71
  },
72
72
  ],
73
73
  spaces: [
74
+ {
75
+ description: "An application that compares visual question answering models across different tasks.",
76
+ id: "merve/pix2struct",
77
+ },
74
78
  {
75
79
  description: "An application that can answer questions based on images.",
76
80
  id: "nielsr/vilt-vqa",