@huggingface/tasks 0.0.9 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -23,6 +23,7 @@ declare enum ModelLibrary {
23
23
  "pyannote-audio" = "pyannote.audio",
24
24
  "sample-factory" = "Sample Factory",
25
25
  "sentence-transformers" = "Sentence Transformers",
26
+ "setfit" = "SetFit",
26
27
  "sklearn" = "Scikit-learn",
27
28
  "spacy" = "spaCy",
28
29
  "span-marker" = "SpanMarker",
@@ -452,6 +453,9 @@ declare const PIPELINE_TYPES_SET: Set<"other" | "text-classification" | "token-c
452
453
  */
453
454
  declare const LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS: Partial<Record<ModelLibraryKey, PipelineType[]>>;
454
455
 
456
+ /**
457
+ * See default-widget-inputs.ts for the default widget inputs, this files only contains the types
458
+ */
455
459
  type TableData = Record<string, (string | number)[]>;
456
460
  type WidgetExampleOutputLabels = Array<{
457
461
  label: string;
@@ -526,6 +530,7 @@ interface WidgetExampleSentenceSimilarityInput<TOutput = WidgetExampleOutput> ex
526
530
  type WidgetExample<TOutput = WidgetExampleOutput> = WidgetExampleTextInput<TOutput> | WidgetExampleTextAndContextInput<TOutput> | WidgetExampleTextAndTableInput<TOutput> | WidgetExampleAssetInput<TOutput> | WidgetExampleAssetAndPromptInput<TOutput> | WidgetExampleAssetAndTextInput<TOutput> | WidgetExampleAssetAndZeroShotInput<TOutput> | WidgetExampleStructuredDataInput<TOutput> | WidgetExampleTableDataInput<TOutput> | WidgetExampleZeroShotTextInput<TOutput> | WidgetExampleSentenceSimilarityInput<TOutput>;
527
531
  type KeysOfUnion<T> = T extends unknown ? keyof T : never;
528
532
  type WidgetExampleAttribute = KeysOfUnion<WidgetExample>;
533
+
529
534
  declare enum InferenceDisplayability {
530
535
  /**
531
536
  * Yes
@@ -618,7 +623,7 @@ interface ModelData {
618
623
  inference?: boolean | {
619
624
  parameters?: Record<string, unknown>;
620
625
  };
621
- base_model?: string;
626
+ base_model?: string | string[];
622
627
  };
623
628
  /**
624
629
  * Library name
package/dist/index.js CHANGED
@@ -66,6 +66,7 @@ var LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS = {
66
66
  peft: ["text-generation"],
67
67
  "pyannote-audio": ["automatic-speech-recognition"],
68
68
  "sentence-transformers": ["feature-extraction", "sentence-similarity"],
69
+ setfit: ["text-classification"],
69
70
  sklearn: ["tabular-classification", "tabular-regression", "text-classification"],
70
71
  spacy: ["token-classification", "text-classification", "sentence-similarity"],
71
72
  "span-marker": ["token-classification"],
@@ -118,7 +119,7 @@ var asteroid = (model) => [
118
119
  model = BaseModel.from_pretrained("${model.id}")`
119
120
  ];
120
121
  function get_base_diffusers_model(model) {
121
- return model.cardData?.base_model ?? "fill-in-base-model";
122
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
122
123
  }
123
124
  var bertopic = (model) => [
124
125
  `from bertopic import BERTopic
@@ -218,8 +219,8 @@ var paddlenlp = (model) => {
218
219
  [
219
220
  `from paddlenlp.transformers import AutoTokenizer, ${architecture}`,
220
221
  "",
221
- `tokenizer = AutoTokenizer.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`,
222
- `model = ${architecture}.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`
222
+ `tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
223
+ `model = ${architecture}.from_pretrained("${model.id}", from_hf_hub=True)`
223
224
  ].join("\n")
224
225
  ];
225
226
  } else {
@@ -228,8 +229,8 @@ var paddlenlp = (model) => {
228
229
  `# \u26A0\uFE0F Type of model unknown`,
229
230
  `from paddlenlp.transformers import AutoTokenizer, AutoModel`,
230
231
  "",
231
- `tokenizer = AutoTokenizer.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`,
232
- `model = AutoModel.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`
232
+ `tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
233
+ `model = AutoModel.from_pretrained("${model.id}", from_hf_hub=True)`
233
234
  ].join("\n")
234
235
  ];
235
236
  }
@@ -365,6 +366,11 @@ var sentenceTransformers = (model) => [
365
366
 
366
367
  model = SentenceTransformer("${model.id}")`
367
368
  ];
369
+ var setfit = (model) => [
370
+ `from setfit import SetFitModel
371
+
372
+ model = SetFitModel.from_pretrained("${model.id}")`
373
+ ];
368
374
  var spacy = (model) => [
369
375
  `!pip install https://huggingface.co/${model.id}/resolve/main/${nameWithoutNamespace(model.id)}-any-py3-none-any.whl
370
376
 
@@ -637,6 +643,13 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
637
643
  docsUrl: "https://huggingface.co/docs/hub/sentence-transformers",
638
644
  snippets: sentenceTransformers
639
645
  },
646
+ setfit: {
647
+ btnLabel: "setfit",
648
+ repoName: "setfit",
649
+ repoUrl: "https://github.com/huggingface/setfit",
650
+ docsUrl: "https://huggingface.co/docs/hub/setfit",
651
+ snippets: setfit
652
+ },
640
653
  sklearn: {
641
654
  btnLabel: "Scikit-learn",
642
655
  repoName: "Scikit-learn",
@@ -2343,6 +2356,10 @@ var taskData5 = {
2343
2356
  {
2344
2357
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
2345
2358
  id: "naver-clova-ix/donut-base-finetuned-docvqa"
2359
+ },
2360
+ {
2361
+ description: "A powerful model for document question answering.",
2362
+ id: "google/pix2struct-docvqa-large"
2346
2363
  }
2347
2364
  ],
2348
2365
  spaces: [
@@ -2353,6 +2370,10 @@ var taskData5 = {
2353
2370
  {
2354
2371
  description: "An application that can answer questions from invoices.",
2355
2372
  id: "impira/invoices"
2373
+ },
2374
+ {
2375
+ description: "An application to compare different document question answering models.",
2376
+ id: "merve/compare_docvqa_models"
2356
2377
  }
2357
2378
  ],
2358
2379
  summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2696,30 +2717,26 @@ var taskData10 = {
2696
2717
  models: [
2697
2718
  {
2698
2719
  description: "A robust image captioning model.",
2699
- id: "Salesforce/blip-image-captioning-large"
2720
+ id: "Salesforce/blip2-opt-2.7b"
2700
2721
  },
2701
2722
  {
2702
- description: "A strong image captioning model.",
2703
- id: "nlpconnect/vit-gpt2-image-captioning"
2723
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
2724
+ id: "microsoft/kosmos-2-patch14-224"
2704
2725
  },
2705
2726
  {
2706
2727
  description: "A strong optical character recognition model.",
2707
- id: "microsoft/trocr-base-printed"
2708
- },
2709
- {
2710
- description: "A strong visual question answering model for scientific diagrams.",
2711
- id: "google/pix2struct-ai2d-base"
2728
+ id: "facebook/nougat-base"
2712
2729
  },
2713
2730
  {
2714
- description: "A strong captioning model for UI components.",
2715
- id: "google/pix2struct-widget-captioning-base"
2716
- },
2717
- {
2718
- description: "A captioning model for images that contain text.",
2719
- id: "google/pix2struct-textcaps-base"
2731
+ description: "A powerful model that lets you have a conversation with the image.",
2732
+ id: "llava-hf/llava-1.5-7b-hf"
2720
2733
  }
2721
2734
  ],
2722
2735
  spaces: [
2736
+ {
2737
+ description: "An application that compares various image captioning models.",
2738
+ id: "nielsr/comparing-captioning-models"
2739
+ },
2723
2740
  {
2724
2741
  description: "A robust image captioning application.",
2725
2742
  id: "flax-community/image-captioning"
@@ -2881,7 +2898,6 @@ var taskData12 = {
2881
2898
  ],
2882
2899
  models: [
2883
2900
  {
2884
- // TO DO: write description
2885
2901
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
2886
2902
  id: "facebook/detr-resnet-50"
2887
2903
  },
@@ -2891,9 +2907,13 @@ var taskData12 = {
2891
2907
  }
2892
2908
  ],
2893
2909
  spaces: [
2910
+ {
2911
+ description: "Leaderboard to compare various object detection models across several metrics.",
2912
+ id: "hf-vision/object_detection_leaderboard"
2913
+ },
2894
2914
  {
2895
2915
  description: "An object detection application that can detect unseen objects out of the box.",
2896
- id: "adirik/OWL-ViT"
2916
+ id: "merve/owlv2"
2897
2917
  },
2898
2918
  {
2899
2919
  description: "An application that contains various object detection models to try from.",
@@ -2939,14 +2959,16 @@ var taskData13 = {
2939
2959
  metrics: [],
2940
2960
  models: [
2941
2961
  {
2942
- // TO DO: write description
2943
2962
  description: "Strong Depth Estimation model trained on 1.4 million images.",
2944
2963
  id: "Intel/dpt-large"
2945
2964
  },
2946
2965
  {
2947
- // TO DO: write description
2948
2966
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
2949
- id: "vinvino02/glpn-kitti"
2967
+ id: "facebook/dpt-dinov2-large-kitti"
2968
+ },
2969
+ {
2970
+ description: "A strong monocular depth estimation model.",
2971
+ id: "Bingxin/Marigold"
2950
2972
  }
2951
2973
  ],
2952
2974
  spaces: [
@@ -3500,12 +3522,12 @@ var taskData22 = {
3500
3522
  ],
3501
3523
  models: [
3502
3524
  {
3503
- description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
3504
- id: "CompVis/stable-diffusion-v1-4"
3525
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
3526
+ id: "stabilityai/stable-diffusion-xl-base-1.0"
3505
3527
  },
3506
3528
  {
3507
- description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
3508
- id: "dalle-mini/dalle-mega"
3529
+ description: "A powerful yet fast image generation model.",
3530
+ id: "latent-consistency/lcm-lora-sdxl"
3509
3531
  },
3510
3532
  {
3511
3533
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -3522,19 +3544,23 @@ var taskData22 = {
3522
3544
  id: "stabilityai/stable-diffusion"
3523
3545
  },
3524
3546
  {
3525
- description: "An text-to-image application that can generate coherent text inside the image.",
3547
+ description: "A text-to-image application to generate comics.",
3548
+ id: "jbilcke-hf/ai-comic-factory"
3549
+ },
3550
+ {
3551
+ description: "A text-to-image application that can generate coherent text inside the image.",
3526
3552
  id: "DeepFloyd/IF"
3527
3553
  },
3528
3554
  {
3529
- description: "An powerful text-to-image application that can generate images.",
3530
- id: "kakaobrain/karlo"
3555
+ description: "A powerful yet very fast image generation application.",
3556
+ id: "latent-consistency/lcm-lora-for-sdxl"
3531
3557
  },
3532
3558
  {
3533
- description: "An powerful text-to-image application that can generates 3D representations.",
3559
+ description: "A powerful text-to-image application that can generate 3D representations.",
3534
3560
  id: "hysts/Shap-E"
3535
3561
  },
3536
3562
  {
3537
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
3563
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
3538
3564
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
3539
3565
  }
3540
3566
  ],
@@ -4026,7 +4052,7 @@ var taskData28 = {
4026
4052
  models: [
4027
4053
  {
4028
4054
  description: "A strong model for video generation.",
4029
- id: "PAIR/text2video-zero-controlnet-canny-arcane"
4055
+ id: "Vchitect/LaVie"
4030
4056
  },
4031
4057
  {
4032
4058
  description: "A robust model for text-to-video generation.",
@@ -4034,7 +4060,7 @@ var taskData28 = {
4034
4060
  },
4035
4061
  {
4036
4062
  description: "A text-to-video generation model with high quality and smooth outputs.",
4037
- id: "cerspense/zeroscope_v2_576w"
4063
+ id: "hotshotco/Hotshot-XL"
4038
4064
  }
4039
4065
  ],
4040
4066
  spaces: [
@@ -4044,7 +4070,7 @@ var taskData28 = {
4044
4070
  },
4045
4071
  {
4046
4072
  description: "An application that generates video from image and text.",
4047
- id: "TempoFunk/makeavid-sd-jax"
4073
+ id: "Vchitect/LaVie"
4048
4074
  },
4049
4075
  {
4050
4076
  description: "An application that generates videos from text and provides multi-model support.",
@@ -4274,6 +4300,10 @@ var taskData31 = {
4274
4300
  }
4275
4301
  ],
4276
4302
  spaces: [
4303
+ {
4304
+ description: "An application that compares visual question answering models across different tasks.",
4305
+ id: "merve/pix2struct"
4306
+ },
4277
4307
  {
4278
4308
  description: "An application that can answer questions based on images.",
4279
4309
  id: "nielsr/vilt-vqa"
@@ -4441,8 +4471,8 @@ var TASKS_MODEL_LIBRARIES = {
4441
4471
  "graph-ml": ["transformers"],
4442
4472
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4443
4473
  "image-segmentation": ["transformers", "transformers.js"],
4444
- "image-to-image": ["diffusers", "transformers.js"],
4445
- "image-to-text": ["transformers.js"],
4474
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
4475
+ "image-to-text": ["transformers", "transformers.js"],
4446
4476
  "image-to-video": ["diffusers"],
4447
4477
  "video-classification": ["transformers"],
4448
4478
  "mask-generation": ["transformers"],
@@ -4459,7 +4489,7 @@ var TASKS_MODEL_LIBRARIES = {
4459
4489
  "tabular-classification": ["sklearn"],
4460
4490
  "tabular-regression": ["sklearn"],
4461
4491
  "tabular-to-text": ["transformers"],
4462
- "text-classification": ["adapter-transformers", "spacy", "transformers", "transformers.js"],
4492
+ "text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
4463
4493
  "text-generation": ["transformers", "transformers.js"],
4464
4494
  "text-retrieval": [],
4465
4495
  "text-to-image": ["diffusers"],
@@ -4566,6 +4596,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
4566
4596
  ModelLibrary2["pyannote-audio"] = "pyannote.audio";
4567
4597
  ModelLibrary2["sample-factory"] = "Sample Factory";
4568
4598
  ModelLibrary2["sentence-transformers"] = "Sentence Transformers";
4599
+ ModelLibrary2["setfit"] = "SetFit";
4569
4600
  ModelLibrary2["sklearn"] = "Scikit-learn";
4570
4601
  ModelLibrary2["spacy"] = "spaCy";
4571
4602
  ModelLibrary2["span-marker"] = "SpanMarker";
package/dist/index.mjs CHANGED
@@ -28,6 +28,7 @@ var LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS = {
28
28
  peft: ["text-generation"],
29
29
  "pyannote-audio": ["automatic-speech-recognition"],
30
30
  "sentence-transformers": ["feature-extraction", "sentence-similarity"],
31
+ setfit: ["text-classification"],
31
32
  sklearn: ["tabular-classification", "tabular-regression", "text-classification"],
32
33
  spacy: ["token-classification", "text-classification", "sentence-similarity"],
33
34
  "span-marker": ["token-classification"],
@@ -80,7 +81,7 @@ var asteroid = (model) => [
80
81
  model = BaseModel.from_pretrained("${model.id}")`
81
82
  ];
82
83
  function get_base_diffusers_model(model) {
83
- return model.cardData?.base_model ?? "fill-in-base-model";
84
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
84
85
  }
85
86
  var bertopic = (model) => [
86
87
  `from bertopic import BERTopic
@@ -180,8 +181,8 @@ var paddlenlp = (model) => {
180
181
  [
181
182
  `from paddlenlp.transformers import AutoTokenizer, ${architecture}`,
182
183
  "",
183
- `tokenizer = AutoTokenizer.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`,
184
- `model = ${architecture}.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`
184
+ `tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
185
+ `model = ${architecture}.from_pretrained("${model.id}", from_hf_hub=True)`
185
186
  ].join("\n")
186
187
  ];
187
188
  } else {
@@ -190,8 +191,8 @@ var paddlenlp = (model) => {
190
191
  `# \u26A0\uFE0F Type of model unknown`,
191
192
  `from paddlenlp.transformers import AutoTokenizer, AutoModel`,
192
193
  "",
193
- `tokenizer = AutoTokenizer.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`,
194
- `model = AutoModel.from_pretrained("${model.id}"${model.private ? ", use_auth_token=True" : ""}, from_hf_hub=True)`
194
+ `tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
195
+ `model = AutoModel.from_pretrained("${model.id}", from_hf_hub=True)`
195
196
  ].join("\n")
196
197
  ];
197
198
  }
@@ -327,6 +328,11 @@ var sentenceTransformers = (model) => [
327
328
 
328
329
  model = SentenceTransformer("${model.id}")`
329
330
  ];
331
+ var setfit = (model) => [
332
+ `from setfit import SetFitModel
333
+
334
+ model = SetFitModel.from_pretrained("${model.id}")`
335
+ ];
330
336
  var spacy = (model) => [
331
337
  `!pip install https://huggingface.co/${model.id}/resolve/main/${nameWithoutNamespace(model.id)}-any-py3-none-any.whl
332
338
 
@@ -599,6 +605,13 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
599
605
  docsUrl: "https://huggingface.co/docs/hub/sentence-transformers",
600
606
  snippets: sentenceTransformers
601
607
  },
608
+ setfit: {
609
+ btnLabel: "setfit",
610
+ repoName: "setfit",
611
+ repoUrl: "https://github.com/huggingface/setfit",
612
+ docsUrl: "https://huggingface.co/docs/hub/setfit",
613
+ snippets: setfit
614
+ },
602
615
  sklearn: {
603
616
  btnLabel: "Scikit-learn",
604
617
  repoName: "Scikit-learn",
@@ -2305,6 +2318,10 @@ var taskData5 = {
2305
2318
  {
2306
2319
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
2307
2320
  id: "naver-clova-ix/donut-base-finetuned-docvqa"
2321
+ },
2322
+ {
2323
+ description: "A powerful model for document question answering.",
2324
+ id: "google/pix2struct-docvqa-large"
2308
2325
  }
2309
2326
  ],
2310
2327
  spaces: [
@@ -2315,6 +2332,10 @@ var taskData5 = {
2315
2332
  {
2316
2333
  description: "An application that can answer questions from invoices.",
2317
2334
  id: "impira/invoices"
2335
+ },
2336
+ {
2337
+ description: "An application to compare different document question answering models.",
2338
+ id: "merve/compare_docvqa_models"
2318
2339
  }
2319
2340
  ],
2320
2341
  summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2658,30 +2679,26 @@ var taskData10 = {
2658
2679
  models: [
2659
2680
  {
2660
2681
  description: "A robust image captioning model.",
2661
- id: "Salesforce/blip-image-captioning-large"
2682
+ id: "Salesforce/blip2-opt-2.7b"
2662
2683
  },
2663
2684
  {
2664
- description: "A strong image captioning model.",
2665
- id: "nlpconnect/vit-gpt2-image-captioning"
2685
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
2686
+ id: "microsoft/kosmos-2-patch14-224"
2666
2687
  },
2667
2688
  {
2668
2689
  description: "A strong optical character recognition model.",
2669
- id: "microsoft/trocr-base-printed"
2670
- },
2671
- {
2672
- description: "A strong visual question answering model for scientific diagrams.",
2673
- id: "google/pix2struct-ai2d-base"
2690
+ id: "facebook/nougat-base"
2674
2691
  },
2675
2692
  {
2676
- description: "A strong captioning model for UI components.",
2677
- id: "google/pix2struct-widget-captioning-base"
2678
- },
2679
- {
2680
- description: "A captioning model for images that contain text.",
2681
- id: "google/pix2struct-textcaps-base"
2693
+ description: "A powerful model that lets you have a conversation with the image.",
2694
+ id: "llava-hf/llava-1.5-7b-hf"
2682
2695
  }
2683
2696
  ],
2684
2697
  spaces: [
2698
+ {
2699
+ description: "An application that compares various image captioning models.",
2700
+ id: "nielsr/comparing-captioning-models"
2701
+ },
2685
2702
  {
2686
2703
  description: "A robust image captioning application.",
2687
2704
  id: "flax-community/image-captioning"
@@ -2843,7 +2860,6 @@ var taskData12 = {
2843
2860
  ],
2844
2861
  models: [
2845
2862
  {
2846
- // TO DO: write description
2847
2863
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
2848
2864
  id: "facebook/detr-resnet-50"
2849
2865
  },
@@ -2853,9 +2869,13 @@ var taskData12 = {
2853
2869
  }
2854
2870
  ],
2855
2871
  spaces: [
2872
+ {
2873
+ description: "Leaderboard to compare various object detection models across several metrics.",
2874
+ id: "hf-vision/object_detection_leaderboard"
2875
+ },
2856
2876
  {
2857
2877
  description: "An object detection application that can detect unseen objects out of the box.",
2858
- id: "adirik/OWL-ViT"
2878
+ id: "merve/owlv2"
2859
2879
  },
2860
2880
  {
2861
2881
  description: "An application that contains various object detection models to try from.",
@@ -2901,14 +2921,16 @@ var taskData13 = {
2901
2921
  metrics: [],
2902
2922
  models: [
2903
2923
  {
2904
- // TO DO: write description
2905
2924
  description: "Strong Depth Estimation model trained on 1.4 million images.",
2906
2925
  id: "Intel/dpt-large"
2907
2926
  },
2908
2927
  {
2909
- // TO DO: write description
2910
2928
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
2911
- id: "vinvino02/glpn-kitti"
2929
+ id: "facebook/dpt-dinov2-large-kitti"
2930
+ },
2931
+ {
2932
+ description: "A strong monocular depth estimation model.",
2933
+ id: "Bingxin/Marigold"
2912
2934
  }
2913
2935
  ],
2914
2936
  spaces: [
@@ -3462,12 +3484,12 @@ var taskData22 = {
3462
3484
  ],
3463
3485
  models: [
3464
3486
  {
3465
- description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
3466
- id: "CompVis/stable-diffusion-v1-4"
3487
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
3488
+ id: "stabilityai/stable-diffusion-xl-base-1.0"
3467
3489
  },
3468
3490
  {
3469
- description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
3470
- id: "dalle-mini/dalle-mega"
3491
+ description: "A powerful yet fast image generation model.",
3492
+ id: "latent-consistency/lcm-lora-sdxl"
3471
3493
  },
3472
3494
  {
3473
3495
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -3484,19 +3506,23 @@ var taskData22 = {
3484
3506
  id: "stabilityai/stable-diffusion"
3485
3507
  },
3486
3508
  {
3487
- description: "An text-to-image application that can generate coherent text inside the image.",
3509
+ description: "A text-to-image application to generate comics.",
3510
+ id: "jbilcke-hf/ai-comic-factory"
3511
+ },
3512
+ {
3513
+ description: "A text-to-image application that can generate coherent text inside the image.",
3488
3514
  id: "DeepFloyd/IF"
3489
3515
  },
3490
3516
  {
3491
- description: "An powerful text-to-image application that can generate images.",
3492
- id: "kakaobrain/karlo"
3517
+ description: "A powerful yet very fast image generation application.",
3518
+ id: "latent-consistency/lcm-lora-for-sdxl"
3493
3519
  },
3494
3520
  {
3495
- description: "An powerful text-to-image application that can generates 3D representations.",
3521
+ description: "A powerful text-to-image application that can generate 3D representations.",
3496
3522
  id: "hysts/Shap-E"
3497
3523
  },
3498
3524
  {
3499
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
3525
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
3500
3526
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
3501
3527
  }
3502
3528
  ],
@@ -3988,7 +4014,7 @@ var taskData28 = {
3988
4014
  models: [
3989
4015
  {
3990
4016
  description: "A strong model for video generation.",
3991
- id: "PAIR/text2video-zero-controlnet-canny-arcane"
4017
+ id: "Vchitect/LaVie"
3992
4018
  },
3993
4019
  {
3994
4020
  description: "A robust model for text-to-video generation.",
@@ -3996,7 +4022,7 @@ var taskData28 = {
3996
4022
  },
3997
4023
  {
3998
4024
  description: "A text-to-video generation model with high quality and smooth outputs.",
3999
- id: "cerspense/zeroscope_v2_576w"
4025
+ id: "hotshotco/Hotshot-XL"
4000
4026
  }
4001
4027
  ],
4002
4028
  spaces: [
@@ -4006,7 +4032,7 @@ var taskData28 = {
4006
4032
  },
4007
4033
  {
4008
4034
  description: "An application that generates video from image and text.",
4009
- id: "TempoFunk/makeavid-sd-jax"
4035
+ id: "Vchitect/LaVie"
4010
4036
  },
4011
4037
  {
4012
4038
  description: "An application that generates videos from text and provides multi-model support.",
@@ -4236,6 +4262,10 @@ var taskData31 = {
4236
4262
  }
4237
4263
  ],
4238
4264
  spaces: [
4265
+ {
4266
+ description: "An application that compares visual question answering models across different tasks.",
4267
+ id: "merve/pix2struct"
4268
+ },
4239
4269
  {
4240
4270
  description: "An application that can answer questions based on images.",
4241
4271
  id: "nielsr/vilt-vqa"
@@ -4403,8 +4433,8 @@ var TASKS_MODEL_LIBRARIES = {
4403
4433
  "graph-ml": ["transformers"],
4404
4434
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4405
4435
  "image-segmentation": ["transformers", "transformers.js"],
4406
- "image-to-image": ["diffusers", "transformers.js"],
4407
- "image-to-text": ["transformers.js"],
4436
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
4437
+ "image-to-text": ["transformers", "transformers.js"],
4408
4438
  "image-to-video": ["diffusers"],
4409
4439
  "video-classification": ["transformers"],
4410
4440
  "mask-generation": ["transformers"],
@@ -4421,7 +4451,7 @@ var TASKS_MODEL_LIBRARIES = {
4421
4451
  "tabular-classification": ["sklearn"],
4422
4452
  "tabular-regression": ["sklearn"],
4423
4453
  "tabular-to-text": ["transformers"],
4424
- "text-classification": ["adapter-transformers", "spacy", "transformers", "transformers.js"],
4454
+ "text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
4425
4455
  "text-generation": ["transformers", "transformers.js"],
4426
4456
  "text-retrieval": [],
4427
4457
  "text-to-image": ["diffusers"],
@@ -4528,6 +4558,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
4528
4558
  ModelLibrary2["pyannote-audio"] = "pyannote.audio";
4529
4559
  ModelLibrary2["sample-factory"] = "Sample Factory";
4530
4560
  ModelLibrary2["sentence-transformers"] = "Sentence Transformers";
4561
+ ModelLibrary2["setfit"] = "SetFit";
4531
4562
  ModelLibrary2["sklearn"] = "Scikit-learn";
4532
4563
  ModelLibrary2["spacy"] = "spaCy";
4533
4564
  ModelLibrary2["span-marker"] = "SpanMarker";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.0.9",
4
+ "version": "0.1.1",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
@@ -1,4 +1,4 @@
1
- import type { WidgetExample } from "./model-data";
1
+ import type { WidgetExample } from "./widget-example";
2
2
  import type { PipelineType } from "./pipelines";
3
3
 
4
4
  type LanguageCode = string;
package/src/index.ts CHANGED
@@ -16,9 +16,8 @@ export {
16
16
  } from "./pipelines";
17
17
  export { ModelLibrary, ALL_DISPLAY_MODEL_LIBRARY_KEYS } from "./model-libraries";
18
18
  export type { ModelLibraryKey } from "./model-libraries";
19
- export {
20
- ModelData,
21
- TransformersInfo,
19
+ export type { ModelData, TransformersInfo } from "./model-data";
20
+ export type {
22
21
  WidgetExample,
23
22
  WidgetExampleAttribute,
24
23
  WidgetExampleAssetAndPromptInput,
@@ -37,7 +36,7 @@ export {
37
36
  WidgetExampleOutputLabels,
38
37
  WidgetExampleOutputAnswerScore,
39
38
  WidgetExampleOutputText,
40
- } from "./model-data";
39
+ } from "./widget-example";
41
40
  export { InferenceDisplayability } from "./model-data";
42
41
 
43
42
  export { TAG_NFAA_CONTENT, OTHER_TAGS_SUGGESTIONS, TAG_TEXT_GENERATION_INFERENCE, TAG_CUSTOM_CODE } from "./tags";
@@ -31,6 +31,7 @@ export const LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS: Partial<Record<ModelLi
31
31
  peft: ["text-generation"],
32
32
  "pyannote-audio": ["automatic-speech-recognition"],
33
33
  "sentence-transformers": ["feature-extraction", "sentence-similarity"],
34
+ setfit: ["text-classification"],
34
35
  sklearn: ["tabular-classification", "tabular-regression", "text-classification"],
35
36
  spacy: ["token-classification", "text-classification", "sentence-similarity"],
36
37
  "span-marker": ["token-classification"],
@@ -72,7 +72,7 @@ model = BaseModel.from_pretrained("${model.id}")`,
72
72
  ];
73
73
 
74
74
  function get_base_diffusers_model(model: ModelData): string {
75
- return model.cardData?.base_model ?? "fill-in-base-model";
75
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
76
76
  }
77
77
 
78
78
  const bertopic = (model: ModelData) => [
@@ -187,12 +187,8 @@ const paddlenlp = (model: ModelData) => {
187
187
  [
188
188
  `from paddlenlp.transformers import AutoTokenizer, ${architecture}`,
189
189
  "",
190
- `tokenizer = AutoTokenizer.from_pretrained("${model.id}"${
191
- model.private ? ", use_auth_token=True" : ""
192
- }, from_hf_hub=True)`,
193
- `model = ${architecture}.from_pretrained("${model.id}"${
194
- model.private ? ", use_auth_token=True" : ""
195
- }, from_hf_hub=True)`,
190
+ `tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
191
+ `model = ${architecture}.from_pretrained("${model.id}", from_hf_hub=True)`,
196
192
  ].join("\n"),
197
193
  ];
198
194
  } else {
@@ -201,12 +197,8 @@ const paddlenlp = (model: ModelData) => {
201
197
  `# ⚠️ Type of model unknown`,
202
198
  `from paddlenlp.transformers import AutoTokenizer, AutoModel`,
203
199
  "",
204
- `tokenizer = AutoTokenizer.from_pretrained("${model.id}"${
205
- model.private ? ", use_auth_token=True" : ""
206
- }, from_hf_hub=True)`,
207
- `model = AutoModel.from_pretrained("${model.id}"${
208
- model.private ? ", use_auth_token=True" : ""
209
- }, from_hf_hub=True)`,
200
+ `tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
201
+ `model = AutoModel.from_pretrained("${model.id}", from_hf_hub=True)`,
210
202
  ].join("\n"),
211
203
  ];
212
204
  }
@@ -358,6 +350,12 @@ const sentenceTransformers = (model: ModelData) => [
358
350
  model = SentenceTransformer("${model.id}")`,
359
351
  ];
360
352
 
353
+ const setfit = (model: ModelData) => [
354
+ `from setfit import SetFitModel
355
+
356
+ model = SetFitModel.from_pretrained("${model.id}")`,
357
+ ];
358
+
361
359
  const spacy = (model: ModelData) => [
362
360
  `!pip install https://huggingface.co/${model.id}/resolve/main/${nameWithoutNamespace(model.id)}-any-py3-none-any.whl
363
361
 
@@ -661,6 +659,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS: Partial<Record<ModelLibraryKey, Librar
661
659
  docsUrl: "https://huggingface.co/docs/hub/sentence-transformers",
662
660
  snippets: sentenceTransformers,
663
661
  },
662
+ setfit: {
663
+ btnLabel: "setfit",
664
+ repoName: "setfit",
665
+ repoUrl: "https://github.com/huggingface/setfit",
666
+ docsUrl: "https://huggingface.co/docs/hub/setfit",
667
+ snippets: setfit,
668
+ },
664
669
  sklearn: {
665
670
  btnLabel: "Scikit-learn",
666
671
  repoName: "Scikit-learn",
package/src/model-data.ts CHANGED
@@ -1,119 +1,5 @@
1
1
  import type { PipelineType } from "./pipelines";
2
-
3
- type TableData = Record<string, (string | number)[]>;
4
-
5
- //#region outputs
6
- export type WidgetExampleOutputLabels = Array<{ label: string; score: number }>;
7
- export interface WidgetExampleOutputAnswerScore {
8
- answer: string;
9
- score: number;
10
- }
11
- export interface WidgetExampleOutputText {
12
- text: string;
13
- }
14
- export interface WidgetExampleOutputUrl {
15
- url: string;
16
- }
17
-
18
- export type WidgetExampleOutput =
19
- | WidgetExampleOutputLabels
20
- | WidgetExampleOutputAnswerScore
21
- | WidgetExampleOutputText
22
- | WidgetExampleOutputUrl;
23
- //#endregion
24
-
25
- export interface WidgetExampleBase<TOutput> {
26
- example_title?: string;
27
- group?: string;
28
- /**
29
- * Potential overrides to API parameters for this specific example
30
- * (takes precedences over the model card metadata's inference.parameters)
31
- */
32
- parameters?: {
33
- /// token-classification
34
- aggregation_strategy?: string;
35
- /// text-generation
36
- top_k?: number;
37
- top_p?: number;
38
- temperature?: number;
39
- max_new_tokens?: number;
40
- do_sample?: boolean;
41
- /// text-to-image
42
- negative_prompt?: string;
43
- guidance_scale?: number;
44
- num_inference_steps?: number;
45
- };
46
- /**
47
- * Optional output
48
- */
49
- output?: TOutput;
50
- }
51
-
52
- export interface WidgetExampleTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
53
- text: string;
54
- }
55
-
56
- export interface WidgetExampleTextAndContextInput<TOutput = WidgetExampleOutput>
57
- extends WidgetExampleTextInput<TOutput> {
58
- context: string;
59
- }
60
-
61
- export interface WidgetExampleTextAndTableInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
62
- table: TableData;
63
- }
64
-
65
- export interface WidgetExampleAssetInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
66
- src: string;
67
- }
68
- export interface WidgetExampleAssetAndPromptInput<TOutput = WidgetExampleOutput>
69
- extends WidgetExampleAssetInput<TOutput> {
70
- prompt: string;
71
- }
72
-
73
- export type WidgetExampleAssetAndTextInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
74
- WidgetExampleTextInput<TOutput>;
75
-
76
- export type WidgetExampleAssetAndZeroShotInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
77
- WidgetExampleZeroShotTextInput<TOutput>;
78
-
79
- export interface WidgetExampleStructuredDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
80
- structured_data: TableData;
81
- }
82
-
83
- export interface WidgetExampleTableDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
84
- table: TableData;
85
- }
86
-
87
- export interface WidgetExampleZeroShotTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
88
- text: string;
89
- candidate_labels: string;
90
- multi_class: boolean;
91
- }
92
-
93
- export interface WidgetExampleSentenceSimilarityInput<TOutput = WidgetExampleOutput>
94
- extends WidgetExampleBase<TOutput> {
95
- source_sentence: string;
96
- sentences: string[];
97
- }
98
-
99
- //#endregion
100
-
101
- export type WidgetExample<TOutput = WidgetExampleOutput> =
102
- | WidgetExampleTextInput<TOutput>
103
- | WidgetExampleTextAndContextInput<TOutput>
104
- | WidgetExampleTextAndTableInput<TOutput>
105
- | WidgetExampleAssetInput<TOutput>
106
- | WidgetExampleAssetAndPromptInput<TOutput>
107
- | WidgetExampleAssetAndTextInput<TOutput>
108
- | WidgetExampleAssetAndZeroShotInput<TOutput>
109
- | WidgetExampleStructuredDataInput<TOutput>
110
- | WidgetExampleTableDataInput<TOutput>
111
- | WidgetExampleZeroShotTextInput<TOutput>
112
- | WidgetExampleSentenceSimilarityInput<TOutput>;
113
-
114
- type KeysOfUnion<T> = T extends unknown ? keyof T : never;
115
-
116
- export type WidgetExampleAttribute = KeysOfUnion<WidgetExample>;
2
+ import type { WidgetExample } from "./widget-example";
117
3
 
118
4
  export enum InferenceDisplayability {
119
5
  /**
@@ -207,7 +93,7 @@ export interface ModelData {
207
93
  | {
208
94
  parameters?: Record<string, unknown>;
209
95
  };
210
- base_model?: string;
96
+ base_model?: string | string[];
211
97
  };
212
98
  /**
213
99
  * Library name
@@ -23,6 +23,7 @@ export enum ModelLibrary {
23
23
  "pyannote-audio" = "pyannote.audio",
24
24
  "sample-factory" = "Sample Factory",
25
25
  "sentence-transformers" = "Sentence Transformers",
26
+ "setfit" = "SetFit",
26
27
  "sklearn" = "Scikit-learn",
27
28
  "spacy" = "spaCy",
28
29
  "span-marker" = "SpanMarker",
@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
24
24
  metrics: [],
25
25
  models: [
26
26
  {
27
- // TO DO: write description
28
27
  description: "Strong Depth Estimation model trained on 1.4 million images.",
29
28
  id: "Intel/dpt-large",
30
29
  },
31
30
  {
32
- // TO DO: write description
33
31
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
34
- id: "vinvino02/glpn-kitti",
32
+ id: "facebook/dpt-dinov2-large-kitti",
33
+ },
34
+ {
35
+ description: "A strong monocular depth estimation model.",
36
+ id: "Bingxin/Marigold",
35
37
  },
36
38
  ],
37
39
  spaces: [
@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
50
50
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
51
51
  id: "naver-clova-ix/donut-base-finetuned-docvqa",
52
52
  },
53
+ {
54
+ description: "A powerful model for document question answering.",
55
+ id: "google/pix2struct-docvqa-large",
56
+ },
53
57
  ],
54
58
  spaces: [
55
59
  {
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
60
64
  description: "An application that can answer questions from invoices.",
61
65
  id: "impira/invoices",
62
66
  },
67
+ {
68
+ description: "An application to compare different document question answering models.",
69
+ id: "merve/compare_docvqa_models",
70
+ },
63
71
  ],
64
72
  summary:
65
73
  "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -27,6 +27,19 @@ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parro
27
27
  ## [{'generated_text': 'two birds are standing next to each other '}]
28
28
  ```
29
29
 
30
+ ### Conversation about the Image
31
+
32
+ Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.
33
+
34
+ ```python
35
+ from transformers import pipeline
36
+
37
+ mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
38
+ mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")
39
+
40
+ ## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
41
+ ```
42
+
30
43
  ### OCR
31
44
 
32
45
  This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
@@ -32,30 +32,26 @@ const taskData: TaskDataCustom = {
32
32
  models: [
33
33
  {
34
34
  description: "A robust image captioning model.",
35
- id: "Salesforce/blip-image-captioning-large",
35
+ id: "Salesforce/blip2-opt-2.7b",
36
36
  },
37
37
  {
38
- description: "A strong image captioning model.",
39
- id: "nlpconnect/vit-gpt2-image-captioning",
38
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
39
+ id: "microsoft/kosmos-2-patch14-224",
40
40
  },
41
41
  {
42
42
  description: "A strong optical character recognition model.",
43
- id: "microsoft/trocr-base-printed",
43
+ id: "facebook/nougat-base",
44
44
  },
45
45
  {
46
- description: "A strong visual question answering model for scientific diagrams.",
47
- id: "google/pix2struct-ai2d-base",
48
- },
49
- {
50
- description: "A strong captioning model for UI components.",
51
- id: "google/pix2struct-widget-captioning-base",
52
- },
53
- {
54
- description: "A captioning model for images that contain text.",
55
- id: "google/pix2struct-textcaps-base",
46
+ description: "A powerful model that lets you have a conversation with the image.",
47
+ id: "llava-hf/llava-1.5-7b-hf",
56
48
  },
57
49
  ],
58
50
  spaces: [
51
+ {
52
+ description: "An application that compares various image captioning models.",
53
+ id: "nielsr/comparing-captioning-models",
54
+ },
59
55
  {
60
56
  description: "A robust image captioning application.",
61
57
  id: "flax-community/image-captioning",
@@ -51,8 +51,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
51
51
  "graph-ml": ["transformers"],
52
52
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
53
53
  "image-segmentation": ["transformers", "transformers.js"],
54
- "image-to-image": ["diffusers", "transformers.js"],
55
- "image-to-text": ["transformers.js"],
54
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
55
+ "image-to-text": ["transformers", "transformers.js"],
56
56
  "image-to-video": ["diffusers"],
57
57
  "video-classification": ["transformers"],
58
58
  "mask-generation": ["transformers"],
@@ -69,7 +69,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
69
69
  "tabular-classification": ["sklearn"],
70
70
  "tabular-regression": ["sklearn"],
71
71
  "tabular-to-text": ["transformers"],
72
- "text-classification": ["adapter-transformers", "spacy", "transformers", "transformers.js"],
72
+ "text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
73
73
  "text-generation": ["transformers", "transformers.js"],
74
74
  "text-retrieval": [],
75
75
  "text-to-image": ["diffusers"],
@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
40
40
  ],
41
41
  models: [
42
42
  {
43
- // TO DO: write description
44
43
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
45
44
  id: "facebook/detr-resnet-50",
46
45
  },
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
50
49
  },
51
50
  ],
52
51
  spaces: [
52
+ {
53
+ description: "Leaderboard to compare various object detection models across several metrics.",
54
+ id: "hf-vision/object_detection_leaderboard",
55
+ },
53
56
  {
54
57
  description: "An object detection application that can detect unseen objects out of the box.",
55
- id: "adirik/OWL-ViT",
58
+ id: "merve/owlv2",
56
59
  },
57
60
  {
58
61
  description: "An application that contains various object detection models to try from.",
@@ -31,7 +31,7 @@ tqa = pipeline(task="table-question-answering", model="google/tapas-large-finetu
31
31
 
32
32
  # result
33
33
 
34
- print(tqa(table=table, query=query)['cells'][0])
34
+ print(tqa(table=table, query=question)['cells'][0])
35
35
  #53
36
36
 
37
37
  ```
@@ -42,6 +42,10 @@ When it comes to text generation, the underlying language model can come in seve
42
42
 
43
43
  - **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
44
44
 
45
+ ## Text Generation from Image and Text
46
+
47
+ There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.
48
+
45
49
  ## Inference
46
50
 
47
51
  You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.
@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
45
45
  ],
46
46
  models: [
47
47
  {
48
- description:
49
- "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
50
- id: "CompVis/stable-diffusion-v1-4",
48
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
49
+ id: "stabilityai/stable-diffusion-xl-base-1.0",
51
50
  },
52
51
  {
53
- description:
54
- "A model that can be used to generate images based on text prompts. The DALL·E Mega model is the largest version of DALLE Mini.",
55
- id: "dalle-mini/dalle-mega",
52
+ description: "A powerful yet fast image generation model.",
53
+ id: "latent-consistency/lcm-lora-sdxl",
56
54
  },
57
55
  {
58
56
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
69
67
  id: "stabilityai/stable-diffusion",
70
68
  },
71
69
  {
72
- description: "An text-to-image application that can generate coherent text inside the image.",
70
+ description: "A text-to-image application to generate comics.",
71
+ id: "jbilcke-hf/ai-comic-factory",
72
+ },
73
+ {
74
+ description: "A text-to-image application that can generate coherent text inside the image.",
73
75
  id: "DeepFloyd/IF",
74
76
  },
75
77
  {
76
- description: "An powerful text-to-image application that can generate images.",
77
- id: "kakaobrain/karlo",
78
+ description: "A powerful yet very fast image generation application.",
79
+ id: "latent-consistency/lcm-lora-for-sdxl",
78
80
  },
79
81
  {
80
- description: "An powerful text-to-image application that can generates 3D representations.",
82
+ description: "A powerful text-to-image application that can generate 3D representations.",
81
83
  id: "hysts/Shap-E",
82
84
  },
83
85
  {
84
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
86
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
85
87
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
86
88
  },
87
89
  ],
@@ -68,7 +68,7 @@ const taskData: TaskDataCustom = {
68
68
  models: [
69
69
  {
70
70
  description: "A strong model for video generation.",
71
- id: "PAIR/text2video-zero-controlnet-canny-arcane",
71
+ id: "Vchitect/LaVie",
72
72
  },
73
73
  {
74
74
  description: "A robust model for text-to-video generation.",
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
76
76
  },
77
77
  {
78
78
  description: "A text-to-video generation model with high quality and smooth outputs.",
79
- id: "cerspense/zeroscope_v2_576w",
79
+ id: "hotshotco/Hotshot-XL",
80
80
  },
81
81
  ],
82
82
  spaces: [
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
86
86
  },
87
87
  {
88
88
  description: "An application that generates video from image and text.",
89
- id: "TempoFunk/makeavid-sd-jax",
89
+ id: "Vchitect/LaVie",
90
90
  },
91
91
  {
92
92
  description: "An application that generates videos from text and provides multi-model support.",
@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
71
71
  },
72
72
  ],
73
73
  spaces: [
74
+ {
75
+ description: "An application that compares visual question answering models across different tasks.",
76
+ id: "merve/pix2struct",
77
+ },
74
78
  {
75
79
  description: "An application that can answer questions based on images.",
76
80
  id: "nielsr/vilt-vqa",
@@ -0,0 +1,118 @@
1
+ /**
2
+ * See default-widget-inputs.ts for the default widget inputs, this files only contains the types
3
+ */
4
+
5
+ type TableData = Record<string, (string | number)[]>;
6
+
7
+ //#region outputs
8
+ export type WidgetExampleOutputLabels = Array<{ label: string; score: number }>;
9
+ export interface WidgetExampleOutputAnswerScore {
10
+ answer: string;
11
+ score: number;
12
+ }
13
+ export interface WidgetExampleOutputText {
14
+ text: string;
15
+ }
16
+ export interface WidgetExampleOutputUrl {
17
+ url: string;
18
+ }
19
+
20
+ export type WidgetExampleOutput =
21
+ | WidgetExampleOutputLabels
22
+ | WidgetExampleOutputAnswerScore
23
+ | WidgetExampleOutputText
24
+ | WidgetExampleOutputUrl;
25
+ //#endregion
26
+
27
+ export interface WidgetExampleBase<TOutput> {
28
+ example_title?: string;
29
+ group?: string;
30
+ /**
31
+ * Potential overrides to API parameters for this specific example
32
+ * (takes precedences over the model card metadata's inference.parameters)
33
+ */
34
+ parameters?: {
35
+ /// token-classification
36
+ aggregation_strategy?: string;
37
+ /// text-generation
38
+ top_k?: number;
39
+ top_p?: number;
40
+ temperature?: number;
41
+ max_new_tokens?: number;
42
+ do_sample?: boolean;
43
+ /// text-to-image
44
+ negative_prompt?: string;
45
+ guidance_scale?: number;
46
+ num_inference_steps?: number;
47
+ };
48
+ /**
49
+ * Optional output
50
+ */
51
+ output?: TOutput;
52
+ }
53
+
54
+ export interface WidgetExampleTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
55
+ text: string;
56
+ }
57
+
58
+ export interface WidgetExampleTextAndContextInput<TOutput = WidgetExampleOutput>
59
+ extends WidgetExampleTextInput<TOutput> {
60
+ context: string;
61
+ }
62
+
63
+ export interface WidgetExampleTextAndTableInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
64
+ table: TableData;
65
+ }
66
+
67
+ export interface WidgetExampleAssetInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
68
+ src: string;
69
+ }
70
+ export interface WidgetExampleAssetAndPromptInput<TOutput = WidgetExampleOutput>
71
+ extends WidgetExampleAssetInput<TOutput> {
72
+ prompt: string;
73
+ }
74
+
75
+ export type WidgetExampleAssetAndTextInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
76
+ WidgetExampleTextInput<TOutput>;
77
+
78
+ export type WidgetExampleAssetAndZeroShotInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
79
+ WidgetExampleZeroShotTextInput<TOutput>;
80
+
81
+ export interface WidgetExampleStructuredDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
82
+ structured_data: TableData;
83
+ }
84
+
85
+ export interface WidgetExampleTableDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
86
+ table: TableData;
87
+ }
88
+
89
+ export interface WidgetExampleZeroShotTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
90
+ text: string;
91
+ candidate_labels: string;
92
+ multi_class: boolean;
93
+ }
94
+
95
+ export interface WidgetExampleSentenceSimilarityInput<TOutput = WidgetExampleOutput>
96
+ extends WidgetExampleBase<TOutput> {
97
+ source_sentence: string;
98
+ sentences: string[];
99
+ }
100
+
101
+ //#endregion
102
+
103
+ export type WidgetExample<TOutput = WidgetExampleOutput> =
104
+ | WidgetExampleTextInput<TOutput>
105
+ | WidgetExampleTextAndContextInput<TOutput>
106
+ | WidgetExampleTextAndTableInput<TOutput>
107
+ | WidgetExampleAssetInput<TOutput>
108
+ | WidgetExampleAssetAndPromptInput<TOutput>
109
+ | WidgetExampleAssetAndTextInput<TOutput>
110
+ | WidgetExampleAssetAndZeroShotInput<TOutput>
111
+ | WidgetExampleStructuredDataInput<TOutput>
112
+ | WidgetExampleTableDataInput<TOutput>
113
+ | WidgetExampleZeroShotTextInput<TOutput>
114
+ | WidgetExampleSentenceSimilarityInput<TOutput>;
115
+
116
+ type KeysOfUnion<T> = T extends unknown ? keyof T : never;
117
+
118
+ export type WidgetExampleAttribute = KeysOfUnion<WidgetExample>;