@huggingface/tasks 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -16,6 +16,7 @@ declare enum ModelLibrary {
16
16
  "flair" = "Flair",
17
17
  "keras" = "Keras",
18
18
  "k2" = "K2",
19
+ "mlx" = "mlx",
19
20
  "nemo" = "NeMo",
20
21
  "open_clip" = "OpenCLIP",
21
22
  "paddlenlp" = "PaddleNLP",
@@ -623,7 +624,7 @@ interface ModelData {
623
624
  inference?: boolean | {
624
625
  parameters?: Record<string, unknown>;
625
626
  };
626
- base_model?: string;
627
+ base_model?: string | string[];
627
628
  };
628
629
  /**
629
630
  * Library name
package/dist/index.js CHANGED
@@ -119,7 +119,7 @@ var asteroid = (model) => [
119
119
  model = BaseModel.from_pretrained("${model.id}")`
120
120
  ];
121
121
  function get_base_diffusers_model(model) {
122
- return model.cardData?.base_model ?? "fill-in-base-model";
122
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
123
123
  }
124
124
  var bertopic = (model) => [
125
125
  `from bertopic import BERTopic
@@ -531,6 +531,12 @@ transcriptions = asr_model.transcribe(["file.wav"])`
531
531
  }
532
532
  };
533
533
  var mlAgents = (model) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
534
+ var mlx = (model) => [
535
+ `pip install huggingface_hub hf_transfer
536
+
537
+ export HF_HUB_ENABLE_HF_TRANSFER=1
538
+ huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`
539
+ ];
534
540
  var nemo = (model) => {
535
541
  let command = void 0;
536
542
  if (model.tags?.includes("automatic-speech-recognition")) {
@@ -605,6 +611,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
605
611
  docsUrl: "https://huggingface.co/docs/hub/keras",
606
612
  snippets: keras
607
613
  },
614
+ mlx: {
615
+ btnLabel: "MLX",
616
+ repoName: "MLX",
617
+ repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
618
+ snippets: mlx
619
+ },
608
620
  nemo: {
609
621
  btnLabel: "NeMo",
610
622
  repoName: "NeMo",
@@ -2356,6 +2368,10 @@ var taskData5 = {
2356
2368
  {
2357
2369
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
2358
2370
  id: "naver-clova-ix/donut-base-finetuned-docvqa"
2371
+ },
2372
+ {
2373
+ description: "A powerful model for document question answering.",
2374
+ id: "google/pix2struct-docvqa-large"
2359
2375
  }
2360
2376
  ],
2361
2377
  spaces: [
@@ -2366,6 +2382,10 @@ var taskData5 = {
2366
2382
  {
2367
2383
  description: "An application that can answer questions from invoices.",
2368
2384
  id: "impira/invoices"
2385
+ },
2386
+ {
2387
+ description: "An application to compare different document question answering models.",
2388
+ id: "merve/compare_docvqa_models"
2369
2389
  }
2370
2390
  ],
2371
2391
  summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2709,30 +2729,26 @@ var taskData10 = {
2709
2729
  models: [
2710
2730
  {
2711
2731
  description: "A robust image captioning model.",
2712
- id: "Salesforce/blip-image-captioning-large"
2732
+ id: "Salesforce/blip2-opt-2.7b"
2713
2733
  },
2714
2734
  {
2715
- description: "A strong image captioning model.",
2716
- id: "nlpconnect/vit-gpt2-image-captioning"
2735
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
2736
+ id: "microsoft/kosmos-2-patch14-224"
2717
2737
  },
2718
2738
  {
2719
2739
  description: "A strong optical character recognition model.",
2720
- id: "microsoft/trocr-base-printed"
2721
- },
2722
- {
2723
- description: "A strong visual question answering model for scientific diagrams.",
2724
- id: "google/pix2struct-ai2d-base"
2740
+ id: "facebook/nougat-base"
2725
2741
  },
2726
2742
  {
2727
- description: "A strong captioning model for UI components.",
2728
- id: "google/pix2struct-widget-captioning-base"
2729
- },
2730
- {
2731
- description: "A captioning model for images that contain text.",
2732
- id: "google/pix2struct-textcaps-base"
2743
+ description: "A powerful model that lets you have a conversation with the image.",
2744
+ id: "llava-hf/llava-1.5-7b-hf"
2733
2745
  }
2734
2746
  ],
2735
2747
  spaces: [
2748
+ {
2749
+ description: "An application that compares various image captioning models.",
2750
+ id: "nielsr/comparing-captioning-models"
2751
+ },
2736
2752
  {
2737
2753
  description: "A robust image captioning application.",
2738
2754
  id: "flax-community/image-captioning"
@@ -2894,7 +2910,6 @@ var taskData12 = {
2894
2910
  ],
2895
2911
  models: [
2896
2912
  {
2897
- // TO DO: write description
2898
2913
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
2899
2914
  id: "facebook/detr-resnet-50"
2900
2915
  },
@@ -2904,9 +2919,13 @@ var taskData12 = {
2904
2919
  }
2905
2920
  ],
2906
2921
  spaces: [
2922
+ {
2923
+ description: "Leaderboard to compare various object detection models across several metrics.",
2924
+ id: "hf-vision/object_detection_leaderboard"
2925
+ },
2907
2926
  {
2908
2927
  description: "An object detection application that can detect unseen objects out of the box.",
2909
- id: "adirik/OWL-ViT"
2928
+ id: "merve/owlv2"
2910
2929
  },
2911
2930
  {
2912
2931
  description: "An application that contains various object detection models to try from.",
@@ -2952,14 +2971,16 @@ var taskData13 = {
2952
2971
  metrics: [],
2953
2972
  models: [
2954
2973
  {
2955
- // TO DO: write description
2956
2974
  description: "Strong Depth Estimation model trained on 1.4 million images.",
2957
2975
  id: "Intel/dpt-large"
2958
2976
  },
2959
2977
  {
2960
- // TO DO: write description
2961
2978
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
2962
- id: "vinvino02/glpn-kitti"
2979
+ id: "facebook/dpt-dinov2-large-kitti"
2980
+ },
2981
+ {
2982
+ description: "A strong monocular depth estimation model.",
2983
+ id: "Bingxin/Marigold"
2963
2984
  }
2964
2985
  ],
2965
2986
  spaces: [
@@ -3513,12 +3534,12 @@ var taskData22 = {
3513
3534
  ],
3514
3535
  models: [
3515
3536
  {
3516
- description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
3517
- id: "CompVis/stable-diffusion-v1-4"
3537
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
3538
+ id: "stabilityai/stable-diffusion-xl-base-1.0"
3518
3539
  },
3519
3540
  {
3520
- description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
3521
- id: "dalle-mini/dalle-mega"
3541
+ description: "A powerful yet fast image generation model.",
3542
+ id: "latent-consistency/lcm-lora-sdxl"
3522
3543
  },
3523
3544
  {
3524
3545
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -3535,19 +3556,23 @@ var taskData22 = {
3535
3556
  id: "stabilityai/stable-diffusion"
3536
3557
  },
3537
3558
  {
3538
- description: "An text-to-image application that can generate coherent text inside the image.",
3559
+ description: "A text-to-image application to generate comics.",
3560
+ id: "jbilcke-hf/ai-comic-factory"
3561
+ },
3562
+ {
3563
+ description: "A text-to-image application that can generate coherent text inside the image.",
3539
3564
  id: "DeepFloyd/IF"
3540
3565
  },
3541
3566
  {
3542
- description: "An powerful text-to-image application that can generate images.",
3543
- id: "kakaobrain/karlo"
3567
+ description: "A powerful yet very fast image generation application.",
3568
+ id: "latent-consistency/lcm-lora-for-sdxl"
3544
3569
  },
3545
3570
  {
3546
- description: "An powerful text-to-image application that can generates 3D representations.",
3571
+ description: "A powerful text-to-image application that can generate 3D representations.",
3547
3572
  id: "hysts/Shap-E"
3548
3573
  },
3549
3574
  {
3550
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
3575
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
3551
3576
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
3552
3577
  }
3553
3578
  ],
@@ -4039,7 +4064,7 @@ var taskData28 = {
4039
4064
  models: [
4040
4065
  {
4041
4066
  description: "A strong model for video generation.",
4042
- id: "PAIR/text2video-zero-controlnet-canny-arcane"
4067
+ id: "Vchitect/LaVie"
4043
4068
  },
4044
4069
  {
4045
4070
  description: "A robust model for text-to-video generation.",
@@ -4047,7 +4072,7 @@ var taskData28 = {
4047
4072
  },
4048
4073
  {
4049
4074
  description: "A text-to-video generation model with high quality and smooth outputs.",
4050
- id: "cerspense/zeroscope_v2_576w"
4075
+ id: "hotshotco/Hotshot-XL"
4051
4076
  }
4052
4077
  ],
4053
4078
  spaces: [
@@ -4057,7 +4082,7 @@ var taskData28 = {
4057
4082
  },
4058
4083
  {
4059
4084
  description: "An application that generates video from image and text.",
4060
- id: "TempoFunk/makeavid-sd-jax"
4085
+ id: "Vchitect/LaVie"
4061
4086
  },
4062
4087
  {
4063
4088
  description: "An application that generates videos from text and provides multi-model support.",
@@ -4287,6 +4312,10 @@ var taskData31 = {
4287
4312
  }
4288
4313
  ],
4289
4314
  spaces: [
4315
+ {
4316
+ description: "An application that compares visual question answering models across different tasks.",
4317
+ id: "merve/pix2struct"
4318
+ },
4290
4319
  {
4291
4320
  description: "An application that can answer questions based on images.",
4292
4321
  id: "nielsr/vilt-vqa"
@@ -4454,8 +4483,8 @@ var TASKS_MODEL_LIBRARIES = {
4454
4483
  "graph-ml": ["transformers"],
4455
4484
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4456
4485
  "image-segmentation": ["transformers", "transformers.js"],
4457
- "image-to-image": ["diffusers", "transformers.js"],
4458
- "image-to-text": ["transformers.js"],
4486
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
4487
+ "image-to-text": ["transformers", "transformers.js"],
4459
4488
  "image-to-video": ["diffusers"],
4460
4489
  "video-classification": ["transformers"],
4461
4490
  "mask-generation": ["transformers"],
@@ -4572,6 +4601,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
4572
4601
  ModelLibrary2["flair"] = "Flair";
4573
4602
  ModelLibrary2["keras"] = "Keras";
4574
4603
  ModelLibrary2["k2"] = "K2";
4604
+ ModelLibrary2["mlx"] = "mlx";
4575
4605
  ModelLibrary2["nemo"] = "NeMo";
4576
4606
  ModelLibrary2["open_clip"] = "OpenCLIP";
4577
4607
  ModelLibrary2["paddlenlp"] = "PaddleNLP";
package/dist/index.mjs CHANGED
@@ -81,7 +81,7 @@ var asteroid = (model) => [
81
81
  model = BaseModel.from_pretrained("${model.id}")`
82
82
  ];
83
83
  function get_base_diffusers_model(model) {
84
- return model.cardData?.base_model ?? "fill-in-base-model";
84
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
85
85
  }
86
86
  var bertopic = (model) => [
87
87
  `from bertopic import BERTopic
@@ -493,6 +493,12 @@ transcriptions = asr_model.transcribe(["file.wav"])`
493
493
  }
494
494
  };
495
495
  var mlAgents = (model) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
496
+ var mlx = (model) => [
497
+ `pip install huggingface_hub hf_transfer
498
+
499
+ export HF_HUB_ENABLE_HF_TRANSFER=1
500
+ huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`
501
+ ];
496
502
  var nemo = (model) => {
497
503
  let command = void 0;
498
504
  if (model.tags?.includes("automatic-speech-recognition")) {
@@ -567,6 +573,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
567
573
  docsUrl: "https://huggingface.co/docs/hub/keras",
568
574
  snippets: keras
569
575
  },
576
+ mlx: {
577
+ btnLabel: "MLX",
578
+ repoName: "MLX",
579
+ repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
580
+ snippets: mlx
581
+ },
570
582
  nemo: {
571
583
  btnLabel: "NeMo",
572
584
  repoName: "NeMo",
@@ -2318,6 +2330,10 @@ var taskData5 = {
2318
2330
  {
2319
2331
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
2320
2332
  id: "naver-clova-ix/donut-base-finetuned-docvqa"
2333
+ },
2334
+ {
2335
+ description: "A powerful model for document question answering.",
2336
+ id: "google/pix2struct-docvqa-large"
2321
2337
  }
2322
2338
  ],
2323
2339
  spaces: [
@@ -2328,6 +2344,10 @@ var taskData5 = {
2328
2344
  {
2329
2345
  description: "An application that can answer questions from invoices.",
2330
2346
  id: "impira/invoices"
2347
+ },
2348
+ {
2349
+ description: "An application to compare different document question answering models.",
2350
+ id: "merve/compare_docvqa_models"
2331
2351
  }
2332
2352
  ],
2333
2353
  summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2671,30 +2691,26 @@ var taskData10 = {
2671
2691
  models: [
2672
2692
  {
2673
2693
  description: "A robust image captioning model.",
2674
- id: "Salesforce/blip-image-captioning-large"
2694
+ id: "Salesforce/blip2-opt-2.7b"
2675
2695
  },
2676
2696
  {
2677
- description: "A strong image captioning model.",
2678
- id: "nlpconnect/vit-gpt2-image-captioning"
2697
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
2698
+ id: "microsoft/kosmos-2-patch14-224"
2679
2699
  },
2680
2700
  {
2681
2701
  description: "A strong optical character recognition model.",
2682
- id: "microsoft/trocr-base-printed"
2683
- },
2684
- {
2685
- description: "A strong visual question answering model for scientific diagrams.",
2686
- id: "google/pix2struct-ai2d-base"
2702
+ id: "facebook/nougat-base"
2687
2703
  },
2688
2704
  {
2689
- description: "A strong captioning model for UI components.",
2690
- id: "google/pix2struct-widget-captioning-base"
2691
- },
2692
- {
2693
- description: "A captioning model for images that contain text.",
2694
- id: "google/pix2struct-textcaps-base"
2705
+ description: "A powerful model that lets you have a conversation with the image.",
2706
+ id: "llava-hf/llava-1.5-7b-hf"
2695
2707
  }
2696
2708
  ],
2697
2709
  spaces: [
2710
+ {
2711
+ description: "An application that compares various image captioning models.",
2712
+ id: "nielsr/comparing-captioning-models"
2713
+ },
2698
2714
  {
2699
2715
  description: "A robust image captioning application.",
2700
2716
  id: "flax-community/image-captioning"
@@ -2856,7 +2872,6 @@ var taskData12 = {
2856
2872
  ],
2857
2873
  models: [
2858
2874
  {
2859
- // TO DO: write description
2860
2875
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
2861
2876
  id: "facebook/detr-resnet-50"
2862
2877
  },
@@ -2866,9 +2881,13 @@ var taskData12 = {
2866
2881
  }
2867
2882
  ],
2868
2883
  spaces: [
2884
+ {
2885
+ description: "Leaderboard to compare various object detection models across several metrics.",
2886
+ id: "hf-vision/object_detection_leaderboard"
2887
+ },
2869
2888
  {
2870
2889
  description: "An object detection application that can detect unseen objects out of the box.",
2871
- id: "adirik/OWL-ViT"
2890
+ id: "merve/owlv2"
2872
2891
  },
2873
2892
  {
2874
2893
  description: "An application that contains various object detection models to try from.",
@@ -2914,14 +2933,16 @@ var taskData13 = {
2914
2933
  metrics: [],
2915
2934
  models: [
2916
2935
  {
2917
- // TO DO: write description
2918
2936
  description: "Strong Depth Estimation model trained on 1.4 million images.",
2919
2937
  id: "Intel/dpt-large"
2920
2938
  },
2921
2939
  {
2922
- // TO DO: write description
2923
2940
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
2924
- id: "vinvino02/glpn-kitti"
2941
+ id: "facebook/dpt-dinov2-large-kitti"
2942
+ },
2943
+ {
2944
+ description: "A strong monocular depth estimation model.",
2945
+ id: "Bingxin/Marigold"
2925
2946
  }
2926
2947
  ],
2927
2948
  spaces: [
@@ -3475,12 +3496,12 @@ var taskData22 = {
3475
3496
  ],
3476
3497
  models: [
3477
3498
  {
3478
- description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
3479
- id: "CompVis/stable-diffusion-v1-4"
3499
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
3500
+ id: "stabilityai/stable-diffusion-xl-base-1.0"
3480
3501
  },
3481
3502
  {
3482
- description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
3483
- id: "dalle-mini/dalle-mega"
3503
+ description: "A powerful yet fast image generation model.",
3504
+ id: "latent-consistency/lcm-lora-sdxl"
3484
3505
  },
3485
3506
  {
3486
3507
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -3497,19 +3518,23 @@ var taskData22 = {
3497
3518
  id: "stabilityai/stable-diffusion"
3498
3519
  },
3499
3520
  {
3500
- description: "An text-to-image application that can generate coherent text inside the image.",
3521
+ description: "A text-to-image application to generate comics.",
3522
+ id: "jbilcke-hf/ai-comic-factory"
3523
+ },
3524
+ {
3525
+ description: "A text-to-image application that can generate coherent text inside the image.",
3501
3526
  id: "DeepFloyd/IF"
3502
3527
  },
3503
3528
  {
3504
- description: "An powerful text-to-image application that can generate images.",
3505
- id: "kakaobrain/karlo"
3529
+ description: "A powerful yet very fast image generation application.",
3530
+ id: "latent-consistency/lcm-lora-for-sdxl"
3506
3531
  },
3507
3532
  {
3508
- description: "An powerful text-to-image application that can generates 3D representations.",
3533
+ description: "A powerful text-to-image application that can generate 3D representations.",
3509
3534
  id: "hysts/Shap-E"
3510
3535
  },
3511
3536
  {
3512
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
3537
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
3513
3538
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
3514
3539
  }
3515
3540
  ],
@@ -4001,7 +4026,7 @@ var taskData28 = {
4001
4026
  models: [
4002
4027
  {
4003
4028
  description: "A strong model for video generation.",
4004
- id: "PAIR/text2video-zero-controlnet-canny-arcane"
4029
+ id: "Vchitect/LaVie"
4005
4030
  },
4006
4031
  {
4007
4032
  description: "A robust model for text-to-video generation.",
@@ -4009,7 +4034,7 @@ var taskData28 = {
4009
4034
  },
4010
4035
  {
4011
4036
  description: "A text-to-video generation model with high quality and smooth outputs.",
4012
- id: "cerspense/zeroscope_v2_576w"
4037
+ id: "hotshotco/Hotshot-XL"
4013
4038
  }
4014
4039
  ],
4015
4040
  spaces: [
@@ -4019,7 +4044,7 @@ var taskData28 = {
4019
4044
  },
4020
4045
  {
4021
4046
  description: "An application that generates video from image and text.",
4022
- id: "TempoFunk/makeavid-sd-jax"
4047
+ id: "Vchitect/LaVie"
4023
4048
  },
4024
4049
  {
4025
4050
  description: "An application that generates videos from text and provides multi-model support.",
@@ -4249,6 +4274,10 @@ var taskData31 = {
4249
4274
  }
4250
4275
  ],
4251
4276
  spaces: [
4277
+ {
4278
+ description: "An application that compares visual question answering models across different tasks.",
4279
+ id: "merve/pix2struct"
4280
+ },
4252
4281
  {
4253
4282
  description: "An application that can answer questions based on images.",
4254
4283
  id: "nielsr/vilt-vqa"
@@ -4416,8 +4445,8 @@ var TASKS_MODEL_LIBRARIES = {
4416
4445
  "graph-ml": ["transformers"],
4417
4446
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4418
4447
  "image-segmentation": ["transformers", "transformers.js"],
4419
- "image-to-image": ["diffusers", "transformers.js"],
4420
- "image-to-text": ["transformers.js"],
4448
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
4449
+ "image-to-text": ["transformers", "transformers.js"],
4421
4450
  "image-to-video": ["diffusers"],
4422
4451
  "video-classification": ["transformers"],
4423
4452
  "mask-generation": ["transformers"],
@@ -4534,6 +4563,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
4534
4563
  ModelLibrary2["flair"] = "Flair";
4535
4564
  ModelLibrary2["keras"] = "Keras";
4536
4565
  ModelLibrary2["k2"] = "K2";
4566
+ ModelLibrary2["mlx"] = "mlx";
4537
4567
  ModelLibrary2["nemo"] = "NeMo";
4538
4568
  ModelLibrary2["open_clip"] = "OpenCLIP";
4539
4569
  ModelLibrary2["paddlenlp"] = "PaddleNLP";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.1.0",
4
+ "version": "0.1.2",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
@@ -72,7 +72,7 @@ model = BaseModel.from_pretrained("${model.id}")`,
72
72
  ];
73
73
 
74
74
  function get_base_diffusers_model(model: ModelData): string {
75
- return model.cardData?.base_model ?? "fill-in-base-model";
75
+ return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
76
76
  }
77
77
 
78
78
  const bertopic = (model: ModelData) => [
@@ -541,6 +541,13 @@ transcriptions = asr_model.transcribe(["file.wav"])`,
541
541
 
542
542
  const mlAgents = (model: ModelData) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
543
543
 
544
+ const mlx = (model: ModelData) => [
545
+ `pip install huggingface_hub hf_transfer
546
+
547
+ export HF_HUB_ENABLE_HF_TRANSFER=1
548
+ huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`,
549
+ ];
550
+
544
551
  const nemo = (model: ModelData) => {
545
552
  let command: string[] | undefined = undefined;
546
553
  // Resolve the tag to a nemo domain/sub-domain
@@ -621,6 +628,12 @@ export const MODEL_LIBRARIES_UI_ELEMENTS: Partial<Record<ModelLibraryKey, Librar
621
628
  docsUrl: "https://huggingface.co/docs/hub/keras",
622
629
  snippets: keras,
623
630
  },
631
+ mlx: {
632
+ btnLabel: "MLX",
633
+ repoName: "MLX",
634
+ repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
635
+ snippets: mlx,
636
+ },
624
637
  nemo: {
625
638
  btnLabel: "NeMo",
626
639
  repoName: "NeMo",
package/src/model-data.ts CHANGED
@@ -93,7 +93,7 @@ export interface ModelData {
93
93
  | {
94
94
  parameters?: Record<string, unknown>;
95
95
  };
96
- base_model?: string;
96
+ base_model?: string | string[];
97
97
  };
98
98
  /**
99
99
  * Library name
@@ -16,6 +16,7 @@ export enum ModelLibrary {
16
16
  "flair" = "Flair",
17
17
  "keras" = "Keras",
18
18
  "k2" = "K2",
19
+ "mlx" = "mlx",
19
20
  "nemo" = "NeMo",
20
21
  "open_clip" = "OpenCLIP",
21
22
  "paddlenlp" = "PaddleNLP",
@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
24
24
  metrics: [],
25
25
  models: [
26
26
  {
27
- // TO DO: write description
28
27
  description: "Strong Depth Estimation model trained on 1.4 million images.",
29
28
  id: "Intel/dpt-large",
30
29
  },
31
30
  {
32
- // TO DO: write description
33
31
  description: "Strong Depth Estimation model trained on the KITTI dataset.",
34
- id: "vinvino02/glpn-kitti",
32
+ id: "facebook/dpt-dinov2-large-kitti",
33
+ },
34
+ {
35
+ description: "A strong monocular depth estimation model.",
36
+ id: "Bingxin/Marigold",
35
37
  },
36
38
  ],
37
39
  spaces: [
@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
50
50
  description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
51
51
  id: "naver-clova-ix/donut-base-finetuned-docvqa",
52
52
  },
53
+ {
54
+ description: "A powerful model for document question answering.",
55
+ id: "google/pix2struct-docvqa-large",
56
+ },
53
57
  ],
54
58
  spaces: [
55
59
  {
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
60
64
  description: "An application that can answer questions from invoices.",
61
65
  id: "impira/invoices",
62
66
  },
67
+ {
68
+ description: "An application to compare different document question answering models.",
69
+ id: "merve/compare_docvqa_models",
70
+ },
63
71
  ],
64
72
  summary:
65
73
  "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -27,6 +27,19 @@ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parro
27
27
  ## [{'generated_text': 'two birds are standing next to each other '}]
28
28
  ```
29
29
 
30
+ ### Conversation about the Image
31
+
32
+ Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.
33
+
34
+ ```python
35
+ from transformers import pipeline
36
+
37
+ mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
38
+ mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")
39
+
40
+ ## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
41
+ ```
42
+
30
43
  ### OCR
31
44
 
32
45
  This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
@@ -32,30 +32,26 @@ const taskData: TaskDataCustom = {
32
32
  models: [
33
33
  {
34
34
  description: "A robust image captioning model.",
35
- id: "Salesforce/blip-image-captioning-large",
35
+ id: "Salesforce/blip2-opt-2.7b",
36
36
  },
37
37
  {
38
- description: "A strong image captioning model.",
39
- id: "nlpconnect/vit-gpt2-image-captioning",
38
+ description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
39
+ id: "microsoft/kosmos-2-patch14-224",
40
40
  },
41
41
  {
42
42
  description: "A strong optical character recognition model.",
43
- id: "microsoft/trocr-base-printed",
43
+ id: "facebook/nougat-base",
44
44
  },
45
45
  {
46
- description: "A strong visual question answering model for scientific diagrams.",
47
- id: "google/pix2struct-ai2d-base",
48
- },
49
- {
50
- description: "A strong captioning model for UI components.",
51
- id: "google/pix2struct-widget-captioning-base",
52
- },
53
- {
54
- description: "A captioning model for images that contain text.",
55
- id: "google/pix2struct-textcaps-base",
46
+ description: "A powerful model that lets you have a conversation with the image.",
47
+ id: "llava-hf/llava-1.5-7b-hf",
56
48
  },
57
49
  ],
58
50
  spaces: [
51
+ {
52
+ description: "An application that compares various image captioning models.",
53
+ id: "nielsr/comparing-captioning-models",
54
+ },
59
55
  {
60
56
  description: "A robust image captioning application.",
61
57
  id: "flax-community/image-captioning",
@@ -51,8 +51,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
51
51
  "graph-ml": ["transformers"],
52
52
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
53
53
  "image-segmentation": ["transformers", "transformers.js"],
54
- "image-to-image": ["diffusers", "transformers.js"],
55
- "image-to-text": ["transformers.js"],
54
+ "image-to-image": ["diffusers", "transformers", "transformers.js"],
55
+ "image-to-text": ["transformers", "transformers.js"],
56
56
  "image-to-video": ["diffusers"],
57
57
  "video-classification": ["transformers"],
58
58
  "mask-generation": ["transformers"],
@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
40
40
  ],
41
41
  models: [
42
42
  {
43
- // TO DO: write description
44
43
  description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
45
44
  id: "facebook/detr-resnet-50",
46
45
  },
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
50
49
  },
51
50
  ],
52
51
  spaces: [
52
+ {
53
+ description: "Leaderboard to compare various object detection models across several metrics.",
54
+ id: "hf-vision/object_detection_leaderboard",
55
+ },
53
56
  {
54
57
  description: "An object detection application that can detect unseen objects out of the box.",
55
- id: "adirik/OWL-ViT",
58
+ id: "merve/owlv2",
56
59
  },
57
60
  {
58
61
  description: "An application that contains various object detection models to try from.",
@@ -31,7 +31,7 @@ tqa = pipeline(task="table-question-answering", model="google/tapas-large-finetu
31
31
 
32
32
  # result
33
33
 
34
- print(tqa(table=table, query=query)['cells'][0])
34
+ print(tqa(table=table, query=question)['cells'][0])
35
35
  #53
36
36
 
37
37
  ```
@@ -42,6 +42,10 @@ When it comes to text generation, the underlying language model can come in seve
42
42
 
43
43
  - **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
44
44
 
45
+ ## Text Generation from Image and Text
46
+
47
+ There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.
48
+
45
49
  ## Inference
46
50
 
47
51
  You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.
@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
45
45
  ],
46
46
  models: [
47
47
  {
48
- description:
49
- "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
50
- id: "CompVis/stable-diffusion-v1-4",
48
+ description: "One of the most powerful image generation models that can generate realistic outputs.",
49
+ id: "stabilityai/stable-diffusion-xl-base-1.0",
51
50
  },
52
51
  {
53
- description:
54
- "A model that can be used to generate images based on text prompts. The DALL·E Mega model is the largest version of DALLE Mini.",
55
- id: "dalle-mini/dalle-mega",
52
+ description: "A powerful yet fast image generation model.",
53
+ id: "latent-consistency/lcm-lora-sdxl",
56
54
  },
57
55
  {
58
56
  description: "A text-to-image model that can generate coherent text inside image.",
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
69
67
  id: "stabilityai/stable-diffusion",
70
68
  },
71
69
  {
72
- description: "An text-to-image application that can generate coherent text inside the image.",
70
+ description: "A text-to-image application to generate comics.",
71
+ id: "jbilcke-hf/ai-comic-factory",
72
+ },
73
+ {
74
+ description: "A text-to-image application that can generate coherent text inside the image.",
73
75
  id: "DeepFloyd/IF",
74
76
  },
75
77
  {
76
- description: "An powerful text-to-image application that can generate images.",
77
- id: "kakaobrain/karlo",
78
+ description: "A powerful yet very fast image generation application.",
79
+ id: "latent-consistency/lcm-lora-for-sdxl",
78
80
  },
79
81
  {
80
- description: "An powerful text-to-image application that can generates 3D representations.",
82
+ description: "A powerful text-to-image application that can generate 3D representations.",
81
83
  id: "hysts/Shap-E",
82
84
  },
83
85
  {
84
- description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
86
+ description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
85
87
  id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
86
88
  },
87
89
  ],
@@ -68,7 +68,7 @@ const taskData: TaskDataCustom = {
68
68
  models: [
69
69
  {
70
70
  description: "A strong model for video generation.",
71
- id: "PAIR/text2video-zero-controlnet-canny-arcane",
71
+ id: "Vchitect/LaVie",
72
72
  },
73
73
  {
74
74
  description: "A robust model for text-to-video generation.",
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
76
76
  },
77
77
  {
78
78
  description: "A text-to-video generation model with high quality and smooth outputs.",
79
- id: "cerspense/zeroscope_v2_576w",
79
+ id: "hotshotco/Hotshot-XL",
80
80
  },
81
81
  ],
82
82
  spaces: [
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
86
86
  },
87
87
  {
88
88
  description: "An application that generates video from image and text.",
89
- id: "TempoFunk/makeavid-sd-jax",
89
+ id: "Vchitect/LaVie",
90
90
  },
91
91
  {
92
92
  description: "An application that generates videos from text and provides multi-model support.",
@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
71
71
  },
72
72
  ],
73
73
  spaces: [
74
+ {
75
+ description: "An application that compares visual question answering models across different tasks.",
76
+ id: "merve/pix2struct",
77
+ },
74
78
  {
75
79
  description: "An application that can answer questions based on images.",
76
80
  id: "nielsr/vilt-vqa",