@huggingface/tasks 0.11.7 → 0.11.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1388,6 +1388,12 @@ var PIPELINE_DATA = {
1388
1388
  modality: "cv",
1389
1389
  color: "indigo"
1390
1390
  },
1391
+ "video-text-to-text": {
1392
+ name: "Video-Text-to-Text",
1393
+ modality: "multimodal",
1394
+ color: "blue",
1395
+ hideInDatasets: true
1396
+ },
1391
1397
  other: {
1392
1398
  name: "Other",
1393
1399
  modality: "other",
@@ -1731,8 +1737,12 @@ var taskData5 = {
1731
1737
  ],
1732
1738
  spaces: [
1733
1739
  {
1734
- description: "A leaderboard to rank best feature extraction models..",
1740
+ description: "A leaderboard to rank text feature extraction models based on a benchmark.",
1735
1741
  id: "mteb/leaderboard"
1742
+ },
1743
+ {
1744
+ description: "A leaderboard to rank best feature extraction models based on human feedback.",
1745
+ id: "mteb/arena"
1736
1746
  }
1737
1747
  ],
1738
1748
  summary: "Feature extraction is the task of extracting features learnt in a model.",
@@ -1937,15 +1947,19 @@ var taskData8 = {
1937
1947
  },
1938
1948
  {
1939
1949
  description: "A strong image feature extraction model.",
1940
- id: "google/vit-base-patch16-224-in21k"
1950
+ id: "nvidia/MambaVision-T-1K"
1941
1951
  },
1942
1952
  {
1943
- description: "A robust image feature extraction models.",
1953
+ description: "A robust image feature extraction model.",
1944
1954
  id: "facebook/dino-vitb16"
1945
1955
  },
1946
1956
  {
1947
- description: "Strong image-text-to-text model made for information retrieval from documents.",
1957
+ description: "Strong image feature extraction model made for information retrieval from documents.",
1948
1958
  id: "vidore/colpali"
1959
+ },
1960
+ {
1961
+ description: "Strong image feature extraction model that can be used on images and documents.",
1962
+ id: "OpenGVLab/InternViT-6B-448px-V1-2"
1949
1963
  }
1950
1964
  ],
1951
1965
  spaces: [],
@@ -1997,8 +2011,8 @@ var taskData9 = {
1997
2011
  ],
1998
2012
  models: [
1999
2013
  {
2000
- description: "A model that enhances images captured in low light conditions.",
2001
- id: "keras-io/low-light-image-enhancement"
2014
+ description: "An image-to-image model to improve image resolution.",
2015
+ id: "fal/AuraSR-v2"
2002
2016
  },
2003
2017
  {
2004
2018
  description: "A model that increases the resolution of an image.",
@@ -2216,7 +2230,7 @@ var taskData11 = {
2216
2230
  ],
2217
2231
  summary: "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
2218
2232
  widgetModels: ["microsoft/kosmos-2-patch14-224"],
2219
- youtubeId: ""
2233
+ youtubeId: "IoGaGfU1CIg"
2220
2234
  };
2221
2235
  var data_default11 = taskData11;
2222
2236
 
@@ -2267,16 +2281,16 @@ var taskData12 = {
2267
2281
  id: "facebook/detr-resnet-50-panoptic"
2268
2282
  },
2269
2283
  {
2270
- description: "Semantic segmentation model trained on ADE20k benchmark dataset.",
2271
- id: "microsoft/beit-large-finetuned-ade-640-640"
2284
+ description: "Background removal model.",
2285
+ id: "briaai/RMBG-1.4"
2272
2286
  },
2273
2287
  {
2274
2288
  description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.",
2275
2289
  id: "nvidia/segformer-b0-finetuned-ade-512-512"
2276
2290
  },
2277
2291
  {
2278
- description: "Semantic segmentation model trained Cityscapes dataset.",
2279
- id: "facebook/mask2former-swin-large-cityscapes-semantic"
2292
+ description: "A multipurpose image segmentation model for high resolution images.",
2293
+ id: "ZhengPeng7/BiRefNet"
2280
2294
  },
2281
2295
  {
2282
2296
  description: "Panoptic segmentation model trained COCO (common objects) dataset.",
@@ -2340,13 +2354,13 @@ var taskData13 = {
2340
2354
  },
2341
2355
  {
2342
2356
  description: "Very strong mask generation model.",
2343
- id: "facebook/sam-vit-huge"
2357
+ id: "facebook/sam2-hiera-large"
2344
2358
  }
2345
2359
  ],
2346
2360
  spaces: [
2347
2361
  {
2348
- description: "An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
2349
- id: "SkalskiP/SAM_and_MetaCLIP"
2362
+ description: "An application that combines a mask generation model with a zero-shot object detection model for text-guided image segmentation.",
2363
+ id: "merve/OWLSAM2"
2350
2364
  },
2351
2365
  {
2352
2366
  description: "An application that compares the performance of a large and a small mask generation model.",
@@ -2354,7 +2368,7 @@ var taskData13 = {
2354
2368
  },
2355
2369
  {
2356
2370
  description: "An application based on an improved mask generation model.",
2357
- id: "linfanluntan/Grounded-SAM"
2371
+ id: "SkalskiP/segment-anything-model-2"
2358
2372
  },
2359
2373
  {
2360
2374
  description: "An application to remove objects from videos using mask generation models.",
@@ -3050,15 +3064,15 @@ var taskData24 = {
3050
3064
  models: [
3051
3065
  {
3052
3066
  description: "One of the most powerful image generation models that can generate realistic outputs.",
3053
- id: "stabilityai/stable-diffusion-xl-base-1.0"
3067
+ id: "black-forest-labs/FLUX.1-dev"
3054
3068
  },
3055
3069
  {
3056
3070
  description: "A powerful yet fast image generation model.",
3057
3071
  id: "latent-consistency/lcm-lora-sdxl"
3058
3072
  },
3059
3073
  {
3060
- description: "A very fast text-to-image model.",
3061
- id: "ByteDance/SDXL-Lightning"
3074
+ description: "Text-to-image model for photorealistic generation.",
3075
+ id: "Kwai-Kolors/Kolors"
3062
3076
  },
3063
3077
  {
3064
3078
  description: "A powerful text-to-image model.",
@@ -3419,6 +3433,10 @@ var taskData29 = {
3419
3433
  {
3420
3434
  description: "An instruction dataset with preference ratings on responses.",
3421
3435
  id: "openbmb/UltraFeedback"
3436
+ },
3437
+ {
3438
+ description: "A large synthetic dataset for alignment of text generation models.",
3439
+ id: "argilla/magpie-ultra-v0.1"
3422
3440
  }
3423
3441
  ],
3424
3442
  demo: {
@@ -3449,32 +3467,32 @@ var taskData29 = {
3449
3467
  ],
3450
3468
  models: [
3451
3469
  {
3452
- description: "A large language model trained for text generation.",
3453
- id: "bigscience/bloom-560m"
3470
+ description: "A text-generation model trained to follow instructions.",
3471
+ id: "google/gemma-2-2b-it"
3454
3472
  },
3455
3473
  {
3456
- description: "A large code generation model that can generate code in 80+ languages.",
3474
+ description: "A code generation model that can generate code in 80+ languages.",
3457
3475
  id: "bigcode/starcoder"
3458
3476
  },
3459
3477
  {
3460
- description: "A very powerful text generation model.",
3461
- id: "mistralai/Mixtral-8x7B-Instruct-v0.1"
3478
+ description: "Very powerful text generation model trained to follow instructions.",
3479
+ id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
3462
3480
  },
3463
3481
  {
3464
3482
  description: "Small yet powerful text generation model.",
3465
- id: "microsoft/phi-2"
3483
+ id: "microsoft/Phi-3-mini-4k-instruct"
3466
3484
  },
3467
3485
  {
3468
- description: "A very powerful model that can chat, do mathematical reasoning and write code.",
3469
- id: "openchat/openchat-3.5-0106"
3486
+ description: "A very powerful model that can solve mathematical problems.",
3487
+ id: "AI-MO/NuminaMath-7B-TIR"
3470
3488
  },
3471
3489
  {
3472
- description: "Very strong yet small assistant model.",
3473
- id: "HuggingFaceH4/zephyr-7b-beta"
3490
+ description: "Strong coding assistant model.",
3491
+ id: "HuggingFaceH4/starchat2-15b-v0.1"
3474
3492
  },
3475
3493
  {
3476
3494
  description: "Very strong open-source large language model.",
3477
- id: "meta-llama/Llama-2-70b-hf"
3495
+ id: "mistralai/Mistral-Nemo-Instruct-2407"
3478
3496
  }
3479
3497
  ],
3480
3498
  spaces: [
@@ -3501,7 +3519,7 @@ var taskData29 = {
3501
3519
  ],
3502
3520
  summary: "Generating text is the task of generating new text given another text. These models can, for example, fill in incomplete text or paraphrase.",
3503
3521
  widgetModels: ["HuggingFaceH4/zephyr-7b-beta"],
3504
- youtubeId: "Vpjb1lu0MDk"
3522
+ youtubeId: "e9gNEAlsOvU"
3505
3523
  };
3506
3524
  var data_default29 = taskData29;
3507
3525
 
@@ -4226,6 +4244,7 @@ var TASKS_MODEL_LIBRARIES = {
4226
4244
  ],
4227
4245
  translation: ["transformers", "transformers.js"],
4228
4246
  "unconditional-image-generation": ["diffusers"],
4247
+ "video-text-to-text": ["transformers"],
4229
4248
  "visual-question-answering": ["transformers", "transformers.js"],
4230
4249
  "voice-activity-detection": [],
4231
4250
  "zero-shot-classification": ["transformers", "transformers.js"],
@@ -4285,6 +4304,7 @@ var TASKS_DATA = {
4285
4304
  "token-classification": getData("token-classification", data_default26),
4286
4305
  translation: getData("translation", data_default27),
4287
4306
  "unconditional-image-generation": getData("unconditional-image-generation", data_default31),
4307
+ "video-text-to-text": getData("video-text-to-text", data_default16),
4288
4308
  "visual-question-answering": getData("visual-question-answering", data_default33),
4289
4309
  "voice-activity-detection": void 0,
4290
4310
  "zero-shot-classification": getData("zero-shot-classification", data_default34),
@@ -4522,6 +4542,23 @@ tokenizer = keras_nlp.models.Tokenizer.from_preset("hf://${model.id}")
4522
4542
  backbone = keras_nlp.models.Backbone.from_preset("hf://${model.id}")
4523
4543
  `
4524
4544
  ];
4545
+ var llama_cpp_python = (model) => [
4546
+ `from llama_cpp import Llama
4547
+
4548
+ llm = Llama.from_pretrained(
4549
+ repo_id="${model.id}",
4550
+ filename="{{GGUF_FILE}}",
4551
+ )
4552
+
4553
+ llm.create_chat_completion(
4554
+ messages = [
4555
+ {
4556
+ "role": "user",
4557
+ "content": "What is the capital of France?"
4558
+ }
4559
+ ]
4560
+ )`
4561
+ ];
4525
4562
  var tf_keras = (model) => [
4526
4563
  `# Note: 'keras<3.x' or 'tf_keras' must be installed (legacy)
4527
4564
  # See https://github.com/keras-team/tf-keras for more details.
@@ -4747,6 +4784,33 @@ var fastai = (model) => [
4747
4784
 
4748
4785
  learn = from_pretrained_fastai("${model.id}")`
4749
4786
  ];
4787
+ var sam2 = (model) => {
4788
+ const image_predictor = `# Use SAM2 with images
4789
+ import torch
4790
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
4791
+
4792
+ predictor = SAM2ImagePredictor.from_pretrained(${model.id})
4793
+
4794
+ with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
4795
+ predictor.set_image(<your_image>)
4796
+ masks, _, _ = predictor.predict(<input_prompts>)`;
4797
+ const video_predictor = `# Use SAM2 with videos
4798
+ import torch
4799
+ from sam2.sam2_video_predictor import SAM2VideoPredictor
4800
+
4801
+ predictor = SAM2VideoPredictor.from_pretrained(${model.id})
4802
+
4803
+ with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
4804
+ state = predictor.init_state(<your_video>)
4805
+
4806
+ # add new prompts and instantly get the output on the same frame
4807
+ frame_idx, object_ids, masks = predictor.add_new_points(state, <your_prompts>):
4808
+
4809
+ # propagate the prompts to get masklets throughout the video
4810
+ for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
4811
+ ...`;
4812
+ return [image_predictor, video_predictor];
4813
+ };
4750
4814
  var sampleFactory = (model) => [
4751
4815
  `python -m sample_factory.huggingface.load_from_hub -r ${model.id} -d ./train_dir`
4752
4816
  ];
@@ -5292,6 +5356,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
5292
5356
  filter: false,
5293
5357
  countDownloads: `path:"liveportrait/landmark.onnx"`
5294
5358
  },
5359
+ "llama-cpp-python": {
5360
+ prettyLabel: "llama-cpp-python",
5361
+ repoName: "llama-cpp-python",
5362
+ repoUrl: "https://github.com/abetlen/llama-cpp-python",
5363
+ snippets: llama_cpp_python
5364
+ },
5295
5365
  mindspore: {
5296
5366
  prettyLabel: "MindSpore",
5297
5367
  repoName: "mindspore",
@@ -5407,6 +5477,14 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
5407
5477
  filter: false,
5408
5478
  countDownloads: `path:"tokenizer.model"`
5409
5479
  },
5480
+ refiners: {
5481
+ prettyLabel: "Refiners",
5482
+ repoName: "Refiners",
5483
+ repoUrl: "https://github.com/finegrain-ai/refiners",
5484
+ docsUrl: "https://refine.rs/",
5485
+ filter: false,
5486
+ countDownloads: `path:"model.safetensors"`
5487
+ },
5410
5488
  saelens: {
5411
5489
  prettyLabel: "SAELens",
5412
5490
  repoName: "SAELens",
@@ -5414,6 +5492,14 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
5414
5492
  snippets: saelens,
5415
5493
  filter: false
5416
5494
  },
5495
+ sam2: {
5496
+ prettyLabel: "sam2",
5497
+ repoName: "sam2",
5498
+ repoUrl: "https://github.com/facebookresearch/segment-anything-2",
5499
+ filter: false,
5500
+ snippets: sam2,
5501
+ countDownloads: `path_extension:"pt"`
5502
+ },
5417
5503
  "sample-factory": {
5418
5504
  prettyLabel: "sample-factory",
5419
5505
  repoName: "sample-factory",
@@ -6555,6 +6641,27 @@ var snippetLlamacpp = (model, filepath) => {
6555
6641
  }
6556
6642
  ];
6557
6643
  };
6644
+ var snippetLocalAI = (model, filepath) => {
6645
+ const command = (binary) => ["# Load and run the model:", `${binary} huggingface://${model.id}/${filepath ?? "{{GGUF_FILE}}"}`].join("\n");
6646
+ return [
6647
+ {
6648
+ title: "Install from binary",
6649
+ setup: "curl https://localai.io/install.sh | sh",
6650
+ content: command("local-ai run")
6651
+ },
6652
+ {
6653
+ title: "Use Docker images",
6654
+ setup: [
6655
+ // prettier-ignore
6656
+ "# Pull the image:",
6657
+ "docker pull localai/localai:latest-cpu"
6658
+ ].join("\n"),
6659
+ content: command(
6660
+ "docker run -p 8080:8080 --name localai -v $PWD/models:/build/models localai/localai:latest-cpu"
6661
+ )
6662
+ }
6663
+ ];
6664
+ };
6558
6665
  var LOCAL_APPS = {
6559
6666
  "llama.cpp": {
6560
6667
  prettyLabel: "llama.cpp",
@@ -6570,6 +6677,13 @@ var LOCAL_APPS = {
6570
6677
  displayOnModelPage: isGgufModel,
6571
6678
  deeplink: (model, filepath) => new URL(`lmstudio://open_from_hf?model=${model.id}${filepath ? `&file=${filepath}` : ""}`)
6572
6679
  },
6680
+ localai: {
6681
+ prettyLabel: "LocalAI",
6682
+ docsUrl: "https://github.com/mudler/LocalAI",
6683
+ mainTask: "text-generation",
6684
+ displayOnModelPage: isGgufModel,
6685
+ snippet: snippetLocalAI
6686
+ },
6573
6687
  jan: {
6574
6688
  prettyLabel: "Jan",
6575
6689
  docsUrl: "https://jan.ai",
@@ -6640,9 +6754,23 @@ var LOCAL_APPS = {
6640
6754
  docsUrl: "https://diffusionbee.com",
6641
6755
  mainTask: "text-to-image",
6642
6756
  macOSOnly: true,
6643
- comingSoon: true,
6644
6757
  displayOnModelPage: (model) => model.library_name === "diffusers" && model.pipeline_tag === "text-to-image",
6645
6758
  deeplink: (model) => new URL(`diffusionbee://open_from_hf?model=${model.id}`)
6759
+ },
6760
+ joyfusion: {
6761
+ prettyLabel: "JoyFusion",
6762
+ docsUrl: "https://joyfusion.app",
6763
+ mainTask: "text-to-image",
6764
+ macOSOnly: true,
6765
+ displayOnModelPage: (model) => model.tags.includes("coreml") && model.pipeline_tag === "text-to-image",
6766
+ deeplink: (model) => new URL(`https://joyfusion.app/import_from_hf?repo_id=${model.id}`)
6767
+ },
6768
+ invoke: {
6769
+ prettyLabel: "Invoke",
6770
+ docsUrl: "https://github.com/invoke-ai/InvokeAI",
6771
+ mainTask: "text-to-image",
6772
+ displayOnModelPage: (model) => model.library_name === "diffusers" && model.pipeline_tag === "text-to-image",
6773
+ deeplink: (model) => new URL(`https://models.invoke.ai/huggingface/${model.id}`)
6646
6774
  }
6647
6775
  };
6648
6776