npm - @huggingface/tasks - Versions diffs - 0.11.7 → 0.11.9 - Mend

@huggingface/tasks 0.11.7 → 0.11.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/dist/index.cjs +159 -31
package/dist/index.js +159 -31
package/dist/src/local-apps.d.ts +22 -1
package/dist/src/local-apps.d.ts.map +1 -1
package/dist/src/model-libraries-snippets.d.ts +2 -0
package/dist/src/model-libraries-snippets.d.ts.map +1 -1
package/dist/src/model-libraries.d.ts +24 -2
package/dist/src/model-libraries.d.ts.map +1 -1
package/dist/src/pipelines.d.ts +8 -2
package/dist/src/pipelines.d.ts.map +1 -1
package/dist/src/tasks/feature-extraction/data.d.ts.map +1 -1
package/dist/src/tasks/image-feature-extraction/data.d.ts.map +1 -1
package/dist/src/tasks/index.d.ts.map +1 -1
package/dist/src/tasks/text-generation/data.d.ts.map +1 -1
package/package.json +4 -1
package/src/local-apps.ts +45 -1
package/src/model-libraries-snippets.ts +47 -0
package/src/model-libraries.ts +22 -0
package/src/pipelines.ts +6 -0
package/src/tasks/feature-extraction/data.ts +5 -1
package/src/tasks/image-feature-extraction/data.ts +7 -3
package/src/tasks/image-segmentation/data.ts +4 -4
package/src/tasks/image-text-to-text/about.md +2 -0
package/src/tasks/image-text-to-text/data.ts +1 -1
package/src/tasks/image-to-image/data.ts +2 -2
package/src/tasks/index.ts +2 -0
package/src/tasks/mask-generation/data.ts +4 -4
package/src/tasks/text-generation/data.ts +16 -12
package/src/tasks/text-to-image/data.ts +3 -3

package/dist/index.cjs CHANGED Viewed

@@ -1388,6 +1388,12 @@ var PIPELINE_DATA = {
     modality: "cv",
     color: "indigo"
   },
+  "video-text-to-text": {
+    name: "Video-Text-to-Text",
+    modality: "multimodal",
+    color: "blue",
+    hideInDatasets: true
+  },
   other: {
     name: "Other",
     modality: "other",
@@ -1731,8 +1737,12 @@ var taskData5 = {
   ],
   spaces: [
     {
-      description: "A leaderboard to rank best feature extraction models..",
+      description: "A leaderboard to rank text feature extraction models based on a benchmark.",
       id: "mteb/leaderboard"
+    },
+    {
+      description: "A leaderboard to rank best feature extraction models based on human feedback.",
+      id: "mteb/arena"
     }
   ],
   summary: "Feature extraction is the task of extracting features learnt in a model.",
@@ -1937,15 +1947,19 @@ var taskData8 = {
     },
     {
       description: "A strong image feature extraction model.",
-      id: "google/vit-base-patch16-224-in21k"
+      id: "nvidia/MambaVision-T-1K"
     },
     {
-      description: "A robust image feature extraction models.",
+      description: "A robust image feature extraction model.",
       id: "facebook/dino-vitb16"
     },
     {
-      description: "Strong image-text-to-text model made for information retrieval from documents.",
+      description: "Strong image feature extraction model made for information retrieval from documents.",
       id: "vidore/colpali"
+    },
+    {
+      description: "Strong image feature extraction model that can be used on images and documents.",
+      id: "OpenGVLab/InternViT-6B-448px-V1-2"
     }
   ],
   spaces: [],
@@ -1997,8 +2011,8 @@ var taskData9 = {
   ],
   models: [
     {
-      description: "A model that enhances images captured in low light conditions.",
-      id: "keras-io/low-light-image-enhancement"
+      description: "An image-to-image model to improve image resolution.",
+      id: "fal/AuraSR-v2"
     },
     {
       description: "A model that increases the resolution of an image.",
@@ -2216,7 +2230,7 @@ var taskData11 = {
   ],
   summary: "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
   widgetModels: ["microsoft/kosmos-2-patch14-224"],
-  youtubeId: ""
+  youtubeId: "IoGaGfU1CIg"
 };
 var data_default11 = taskData11;
@@ -2267,16 +2281,16 @@ var taskData12 = {
       id: "facebook/detr-resnet-50-panoptic"
     },
     {
-      description: "Semantic segmentation model trained on ADE20k benchmark dataset.",
-      id: "microsoft/beit-large-finetuned-ade-640-640"
+      description: "Background removal model.",
+      id: "briaai/RMBG-1.4"
     },
     {
       description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.",
       id: "nvidia/segformer-b0-finetuned-ade-512-512"
     },
     {
-      description: "Semantic segmentation model trained Cityscapes dataset.",
-      id: "facebook/mask2former-swin-large-cityscapes-semantic"
+      description: "A multipurpose image segmentation model for high resolution images.",
+      id: "ZhengPeng7/BiRefNet"
     },
     {
       description: "Panoptic segmentation model trained COCO (common objects) dataset.",
@@ -2340,13 +2354,13 @@ var taskData13 = {
     },
     {
       description: "Very strong mask generation model.",
-      id: "facebook/sam-vit-huge"
+      id: "facebook/sam2-hiera-large"
     }
   ],
   spaces: [
     {
-      description: "An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
-      id: "SkalskiP/SAM_and_MetaCLIP"
+      description: "An application that combines a mask generation model with a zero-shot object detection model for text-guided image segmentation.",
+      id: "merve/OWLSAM2"
     },
     {
       description: "An application that compares the performance of a large and a small mask generation model.",
@@ -2354,7 +2368,7 @@ var taskData13 = {
     },
     {
       description: "An application based on an improved mask generation model.",
-      id: "linfanluntan/Grounded-SAM"
+      id: "SkalskiP/segment-anything-model-2"
     },
     {
       description: "An application to remove objects from videos using mask generation models.",
@@ -3050,15 +3064,15 @@ var taskData24 = {
   models: [
     {
       description: "One of the most powerful image generation models that can generate realistic outputs.",
-      id: "stabilityai/stable-diffusion-xl-base-1.0"
+      id: "black-forest-labs/FLUX.1-dev"
     },
     {
       description: "A powerful yet fast image generation model.",
       id: "latent-consistency/lcm-lora-sdxl"
     },
     {
-      description: "A very fast text-to-image model.",
-      id: "ByteDance/SDXL-Lightning"
+      description: "Text-to-image model for photorealistic generation.",
+      id: "Kwai-Kolors/Kolors"
     },
     {
       description: "A powerful text-to-image model.",
@@ -3419,6 +3433,10 @@ var taskData29 = {
     {
       description: "An instruction dataset with preference ratings on responses.",
       id: "openbmb/UltraFeedback"
+    },
+    {
+      description: "A large synthetic dataset for alignment of text generation models.",
+      id: "argilla/magpie-ultra-v0.1"
     }
   ],
   demo: {
@@ -3449,32 +3467,32 @@ var taskData29 = {
   ],
   models: [
     {
-      description: "A large language model trained for text generation.",
-      id: "bigscience/bloom-560m"
+      description: "A text-generation model trained to follow instructions.",
+      id: "google/gemma-2-2b-it"
     },
     {
-      description: "A large code generation model that can generate code in 80+ languages.",
+      description: "A code generation model that can generate code in 80+ languages.",
       id: "bigcode/starcoder"
     },
     {
-      description: "A very powerful text generation model.",
-      id: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+      description: "Very powerful text generation model trained to follow instructions.",
+      id: "meta-llama/Meta-Llama-3.1-8B-Instruct"
     },
     {
       description: "Small yet powerful text generation model.",
-      id: "microsoft/phi-2"
+      id: "microsoft/Phi-3-mini-4k-instruct"
     },
     {
-      description: "A very powerful model that can chat, do mathematical reasoning and write code.",
-      id: "openchat/openchat-3.5-0106"
+      description: "A very powerful model that can solve mathematical problems.",
+      id: "AI-MO/NuminaMath-7B-TIR"
     },
     {
-      description: "Very strong yet small assistant model.",
-      id: "HuggingFaceH4/zephyr-7b-beta"
+      description: "Strong coding assistant model.",
+      id: "HuggingFaceH4/starchat2-15b-v0.1"
     },
     {
       description: "Very strong open-source large language model.",
-      id: "meta-llama/Llama-2-70b-hf"
+      id: "mistralai/Mistral-Nemo-Instruct-2407"
     }
   ],
   spaces: [
@@ -3501,7 +3519,7 @@ var taskData29 = {
   ],
   summary: "Generating text is the task of generating new text given another text. These models can, for example, fill in incomplete text or paraphrase.",
   widgetModels: ["HuggingFaceH4/zephyr-7b-beta"],
-  youtubeId: "Vpjb1lu0MDk"
+  youtubeId: "e9gNEAlsOvU"
 };
 var data_default29 = taskData29;
@@ -4226,6 +4244,7 @@ var TASKS_MODEL_LIBRARIES = {
   ],
   translation: ["transformers", "transformers.js"],
   "unconditional-image-generation": ["diffusers"],
+  "video-text-to-text": ["transformers"],
   "visual-question-answering": ["transformers", "transformers.js"],
   "voice-activity-detection": [],
   "zero-shot-classification": ["transformers", "transformers.js"],
@@ -4285,6 +4304,7 @@ var TASKS_DATA = {
   "token-classification": getData("token-classification", data_default26),
   translation: getData("translation", data_default27),
   "unconditional-image-generation": getData("unconditional-image-generation", data_default31),
+  "video-text-to-text": getData("video-text-to-text", data_default16),
   "visual-question-answering": getData("visual-question-answering", data_default33),
   "voice-activity-detection": void 0,
   "zero-shot-classification": getData("zero-shot-classification", data_default34),
@@ -4522,6 +4542,23 @@ tokenizer = keras_nlp.models.Tokenizer.from_preset("hf://${model.id}")
 backbone = keras_nlp.models.Backbone.from_preset("hf://${model.id}")
 `
 ];
+var llama_cpp_python = (model) => [
+  `from llama_cpp import Llama
+llm = Llama.from_pretrained(
+	repo_id="${model.id}",
+	filename="{{GGUF_FILE}}",
+)
+llm.create_chat_completion(
+		messages = [
+			{
+				"role": "user",
+				"content": "What is the capital of France?"
+			}
+		]
+)`
+];
 var tf_keras = (model) => [
   `# Note: 'keras<3.x' or 'tf_keras' must be installed (legacy)
 # See https://github.com/keras-team/tf-keras for more details.
@@ -4747,6 +4784,33 @@ var fastai = (model) => [
 learn = from_pretrained_fastai("${model.id}")`
 ];
+var sam2 = (model) => {
+  const image_predictor = `# Use SAM2 with images
+import torch
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+predictor = SAM2ImagePredictor.from_pretrained(${model.id})
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    predictor.set_image(<your_image>)
+    masks, _, _ = predictor.predict(<input_prompts>)`;
+  const video_predictor = `# Use SAM2 with videos
+import torch
+from sam2.sam2_video_predictor import SAM2VideoPredictor
+predictor = SAM2VideoPredictor.from_pretrained(${model.id})
+with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+    state = predictor.init_state(<your_video>)
+    # add new prompts and instantly get the output on the same frame
+    frame_idx, object_ids, masks = predictor.add_new_points(state, <your_prompts>):
+    # propagate the prompts to get masklets throughout the video
+    for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
+        ...`;
+  return [image_predictor, video_predictor];
+};
 var sampleFactory = (model) => [
   `python -m sample_factory.huggingface.load_from_hub -r ${model.id} -d ./train_dir`
 ];
@@ -5292,6 +5356,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
     filter: false,
     countDownloads: `path:"liveportrait/landmark.onnx"`
   },
+  "llama-cpp-python": {
+    prettyLabel: "llama-cpp-python",
+    repoName: "llama-cpp-python",
+    repoUrl: "https://github.com/abetlen/llama-cpp-python",
+    snippets: llama_cpp_python
+  },
   mindspore: {
     prettyLabel: "MindSpore",
     repoName: "mindspore",
@@ -5407,6 +5477,14 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
     filter: false,
     countDownloads: `path:"tokenizer.model"`
   },
+  refiners: {
+    prettyLabel: "Refiners",
+    repoName: "Refiners",
+    repoUrl: "https://github.com/finegrain-ai/refiners",
+    docsUrl: "https://refine.rs/",
+    filter: false,
+    countDownloads: `path:"model.safetensors"`
+  },
   saelens: {
     prettyLabel: "SAELens",
     repoName: "SAELens",
@@ -5414,6 +5492,14 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
     snippets: saelens,
     filter: false
   },
+  sam2: {
+    prettyLabel: "sam2",
+    repoName: "sam2",
+    repoUrl: "https://github.com/facebookresearch/segment-anything-2",
+    filter: false,
+    snippets: sam2,
+    countDownloads: `path_extension:"pt"`
+  },
   "sample-factory": {
     prettyLabel: "sample-factory",
     repoName: "sample-factory",
@@ -6555,6 +6641,27 @@ var snippetLlamacpp = (model, filepath) => {
     }
   ];
 };
+var snippetLocalAI = (model, filepath) => {
+  const command = (binary) => ["# Load and run the model:", `${binary} huggingface://${model.id}/${filepath ?? "{{GGUF_FILE}}"}`].join("\n");
+  return [
+    {
+      title: "Install from binary",
+      setup: "curl https://localai.io/install.sh | sh",
+      content: command("local-ai run")
+    },
+    {
+      title: "Use Docker images",
+      setup: [
+        // prettier-ignore
+        "# Pull the image:",
+        "docker pull localai/localai:latest-cpu"
+      ].join("\n"),
+      content: command(
+        "docker run -p 8080:8080 --name localai -v $PWD/models:/build/models localai/localai:latest-cpu"
+      )
+    }
+  ];
+};
 var LOCAL_APPS = {
   "llama.cpp": {
     prettyLabel: "llama.cpp",
@@ -6570,6 +6677,13 @@ var LOCAL_APPS = {
     displayOnModelPage: isGgufModel,
     deeplink: (model, filepath) => new URL(`lmstudio://open_from_hf?model=${model.id}${filepath ? `&file=${filepath}` : ""}`)
   },
+  localai: {
+    prettyLabel: "LocalAI",
+    docsUrl: "https://github.com/mudler/LocalAI",
+    mainTask: "text-generation",
+    displayOnModelPage: isGgufModel,
+    snippet: snippetLocalAI
+  },
   jan: {
     prettyLabel: "Jan",
     docsUrl: "https://jan.ai",
@@ -6640,9 +6754,23 @@ var LOCAL_APPS = {
     docsUrl: "https://diffusionbee.com",
     mainTask: "text-to-image",
     macOSOnly: true,
-    comingSoon: true,
     displayOnModelPage: (model) => model.library_name === "diffusers" && model.pipeline_tag === "text-to-image",
     deeplink: (model) => new URL(`diffusionbee://open_from_hf?model=${model.id}`)
+  },
+  joyfusion: {
+    prettyLabel: "JoyFusion",
+    docsUrl: "https://joyfusion.app",
+    mainTask: "text-to-image",
+    macOSOnly: true,
+    displayOnModelPage: (model) => model.tags.includes("coreml") && model.pipeline_tag === "text-to-image",
+    deeplink: (model) => new URL(`https://joyfusion.app/import_from_hf?repo_id=${model.id}`)
+  },
+  invoke: {
+    prettyLabel: "Invoke",
+    docsUrl: "https://github.com/invoke-ai/InvokeAI",
+    mainTask: "text-to-image",
+    displayOnModelPage: (model) => model.library_name === "diffusers" && model.pipeline_tag === "text-to-image",
+    deeplink: (model) => new URL(`https://models.invoke.ai/huggingface/${model.id}`)
   }
 };