npm - @huggingface/tasks - Versions diffs - 0.1.0 → 0.1.2 - Mend

@huggingface/tasks 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/index.d.ts +2 -1
package/dist/index.js +64 -34
package/dist/index.mjs +64 -34
package/package.json +1 -1
package/src/library-ui-elements.ts +14 -1
package/src/model-data.ts +1 -1
package/src/model-libraries.ts +1 -0
package/src/tasks/depth-estimation/data.ts +5 -3
package/src/tasks/document-question-answering/data.ts +8 -0
package/src/tasks/image-to-text/about.md +13 -0
package/src/tasks/image-to-text/data.ts +10 -14
package/src/tasks/index.ts +2 -2
package/src/tasks/object-detection/data.ts +5 -2
package/src/tasks/table-question-answering/about.md +1 -1
package/src/tasks/text-generation/about.md +4 -0
package/src/tasks/text-to-image/data.ts +13 -11
package/src/tasks/text-to-video/data.ts +3 -3
package/src/tasks/visual-question-answering/data.ts +4 -0

package/dist/index.d.ts CHANGED Viewed

@@ -16,6 +16,7 @@ declare enum ModelLibrary {
     "flair" = "Flair",
     "keras" = "Keras",
     "k2" = "K2",
+    "mlx" = "mlx",
     "nemo" = "NeMo",
     "open_clip" = "OpenCLIP",
     "paddlenlp" = "PaddleNLP",
@@ -623,7 +624,7 @@ interface ModelData {
         inference?: boolean | {
             parameters?: Record<string, unknown>;
         };
-        base_model?: string;
+        base_model?: string | string[];
     };
     /**
      * Library name

package/dist/index.js CHANGED Viewed

@@ -119,7 +119,7 @@ var asteroid = (model) => [
 model = BaseModel.from_pretrained("${model.id}")`
 ];
 function get_base_diffusers_model(model) {
-  return model.cardData?.base_model ?? "fill-in-base-model";
+  return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
 }
 var bertopic = (model) => [
   `from bertopic import BERTopic
@@ -531,6 +531,12 @@ transcriptions = asr_model.transcribe(["file.wav"])`
   }
 };
 var mlAgents = (model) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
+var mlx = (model) => [
+  `pip install huggingface_hub hf_transfer
+export HF_HUB_ENABLE_HF_TRANSFER=1
+huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`
+];
 var nemo = (model) => {
   let command = void 0;
   if (model.tags?.includes("automatic-speech-recognition")) {
@@ -605,6 +611,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
     docsUrl: "https://huggingface.co/docs/hub/keras",
     snippets: keras
   },
+  mlx: {
+    btnLabel: "MLX",
+    repoName: "MLX",
+    repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
+    snippets: mlx
+  },
   nemo: {
     btnLabel: "NeMo",
     repoName: "NeMo",
@@ -2356,6 +2368,10 @@ var taskData5 = {
     {
       description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
       id: "naver-clova-ix/donut-base-finetuned-docvqa"
+    },
+    {
+      description: "A powerful model for document question answering.",
+      id: "google/pix2struct-docvqa-large"
     }
   ],
   spaces: [
@@ -2366,6 +2382,10 @@ var taskData5 = {
     {
       description: "An application that can answer questions from invoices.",
       id: "impira/invoices"
+    },
+    {
+      description: "An application to compare different document question answering models.",
+      id: "merve/compare_docvqa_models"
     }
   ],
   summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2709,30 +2729,26 @@ var taskData10 = {
   models: [
     {
       description: "A robust image captioning model.",
-      id: "Salesforce/blip-image-captioning-large"
+      id: "Salesforce/blip2-opt-2.7b"
     },
     {
-      description: "A strong image captioning model.",
-      id: "nlpconnect/vit-gpt2-image-captioning"
+      description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
+      id: "microsoft/kosmos-2-patch14-224"
     },
     {
       description: "A strong optical character recognition model.",
-      id: "microsoft/trocr-base-printed"
-    },
-    {
-      description: "A strong visual question answering model for scientific diagrams.",
-      id: "google/pix2struct-ai2d-base"
+      id: "facebook/nougat-base"
     },
     {
-      description: "A strong captioning model for UI components.",
-      id: "google/pix2struct-widget-captioning-base"
-    },
-    {
-      description: "A captioning model for images that contain text.",
-      id: "google/pix2struct-textcaps-base"
+      description: "A powerful model that lets you have a conversation with the image.",
+      id: "llava-hf/llava-1.5-7b-hf"
     }
   ],
   spaces: [
+    {
+      description: "An application that compares various image captioning models.",
+      id: "nielsr/comparing-captioning-models"
+    },
     {
       description: "A robust image captioning application.",
       id: "flax-community/image-captioning"
@@ -2894,7 +2910,6 @@ var taskData12 = {
   ],
   models: [
     {
-      // TO DO: write description
       description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
       id: "facebook/detr-resnet-50"
     },
@@ -2904,9 +2919,13 @@ var taskData12 = {
     }
   ],
   spaces: [
+    {
+      description: "Leaderboard to compare various object detection models across several metrics.",
+      id: "hf-vision/object_detection_leaderboard"
+    },
     {
       description: "An object detection application that can detect unseen objects out of the box.",
-      id: "adirik/OWL-ViT"
+      id: "merve/owlv2"
     },
     {
       description: "An application that contains various object detection models to try from.",
@@ -2952,14 +2971,16 @@ var taskData13 = {
   metrics: [],
   models: [
     {
-      // TO DO: write description
       description: "Strong Depth Estimation model trained on 1.4 million images.",
       id: "Intel/dpt-large"
     },
     {
-      // TO DO: write description
       description: "Strong Depth Estimation model trained on the KITTI dataset.",
-      id: "vinvino02/glpn-kitti"
+      id: "facebook/dpt-dinov2-large-kitti"
+    },
+    {
+      description: "A strong monocular depth estimation model.",
+      id: "Bingxin/Marigold"
     }
   ],
   spaces: [
@@ -3513,12 +3534,12 @@ var taskData22 = {
   ],
   models: [
     {
-      description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
-      id: "CompVis/stable-diffusion-v1-4"
+      description: "One of the most powerful image generation models that can generate realistic outputs.",
+      id: "stabilityai/stable-diffusion-xl-base-1.0"
     },
     {
-      description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
-      id: "dalle-mini/dalle-mega"
+      description: "A powerful yet fast image generation model.",
+      id: "latent-consistency/lcm-lora-sdxl"
     },
     {
       description: "A text-to-image model that can generate coherent text inside image.",
@@ -3535,19 +3556,23 @@ var taskData22 = {
       id: "stabilityai/stable-diffusion"
     },
     {
-      description: "An text-to-image application that can generate coherent text inside the image.",
+      description: "A text-to-image application to generate comics.",
+      id: "jbilcke-hf/ai-comic-factory"
+    },
+    {
+      description: "A text-to-image application that can generate coherent text inside the image.",
       id: "DeepFloyd/IF"
     },
     {
-      description: "An powerful text-to-image application that can generate images.",
-      id: "kakaobrain/karlo"
+      description: "A powerful yet very fast image generation application.",
+      id: "latent-consistency/lcm-lora-for-sdxl"
     },
     {
-      description: "An powerful text-to-image application that can generates 3D representations.",
+      description: "A powerful text-to-image application that can generate 3D representations.",
       id: "hysts/Shap-E"
     },
     {
-      description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
+      description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
       id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
     }
   ],
@@ -4039,7 +4064,7 @@ var taskData28 = {
   models: [
     {
       description: "A strong model for video generation.",
-      id: "PAIR/text2video-zero-controlnet-canny-arcane"
+      id: "Vchitect/LaVie"
     },
     {
       description: "A robust model for text-to-video generation.",
@@ -4047,7 +4072,7 @@ var taskData28 = {
     },
     {
       description: "A text-to-video generation model with high quality and smooth outputs.",
-      id: "cerspense/zeroscope_v2_576w"
+      id: "hotshotco/Hotshot-XL"
     }
   ],
   spaces: [
@@ -4057,7 +4082,7 @@ var taskData28 = {
     },
     {
       description: "An application that generates video from image and text.",
-      id: "TempoFunk/makeavid-sd-jax"
+      id: "Vchitect/LaVie"
     },
     {
       description: "An application that generates videos from text and provides multi-model support.",
@@ -4287,6 +4312,10 @@ var taskData31 = {
     }
   ],
   spaces: [
+    {
+      description: "An application that compares visual question answering models across different tasks.",
+      id: "merve/pix2struct"
+    },
     {
       description: "An application that can answer questions based on images.",
       id: "nielsr/vilt-vqa"
@@ -4454,8 +4483,8 @@ var TASKS_MODEL_LIBRARIES = {
   "graph-ml": ["transformers"],
   "image-classification": ["keras", "timm", "transformers", "transformers.js"],
   "image-segmentation": ["transformers", "transformers.js"],
-  "image-to-image": ["diffusers", "transformers.js"],
-  "image-to-text": ["transformers.js"],
+  "image-to-image": ["diffusers", "transformers", "transformers.js"],
+  "image-to-text": ["transformers", "transformers.js"],
   "image-to-video": ["diffusers"],
   "video-classification": ["transformers"],
   "mask-generation": ["transformers"],
@@ -4572,6 +4601,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
   ModelLibrary2["flair"] = "Flair";
   ModelLibrary2["keras"] = "Keras";
   ModelLibrary2["k2"] = "K2";
+  ModelLibrary2["mlx"] = "mlx";
   ModelLibrary2["nemo"] = "NeMo";
   ModelLibrary2["open_clip"] = "OpenCLIP";
   ModelLibrary2["paddlenlp"] = "PaddleNLP";

package/dist/index.mjs CHANGED Viewed

@@ -81,7 +81,7 @@ var asteroid = (model) => [
 model = BaseModel.from_pretrained("${model.id}")`
 ];
 function get_base_diffusers_model(model) {
-  return model.cardData?.base_model ?? "fill-in-base-model";
+  return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
 }
 var bertopic = (model) => [
   `from bertopic import BERTopic
@@ -493,6 +493,12 @@ transcriptions = asr_model.transcribe(["file.wav"])`
   }
 };
 var mlAgents = (model) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
+var mlx = (model) => [
+  `pip install huggingface_hub hf_transfer
+export HF_HUB_ENABLE_HF_TRANSFER=1
+huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`
+];
 var nemo = (model) => {
   let command = void 0;
   if (model.tags?.includes("automatic-speech-recognition")) {
@@ -567,6 +573,12 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
     docsUrl: "https://huggingface.co/docs/hub/keras",
     snippets: keras
   },
+  mlx: {
+    btnLabel: "MLX",
+    repoName: "MLX",
+    repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
+    snippets: mlx
+  },
   nemo: {
     btnLabel: "NeMo",
     repoName: "NeMo",
@@ -2318,6 +2330,10 @@ var taskData5 = {
     {
       description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
       id: "naver-clova-ix/donut-base-finetuned-docvqa"
+    },
+    {
+      description: "A powerful model for document question answering.",
+      id: "google/pix2struct-docvqa-large"
     }
   ],
   spaces: [
@@ -2328,6 +2344,10 @@ var taskData5 = {
     {
       description: "An application that can answer questions from invoices.",
       id: "impira/invoices"
+    },
+    {
+      description: "An application to compare different document question answering models.",
+      id: "merve/compare_docvqa_models"
     }
   ],
   summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
@@ -2671,30 +2691,26 @@ var taskData10 = {
   models: [
     {
       description: "A robust image captioning model.",
-      id: "Salesforce/blip-image-captioning-large"
+      id: "Salesforce/blip2-opt-2.7b"
     },
     {
-      description: "A strong image captioning model.",
-      id: "nlpconnect/vit-gpt2-image-captioning"
+      description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
+      id: "microsoft/kosmos-2-patch14-224"
     },
     {
       description: "A strong optical character recognition model.",
-      id: "microsoft/trocr-base-printed"
-    },
-    {
-      description: "A strong visual question answering model for scientific diagrams.",
-      id: "google/pix2struct-ai2d-base"
+      id: "facebook/nougat-base"
     },
     {
-      description: "A strong captioning model for UI components.",
-      id: "google/pix2struct-widget-captioning-base"
-    },
-    {
-      description: "A captioning model for images that contain text.",
-      id: "google/pix2struct-textcaps-base"
+      description: "A powerful model that lets you have a conversation with the image.",
+      id: "llava-hf/llava-1.5-7b-hf"
     }
   ],
   spaces: [
+    {
+      description: "An application that compares various image captioning models.",
+      id: "nielsr/comparing-captioning-models"
+    },
     {
       description: "A robust image captioning application.",
       id: "flax-community/image-captioning"
@@ -2856,7 +2872,6 @@ var taskData12 = {
   ],
   models: [
     {
-      // TO DO: write description
       description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
       id: "facebook/detr-resnet-50"
     },
@@ -2866,9 +2881,13 @@ var taskData12 = {
     }
   ],
   spaces: [
+    {
+      description: "Leaderboard to compare various object detection models across several metrics.",
+      id: "hf-vision/object_detection_leaderboard"
+    },
     {
       description: "An object detection application that can detect unseen objects out of the box.",
-      id: "adirik/OWL-ViT"
+      id: "merve/owlv2"
     },
     {
       description: "An application that contains various object detection models to try from.",
@@ -2914,14 +2933,16 @@ var taskData13 = {
   metrics: [],
   models: [
     {
-      // TO DO: write description
       description: "Strong Depth Estimation model trained on 1.4 million images.",
       id: "Intel/dpt-large"
     },
     {
-      // TO DO: write description
       description: "Strong Depth Estimation model trained on the KITTI dataset.",
-      id: "vinvino02/glpn-kitti"
+      id: "facebook/dpt-dinov2-large-kitti"
+    },
+    {
+      description: "A strong monocular depth estimation model.",
+      id: "Bingxin/Marigold"
     }
   ],
   spaces: [
@@ -3475,12 +3496,12 @@ var taskData22 = {
   ],
   models: [
     {
-      description: "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
-      id: "CompVis/stable-diffusion-v1-4"
+      description: "One of the most powerful image generation models that can generate realistic outputs.",
+      id: "stabilityai/stable-diffusion-xl-base-1.0"
     },
     {
-      description: "A model that can be used to generate images based on text prompts. The DALL\xB7E Mega model is the largest version of DALLE Mini.",
-      id: "dalle-mini/dalle-mega"
+      description: "A powerful yet fast image generation model.",
+      id: "latent-consistency/lcm-lora-sdxl"
     },
     {
       description: "A text-to-image model that can generate coherent text inside image.",
@@ -3497,19 +3518,23 @@ var taskData22 = {
       id: "stabilityai/stable-diffusion"
     },
     {
-      description: "An text-to-image application that can generate coherent text inside the image.",
+      description: "A text-to-image application to generate comics.",
+      id: "jbilcke-hf/ai-comic-factory"
+    },
+    {
+      description: "A text-to-image application that can generate coherent text inside the image.",
       id: "DeepFloyd/IF"
     },
     {
-      description: "An powerful text-to-image application that can generate images.",
-      id: "kakaobrain/karlo"
+      description: "A powerful yet very fast image generation application.",
+      id: "latent-consistency/lcm-lora-for-sdxl"
     },
     {
-      description: "An powerful text-to-image application that can generates 3D representations.",
+      description: "A powerful text-to-image application that can generate 3D representations.",
       id: "hysts/Shap-E"
     },
     {
-      description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
+      description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
       id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
     }
   ],
@@ -4001,7 +4026,7 @@ var taskData28 = {
   models: [
     {
       description: "A strong model for video generation.",
-      id: "PAIR/text2video-zero-controlnet-canny-arcane"
+      id: "Vchitect/LaVie"
     },
     {
       description: "A robust model for text-to-video generation.",
@@ -4009,7 +4034,7 @@ var taskData28 = {
     },
     {
       description: "A text-to-video generation model with high quality and smooth outputs.",
-      id: "cerspense/zeroscope_v2_576w"
+      id: "hotshotco/Hotshot-XL"
     }
   ],
   spaces: [
@@ -4019,7 +4044,7 @@ var taskData28 = {
     },
     {
       description: "An application that generates video from image and text.",
-      id: "TempoFunk/makeavid-sd-jax"
+      id: "Vchitect/LaVie"
     },
     {
       description: "An application that generates videos from text and provides multi-model support.",
@@ -4249,6 +4274,10 @@ var taskData31 = {
     }
   ],
   spaces: [
+    {
+      description: "An application that compares visual question answering models across different tasks.",
+      id: "merve/pix2struct"
+    },
     {
       description: "An application that can answer questions based on images.",
       id: "nielsr/vilt-vqa"
@@ -4416,8 +4445,8 @@ var TASKS_MODEL_LIBRARIES = {
   "graph-ml": ["transformers"],
   "image-classification": ["keras", "timm", "transformers", "transformers.js"],
   "image-segmentation": ["transformers", "transformers.js"],
-  "image-to-image": ["diffusers", "transformers.js"],
-  "image-to-text": ["transformers.js"],
+  "image-to-image": ["diffusers", "transformers", "transformers.js"],
+  "image-to-text": ["transformers", "transformers.js"],
   "image-to-video": ["diffusers"],
   "video-classification": ["transformers"],
   "mask-generation": ["transformers"],
@@ -4534,6 +4563,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
   ModelLibrary2["flair"] = "Flair";
   ModelLibrary2["keras"] = "Keras";
   ModelLibrary2["k2"] = "K2";
+  ModelLibrary2["mlx"] = "mlx";
   ModelLibrary2["nemo"] = "NeMo";
   ModelLibrary2["open_clip"] = "OpenCLIP";
   ModelLibrary2["paddlenlp"] = "PaddleNLP";

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@huggingface/tasks",
   "packageManager": "pnpm@8.10.5",
-  "version": "0.1.0",
+  "version": "0.1.2",
   "description": "List of ML tasks for huggingface.co/tasks",
   "repository": "https://github.com/huggingface/huggingface.js.git",
   "publishConfig": {

package/src/library-ui-elements.ts CHANGED Viewed

@@ -72,7 +72,7 @@ model = BaseModel.from_pretrained("${model.id}")`,
 ];
 function get_base_diffusers_model(model: ModelData): string {
-	return model.cardData?.base_model ?? "fill-in-base-model";
+	return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
 }
 const bertopic = (model: ModelData) => [
@@ -541,6 +541,13 @@ transcriptions = asr_model.transcribe(["file.wav"])`,
 const mlAgents = (model: ModelData) => [`mlagents-load-from-hf --repo-id="${model.id}" --local-dir="./downloads"`];
+const mlx = (model: ModelData) => [
+	`pip install huggingface_hub hf_transfer
+export HF_HUB_ENABLE_HF_TRANSFER=1
+huggingface-cli download --local-dir ${nameWithoutNamespace(model.id)} ${model.id}`,
+];
 const nemo = (model: ModelData) => {
 	let command: string[] | undefined = undefined;
 	// Resolve the tag to a nemo domain/sub-domain
@@ -621,6 +628,12 @@ export const MODEL_LIBRARIES_UI_ELEMENTS: Partial<Record<ModelLibraryKey, Librar
 		docsUrl: "https://huggingface.co/docs/hub/keras",
 		snippets: keras,
 	},
+	mlx: {
+		btnLabel: "MLX",
+		repoName: "MLX",
+		repoUrl: "https://github.com/ml-explore/mlx-examples/tree/main",
+		snippets: mlx,
+	},
 	nemo: {
 		btnLabel: "NeMo",
 		repoName: "NeMo",

package/src/model-data.ts CHANGED Viewed

@@ -93,7 +93,7 @@ export interface ModelData {
 			| {
 					parameters?: Record<string, unknown>;
 			  };
-		base_model?: string;
+		base_model?: string | string[];
 	};
 	/**
 	 * Library name

package/src/model-libraries.ts CHANGED Viewed

@@ -16,6 +16,7 @@ export enum ModelLibrary {
 	"flair" = "Flair",
 	"keras" = "Keras",
 	"k2" = "K2",
+	"mlx" = "mlx",
 	"nemo" = "NeMo",
 	"open_clip" = "OpenCLIP",
 	"paddlenlp" = "PaddleNLP",

package/src/tasks/depth-estimation/data.ts CHANGED Viewed

@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
 	metrics: [],
 	models: [
 		{
-			// TO DO: write description
 			description: "Strong Depth Estimation model trained on 1.4 million images.",
 			id: "Intel/dpt-large",
 		},
 		{
-			// TO DO: write description
 			description: "Strong Depth Estimation model trained on the KITTI dataset.",
-			id: "vinvino02/glpn-kitti",
+			id: "facebook/dpt-dinov2-large-kitti",
+		},
+		{
+			description: "A strong monocular depth estimation model.",
+			id: "Bingxin/Marigold",
 		},
 	],
 	spaces: [

package/src/tasks/document-question-answering/data.ts CHANGED Viewed

@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
 			description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
 			id: "naver-clova-ix/donut-base-finetuned-docvqa",
 		},
+		{
+			description: "A powerful model for document question answering.",
+			id: "google/pix2struct-docvqa-large",
+		},
 	],
 	spaces: [
 		{
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
 			description: "An application that can answer questions from invoices.",
 			id: "impira/invoices",
 		},
+		{
+			description: "An application to compare different document question answering models.",
+			id: "merve/compare_docvqa_models",
+		},
 	],
 	summary:
 		"Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",

package/src/tasks/image-to-text/about.md CHANGED Viewed

@@ -27,6 +27,19 @@ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parro
 ## [{'generated_text': 'two birds are standing next to each other '}]
 ```
+### Conversation about the Image
+Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.
+```python
+from transformers import pipeline
+mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
+mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")
+## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
+```
 ### OCR
 This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.

package/src/tasks/image-to-text/data.ts CHANGED Viewed

@@ -32,30 +32,26 @@ const taskData: TaskDataCustom = {
 	models: [
 		{
 			description: "A robust image captioning model.",
-			id: "Salesforce/blip-image-captioning-large",
+			id: "Salesforce/blip2-opt-2.7b",
 		},
 		{
-			description: "A strong image captioning model.",
-			id: "nlpconnect/vit-gpt2-image-captioning",
+			description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
+			id: "microsoft/kosmos-2-patch14-224",
 		},
 		{
 			description: "A strong optical character recognition model.",
-			id: "microsoft/trocr-base-printed",
+			id: "facebook/nougat-base",
 		},
 		{
-			description: "A strong visual question answering model for scientific diagrams.",
-			id: "google/pix2struct-ai2d-base",
-		},
-		{
-			description: "A strong captioning model for UI components.",
-			id: "google/pix2struct-widget-captioning-base",
-		},
-		{
-			description: "A captioning model for images that contain text.",
-			id: "google/pix2struct-textcaps-base",
+			description: "A powerful model that lets you have a conversation with the image.",
+			id: "llava-hf/llava-1.5-7b-hf",
 		},
 	],
 	spaces: [
+		{
+			description: "An application that compares various image captioning models.",
+			id: "nielsr/comparing-captioning-models",
+		},
 		{
 			description: "A robust image captioning application.",
 			id: "flax-community/image-captioning",

package/src/tasks/index.ts CHANGED Viewed

@@ -51,8 +51,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
 	"graph-ml": ["transformers"],
 	"image-classification": ["keras", "timm", "transformers", "transformers.js"],
 	"image-segmentation": ["transformers", "transformers.js"],
-	"image-to-image": ["diffusers", "transformers.js"],
-	"image-to-text": ["transformers.js"],
+	"image-to-image": ["diffusers", "transformers", "transformers.js"],
+	"image-to-text": ["transformers", "transformers.js"],
 	"image-to-video": ["diffusers"],
 	"video-classification": ["transformers"],
 	"mask-generation": ["transformers"],

package/src/tasks/object-detection/data.ts CHANGED Viewed

@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			// TO DO: write description
 			description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
 			id: "facebook/detr-resnet-50",
 		},
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
 		},
 	],
 	spaces: [
+		{
+			description: "Leaderboard to compare various object detection models across several metrics.",
+			id: "hf-vision/object_detection_leaderboard",
+		},
 		{
 			description: "An object detection application that can detect unseen objects out of the box.",
-			id: "adirik/OWL-ViT",
+			id: "merve/owlv2",
 		},
 		{
 			description: "An application that contains various object detection models to try from.",

package/src/tasks/table-question-answering/about.md CHANGED Viewed

@@ -31,7 +31,7 @@ tqa = pipeline(task="table-question-answering", model="google/tapas-large-finetu
 # result
-print(tqa(table=table, query=query)['cells'][0])
+print(tqa(table=table, query=question)['cells'][0])
 #53
 ```

package/src/tasks/text-generation/about.md CHANGED Viewed

@@ -42,6 +42,10 @@ When it comes to text generation, the underlying language model can come in seve
 - **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
+## Text Generation from Image and Text
+There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.
 ## Inference
 You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.

package/src/tasks/text-to-image/data.ts CHANGED Viewed

@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description:
-				"A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
-			id: "CompVis/stable-diffusion-v1-4",
+			description: "One of the most powerful image generation models that can generate realistic outputs.",
+			id: "stabilityai/stable-diffusion-xl-base-1.0",
 		},
 		{
-			description:
-				"A model that can be used to generate images based on text prompts. The DALL·E Mega model is the largest version of DALLE Mini.",
-			id: "dalle-mini/dalle-mega",
+			description: "A powerful yet fast image generation model.",
+			id: "latent-consistency/lcm-lora-sdxl",
 		},
 		{
 			description: "A text-to-image model that can generate coherent text inside image.",
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
 			id: "stabilityai/stable-diffusion",
 		},
 		{
-			description: "An text-to-image application that can generate coherent text inside the image.",
+			description: "A text-to-image application to generate comics.",
+			id: "jbilcke-hf/ai-comic-factory",
+		},
+		{
+			description: "A text-to-image application that can generate coherent text inside the image.",
 			id: "DeepFloyd/IF",
 		},
 		{
-			description: "An powerful text-to-image application that can generate images.",
-			id: "kakaobrain/karlo",
+			description: "A powerful yet very fast image generation application.",
+			id: "latent-consistency/lcm-lora-for-sdxl",
 		},
 		{
-			description: "An powerful text-to-image application that can generates 3D representations.",
+			description: "A powerful text-to-image application that can generate 3D representations.",
 			id: "hysts/Shap-E",
 		},
 		{
-			description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
+			description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
 			id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
 		},
 	],

package/src/tasks/text-to-video/data.ts CHANGED Viewed

@@ -68,7 +68,7 @@ const taskData: TaskDataCustom = {
 	models: [
 		{
 			description: "A strong model for video generation.",
-			id: "PAIR/text2video-zero-controlnet-canny-arcane",
+			id: "Vchitect/LaVie",
 		},
 		{
 			description: "A robust model for text-to-video generation.",
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A text-to-video generation model with high quality and smooth outputs.",
-			id: "cerspense/zeroscope_v2_576w",
+			id: "hotshotco/Hotshot-XL",
 		},
 	],
 	spaces: [
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "An application that generates video from image and text.",
-			id: "TempoFunk/makeavid-sd-jax",
+			id: "Vchitect/LaVie",
 		},
 		{
 			description: "An application that generates videos from text and provides multi-model support.",

package/src/tasks/visual-question-answering/data.ts CHANGED Viewed

@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
 		},
 	],
 	spaces: [
+		{
+			description: "An application that compares visual question answering models across different tasks.",
+			id: "merve/pix2struct",
+		},
 		{
 			description: "An application that can answer questions based on images.",
 			id: "nielsr/vilt-vqa",