npm - @huggingface/tasks - Versions diffs - 0.13.1-test → 0.13.1-test2 - Mend

@huggingface/tasks 0.13.1-test → 0.13.1-test2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

package/package.json +4 -2
package/src/dataset-libraries.ts +89 -0
package/src/default-widget-inputs.ts +718 -0
package/src/gguf.ts +40 -0
package/src/hardware.ts +482 -0
package/src/index.ts +59 -0
package/src/library-to-tasks.ts +76 -0
package/src/local-apps.ts +412 -0
package/src/model-data.ts +149 -0
package/src/model-libraries-downloads.ts +18 -0
package/src/model-libraries-snippets.ts +1128 -0
package/src/model-libraries.ts +820 -0
package/src/pipelines.ts +698 -0
package/src/snippets/common.ts +39 -0
package/src/snippets/curl.spec.ts +94 -0
package/src/snippets/curl.ts +120 -0
package/src/snippets/index.ts +7 -0
package/src/snippets/inputs.ts +167 -0
package/src/snippets/js.spec.ts +148 -0
package/src/snippets/js.ts +305 -0
package/src/snippets/python.spec.ts +144 -0
package/src/snippets/python.ts +321 -0
package/src/snippets/types.ts +16 -0
package/src/tasks/audio-classification/about.md +86 -0
package/src/tasks/audio-classification/data.ts +81 -0
package/src/tasks/audio-classification/inference.ts +52 -0
package/src/tasks/audio-classification/spec/input.json +35 -0
package/src/tasks/audio-classification/spec/output.json +11 -0
package/src/tasks/audio-to-audio/about.md +56 -0
package/src/tasks/audio-to-audio/data.ts +70 -0
package/src/tasks/automatic-speech-recognition/about.md +90 -0
package/src/tasks/automatic-speech-recognition/data.ts +82 -0
package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
package/src/tasks/chat-completion/inference.ts +322 -0
package/src/tasks/chat-completion/spec/input.json +350 -0
package/src/tasks/chat-completion/spec/output.json +206 -0
package/src/tasks/chat-completion/spec/stream_output.json +213 -0
package/src/tasks/common-definitions.json +100 -0
package/src/tasks/depth-estimation/about.md +45 -0
package/src/tasks/depth-estimation/data.ts +70 -0
package/src/tasks/depth-estimation/inference.ts +35 -0
package/src/tasks/depth-estimation/spec/input.json +25 -0
package/src/tasks/depth-estimation/spec/output.json +16 -0
package/src/tasks/document-question-answering/about.md +53 -0
package/src/tasks/document-question-answering/data.ts +85 -0
package/src/tasks/document-question-answering/inference.ts +110 -0
package/src/tasks/document-question-answering/spec/input.json +85 -0
package/src/tasks/document-question-answering/spec/output.json +36 -0
package/src/tasks/feature-extraction/about.md +72 -0
package/src/tasks/feature-extraction/data.ts +57 -0
package/src/tasks/feature-extraction/inference.ts +40 -0
package/src/tasks/feature-extraction/spec/input.json +47 -0
package/src/tasks/feature-extraction/spec/output.json +15 -0
package/src/tasks/fill-mask/about.md +51 -0
package/src/tasks/fill-mask/data.ts +79 -0
package/src/tasks/fill-mask/inference.ts +62 -0
package/src/tasks/fill-mask/spec/input.json +38 -0
package/src/tasks/fill-mask/spec/output.json +29 -0
package/src/tasks/image-classification/about.md +50 -0
package/src/tasks/image-classification/data.ts +88 -0
package/src/tasks/image-classification/inference.ts +52 -0
package/src/tasks/image-classification/spec/input.json +35 -0
package/src/tasks/image-classification/spec/output.json +11 -0
package/src/tasks/image-feature-extraction/about.md +23 -0
package/src/tasks/image-feature-extraction/data.ts +59 -0
package/src/tasks/image-segmentation/about.md +63 -0
package/src/tasks/image-segmentation/data.ts +99 -0
package/src/tasks/image-segmentation/inference.ts +69 -0
package/src/tasks/image-segmentation/spec/input.json +45 -0
package/src/tasks/image-segmentation/spec/output.json +26 -0
package/src/tasks/image-text-to-text/about.md +76 -0
package/src/tasks/image-text-to-text/data.ts +102 -0
package/src/tasks/image-to-3d/about.md +62 -0
package/src/tasks/image-to-3d/data.ts +75 -0
package/src/tasks/image-to-image/about.md +129 -0
package/src/tasks/image-to-image/data.ts +101 -0
package/src/tasks/image-to-image/inference.ts +68 -0
package/src/tasks/image-to-image/spec/input.json +55 -0
package/src/tasks/image-to-image/spec/output.json +12 -0
package/src/tasks/image-to-text/about.md +61 -0
package/src/tasks/image-to-text/data.ts +82 -0
package/src/tasks/image-to-text/inference.ts +143 -0
package/src/tasks/image-to-text/spec/input.json +34 -0
package/src/tasks/image-to-text/spec/output.json +14 -0
package/src/tasks/index.ts +312 -0
package/src/tasks/keypoint-detection/about.md +57 -0
package/src/tasks/keypoint-detection/data.ts +50 -0
package/src/tasks/mask-generation/about.md +65 -0
package/src/tasks/mask-generation/data.ts +55 -0
package/src/tasks/object-detection/about.md +37 -0
package/src/tasks/object-detection/data.ts +86 -0
package/src/tasks/object-detection/inference.ts +75 -0
package/src/tasks/object-detection/spec/input.json +31 -0
package/src/tasks/object-detection/spec/output.json +50 -0
package/src/tasks/placeholder/about.md +15 -0
package/src/tasks/placeholder/data.ts +21 -0
package/src/tasks/placeholder/spec/input.json +35 -0
package/src/tasks/placeholder/spec/output.json +17 -0
package/src/tasks/question-answering/about.md +56 -0
package/src/tasks/question-answering/data.ts +75 -0
package/src/tasks/question-answering/inference.ts +99 -0
package/src/tasks/question-answering/spec/input.json +67 -0
package/src/tasks/question-answering/spec/output.json +29 -0
package/src/tasks/reinforcement-learning/about.md +167 -0
package/src/tasks/reinforcement-learning/data.ts +75 -0
package/src/tasks/sentence-similarity/about.md +97 -0
package/src/tasks/sentence-similarity/data.ts +101 -0
package/src/tasks/sentence-similarity/inference.ts +32 -0
package/src/tasks/sentence-similarity/spec/input.json +40 -0
package/src/tasks/sentence-similarity/spec/output.json +12 -0
package/src/tasks/summarization/about.md +58 -0
package/src/tasks/summarization/data.ts +76 -0
package/src/tasks/summarization/inference.ts +57 -0
package/src/tasks/summarization/spec/input.json +42 -0
package/src/tasks/summarization/spec/output.json +14 -0
package/src/tasks/table-question-answering/about.md +43 -0
package/src/tasks/table-question-answering/data.ts +59 -0
package/src/tasks/table-question-answering/inference.ts +61 -0
package/src/tasks/table-question-answering/spec/input.json +44 -0
package/src/tasks/table-question-answering/spec/output.json +40 -0
package/src/tasks/tabular-classification/about.md +65 -0
package/src/tasks/tabular-classification/data.ts +68 -0
package/src/tasks/tabular-regression/about.md +87 -0
package/src/tasks/tabular-regression/data.ts +57 -0
package/src/tasks/text-classification/about.md +173 -0
package/src/tasks/text-classification/data.ts +103 -0
package/src/tasks/text-classification/inference.ts +51 -0
package/src/tasks/text-classification/spec/input.json +35 -0
package/src/tasks/text-classification/spec/output.json +11 -0
package/src/tasks/text-generation/about.md +154 -0
package/src/tasks/text-generation/data.ts +114 -0
package/src/tasks/text-generation/inference.ts +200 -0
package/src/tasks/text-generation/spec/input.json +219 -0
package/src/tasks/text-generation/spec/output.json +179 -0
package/src/tasks/text-generation/spec/stream_output.json +103 -0
package/src/tasks/text-to-3d/about.md +62 -0
package/src/tasks/text-to-3d/data.ts +56 -0
package/src/tasks/text-to-audio/inference.ts +143 -0
package/src/tasks/text-to-audio/spec/input.json +31 -0
package/src/tasks/text-to-audio/spec/output.json +17 -0
package/src/tasks/text-to-image/about.md +96 -0
package/src/tasks/text-to-image/data.ts +100 -0
package/src/tasks/text-to-image/inference.ts +75 -0
package/src/tasks/text-to-image/spec/input.json +63 -0
package/src/tasks/text-to-image/spec/output.json +13 -0
package/src/tasks/text-to-speech/about.md +63 -0
package/src/tasks/text-to-speech/data.ts +79 -0
package/src/tasks/text-to-speech/inference.ts +145 -0
package/src/tasks/text-to-speech/spec/input.json +31 -0
package/src/tasks/text-to-speech/spec/output.json +7 -0
package/src/tasks/text-to-video/about.md +41 -0
package/src/tasks/text-to-video/data.ts +102 -0
package/src/tasks/text2text-generation/inference.ts +55 -0
package/src/tasks/text2text-generation/spec/input.json +55 -0
package/src/tasks/text2text-generation/spec/output.json +14 -0
package/src/tasks/token-classification/about.md +76 -0
package/src/tasks/token-classification/data.ts +92 -0
package/src/tasks/token-classification/inference.ts +85 -0
package/src/tasks/token-classification/spec/input.json +65 -0
package/src/tasks/token-classification/spec/output.json +37 -0
package/src/tasks/translation/about.md +65 -0
package/src/tasks/translation/data.ts +70 -0
package/src/tasks/translation/inference.ts +67 -0
package/src/tasks/translation/spec/input.json +50 -0
package/src/tasks/translation/spec/output.json +14 -0
package/src/tasks/unconditional-image-generation/about.md +50 -0
package/src/tasks/unconditional-image-generation/data.ts +72 -0
package/src/tasks/video-classification/about.md +37 -0
package/src/tasks/video-classification/data.ts +84 -0
package/src/tasks/video-classification/inference.ts +59 -0
package/src/tasks/video-classification/spec/input.json +42 -0
package/src/tasks/video-classification/spec/output.json +10 -0
package/src/tasks/video-text-to-text/about.md +98 -0
package/src/tasks/video-text-to-text/data.ts +66 -0
package/src/tasks/visual-question-answering/about.md +48 -0
package/src/tasks/visual-question-answering/data.ts +97 -0
package/src/tasks/visual-question-answering/inference.ts +62 -0
package/src/tasks/visual-question-answering/spec/input.json +41 -0
package/src/tasks/visual-question-answering/spec/output.json +21 -0
package/src/tasks/zero-shot-classification/about.md +40 -0
package/src/tasks/zero-shot-classification/data.ts +70 -0
package/src/tasks/zero-shot-classification/inference.ts +67 -0
package/src/tasks/zero-shot-classification/spec/input.json +50 -0
package/src/tasks/zero-shot-classification/spec/output.json +11 -0
package/src/tasks/zero-shot-image-classification/about.md +75 -0
package/src/tasks/zero-shot-image-classification/data.ts +84 -0
package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
package/src/tasks/zero-shot-object-detection/about.md +45 -0
package/src/tasks/zero-shot-object-detection/data.ts +67 -0
package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
package/src/tokenizer-data.ts +32 -0
package/src/widget-example.ts +125 -0

package/src/tasks/image-to-image/about.md ADDED Viewed

@@ -0,0 +1,129 @@
+Image-to-image pipelines can also be used in text-to-image tasks, to provide visual guidance to the text-guided generation process.
+## Use Cases
+### Image inpainting
+Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires, or sensor dust.
+### Image colorization
+Old or black and white images can be brought up to life using an image colorization model.
+### Super Resolution
+Super-resolution models increase the resolution of an image, allowing for higher-quality viewing and printing.
+## Inference
+You can use pipelines for image-to-image in 🧨diffusers library to easily use image-to-image models. See an example for `StableDiffusionImg2ImgPipeline` below.
+```python
+import torch
+from diffusers import AutoPipelineForImage2Image
+from diffusers.utils import make_image_grid, load_image
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
+# this helps us to reduce memory usage- since SDXL is a bit heavy, this could help by
+# offloading the model to CPU w/o hurting performance.
+pipeline.enable_model_cpu_offload()
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png"
+init_image = load_image(url)
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image, strength=0.5).images[0]
+make_image_grid([init_image, image], rows=1, cols=2)
+```
+You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-image models on Hugging Face Hub.
+```javascript
+import { HfInference } from "@huggingface/inference";
+const inference = new HfInference(HF_TOKEN);
+await inference.imageToImage({
+	data: await (await fetch("image")).blob(),
+	model: "timbrooks/instruct-pix2pix",
+	parameters: {
+		prompt: "Deblur this image",
+	},
+});
+```
+## Uses Cases for Text Guided Image Generation
+### Style Transfer
+One of the most popular use cases of image-to-image is style transfer. With style transfer models:
+- a regular photo can be transformed into a variety of artistic styles or genres, such as a watercolor painting, a comic book illustration and more.
+- new images can be generated using a text prompt, in the style of a reference input image.
+See 🧨diffusers example for style transfer with `AutoPipelineForText2Image` below.
+```python
+from diffusers import AutoPipelineForText2Image
+from diffusers.utils import load_image
+import torch
+# load pipeline
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+# set the adapter and scales - this is a component that lets us add the style control from an image to the text-to-image model
+scale = {
+    "down": {"block_2": [0.0, 1.0]},
+    "up": {"block_0": [0.0, 1.0, 0.0]},
+}
+pipeline.set_ip_adapter_scale(scale)
+style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
+generator = torch.Generator(device="cpu").manual_seed(26)
+image = pipeline(
+    prompt="a cat, masterpiece, best quality, high quality",
+    ip_adapter_image=style_image,
+    negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+    guidance_scale=5,
+    num_inference_steps=30,
+    generator=generator,
+).images[0]
+image
+```
+### ControlNet
+Controlling the outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network model that provides image-based control to diffusion models. Control images can be edges or other landmarks extracted from a source image.
+![Examples](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/12-sdxl-text2img-controlnet.png)
+## Pix2Pix
+Pix2Pix is a popular model used for image-to-image translation tasks. It is based on a conditional-GAN (generative adversarial network) where instead of a noise vector a 2D image is given as input. More information about Pix2Pix can be retrieved from this [link](https://phillipi.github.io/pix2pix/) where the associated paper and the GitHub repository can be found.
+The images below show some examples extracted from the Pix2Pix paper. This model can be applied to various use cases. It is capable of relatively simpler things, e.g., converting a grayscale image to its colored version. But more importantly, it can generate realistic pictures from rough sketches (can be seen in the purse example) or from painting-like images (can be seen in the street and facade examples below).
+![Examples](https://huggingface.co/datasets/huggingfacejs/tasks/resolve/main/image-to-image/pix2pix_examples.jpg)
+## Useful Resources
+- [Image-to-image guide with diffusers](https://huggingface.co/docs/diffusers/using-diffusers/img2img)
+- Image inpainting: [inpainting with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/inpaint), [demo](https://huggingface.co/spaces/diffusers/stable-diffusion-xl-inpainting)
+- Colorization: [demo](https://huggingface.co/spaces/modelscope/old_photo_restoration)
+- Super resolution: [image upscaling with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/upscale#super-resolution), [demo](https://huggingface.co/spaces/radames/Enhance-This-HiDiffusion-SDXL)
+- [Style transfer and layout control with diffusers 🧨](https://huggingface.co/docs/diffusers/main/en/using-diffusers/ip_adapter#style--layout-control)
+- [Train your ControlNet with diffusers 🧨](https://huggingface.co/blog/train-your-controlnet)
+- [Ultra fast ControlNet with 🧨 Diffusers](https://huggingface.co/blog/controlnet)
+- [List of ControlNets trained in the community JAX Diffusers sprint](https://huggingface.co/spaces/jax-diffusers-event/leaderboard)
+## References
+[1] P. Isola, J. -Y. Zhu, T. Zhou and A. A. Efros, "Image-to-Image Translation with Conditional Adversarial Networks," 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2017, pp. 5967-5976, doi: 10.1109/CVPR.2017.632.
+This page was made possible thanks to the efforts of [Paul Gafton](https://github.com/Paul92) and [Osman Alenbey](https://huggingface.co/osman93).

package/src/tasks/image-to-image/data.ts ADDED Viewed

@@ -0,0 +1,101 @@
+import type { TaskDataCustom } from "../index.js";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			description: "Synthetic dataset, for image relighting",
+			id: "VIDIT",
+		},
+		{
+			description: "Multiple images of celebrities, used for facial expression translation",
+			id: "huggan/CelebA-faces",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "image-to-image-input.jpeg",
+				type: "img",
+			},
+		],
+		outputs: [
+			{
+				filename: "image-to-image-output.png",
+				type: "img",
+			},
+		],
+	},
+	isPlaceholder: false,
+	metrics: [
+		{
+			description:
+				"Peak Signal to Noise Ratio (PSNR) is an approximation of the human perception, considering the ratio of the absolute intensity with respect to the variations. Measured in dB, a high value indicates a high fidelity.",
+			id: "PSNR",
+		},
+		{
+			description:
+				"Structural Similarity Index (SSIM) is a perceptual metric which compares the luminance, contrast and structure of two images. The values of SSIM range between -1 and 1, and higher values indicate closer resemblance to the original image.",
+			id: "SSIM",
+		},
+		{
+			description:
+				"Inception Score (IS) is an analysis of the labels predicted by an image classification model when presented with a sample of the generated images.",
+			id: "IS",
+		},
+	],
+	models: [
+		{
+			description: "An image-to-image model to improve image resolution.",
+			id: "fal/AuraSR-v2",
+		},
+		{
+			description: "A model that increases the resolution of an image.",
+			id: "keras-io/super-resolution",
+		},
+		{
+			description:
+				"A model that creates a set of variations of the input image in the style of DALL-E using Stable Diffusion.",
+			id: "lambdalabs/sd-image-variations-diffusers",
+		},
+		{
+			description: "A model that generates images based on segments in the input image and the text prompt.",
+			id: "mfidabel/controlnet-segment-anything",
+		},
+		{
+			description: "A model that takes an image and an instruction to edit the image.",
+			id: "timbrooks/instruct-pix2pix",
+		},
+	],
+	spaces: [
+		{
+			description: "Image enhancer application for low light.",
+			id: "keras-io/low-light-image-enhancement",
+		},
+		{
+			description: "Style transfer application.",
+			id: "keras-io/neural-style-transfer",
+		},
+		{
+			description: "An application that generates images based on segment control.",
+			id: "mfidabel/controlnet-segment-anything",
+		},
+		{
+			description: "Image generation application that takes image control and text prompt.",
+			id: "hysts/ControlNet",
+		},
+		{
+			description: "Colorize any image using this app.",
+			id: "ioclab/brightness-controlnet",
+		},
+		{
+			description: "Edit images with instructions.",
+			id: "timbrooks/instruct-pix2pix",
+		},
+	],
+	summary:
+		"Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
+	widgetModels: ["stabilityai/stable-diffusion-2-inpainting"],
+	youtubeId: "",
+};
+export default taskData;

package/src/tasks/image-to-image/inference.ts ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image To Image inference
+ */
+export interface ImageToImageInput {
+	/**
+	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
+	 * also provide the image data as a raw bytes payload.
+	 */
+	inputs: string;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToImageParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Image
+ */
+export interface ImageToImageParameters {
+	/**
+	 * For diffusion models. A higher guidance scale value encourages the model to generate
+	 * images closely linked to the text prompt at the expense of lower image quality.
+	 */
+	guidance_scale?: number;
+	/**
+	 * One or several prompt to guide what NOT to include in image generation.
+	 */
+	negative_prompt?: string[];
+	/**
+	 * For diffusion models. The number of denoising steps. More denoising steps usually lead to
+	 * a higher quality image at the expense of slower inference.
+	 */
+	num_inference_steps?: number;
+	/**
+	 * The size in pixel of the output image.
+	 */
+	target_size?: TargetSize;
+	[property: string]: unknown;
+}
+/**
+ * The size in pixel of the output image.
+ */
+export interface TargetSize {
+	height: number;
+	width: number;
+	[property: string]: unknown;
+}
+/**
+ * Outputs of inference for the Image To Image task
+ */
+export interface ImageToImageOutput {
+	/**
+	 * The output image returned as raw bytes in the payload.
+	 */
+	image?: unknown;
+	[property: string]: unknown;
+}

package/src/tasks/image-to-image/spec/input.json ADDED Viewed

@@ -0,0 +1,55 @@
+{
+	"$id": "/inference/schemas/image-to-image/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Image inference",
+	"title": "ImageToImageInput",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"type": "string",
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ImageToImageParameters"
+		}
+	},
+	"$defs": {
+		"ImageToImageParameters": {
+			"title": "ImageToImageParameters",
+			"description": "Additional inference parameters for Image To Image",
+			"type": "object",
+			"properties": {
+				"guidance_scale": {
+					"type": "number",
+					"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
+				},
+				"negative_prompt": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"description": "One or several prompt to guide what NOT to include in image generation."
+				},
+				"num_inference_steps": {
+					"type": "integer",
+					"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
+				},
+				"target_size": {
+					"type": "object",
+					"description": "The size in pixel of the output image.",
+					"properties": {
+						"width": {
+							"type": "integer"
+						},
+						"height": {
+							"type": "integer"
+						}
+					},
+					"required": ["width", "height"]
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}

package/src/tasks/image-to-image/spec/output.json ADDED Viewed

@@ -0,0 +1,12 @@
+{
+	"$id": "/inference/schemas/image-to-image/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Image task",
+	"title": "ImageToImageOutput",
+	"type": "object",
+	"properties": {
+		"image": {
+			"description": "The output image returned as raw bytes in the payload."
+		}
+	}
+}

package/src/tasks/image-to-text/about.md ADDED Viewed

@@ -0,0 +1,61 @@
+## Use Cases
+### Image Captioning
+Image Captioning is the process of generating textual description of an image.
+This can help the visually impaired people to understand what's happening in their surroundings.
+### Optical Character Recognition (OCR)
+OCR models convert the text present in an image, e.g. a scanned document, to text.
+## Inference
+### Image Captioning
+You can use the 🤗 Transformers library's `image-to-text` pipeline to generate caption for the Image input.
+```python
+from transformers import pipeline
+captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parrots.png")
+## [{'generated_text': 'two birds are standing next to each other '}]
+```
+### OCR
+This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
+```python
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
+model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
+pixel_values = processor(images="image.jpeg", return_tensors="pt").pixel_values
+generated_ids = model.generate(pixel_values)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
+You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-text models on Hugging Face Hub.
+```javascript
+import { HfInference } from "@huggingface/inference";
+const inference = new HfInference(HF_TOKEN);
+await inference.imageToText({
+	data: await (await fetch("https://picsum.photos/300/300")).blob(),
+	model: "Salesforce/blip-image-captioning-base",
+});
+```
+## Useful Resources
+- [Image Captioning](https://huggingface.co/docs/transformers/main/en/tasks/image_captioning)
+- [Image Captioning Use Case](https://blog.google/outreach-initiatives/accessibility/get-image-descriptions/)
+- [Train Image Captioning model on your dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb)
+- [Train OCR model on your dataset ](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR)
+This page was made possible thanks to efforts of [Sukesh Perla](https://huggingface.co/hitchhiker3010) and [Johannes Kolbe](https://huggingface.co/johko).

package/src/tasks/image-to-text/data.ts ADDED Viewed

@@ -0,0 +1,82 @@
+import type { TaskDataCustom } from "../index.js";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			// TODO write proper description
+			description: "Dataset from 12M image-text of Reddit",
+			id: "red_caps",
+		},
+		{
+			// TODO write proper description
+			description: "Dataset from 3.3M images of Google",
+			id: "datasets/conceptual_captions",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "savanna.jpg",
+				type: "img",
+			},
+		],
+		outputs: [
+			{
+				label: "Detailed description",
+				content: "a herd of giraffes and zebras grazing in a field",
+				type: "text",
+			},
+		],
+	},
+	metrics: [],
+	models: [
+		{
+			description: "A robust image captioning model.",
+			id: "Salesforce/blip2-opt-2.7b",
+		},
+		{
+			description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
+			id: "microsoft/kosmos-2-patch14-224",
+		},
+		{
+			description: "A strong optical character recognition model.",
+			id: "facebook/nougat-base",
+		},
+		{
+			description: "A powerful model that lets you have a conversation with the image.",
+			id: "llava-hf/llava-1.5-7b-hf",
+		},
+	],
+	spaces: [
+		{
+			description: "An application that compares various image captioning models.",
+			id: "nielsr/comparing-captioning-models",
+		},
+		{
+			description: "A robust image captioning application.",
+			id: "flax-community/image-captioning",
+		},
+		{
+			description: "An application that transcribes handwritings into text.",
+			id: "nielsr/TrOCR-handwritten",
+		},
+		{
+			description: "An application that can caption images and answer questions about a given image.",
+			id: "Salesforce/BLIP",
+		},
+		{
+			description: "An application that can caption images and answer questions with a conversational agent.",
+			id: "Salesforce/BLIP2",
+		},
+		{
+			description: "An image captioning application that demonstrates the effect of noise on captions.",
+			id: "johko/capdec-image-captioning",
+		},
+	],
+	summary:
+		"Image to text models output a text from a given image. Image captioning or optical character recognition can be considered as the most common applications of image to text.",
+	widgetModels: ["Salesforce/blip-image-captioning-large"],
+	youtubeId: "",
+};
+export default taskData;

package/src/tasks/image-to-text/inference.ts ADDED Viewed

@@ -0,0 +1,143 @@
+/**
+ * Inference code generated from the JSON schema spec in ./spec
+ *
+ * Using src/scripts/inference-codegen
+ */
+/**
+ * Inputs for Image To Text inference
+ */
+export interface ImageToTextInput {
+	/**
+	 * The input image data
+	 */
+	inputs: unknown;
+	/**
+	 * Additional inference parameters
+	 */
+	parameters?: ImageToTextParameters;
+	[property: string]: unknown;
+}
+/**
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Image To Text
+ */
+export interface ImageToTextParameters {
+	/**
+	 * Parametrization of the text generation process
+	 */
+	generation_parameters?: GenerationParameters;
+	/**
+	 * The amount of maximum tokens to generate.
+	 */
+	max_new_tokens?: number;
+	[property: string]: unknown;
+}
+/**
+ * Parametrization of the text generation process
+ *
+ * Ad-hoc parametrization of the text generation process
+ */
+export interface GenerationParameters {
+	/**
+	 * Whether to use sampling instead of greedy decoding when generating new tokens.
+	 */
+	do_sample?: boolean;
+	/**
+	 * Controls the stopping condition for beam-based methods.
+	 */
+	early_stopping?: EarlyStoppingUnion;
+	/**
+	 * If set to float strictly between 0 and 1, only tokens with a conditional probability
+	 * greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+	 * 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+	 * Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+	 */
+	epsilon_cutoff?: number;
+	/**
+	 * Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+	 * float strictly between 0 and 1, a token is only considered if it is greater than either
+	 * eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+	 * term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+	 * the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+	 * See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+	 * for more details.
+	 */
+	eta_cutoff?: number;
+	/**
+	 * The maximum length (in tokens) of the generated text, including the input.
+	 */
+	max_length?: number;
+	/**
+	 * The maximum number of tokens to generate. Takes precedence over max_length.
+	 */
+	max_new_tokens?: number;
+	/**
+	 * The minimum length (in tokens) of the generated text, including the input.
+	 */
+	min_length?: number;
+	/**
+	 * The minimum number of tokens to generate. Takes precedence over min_length.
+	 */
+	min_new_tokens?: number;
+	/**
+	 * Number of groups to divide num_beams into in order to ensure diversity among different
+	 * groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+	 */
+	num_beam_groups?: number;
+	/**
+	 * Number of beams to use for beam search.
+	 */
+	num_beams?: number;
+	/**
+	 * The value balances the model confidence and the degeneration penalty in contrastive
+	 * search decoding.
+	 */
+	penalty_alpha?: number;
+	/**
+	 * The value used to modulate the next token probabilities.
+	 */
+	temperature?: number;
+	/**
+	 * The number of highest probability vocabulary tokens to keep for top-k-filtering.
+	 */
+	top_k?: number;
+	/**
+	 * If set to float < 1, only the smallest set of most probable tokens with probabilities
+	 * that add up to top_p or higher are kept for generation.
+	 */
+	top_p?: number;
+	/**
+	 * Local typicality measures how similar the conditional probability of predicting a target
+	 * token next is to the expected conditional probability of predicting a random token next,
+	 * given the partial text already generated. If set to float < 1, the smallest set of the
+	 * most locally typical tokens with probabilities that add up to typical_p or higher are
+	 * kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+	 */
+	typical_p?: number;
+	/**
+	 * Whether the model should use the past last key/values attentions to speed up decoding
+	 */
+	use_cache?: boolean;
+	[property: string]: unknown;
+}
+/**
+ * Controls the stopping condition for beam-based methods.
+ */
+export type EarlyStoppingUnion = boolean | "never";
+/**
+ * Outputs of inference for the Image To Text task
+ */
+export interface ImageToTextOutput {
+	generatedText: unknown;
+	/**
+	 * The generated text.
+	 */
+	generated_text?: string;
+	[property: string]: unknown;
+}

package/src/tasks/image-to-text/spec/input.json ADDED Viewed

@@ -0,0 +1,34 @@
+{
+	"$id": "/inference/schemas/image-to-text/input.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Inputs for Image To Text inference",
+	"title": "ImageToTextInput",
+	"type": "object",
+	"properties": {
+		"inputs": {
+			"description": "The input image data"
+		},
+		"parameters": {
+			"description": "Additional inference parameters",
+			"$ref": "#/$defs/ImageToTextParameters"
+		}
+	},
+	"$defs": {
+		"ImageToTextParameters": {
+			"title": "ImageToTextParameters",
+			"description": "Additional inference parameters for Image To Text",
+			"type": "object",
+			"properties": {
+				"max_new_tokens": {
+					"type": "integer",
+					"description": "The amount of maximum tokens to generate."
+				},
+				"generation_parameters": {
+					"description": "Parametrization of the text generation process",
+					"$ref": "/inference/schemas/common-definitions.json#/definitions/GenerationParameters"
+				}
+			}
+		}
+	},
+	"required": ["inputs"]
+}