npm - @huggingface/tasks - Versions diffs - 0.13.1-test → 0.13.1-test2 - Mend

@huggingface/tasks 0.13.1-test → 0.13.1-test2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

package/package.json +4 -2
package/src/dataset-libraries.ts +89 -0
package/src/default-widget-inputs.ts +718 -0
package/src/gguf.ts +40 -0
package/src/hardware.ts +482 -0
package/src/index.ts +59 -0
package/src/library-to-tasks.ts +76 -0
package/src/local-apps.ts +412 -0
package/src/model-data.ts +149 -0
package/src/model-libraries-downloads.ts +18 -0
package/src/model-libraries-snippets.ts +1128 -0
package/src/model-libraries.ts +820 -0
package/src/pipelines.ts +698 -0
package/src/snippets/common.ts +39 -0
package/src/snippets/curl.spec.ts +94 -0
package/src/snippets/curl.ts +120 -0
package/src/snippets/index.ts +7 -0
package/src/snippets/inputs.ts +167 -0
package/src/snippets/js.spec.ts +148 -0
package/src/snippets/js.ts +305 -0
package/src/snippets/python.spec.ts +144 -0
package/src/snippets/python.ts +321 -0
package/src/snippets/types.ts +16 -0
package/src/tasks/audio-classification/about.md +86 -0
package/src/tasks/audio-classification/data.ts +81 -0
package/src/tasks/audio-classification/inference.ts +52 -0
package/src/tasks/audio-classification/spec/input.json +35 -0
package/src/tasks/audio-classification/spec/output.json +11 -0
package/src/tasks/audio-to-audio/about.md +56 -0
package/src/tasks/audio-to-audio/data.ts +70 -0
package/src/tasks/automatic-speech-recognition/about.md +90 -0
package/src/tasks/automatic-speech-recognition/data.ts +82 -0
package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
package/src/tasks/chat-completion/inference.ts +322 -0
package/src/tasks/chat-completion/spec/input.json +350 -0
package/src/tasks/chat-completion/spec/output.json +206 -0
package/src/tasks/chat-completion/spec/stream_output.json +213 -0
package/src/tasks/common-definitions.json +100 -0
package/src/tasks/depth-estimation/about.md +45 -0
package/src/tasks/depth-estimation/data.ts +70 -0
package/src/tasks/depth-estimation/inference.ts +35 -0
package/src/tasks/depth-estimation/spec/input.json +25 -0
package/src/tasks/depth-estimation/spec/output.json +16 -0
package/src/tasks/document-question-answering/about.md +53 -0
package/src/tasks/document-question-answering/data.ts +85 -0
package/src/tasks/document-question-answering/inference.ts +110 -0
package/src/tasks/document-question-answering/spec/input.json +85 -0
package/src/tasks/document-question-answering/spec/output.json +36 -0
package/src/tasks/feature-extraction/about.md +72 -0
package/src/tasks/feature-extraction/data.ts +57 -0
package/src/tasks/feature-extraction/inference.ts +40 -0
package/src/tasks/feature-extraction/spec/input.json +47 -0
package/src/tasks/feature-extraction/spec/output.json +15 -0
package/src/tasks/fill-mask/about.md +51 -0
package/src/tasks/fill-mask/data.ts +79 -0
package/src/tasks/fill-mask/inference.ts +62 -0
package/src/tasks/fill-mask/spec/input.json +38 -0
package/src/tasks/fill-mask/spec/output.json +29 -0
package/src/tasks/image-classification/about.md +50 -0
package/src/tasks/image-classification/data.ts +88 -0
package/src/tasks/image-classification/inference.ts +52 -0
package/src/tasks/image-classification/spec/input.json +35 -0
package/src/tasks/image-classification/spec/output.json +11 -0
package/src/tasks/image-feature-extraction/about.md +23 -0
package/src/tasks/image-feature-extraction/data.ts +59 -0
package/src/tasks/image-segmentation/about.md +63 -0
package/src/tasks/image-segmentation/data.ts +99 -0
package/src/tasks/image-segmentation/inference.ts +69 -0
package/src/tasks/image-segmentation/spec/input.json +45 -0
package/src/tasks/image-segmentation/spec/output.json +26 -0
package/src/tasks/image-text-to-text/about.md +76 -0
package/src/tasks/image-text-to-text/data.ts +102 -0
package/src/tasks/image-to-3d/about.md +62 -0
package/src/tasks/image-to-3d/data.ts +75 -0
package/src/tasks/image-to-image/about.md +129 -0
package/src/tasks/image-to-image/data.ts +101 -0
package/src/tasks/image-to-image/inference.ts +68 -0
package/src/tasks/image-to-image/spec/input.json +55 -0
package/src/tasks/image-to-image/spec/output.json +12 -0
package/src/tasks/image-to-text/about.md +61 -0
package/src/tasks/image-to-text/data.ts +82 -0
package/src/tasks/image-to-text/inference.ts +143 -0
package/src/tasks/image-to-text/spec/input.json +34 -0
package/src/tasks/image-to-text/spec/output.json +14 -0
package/src/tasks/index.ts +312 -0
package/src/tasks/keypoint-detection/about.md +57 -0
package/src/tasks/keypoint-detection/data.ts +50 -0
package/src/tasks/mask-generation/about.md +65 -0
package/src/tasks/mask-generation/data.ts +55 -0
package/src/tasks/object-detection/about.md +37 -0
package/src/tasks/object-detection/data.ts +86 -0
package/src/tasks/object-detection/inference.ts +75 -0
package/src/tasks/object-detection/spec/input.json +31 -0
package/src/tasks/object-detection/spec/output.json +50 -0
package/src/tasks/placeholder/about.md +15 -0
package/src/tasks/placeholder/data.ts +21 -0
package/src/tasks/placeholder/spec/input.json +35 -0
package/src/tasks/placeholder/spec/output.json +17 -0
package/src/tasks/question-answering/about.md +56 -0
package/src/tasks/question-answering/data.ts +75 -0
package/src/tasks/question-answering/inference.ts +99 -0
package/src/tasks/question-answering/spec/input.json +67 -0
package/src/tasks/question-answering/spec/output.json +29 -0
package/src/tasks/reinforcement-learning/about.md +167 -0
package/src/tasks/reinforcement-learning/data.ts +75 -0
package/src/tasks/sentence-similarity/about.md +97 -0
package/src/tasks/sentence-similarity/data.ts +101 -0
package/src/tasks/sentence-similarity/inference.ts +32 -0
package/src/tasks/sentence-similarity/spec/input.json +40 -0
package/src/tasks/sentence-similarity/spec/output.json +12 -0
package/src/tasks/summarization/about.md +58 -0
package/src/tasks/summarization/data.ts +76 -0
package/src/tasks/summarization/inference.ts +57 -0
package/src/tasks/summarization/spec/input.json +42 -0
package/src/tasks/summarization/spec/output.json +14 -0
package/src/tasks/table-question-answering/about.md +43 -0
package/src/tasks/table-question-answering/data.ts +59 -0
package/src/tasks/table-question-answering/inference.ts +61 -0
package/src/tasks/table-question-answering/spec/input.json +44 -0
package/src/tasks/table-question-answering/spec/output.json +40 -0
package/src/tasks/tabular-classification/about.md +65 -0
package/src/tasks/tabular-classification/data.ts +68 -0
package/src/tasks/tabular-regression/about.md +87 -0
package/src/tasks/tabular-regression/data.ts +57 -0
package/src/tasks/text-classification/about.md +173 -0
package/src/tasks/text-classification/data.ts +103 -0
package/src/tasks/text-classification/inference.ts +51 -0
package/src/tasks/text-classification/spec/input.json +35 -0
package/src/tasks/text-classification/spec/output.json +11 -0
package/src/tasks/text-generation/about.md +154 -0
package/src/tasks/text-generation/data.ts +114 -0
package/src/tasks/text-generation/inference.ts +200 -0
package/src/tasks/text-generation/spec/input.json +219 -0
package/src/tasks/text-generation/spec/output.json +179 -0
package/src/tasks/text-generation/spec/stream_output.json +103 -0
package/src/tasks/text-to-3d/about.md +62 -0
package/src/tasks/text-to-3d/data.ts +56 -0
package/src/tasks/text-to-audio/inference.ts +143 -0
package/src/tasks/text-to-audio/spec/input.json +31 -0
package/src/tasks/text-to-audio/spec/output.json +17 -0
package/src/tasks/text-to-image/about.md +96 -0
package/src/tasks/text-to-image/data.ts +100 -0
package/src/tasks/text-to-image/inference.ts +75 -0
package/src/tasks/text-to-image/spec/input.json +63 -0
package/src/tasks/text-to-image/spec/output.json +13 -0
package/src/tasks/text-to-speech/about.md +63 -0
package/src/tasks/text-to-speech/data.ts +79 -0
package/src/tasks/text-to-speech/inference.ts +145 -0
package/src/tasks/text-to-speech/spec/input.json +31 -0
package/src/tasks/text-to-speech/spec/output.json +7 -0
package/src/tasks/text-to-video/about.md +41 -0
package/src/tasks/text-to-video/data.ts +102 -0
package/src/tasks/text2text-generation/inference.ts +55 -0
package/src/tasks/text2text-generation/spec/input.json +55 -0
package/src/tasks/text2text-generation/spec/output.json +14 -0
package/src/tasks/token-classification/about.md +76 -0
package/src/tasks/token-classification/data.ts +92 -0
package/src/tasks/token-classification/inference.ts +85 -0
package/src/tasks/token-classification/spec/input.json +65 -0
package/src/tasks/token-classification/spec/output.json +37 -0
package/src/tasks/translation/about.md +65 -0
package/src/tasks/translation/data.ts +70 -0
package/src/tasks/translation/inference.ts +67 -0
package/src/tasks/translation/spec/input.json +50 -0
package/src/tasks/translation/spec/output.json +14 -0
package/src/tasks/unconditional-image-generation/about.md +50 -0
package/src/tasks/unconditional-image-generation/data.ts +72 -0
package/src/tasks/video-classification/about.md +37 -0
package/src/tasks/video-classification/data.ts +84 -0
package/src/tasks/video-classification/inference.ts +59 -0
package/src/tasks/video-classification/spec/input.json +42 -0
package/src/tasks/video-classification/spec/output.json +10 -0
package/src/tasks/video-text-to-text/about.md +98 -0
package/src/tasks/video-text-to-text/data.ts +66 -0
package/src/tasks/visual-question-answering/about.md +48 -0
package/src/tasks/visual-question-answering/data.ts +97 -0
package/src/tasks/visual-question-answering/inference.ts +62 -0
package/src/tasks/visual-question-answering/spec/input.json +41 -0
package/src/tasks/visual-question-answering/spec/output.json +21 -0
package/src/tasks/zero-shot-classification/about.md +40 -0
package/src/tasks/zero-shot-classification/data.ts +70 -0
package/src/tasks/zero-shot-classification/inference.ts +67 -0
package/src/tasks/zero-shot-classification/spec/input.json +50 -0
package/src/tasks/zero-shot-classification/spec/output.json +11 -0
package/src/tasks/zero-shot-image-classification/about.md +75 -0
package/src/tasks/zero-shot-image-classification/data.ts +84 -0
package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
package/src/tasks/zero-shot-object-detection/about.md +45 -0
package/src/tasks/zero-shot-object-detection/data.ts +67 -0
package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
package/src/tokenizer-data.ts +32 -0
package/src/widget-example.ts +125 -0

package/src/tasks/image-to-text/spec/output.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+	"$id": "/inference/schemas/image-to-text/output.json",
+	"$schema": "http://json-schema.org/draft-06/schema#",
+	"description": "Outputs of inference for the Image To Text task",
+	"title": "ImageToTextOutput",
+	"type": "object",
+	"properties": {
+		"generated_text": {
+			"type": "string",
+			"description": "The generated text."
+		}
+	},
+	"required": ["generatedText"]
+}

package/src/tasks/index.ts ADDED Viewed

@@ -0,0 +1,312 @@
+import type { PipelineType } from "../pipelines.js";
+import { PIPELINE_DATA } from "../pipelines.js";
+import audioClassification from "./audio-classification/data.js";
+import audioToAudio from "./audio-to-audio/data.js";
+import automaticSpeechRecognition from "./automatic-speech-recognition/data.js";
+import documentQuestionAnswering from "./document-question-answering/data.js";
+import featureExtraction from "./feature-extraction/data.js";
+import fillMask from "./fill-mask/data.js";
+import imageClassification from "./image-classification/data.js";
+import imageFeatureExtraction from "./image-feature-extraction/data.js";
+import imageToImage from "./image-to-image/data.js";
+import imageToText from "./image-to-text/data.js";
+import imageTextToText from "./image-text-to-text/data.js";
+import imageSegmentation from "./image-segmentation/data.js";
+import maskGeneration from "./mask-generation/data.js";
+import objectDetection from "./object-detection/data.js";
+import depthEstimation from "./depth-estimation/data.js";
+import placeholder from "./placeholder/data.js";
+import reinforcementLearning from "./reinforcement-learning/data.js";
+import questionAnswering from "./question-answering/data.js";
+import sentenceSimilarity from "./sentence-similarity/data.js";
+import summarization from "./summarization/data.js";
+import tableQuestionAnswering from "./table-question-answering/data.js";
+import tabularClassification from "./tabular-classification/data.js";
+import tabularRegression from "./tabular-regression/data.js";
+import textToImage from "./text-to-image/data.js";
+import textToSpeech from "./text-to-speech/data.js";
+import tokenClassification from "./token-classification/data.js";
+import translation from "./translation/data.js";
+import textClassification from "./text-classification/data.js";
+import textGeneration from "./text-generation/data.js";
+import textToVideo from "./text-to-video/data.js";
+import unconditionalImageGeneration from "./unconditional-image-generation/data.js";
+import videoClassification from "./video-classification/data.js";
+import visualQuestionAnswering from "./visual-question-answering/data.js";
+import zeroShotClassification from "./zero-shot-classification/data.js";
+import zeroShotImageClassification from "./zero-shot-image-classification/data.js";
+import zeroShotObjectDetection from "./zero-shot-object-detection/data.js";
+import imageTo3D from "./image-to-3d/data.js";
+import textTo3D from "./text-to-3d/data.js";
+import keypointDetection from "./keypoint-detection/data.js";
+import videoTextToText from "./video-text-to-text/data.js";
+export type * from "./audio-classification/inference.js";
+export type * from "./automatic-speech-recognition/inference.js";
+export type {
+	ChatCompletionInput,
+	ChatCompletionInputMessage,
+	ChatCompletionOutput,
+	ChatCompletionOutputComplete,
+	ChatCompletionOutputMessage,
+	ChatCompletionStreamOutput,
+	ChatCompletionStreamOutputChoice,
+	ChatCompletionStreamOutputDelta,
+} from "./chat-completion/inference.js";
+export type * from "./document-question-answering/inference.js";
+export type * from "./feature-extraction/inference.js";
+export type * from "./fill-mask/inference.js";
+export type {
+	ImageClassificationInput,
+	ImageClassificationOutput,
+	ImageClassificationOutputElement,
+	ImageClassificationParameters,
+} from "./image-classification/inference.js";
+export type * from "./image-to-image/inference.js";
+export type { ImageToTextInput, ImageToTextOutput, ImageToTextParameters } from "./image-to-text/inference.js";
+export type * from "./image-segmentation/inference.js";
+export type * from "./object-detection/inference.js";
+export type * from "./depth-estimation/inference.js";
+export type * from "./question-answering/inference.js";
+export type * from "./sentence-similarity/inference.js";
+export type * from "./summarization/inference.js";
+export type * from "./table-question-answering/inference.js";
+export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference.js";
+export type { TextToSpeechParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference.js";
+export type * from "./token-classification/inference.js";
+export type { TranslationInput, TranslationOutput } from "./translation/inference.js";
+export type {
+	ClassificationOutputTransform,
+	TextClassificationInput,
+	TextClassificationOutput,
+	TextClassificationOutputElement,
+	TextClassificationParameters,
+} from "./text-classification/inference.js";
+export type {
+	TextGenerationOutputFinishReason,
+	TextGenerationOutputPrefillToken,
+	TextGenerationInput,
+	TextGenerationOutput,
+	TextGenerationOutputDetails,
+	TextGenerationInputGenerateParameters,
+	TextGenerationOutputBestOfSequence,
+	TextGenerationOutputToken,
+	TextGenerationStreamOutputStreamDetails,
+	TextGenerationStreamOutput,
+} from "./text-generation/inference.js";
+export type * from "./video-classification/inference.js";
+export type * from "./visual-question-answering/inference.js";
+export type * from "./zero-shot-classification/inference.js";
+export type * from "./zero-shot-image-classification/inference.js";
+export type {
+	BoundingBox,
+	ZeroShotObjectDetectionInput,
+	ZeroShotObjectDetectionInputData,
+	ZeroShotObjectDetectionOutput,
+	ZeroShotObjectDetectionOutputElement,
+} from "./zero-shot-object-detection/inference.js";
+import type { ModelLibraryKey } from "../model-libraries.js";
+/**
+ * Model libraries compatible with each ML task
+ */
+export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
+	"audio-classification": ["speechbrain", "transformers", "transformers.js"],
+	"audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
+	"automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
+	"depth-estimation": ["transformers", "transformers.js"],
+	"document-question-answering": ["transformers", "transformers.js"],
+	"feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
+	"fill-mask": ["transformers", "transformers.js"],
+	"graph-ml": ["transformers"],
+	"image-classification": ["keras", "timm", "transformers", "transformers.js"],
+	"image-feature-extraction": ["timm", "transformers"],
+	"image-segmentation": ["transformers", "transformers.js"],
+	"image-text-to-text": ["transformers"],
+	"image-to-image": ["diffusers", "transformers", "transformers.js"],
+	"image-to-text": ["transformers", "transformers.js"],
+	"image-to-video": ["diffusers"],
+	"keypoint-detection": ["transformers"],
+	"video-classification": ["transformers"],
+	"mask-generation": ["transformers"],
+	"multiple-choice": ["transformers"],
+	"object-detection": ["transformers", "transformers.js"],
+	other: [],
+	"question-answering": ["adapter-transformers", "allennlp", "transformers", "transformers.js"],
+	robotics: [],
+	"reinforcement-learning": ["transformers", "stable-baselines3", "ml-agents", "sample-factory"],
+	"sentence-similarity": ["sentence-transformers", "spacy", "transformers.js"],
+	summarization: ["transformers", "transformers.js"],
+	"table-question-answering": ["transformers"],
+	"table-to-text": ["transformers"],
+	"tabular-classification": ["sklearn"],
+	"tabular-regression": ["sklearn"],
+	"tabular-to-text": ["transformers"],
+	"text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
+	"text-generation": ["transformers", "transformers.js"],
+	"text-retrieval": [],
+	"text-to-image": ["diffusers"],
+	"text-to-speech": ["espnet", "tensorflowtts", "transformers", "transformers.js"],
+	"text-to-audio": ["transformers", "transformers.js"],
+	"text-to-video": ["diffusers"],
+	"text2text-generation": ["transformers", "transformers.js"],
+	"time-series-forecasting": [],
+	"token-classification": [
+		"adapter-transformers",
+		"flair",
+		"spacy",
+		"span-marker",
+		"stanza",
+		"transformers",
+		"transformers.js",
+	],
+	translation: ["transformers", "transformers.js"],
+	"unconditional-image-generation": ["diffusers"],
+	"video-text-to-text": ["transformers"],
+	"visual-question-answering": ["transformers", "transformers.js"],
+	"voice-activity-detection": [],
+	"zero-shot-classification": ["transformers", "transformers.js"],
+	"zero-shot-image-classification": ["transformers", "transformers.js"],
+	"zero-shot-object-detection": ["transformers", "transformers.js"],
+	"text-to-3d": ["diffusers"],
+	"image-to-3d": ["diffusers"],
+	"any-to-any": ["transformers"],
+};
+/**
+ * Return the whole TaskData object for a certain task.
+ * If the partialTaskData argument is left undefined,
+ * the default placholder data will be used.
+ */
+function getData(type: PipelineType, partialTaskData: TaskDataCustom = placeholder): TaskData {
+	return {
+		...partialTaskData,
+		id: type,
+		label: PIPELINE_DATA[type].name,
+		libraries: TASKS_MODEL_LIBRARIES[type],
+	};
+}
+// To make comparisons easier, task order is the same as in const.ts
+// Tasks set to undefined won't have an associated task page.
+// Tasks that call getData() without the second argument will
+// have a "placeholder" page.
+export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
+	"any-to-any": getData("any-to-any", placeholder),
+	"audio-classification": getData("audio-classification", audioClassification),
+	"audio-to-audio": getData("audio-to-audio", audioToAudio),
+	"automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
+	"depth-estimation": getData("depth-estimation", depthEstimation),
+	"document-question-answering": getData("document-question-answering", documentQuestionAnswering),
+	"feature-extraction": getData("feature-extraction", featureExtraction),
+	"fill-mask": getData("fill-mask", fillMask),
+	"graph-ml": undefined,
+	"image-classification": getData("image-classification", imageClassification),
+	"image-feature-extraction": getData("image-feature-extraction", imageFeatureExtraction),
+	"image-segmentation": getData("image-segmentation", imageSegmentation),
+	"image-to-image": getData("image-to-image", imageToImage),
+	"image-text-to-text": getData("image-text-to-text", imageTextToText),
+	"image-to-text": getData("image-to-text", imageToText),
+	"image-to-video": undefined,
+	"keypoint-detection": getData("keypoint-detection", keypointDetection),
+	"mask-generation": getData("mask-generation", maskGeneration),
+	"multiple-choice": undefined,
+	"object-detection": getData("object-detection", objectDetection),
+	"video-classification": getData("video-classification", videoClassification),
+	other: undefined,
+	"question-answering": getData("question-answering", questionAnswering),
+	"reinforcement-learning": getData("reinforcement-learning", reinforcementLearning),
+	robotics: undefined,
+	"sentence-similarity": getData("sentence-similarity", sentenceSimilarity),
+	summarization: getData("summarization", summarization),
+	"table-question-answering": getData("table-question-answering", tableQuestionAnswering),
+	"table-to-text": undefined,
+	"tabular-classification": getData("tabular-classification", tabularClassification),
+	"tabular-regression": getData("tabular-regression", tabularRegression),
+	"tabular-to-text": undefined,
+	"text-classification": getData("text-classification", textClassification),
+	"text-generation": getData("text-generation", textGeneration),
+	"text-retrieval": undefined,
+	"text-to-image": getData("text-to-image", textToImage),
+	"text-to-speech": getData("text-to-speech", textToSpeech),
+	"text-to-audio": undefined,
+	"text-to-video": getData("text-to-video", textToVideo),
+	"text2text-generation": undefined,
+	"time-series-forecasting": undefined,
+	"token-classification": getData("token-classification", tokenClassification),
+	translation: getData("translation", translation),
+	"unconditional-image-generation": getData("unconditional-image-generation", unconditionalImageGeneration),
+	"video-text-to-text": getData("video-text-to-text", videoTextToText),
+	"visual-question-answering": getData("visual-question-answering", visualQuestionAnswering),
+	"voice-activity-detection": undefined,
+	"zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
+	"zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification),
+	"zero-shot-object-detection": getData("zero-shot-object-detection", zeroShotObjectDetection),
+	"text-to-3d": getData("text-to-3d", textTo3D),
+	"image-to-3d": getData("image-to-3d", imageTo3D),
+} as const;
+export interface ExampleRepo {
+	description: string;
+	id: string;
+}
+export type TaskDemoEntry =
+	| {
+			filename: string;
+			type: "audio";
+	  }
+	| {
+			data: Array<{
+				label: string;
+				score: number;
+			}>;
+			type: "chart";
+	  }
+	| {
+			filename: string;
+			type: "img";
+	  }
+	| {
+			table: string[][];
+			type: "tabular";
+	  }
+	| {
+			content: string;
+			label: string;
+			type: "text";
+	  }
+	| {
+			text: string;
+			tokens: Array<{
+				end: number;
+				start: number;
+				type: string;
+			}>;
+			type: "text-with-tokens";
+	  };
+export interface TaskDemo {
+	inputs: TaskDemoEntry[];
+	outputs: TaskDemoEntry[];
+}
+export interface TaskData {
+	datasets: ExampleRepo[];
+	demo: TaskDemo;
+	id: PipelineType;
+	canonicalId?: PipelineType;
+	isPlaceholder?: boolean;
+	label: string;
+	libraries: ModelLibraryKey[];
+	metrics: ExampleRepo[];
+	models: ExampleRepo[];
+	spaces: ExampleRepo[];
+	summary: string;
+	widgetModels: string[];
+	youtubeId?: string;
+}
+export type TaskDataCustom = Omit<TaskData, "id" | "label" | "libraries">;

package/src/tasks/keypoint-detection/about.md ADDED Viewed

@@ -0,0 +1,57 @@
+## Task Variants
+### Pose Estimation
+Pose estimation is the process of determining the position and orientation of an object or a camera in a 3D space. It is a fundamental task in computer vision and is widely used in various applications such as robotics, augmented reality, and 3D reconstruction.
+## Use Cases for Keypoint Detection
+### Facial Landmark Estimation
+Keypoint detection models can be used to estimate the position of facial landmarks. Facial landmarks are points on the face such as the corners of the mouth, the outer corners of the eyes, and the tip of the nose. These landmarks can be used for a variety of applications, such as facial expression recognition, 3D face reconstruction, and cinematic animation.
+### Fitness Tracking
+Keypoint detection models can be used to track the movement of the human body, e.g. position of the joints in a 3D space. This can be used for a variety of applications, such as fitness tracking, sports analysis or virtual reality applications.
+## Inference Code
+Below you can find an example of how to use a keypoint detection model and how to visualize the results.
+```python
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+import torch
+import matplotlib.pyplot as plt
+from PIL import Image
+import requests
+url_image = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url_image_1, stream=True).raw)
+# initialize the model and processor
+processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+# infer
+inputs = processor(image, return_tensors="pt").to(model.device, model.dtype)
+outputs = model(**inputs)
+# postprocess
+image_sizes = [(image.size[1], image.size[0])]
+outputs = processor.post_process_keypoint_detection(model_outputs, image_sizes)
+keypoints = outputs[0]["keypoints"].detach().numpy()
+scores = outputs[0]["scores"].detach().numpy()
+image_width, image_height = image.size
+# plot
+plt.axis('off')
+plt.imshow(image)
+plt.scatter(
+    keypoints[:, 0],
+    keypoints[:, 1],
+    s=scores * 100,
+    c='cyan',
+    alpha=0.4
+)
+plt.show()
+```

package/src/tasks/keypoint-detection/data.ts ADDED Viewed

@@ -0,0 +1,50 @@
+import type { TaskDataCustom } from "../index.js";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			description: "A dataset of hand keypoints of over 500k examples.",
+			id: "Vincent-luo/hagrid-mediapipe-hands",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "keypoint-detection-input.png",
+				type: "img",
+			},
+		],
+		outputs: [
+			{
+				filename: "keypoint-detection-output.png",
+				type: "img",
+			},
+		],
+	},
+	metrics: [],
+	models: [
+		{
+			description: "A robust keypoint detection model.",
+			id: "magic-leap-community/superpoint",
+		},
+		{
+			description: "Strong keypoint detection model used to detect human pose.",
+			id: "facebook/sapiens-pose-1b",
+		},
+	],
+	spaces: [
+		{
+			description: "An application that detects hand keypoints in real-time.",
+			id: "datasciencedojo/Hand-Keypoint-Detection-Realtime",
+		},
+		{
+			description: "An application to try a universal keypoint detection model.",
+			id: "merve/SuperPoint",
+		},
+	],
+	summary: "Keypoint detection is the task of identifying meaningful distinctive points or features in an image.",
+	widgetModels: [],
+	youtubeId: "",
+};
+export default taskData;

package/src/tasks/mask-generation/about.md ADDED Viewed

@@ -0,0 +1,65 @@
+## Use Cases
+### Filtering an Image
+When filtering for an image, the generated masks might serve as an initial filter to eliminate irrelevant information. For instance, when monitoring vegetation in satellite imaging, mask generation models identify green spots, highlighting the relevant region of the image.
+### Masked Image Modelling
+Generating masks can facilitate learning, especially in semi or unsupervised learning. For example, the [BEiT model](https://huggingface.co/docs/transformers/model_doc/beit) uses image-mask patches in the pre-training.
+### Human-in-the-loop Computer Vision Applications
+For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
+## Task Variants
+### Segmentation
+Image Segmentation divides an image into segments where each pixel is mapped to an object. This task has multiple variants, such as instance segmentation, panoptic segmentation, and semantic segmentation. You can learn more about segmentation on its [task page](https://huggingface.co/tasks/image-segmentation).
+## Inference
+Mask generation models often work in two modes: segment everything or prompt mode.
+The example below works in segment-everything-mode, where many masks will be returned.
+```python
+from transformers import pipeline
+generator = pipeline("mask-generation", model="Zigeng/SlimSAM-uniform-50", points_per_batch=64, device="cuda")
+image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+outputs = generator(image_url)
+outputs["masks"]
+# array of multiple binary masks returned for each generated mask
+```
+Prompt mode takes in three types of prompts:
+- **Point prompt:** The user can select a point on the image, and a meaningful segment around the point will be returned.
+- **Box prompt:** The user can draw a box on the image, and a meaningful segment within the box will be returned.
+- **Text prompt:** The user can input a text, and the objects of that type will be segmented. Note that this capability has not yet been released and has only been explored in research.
+Below you can see how to use an input-point prompt. It also demonstrates direct model inference without the `pipeline` abstraction. The input prompt here is a nested list where the outermost list is the batch size (`1`), then the number of points (also `1` in this example), and the innermost list contains the actual coordinates of the point (`[450, 600]`).
+```python
+from transformers import SamModel, SamProcessor
+from PIL import Image
+import requests
+model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50").to("cuda")
+processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-50")
+raw_image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
+# pointing to the car window
+input_points = [[[450, 600]]]
+inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda")
+outputs = model(**inputs)
+masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+scores = outputs.iou_scores
+```
+## Useful Resources
+Would you like to learn more about mask generation? Great! Here you can find some curated resources that you may find helpful!
+- [Segment anything model](https://huggingface.co/docs/transformers/main/model_doc/sam)

package/src/tasks/mask-generation/data.ts ADDED Viewed

@@ -0,0 +1,55 @@
+import type { TaskDataCustom } from "../index.js";
+const taskData: TaskDataCustom = {
+	datasets: [],
+	demo: {
+		inputs: [
+			{
+				filename: "mask-generation-input.png",
+				type: "img",
+			},
+		],
+		outputs: [
+			{
+				filename: "mask-generation-output.png",
+				type: "img",
+			},
+		],
+	},
+	metrics: [],
+	models: [
+		{
+			description: "Small yet powerful mask generation model.",
+			id: "Zigeng/SlimSAM-uniform-50",
+		},
+		{
+			description: "Very strong mask generation model.",
+			id: "facebook/sam2-hiera-large",
+		},
+	],
+	spaces: [
+		{
+			description:
+				"An application that combines a mask generation model with a zero-shot object detection model for text-guided image segmentation.",
+			id: "merve/OWLSAM2",
+		},
+		{
+			description: "An application that compares the performance of a large and a small mask generation model.",
+			id: "merve/slimsam",
+		},
+		{
+			description: "An application based on an improved mask generation model.",
+			id: "SkalskiP/segment-anything-model-2",
+		},
+		{
+			description: "An application to remove objects from videos using mask generation models.",
+			id: "SkalskiP/SAM_and_ProPainter",
+		},
+	],
+	summary:
+		"Mask generation is the task of generating masks that identify a specific object or region of interest in a given image. Masks are often used in segmentation tasks, where they provide a precise way to isolate the object of interest for further processing or analysis.",
+	widgetModels: [],
+	youtubeId: "",
+};
+export default taskData;

package/src/tasks/object-detection/about.md ADDED Viewed

@@ -0,0 +1,37 @@
+## Use Cases
+### Autonomous Driving
+Object Detection is widely used in computer vision for autonomous driving. Self-driving cars use Object Detection models to detect pedestrians, bicycles, traffic lights and road signs to decide which step to take.
+### Object Tracking in Matches
+Object Detection models are widely used in sports where the ball or a player is tracked for monitoring and refereeing during matches.
+### Image Search
+Object Detection models are widely used in image search. Smartphones use Object Detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the Internet.
+### Object Counting
+Object Detection models are used to count instances of objects in a given image, this can include counting the objects in warehouses or stores, or counting the number of visitors in a store. They are also used to manage crowds at events to prevent disasters.
+## Inference
+You can infer with Object Detection models through the `object-detection` pipeline. When calling the pipeline you just need to specify a path or http link to an image.
+```python
+model = pipeline("object-detection")
+model("path_to_cat_image")
+# [{'label': 'blanket',
+#  'mask': mask_string,
+#  'score': 0.917},
+#...]
+```
+# Useful Resources
+- [Walkthrough of Computer Vision Ecosystem in Hugging Face - CV Study Group](https://www.youtube.com/watch?v=oL-xmufhZM8)
+- [Object detection task guide](https://huggingface.co/docs/transformers/tasks/object_detection)

package/src/tasks/object-detection/data.ts ADDED Viewed

@@ -0,0 +1,86 @@
+import type { TaskDataCustom } from "../index.js";
+const taskData: TaskDataCustom = {
+	datasets: [
+		{
+			description: "Widely used benchmark dataset for multiple vision tasks.",
+			id: "merve/coco2017",
+		},
+		{
+			description: "Multi-task computer vision benchmark.",
+			id: "merve/pascal-voc",
+		},
+	],
+	demo: {
+		inputs: [
+			{
+				filename: "object-detection-input.jpg",
+				type: "img",
+			},
+		],
+		outputs: [
+			{
+				filename: "object-detection-output.jpg",
+				type: "img",
+			},
+		],
+	},
+	metrics: [
+		{
+			description:
+				"The Average Precision (AP) metric is the Area Under the PR Curve (AUC-PR). It is calculated for each class separately",
+			id: "Average Precision",
+		},
+		{
+			description: "The Mean Average Precision (mAP) metric is the overall average of the AP values",
+			id: "Mean Average Precision",
+		},
+		{
+			description:
+				"The APα metric is the Average Precision at the IoU threshold of a α value, for example, AP50 and AP75",
+			id: "APα",
+		},
+	],
+	models: [
+		{
+			description: "Solid object detection model pre-trained on the COCO 2017 dataset.",
+			id: "facebook/detr-resnet-50",
+		},
+		{
+			description: "Real-time and accurate object detection model.",
+			id: "jameslahm/yolov10x",
+		},
+		{
+			description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
+			id: "PekingU/rtdetr_r18vd_coco_o365",
+		},
+	],
+	spaces: [
+		{
+			description: "Leaderboard to compare various object detection models across several metrics.",
+			id: "hf-vision/object_detection_leaderboard",
+		},
+		{
+			description: "An application that contains various object detection models to try from.",
+			id: "Gradio-Blocks/Object-Detection-With-DETR-and-YOLOS",
+		},
+		{
+			description: "An application that shows multiple cutting edge techniques for object detection and tracking.",
+			id: "kadirnar/torchyolo",
+		},
+		{
+			description: "An object tracking, segmentation and inpainting application.",
+			id: "VIPLab/Track-Anything",
+		},
+		{
+			description: "Very fast object tracking application based on object detection.",
+			id: "merve/RT-DETR-tracking-coco",
+		},
+	],
+	summary:
+		"Object Detection models allow users to identify objects of certain defined classes. Object detection models receive an image as input and output the images with bounding boxes and labels on detected objects.",
+	widgetModels: ["facebook/detr-resnet-50"],
+	youtubeId: "WdAeKSOpxhw",
+};
+export default taskData;