npm - @huggingface/tasks - Versions diffs - 0.13.2 → 0.13.4 - Mend

@huggingface/tasks 0.13.2 → 0.13.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/dist/esm/pipelines.js CHANGED Viewed

@@ -306,6 +306,12 @@ export const PIPELINE_DATA = {
         modality: "audio",
         color: "green",
     },
+    "audio-text-to-text": {
+        name: "Audio-Text-to-Text",
+        modality: "multimodal",
+        color: "red",
+        hideInDatasets: true,
+    },
     "voice-activity-detection": {
         name: "Voice Activity Detection",
         modality: "audio",

package/dist/esm/tasks/automatic-speech-recognition/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/automatic-speech-recognition/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cA6Ef~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/automatic-speech-recognition/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAyFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/automatic-speech-recognition/data.js CHANGED Viewed

@@ -4,12 +4,16 @@ const taskData = {
             description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
             id: "mozilla-foundation/common_voice_17_0",
         },
+        {
+            description: "Multilingual and diverse audio dataset with 101k hours of audio.",
+            id: "amphion/Emilia-Dataset",
+        },
         {
             description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
             id: "parler-tts/mls_eng",
         },
         {
-            description: "A multi-lingual audio dataset with 370K hours of audio.",
+            description: "A multilingual audio dataset with 370K hours of audio.",
             id: "espnet/yodas",
         },
     ],
@@ -52,6 +56,10 @@ const taskData = {
             description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
             id: "facebook/seamless-m4t-v2-large",
         },
+        {
+            description: "A powerful multilingual ASR and Speech Translation model by Nvidia.",
+            id: "nvidia/canary-1b",
+        },
         {
             description: "Powerful speaker diarization model.",
             id: "pyannote/speaker-diarization-3.1",
@@ -63,13 +71,17 @@ const taskData = {
             id: "hf-audio/whisper-large-v3",
         },
         {
-            description: "Fastest speech recognition application.",
-            id: "sanchit-gandhi/whisper-jax",
+            description: "Latest ASR model from Useful Sensors.",
+            id: "mrfakename/Moonshinex",
         },
         {
             description: "A high quality speech and text translation model by Meta.",
             id: "facebook/seamless_m4t",
         },
+        {
+            description: "A powerful multilingual ASR and Speech Translation model by Nvidia",
+            id: "nvidia/canary-1b",
+        },
     ],
     summary: "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
     widgetModels: ["openai/whisper-large-v3"],

package/dist/esm/tasks/document-question-answering/inference.d.ts CHANGED Viewed

@@ -102,10 +102,6 @@ export interface DocumentQuestionAnsweringOutputElement {
      * boxes).
      */
     start: number;
-    /**
-     * The index of each word/box pair that is in the answer
-     */
-    words: number[];
     [property: string]: unknown;
 }
 //# sourceMappingURL=inference.d.ts.map

package/dist/esm/tasks/document-question-answering/inference.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/document-question-answering/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,MAAM,EAAE,kCAAkC,CAAC;IAC3C;;OAEG;IACH,UAAU,CAAC,EAAE,mCAAmC,CAAC;IACjD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,kCAAkC;IAClD;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,mCAAmC;IACnD;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC;IACvB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,OAAO,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC;AACxC,MAAM,MAAM,+BAA+B,GAAG,sCAAsC,EAAE,CAAC;AACvF;;GAEG;AACH,MAAM,WAAW,sCAAsC;IACtD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,GAAG,EAAE,MAAM,CAAC;IACZ;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IACd~~;;OAEG;IACH~~,~~KAAK,EAAE,MAAM,EAAE,~~CAAC~~;IAChB~~,~~CAAC,~~QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1	+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/document-question-answering/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,MAAM,EAAE,kCAAkC,CAAC;IAC3C;;OAEG;IACH,UAAU,CAAC,EAAE,mCAAmC,CAAC;IACjD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,kCAAkC;IAClD;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,mCAAmC;IACnD;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC;IACvB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,OAAO,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC;AACxC,MAAM,MAAM,+BAA+B,GAAG,sCAAsC,EAAE,CAAC;AACvF;;GAEG;AACH,MAAM,WAAW,sCAAsC;IACtD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,GAAG,EAAE,MAAM,CAAC;IACZ;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}

package/dist/esm/tasks/index.d.ts CHANGED Viewed

@@ -25,7 +25,7 @@ export type * from "./video-classification/inference.js";
 export type * from "./visual-question-answering/inference.js";
 export type * from "./zero-shot-classification/inference.js";
 export type * from "./zero-shot-image-classification/inference.js";
-export type { BoundingBox, ZeroShotObjectDetectionInput, ZeroShotObjectDetectionInputData, ZeroShotObjectDetectionOutput, ZeroShotObjectDetectionOutputElement, } from "./zero-shot-object-detection/inference.js";
+export type { BoundingBox, ZeroShotObjectDetectionInput, ZeroShotObjectDetectionOutput, ZeroShotObjectDetectionOutputElement, } from "./zero-shot-object-detection/inference.js";
 import type { ModelLibraryKey } from "../model-libraries.js";
 /**
  * Model libraries compatible with each ML task

package/dist/esm/tasks/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AA4CpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,~~gCAAgC,EAChC,~~6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAE7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,~~CA6DzE~~,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,~~CAqDxD~~,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AA4CpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAE7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CA8DzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CAsDxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}

package/dist/esm/tasks/index.js CHANGED Viewed

@@ -46,6 +46,7 @@ export const TASKS_MODEL_LIBRARIES = {
     "audio-classification": ["speechbrain", "transformers", "transformers.js"],
     "audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
     "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
+    "audio-text-to-text": [],
     "depth-estimation": ["transformers", "transformers.js"],
     "document-question-answering": ["transformers", "transformers.js"],
     "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
@@ -125,6 +126,7 @@ export const TASKS_DATA = {
     "any-to-any": getData("any-to-any", placeholder),
     "audio-classification": getData("audio-classification", audioClassification),
     "audio-to-audio": getData("audio-to-audio", audioToAudio),
+    "audio-text-to-text": getData("audio-text-to-text", placeholder),
     "automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
     "depth-estimation": getData("depth-estimation", depthEstimation),
     "document-question-answering": getData("document-question-answering", documentQuestionAnswering),

package/dist/esm/tasks/mask-generation/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/mask-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cAkDf~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/mask-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAgEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/mask-generation/data.js CHANGED Viewed

@@ -1,5 +1,14 @@
 const taskData = {
-    datasets: [],
+    datasets: [
+        {
+            description: "Widely used benchmark dataset for multiple Vision tasks.",
+            id: "merve/coco2017",
+        },
+        {
+            description: "Medical Imaging dataset of the Human Brain for segmentation and mask generating tasks",
+            id: "rocky93/BraTS_segmentation",
+        },
+    ],
     demo: {
         inputs: [
             {
@@ -14,7 +23,12 @@ const taskData = {
             },
         ],
     },
-    metrics: [],
+    metrics: [
+        {
+            description: "IoU is used to measure the overlap between predicted mask and the ground truth mask.",
+            id: "Intersection over Union (IoU)",
+        },
+    ],
     models: [
         {
             description: "Small yet powerful mask generation model.",

package/dist/esm/tasks/text-to-speech/data.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,~~cA0Ef~~,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1	+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAkFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}

package/dist/esm/tasks/text-to-speech/data.js CHANGED Viewed

@@ -9,6 +9,10 @@ const taskData = {
             description: "Multi-speaker English dataset.",
             id: "mythicinfinity/libritts_r",
         },
+        {
+            description: "Mulit-lingual dataset.",
+            id: "facebook/multilingual_librispeech",
+        },
     ],
     demo: {
         inputs: [
@@ -33,20 +37,24 @@ const taskData = {
     ],
     models: [
         {
-            description: "A powerful TTS model.",
+            description: "A prompt based, powerful TTS model.",
             id: "parler-tts/parler-tts-large-v1",
         },
+        {
+            description: "A powerful TTS model that supports English and Chinese.",
+            id: "SWivid/F5-TTS",
+        },
         {
             description: "A massively multi-lingual TTS model.",
             id: "coqui/XTTS-v2",
         },
         {
-            description: "Robust TTS model.",
-            id: "metavoiceio/metavoice-1B-v0.1",
+            description: "A powerful TTS model.",
+            id: "amphion/MaskGCT",
         },
         {
-            description: "A prompt based, powerful TTS model.",
-            id: "parler-tts/parler_tts_mini_v0.1",
+            description: "A Llama based TTS model.",
+            id: "OuteAI/OuteTTS-0.1-350M",
         },
     ],
     spaces: [
@@ -63,8 +71,8 @@ const taskData = {
             id: "mrfakename/E2-F5-TTS",
         },
         {
-            description: "An application that synthesizes speech for diverse speaker prompts.",
-            id: "parler-tts/parler_tts_mini",
+            description: "An application that synthesizes emotional speech for diverse speaker prompts.",
+            id: "parler-tts/parler-tts-expresso",
         },
     ],
     summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",

package/dist/esm/tasks/zero-shot-classification/inference.d.ts CHANGED Viewed

@@ -8,27 +8,13 @@
  */
 export interface ZeroShotClassificationInput {
     /**
-     * The input text data, with candidate labels
+     * The text to classify
      */
-    inputs: ZeroShotClassificationInputData;
+    inputs: string;
     /**
      * Additional inference parameters
      */
-    parameters?: ZeroShotClassificationParameters;
-    [property: string]: unknown;
-}
-/**
- * The input text data, with candidate labels
- */
-export interface ZeroShotClassificationInputData {
-    /**
-     * The set of possible class labels to classify the text into.
-     */
-    candidateLabels: string[];
-    /**
-     * The text to classify
-     */
-    text: string;
+    parameters: ZeroShotClassificationParameters;
     [property: string]: unknown;
 }
 /**
@@ -38,8 +24,12 @@ export interface ZeroShotClassificationInputData {
  */
 export interface ZeroShotClassificationParameters {
     /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
+     * The set of possible class labels to classify the text into.
+     */
+    candidate_labels: string[];
+    /**
+     * The sentence used in conjunction with `candidate_labels` to attempt the text
+     * classification by replacing the placeholder with the candidate labels.
      */
     hypothesis_template?: string;
     /**

package/dist/esm/tasks/zero-shot-classification/inference.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,2BAA2B;IAC3C;;OAEG;IACH,MAAM,EAAE~~,+BAA+B~~,CAAC;~~IACxC~~;;OAEG;IACH,UAAU,~~CAAC,~~EAAE,gCAAgC,CAAC;~~IAC9C~~,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD~~;;GAEG~~;AACH,MAAM,WAAW~~,+BAA+B;IAC/C;;OAEG;IACH~~,~~eAAe,EAAE,MAAM,EAAE,CAAC~~;~~IAC1B~~;;OAEG;IACH,~~IAAI~~,EAAE,MAAM,~~CAAC;IACb,CAAC,QAAQ,~~EAAE,~~MAAM,GAAG,OAAO,~~CAAC;~~CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,gCAAgC;IAChD~~;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,4BAA4B,GAAG,mCAAmC,EAAE,CAAC;AACjF;;GAEG;AACH,MAAM,WAAW,mCAAmC;IACnD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1	+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,2BAA2B;IAC3C;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,gCAAgC,CAAC;IAC7C,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,4BAA4B,GAAG,mCAAmC,EAAE,CAAC;AACjF;;GAEG;AACH,MAAM,WAAW,mCAAmC;IACnD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}

package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts CHANGED Viewed

@@ -8,27 +8,13 @@
  */
 export interface ZeroShotImageClassificationInput {
     /**
-     * The input image data, with candidate labels
+     * The input image data to classify as a base64-encoded string.
      */
-    inputs: ZeroShotImageClassificationInputData;
+    inputs: string;
     /**
      * Additional inference parameters
      */
-    parameters?: ZeroShotImageClassificationParameters;
-    [property: string]: unknown;
-}
-/**
- * The input image data, with candidate labels
- */
-export interface ZeroShotImageClassificationInputData {
-    /**
-     * The candidate labels for this image
-     */
-    candidateLabels: string[];
-    /**
-     * The image data to classify
-     */
-    image: unknown;
+    parameters: ZeroShotImageClassificationParameters;
     [property: string]: unknown;
 }
 /**
@@ -38,8 +24,12 @@ export interface ZeroShotImageClassificationInputData {
  */
 export interface ZeroShotImageClassificationParameters {
     /**
-     * The sentence used in conjunction with candidateLabels to attempt the text classification
-     * by replacing the placeholder with the candidate labels.
+     * The candidate labels for this image
+     */
+    candidate_labels: string[];
+    /**
+     * The sentence used in conjunction with `candidate_labels` to attempt the image
+     * classification by replacing the placeholder with the candidate labels.
      */
     hypothesis_template?: string;
     [property: string]: unknown;

package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-image-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,MAAM,EAAE,~~oCAAoC~~,CAAC;~~IAC7C~~;;OAEG;IACH,UAAU,~~CAAC,~~EAAE,qCAAqC,CAAC;~~IACnD~~,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD~~;;GAEG~~;AACH,MAAM,WAAW,~~oCAAoC~~;~~IACpD~~;;OAEG;IACH,~~eAAe~~,EAAE,MAAM,EAAE,CAAC;~~IAC1B;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,qCAAqC;IACrD~~;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,iCAAiC,GAAG,wCAAwC,EAAE,CAAC;AAC3F;;GAEG;AACH,MAAM,WAAW,wCAAwC;IACxD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1	+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-image-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,qCAAqC,CAAC;IAClD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,qCAAqC;IACrD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,iCAAiC,GAAG,wCAAwC,EAAE,CAAC;AAC3F;;GAEG;AACH,MAAM,WAAW,wCAAwC;IACxD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}

package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts CHANGED Viewed

@@ -8,29 +8,25 @@
  */
 export interface ZeroShotObjectDetectionInput {
     /**
-     * The input image data, with candidate labels
+     * The input image data as a base64-encoded string.
      */
-    inputs: ZeroShotObjectDetectionInputData;
+    inputs: string;
     /**
      * Additional inference parameters
      */
-    parameters?: {
-        [key: string]: unknown;
-    };
+    parameters: ZeroShotObjectDetectionParameters;
     [property: string]: unknown;
 }
 /**
- * The input image data, with candidate labels
+ * Additional inference parameters
+ *
+ * Additional inference parameters for Zero Shot Object Detection
  */
-export interface ZeroShotObjectDetectionInputData {
+export interface ZeroShotObjectDetectionParameters {
     /**
      * The candidate labels for this image
      */
-    candidateLabels: string[];
-    /**
-     * The image data to generate bounding boxes from
-     */
-    image: unknown;
+    candidate_labels: string[];
     [property: string]: unknown;
 }
 /**

package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-object-detection/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C;;OAEG;IACH,MAAM,EAAE,~~gCAAgC~~,CAAC;~~IACzC~~;;OAEG;IACH,UAAU,~~CAAC,~~EAAE~~;QACZ~~,~~CAAC~~,~~GAAG,EAAE,MAAM,GAAG,OAAO,~~CAAC;~~KACvB~~,CAAC~~;IACF~~,~~CAAC,~~QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD~~;;GAEG~~;AACH,MAAM,WAAW,~~gCAAgC~~;~~IAChD~~;;OAEG;IACH,~~eAAe~~,EAAE,MAAM,EAAE,CAAC;~~IAC1B;;OAEG;IACH~~,~~KAAK,EAAE,OAAO,~~CAAC~~;IACf~~,~~CAAC,~~QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,6BAA6B,GAAG,oCAAoC,EAAE,CAAC;AACnF;;GAEG;AACH,MAAM,WAAW,oCAAoC;IACpD;;;OAGG;IACH,GAAG,EAAE,WAAW,CAAC;IACjB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1	+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-object-detection/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,iCAAiC,CAAC;IAC9C,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,iCAAiC;IACjD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,6BAA6B,GAAG,oCAAoC,EAAE,CAAC;AACnF;;GAEG;AACH,MAAM,WAAW,oCAAoC;IACpD;;;OAGG;IACH,GAAG,EAAE,WAAW,CAAC;IACjB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@huggingface/tasks",
   "packageManager": "pnpm@8.10.5",
-  "version": "0.13.2",
+  "version": "0.13.4",
   "description": "List of ML tasks for huggingface.co/tasks",
   "repository": "https://github.com/huggingface/huggingface.js.git",
   "publishConfig": {

package/src/index.ts CHANGED Viewed

@@ -49,6 +49,7 @@ import * as snippets from "./snippets/index.js";
 export * from "./gguf.js";
 export { snippets };
+export type { InferenceSnippet } from "./snippets/index.js";
 export { SKUS, DEFAULT_MEMORY_OPTIONS } from "./hardware.js";
 export type { HardwareSpec, SkuType } from "./hardware.js";

package/src/model-libraries.ts CHANGED Viewed

@@ -212,6 +212,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
 		repoUrl: "https://github.com/cartesia-ai/cartesia_mlx",
 		snippets: snippets.cartesia_mlx,
 	},
+	clipscope: {
+		prettyLabel: "clipscope",
+		repoName: "clipscope",
+		repoUrl: "https://github.com/Lewington-pitsos/clipscope",
+		filter: false,
+		countDownloads: `path_extension:"pt"`,
+	},
 	cotracker: {
 		prettyLabel: "CoTracker",
 		repoName: "CoTracker",

package/src/pipelines.ts CHANGED Viewed

@@ -355,6 +355,12 @@ export const PIPELINE_DATA = {
 		modality: "audio",
 		color: "green",
 	},
+	"audio-text-to-text": {
+		name: "Audio-Text-to-Text",
+		modality: "multimodal",
+		color: "red",
+		hideInDatasets: true,
+	},
 	"voice-activity-detection": {
 		name: "Voice Activity Detection",
 		modality: "audio",

package/src/tasks/automatic-speech-recognition/data.ts CHANGED Viewed

@@ -6,12 +6,16 @@ const taskData: TaskDataCustom = {
 			description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
 			id: "mozilla-foundation/common_voice_17_0",
 		},
+		{
+			description: "Multilingual and diverse audio dataset with 101k hours of audio.",
+			id: "amphion/Emilia-Dataset",
+		},
 		{
 			description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
 			id: "parler-tts/mls_eng",
 		},
 		{
-			description: "A multi-lingual audio dataset with 370K hours of audio.",
+			description: "A multilingual audio dataset with 370K hours of audio.",
 			id: "espnet/yodas",
 		},
 	],
@@ -54,6 +58,10 @@ const taskData: TaskDataCustom = {
 			description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
 			id: "facebook/seamless-m4t-v2-large",
 		},
+		{
+			description: "A powerful multilingual ASR and Speech Translation model by Nvidia.",
+			id: "nvidia/canary-1b",
+		},
 		{
 			description: "Powerful speaker diarization model.",
 			id: "pyannote/speaker-diarization-3.1",
@@ -65,13 +73,17 @@ const taskData: TaskDataCustom = {
 			id: "hf-audio/whisper-large-v3",
 		},
 		{
-			description: "Fastest speech recognition application.",
-			id: "sanchit-gandhi/whisper-jax",
+			description: "Latest ASR model from Useful Sensors.",
+			id: "mrfakename/Moonshinex",
 		},
 		{
 			description: "A high quality speech and text translation model by Meta.",
 			id: "facebook/seamless_m4t",
 		},
+		{
+			description: "A powerful multilingual ASR and Speech Translation model by Nvidia",
+			id: "nvidia/canary-1b",
+		},
 	],
 	summary:
 		"Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",

package/src/tasks/document-question-answering/inference.ts CHANGED Viewed

@@ -102,9 +102,5 @@ export interface DocumentQuestionAnsweringOutputElement {
 	 * boxes).
 	 */
 	start: number;
-	/**
-	 * The index of each word/box pair that is in the answer
-	 */
-	words: number[];
 	[property: string]: unknown;
 }

package/src/tasks/document-question-answering/spec/output.json CHANGED Viewed

@@ -22,15 +22,8 @@
 			"end": {
 				"type": "integer",
 				"description": "The end word index of the answer (in the OCR\u2019d version of the input or provided word boxes)."
-			},
-			"words": {
-				"type": "array",
-				"items": {
-					"type": "integer"
-				},
-				"description": "The index of each word/box pair that is in the answer"
 			}
 		},
-		"required": ["answer", "score", "start", "end", "words"]
+		"required": ["answer", "score", "start", "end"]
 	}
 }

package/src/tasks/index.ts CHANGED Viewed

@@ -102,7 +102,6 @@ export type * from "./zero-shot-image-classification/inference.js";
 export type {
 	BoundingBox,
 	ZeroShotObjectDetectionInput,
-	ZeroShotObjectDetectionInputData,
 	ZeroShotObjectDetectionOutput,
 	ZeroShotObjectDetectionOutputElement,
 } from "./zero-shot-object-detection/inference.js";
@@ -116,6 +115,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
 	"audio-classification": ["speechbrain", "transformers", "transformers.js"],
 	"audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
 	"automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
+	"audio-text-to-text": [],
 	"depth-estimation": ["transformers", "transformers.js"],
 	"document-question-answering": ["transformers", "transformers.js"],
 	"feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
@@ -197,6 +197,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
 	"any-to-any": getData("any-to-any", placeholder),
 	"audio-classification": getData("audio-classification", audioClassification),
 	"audio-to-audio": getData("audio-to-audio", audioToAudio),
+	"audio-text-to-text": getData("audio-text-to-text", placeholder),
 	"automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
 	"depth-estimation": getData("depth-estimation", depthEstimation),
 	"document-question-answering": getData("document-question-answering", documentQuestionAnswering),

package/src/tasks/mask-generation/about.md CHANGED Viewed

@@ -12,6 +12,16 @@ Generating masks can facilitate learning, especially in semi or unsupervised lea
 For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
+### Medical Imaging
+Mask generation models are used in medical imaging to aid in segmenting and analyzing specific regions.
+### Autonomous Vehicles
+Mask generation models are used to create segments and masks for obstacles and other objects in view.
+This page was made possible thanks to the efforts of [Raj Aryan](https://huggingface.co/thatrajaryan) and other contributors.
 ## Task Variants
 ### Segmentation

package/src/tasks/mask-generation/data.ts CHANGED Viewed

@@ -1,7 +1,16 @@
 import type { TaskDataCustom } from "../index.js";
 const taskData: TaskDataCustom = {
-	datasets: [],
+	datasets: [
+		{
+			description: "Widely used benchmark dataset for multiple Vision tasks.",
+			id: "merve/coco2017",
+		},
+		{
+			description: "Medical Imaging dataset of the Human Brain for segmentation and mask generating tasks",
+			id: "rocky93/BraTS_segmentation",
+		},
+	],
 	demo: {
 		inputs: [
 			{
@@ -16,7 +25,12 @@ const taskData: TaskDataCustom = {
 			},
 		],
 	},
-	metrics: [],
+	metrics: [
+		{
+			description: "IoU is used to measure the overlap between predicted mask and the ground truth mask.",
+			id: "Intersection over Union (IoU)",
+		},
+	],
 	models: [
 		{
 			description: "Small yet powerful mask generation model.",

package/src/tasks/text-to-speech/data.ts CHANGED Viewed

@@ -11,6 +11,10 @@ const taskData: TaskDataCustom = {
 			description: "Multi-speaker English dataset.",
 			id: "mythicinfinity/libritts_r",
 		},
+		{
+			description: "Mulit-lingual dataset.",
+			id: "facebook/multilingual_librispeech",
+		},
 	],
 	demo: {
 		inputs: [
@@ -35,20 +39,24 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description: "A powerful TTS model.",
+			description: "A prompt based, powerful TTS model.",
 			id: "parler-tts/parler-tts-large-v1",
 		},
+		{
+			description: "A powerful TTS model that supports English and Chinese.",
+			id: "SWivid/F5-TTS",
+		},
 		{
 			description: "A massively multi-lingual TTS model.",
 			id: "coqui/XTTS-v2",
 		},
 		{
-			description: "Robust TTS model.",
-			id: "metavoiceio/metavoice-1B-v0.1",
+			description: "A powerful TTS model.",
+			id: "amphion/MaskGCT",
 		},
 		{
-			description: "A prompt based, powerful TTS model.",
-			id: "parler-tts/parler_tts_mini_v0.1",
+			description: "A Llama based TTS model.",
+			id: "OuteAI/OuteTTS-0.1-350M",
 		},
 	],
 	spaces: [
@@ -66,8 +74,8 @@ const taskData: TaskDataCustom = {
 			id: "mrfakename/E2-F5-TTS",
 		},
 		{
-			description: "An application that synthesizes speech for diverse speaker prompts.",
-			id: "parler-tts/parler_tts_mini",
+			description: "An application that synthesizes emotional speech for diverse speaker prompts.",
+			id: "parler-tts/parler-tts-expresso",
 		},
 	],
 	summary: