@huggingface/tasks 0.13.2 → 0.13.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commonjs/index.d.ts +1 -0
- package/dist/commonjs/index.d.ts.map +1 -1
- package/dist/commonjs/model-libraries.d.ts +8 -1
- package/dist/commonjs/model-libraries.d.ts.map +1 -1
- package/dist/commonjs/model-libraries.js +7 -0
- package/dist/commonjs/pipelines.d.ts +7 -1
- package/dist/commonjs/pipelines.d.ts.map +1 -1
- package/dist/commonjs/pipelines.js +6 -0
- package/dist/commonjs/tasks/automatic-speech-recognition/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/automatic-speech-recognition/data.js +15 -3
- package/dist/commonjs/tasks/document-question-answering/inference.d.ts +0 -4
- package/dist/commonjs/tasks/document-question-answering/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/index.d.ts +1 -1
- package/dist/commonjs/tasks/index.d.ts.map +1 -1
- package/dist/commonjs/tasks/index.js +2 -0
- package/dist/commonjs/tasks/mask-generation/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/mask-generation/data.js +16 -2
- package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-speech/data.js +15 -7
- package/dist/commonjs/tasks/zero-shot-classification/inference.d.ts +9 -19
- package/dist/commonjs/tasks/zero-shot-classification/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts +9 -19
- package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts +8 -12
- package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
- package/dist/esm/index.d.ts +1 -0
- package/dist/esm/index.d.ts.map +1 -1
- package/dist/esm/model-libraries.d.ts +8 -1
- package/dist/esm/model-libraries.d.ts.map +1 -1
- package/dist/esm/model-libraries.js +7 -0
- package/dist/esm/pipelines.d.ts +7 -1
- package/dist/esm/pipelines.d.ts.map +1 -1
- package/dist/esm/pipelines.js +6 -0
- package/dist/esm/tasks/automatic-speech-recognition/data.d.ts.map +1 -1
- package/dist/esm/tasks/automatic-speech-recognition/data.js +15 -3
- package/dist/esm/tasks/document-question-answering/inference.d.ts +0 -4
- package/dist/esm/tasks/document-question-answering/inference.d.ts.map +1 -1
- package/dist/esm/tasks/index.d.ts +1 -1
- package/dist/esm/tasks/index.d.ts.map +1 -1
- package/dist/esm/tasks/index.js +2 -0
- package/dist/esm/tasks/mask-generation/data.d.ts.map +1 -1
- package/dist/esm/tasks/mask-generation/data.js +16 -2
- package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-speech/data.js +15 -7
- package/dist/esm/tasks/zero-shot-classification/inference.d.ts +9 -19
- package/dist/esm/tasks/zero-shot-classification/inference.d.ts.map +1 -1
- package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts +9 -19
- package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
- package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts +8 -12
- package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/index.ts +1 -0
- package/src/model-libraries.ts +7 -0
- package/src/pipelines.ts +6 -0
- package/src/tasks/automatic-speech-recognition/data.ts +15 -3
- package/src/tasks/document-question-answering/inference.ts +0 -4
- package/src/tasks/document-question-answering/spec/output.json +1 -8
- package/src/tasks/index.ts +2 -1
- package/src/tasks/mask-generation/about.md +10 -0
- package/src/tasks/mask-generation/data.ts +16 -2
- package/src/tasks/text-to-speech/data.ts +15 -7
- package/src/tasks/zero-shot-classification/inference.ts +9 -19
- package/src/tasks/zero-shot-classification/spec/input.json +13 -20
- package/src/tasks/zero-shot-image-classification/inference.ts +9 -19
- package/src/tasks/zero-shot-image-classification/spec/input.json +13 -19
- package/src/tasks/zero-shot-object-detection/inference.ts +8 -12
- package/src/tasks/zero-shot-object-detection/spec/input.json +13 -18
package/dist/esm/pipelines.js
CHANGED
|
@@ -306,6 +306,12 @@ export const PIPELINE_DATA = {
|
|
|
306
306
|
modality: "audio",
|
|
307
307
|
color: "green",
|
|
308
308
|
},
|
|
309
|
+
"audio-text-to-text": {
|
|
310
|
+
name: "Audio-Text-to-Text",
|
|
311
|
+
modality: "multimodal",
|
|
312
|
+
color: "red",
|
|
313
|
+
hideInDatasets: true,
|
|
314
|
+
},
|
|
309
315
|
"voice-activity-detection": {
|
|
310
316
|
name: "Voice Activity Detection",
|
|
311
317
|
modality: "audio",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/automatic-speech-recognition/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/automatic-speech-recognition/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAyFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -4,12 +4,16 @@ const taskData = {
|
|
|
4
4
|
description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
|
|
5
5
|
id: "mozilla-foundation/common_voice_17_0",
|
|
6
6
|
},
|
|
7
|
+
{
|
|
8
|
+
description: "Multilingual and diverse audio dataset with 101k hours of audio.",
|
|
9
|
+
id: "amphion/Emilia-Dataset",
|
|
10
|
+
},
|
|
7
11
|
{
|
|
8
12
|
description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
|
|
9
13
|
id: "parler-tts/mls_eng",
|
|
10
14
|
},
|
|
11
15
|
{
|
|
12
|
-
description: "A
|
|
16
|
+
description: "A multilingual audio dataset with 370K hours of audio.",
|
|
13
17
|
id: "espnet/yodas",
|
|
14
18
|
},
|
|
15
19
|
],
|
|
@@ -52,6 +56,10 @@ const taskData = {
|
|
|
52
56
|
description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
|
|
53
57
|
id: "facebook/seamless-m4t-v2-large",
|
|
54
58
|
},
|
|
59
|
+
{
|
|
60
|
+
description: "A powerful multilingual ASR and Speech Translation model by Nvidia.",
|
|
61
|
+
id: "nvidia/canary-1b",
|
|
62
|
+
},
|
|
55
63
|
{
|
|
56
64
|
description: "Powerful speaker diarization model.",
|
|
57
65
|
id: "pyannote/speaker-diarization-3.1",
|
|
@@ -63,13 +71,17 @@ const taskData = {
|
|
|
63
71
|
id: "hf-audio/whisper-large-v3",
|
|
64
72
|
},
|
|
65
73
|
{
|
|
66
|
-
description: "
|
|
67
|
-
id: "
|
|
74
|
+
description: "Latest ASR model from Useful Sensors.",
|
|
75
|
+
id: "mrfakename/Moonshinex",
|
|
68
76
|
},
|
|
69
77
|
{
|
|
70
78
|
description: "A high quality speech and text translation model by Meta.",
|
|
71
79
|
id: "facebook/seamless_m4t",
|
|
72
80
|
},
|
|
81
|
+
{
|
|
82
|
+
description: "A powerful multilingual ASR and Speech Translation model by Nvidia",
|
|
83
|
+
id: "nvidia/canary-1b",
|
|
84
|
+
},
|
|
73
85
|
],
|
|
74
86
|
summary: "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
|
|
75
87
|
widgetModels: ["openai/whisper-large-v3"],
|
|
@@ -102,10 +102,6 @@ export interface DocumentQuestionAnsweringOutputElement {
|
|
|
102
102
|
* boxes).
|
|
103
103
|
*/
|
|
104
104
|
start: number;
|
|
105
|
-
/**
|
|
106
|
-
* The index of each word/box pair that is in the answer
|
|
107
|
-
*/
|
|
108
|
-
words: number[];
|
|
109
105
|
[property: string]: unknown;
|
|
110
106
|
}
|
|
111
107
|
//# sourceMappingURL=inference.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/document-question-answering/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,MAAM,EAAE,kCAAkC,CAAC;IAC3C;;OAEG;IACH,UAAU,CAAC,EAAE,mCAAmC,CAAC;IACjD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,kCAAkC;IAClD;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,mCAAmC;IACnD;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC;IACvB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,OAAO,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC;AACxC,MAAM,MAAM,+BAA+B,GAAG,sCAAsC,EAAE,CAAC;AACvF;;GAEG;AACH,MAAM,WAAW,sCAAsC;IACtD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,GAAG,EAAE,MAAM,CAAC;IACZ;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IACd
|
|
1
|
+
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/document-question-answering/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,MAAM,EAAE,kCAAkC,CAAC;IAC3C;;OAEG;IACH,UAAU,CAAC,EAAE,mCAAmC,CAAC;IACjD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,kCAAkC;IAClD;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,mCAAmC;IACnD;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC;IACvB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,OAAO,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC;AACxC,MAAM,MAAM,+BAA+B,GAAG,sCAAsC,EAAE,CAAC;AACvF;;GAEG;AACH,MAAM,WAAW,sCAAsC;IACtD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,GAAG,EAAE,MAAM,CAAC;IACZ;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
|
|
@@ -25,7 +25,7 @@ export type * from "./video-classification/inference.js";
|
|
|
25
25
|
export type * from "./visual-question-answering/inference.js";
|
|
26
26
|
export type * from "./zero-shot-classification/inference.js";
|
|
27
27
|
export type * from "./zero-shot-image-classification/inference.js";
|
|
28
|
-
export type { BoundingBox, ZeroShotObjectDetectionInput,
|
|
28
|
+
export type { BoundingBox, ZeroShotObjectDetectionInput, ZeroShotObjectDetectionOutput, ZeroShotObjectDetectionOutputElement, } from "./zero-shot-object-detection/inference.js";
|
|
29
29
|
import type { ModelLibraryKey } from "../model-libraries.js";
|
|
30
30
|
/**
|
|
31
31
|
* Model libraries compatible with each ML task
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AA4CpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AA4CpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAE7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CA8DzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CAsDxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
|
package/dist/esm/tasks/index.js
CHANGED
|
@@ -46,6 +46,7 @@ export const TASKS_MODEL_LIBRARIES = {
|
|
|
46
46
|
"audio-classification": ["speechbrain", "transformers", "transformers.js"],
|
|
47
47
|
"audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
|
|
48
48
|
"automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
|
|
49
|
+
"audio-text-to-text": [],
|
|
49
50
|
"depth-estimation": ["transformers", "transformers.js"],
|
|
50
51
|
"document-question-answering": ["transformers", "transformers.js"],
|
|
51
52
|
"feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
|
|
@@ -125,6 +126,7 @@ export const TASKS_DATA = {
|
|
|
125
126
|
"any-to-any": getData("any-to-any", placeholder),
|
|
126
127
|
"audio-classification": getData("audio-classification", audioClassification),
|
|
127
128
|
"audio-to-audio": getData("audio-to-audio", audioToAudio),
|
|
129
|
+
"audio-text-to-text": getData("audio-text-to-text", placeholder),
|
|
128
130
|
"automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
|
|
129
131
|
"depth-estimation": getData("depth-estimation", depthEstimation),
|
|
130
132
|
"document-question-answering": getData("document-question-answering", documentQuestionAnswering),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/mask-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/mask-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAgEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
const taskData = {
|
|
2
|
-
datasets: [
|
|
2
|
+
datasets: [
|
|
3
|
+
{
|
|
4
|
+
description: "Widely used benchmark dataset for multiple Vision tasks.",
|
|
5
|
+
id: "merve/coco2017",
|
|
6
|
+
},
|
|
7
|
+
{
|
|
8
|
+
description: "Medical Imaging dataset of the Human Brain for segmentation and mask generating tasks",
|
|
9
|
+
id: "rocky93/BraTS_segmentation",
|
|
10
|
+
},
|
|
11
|
+
],
|
|
3
12
|
demo: {
|
|
4
13
|
inputs: [
|
|
5
14
|
{
|
|
@@ -14,7 +23,12 @@ const taskData = {
|
|
|
14
23
|
},
|
|
15
24
|
],
|
|
16
25
|
},
|
|
17
|
-
metrics: [
|
|
26
|
+
metrics: [
|
|
27
|
+
{
|
|
28
|
+
description: "IoU is used to measure the overlap between predicted mask and the ground truth mask.",
|
|
29
|
+
id: "Intersection over Union (IoU)",
|
|
30
|
+
},
|
|
31
|
+
],
|
|
18
32
|
models: [
|
|
19
33
|
{
|
|
20
34
|
description: "Small yet powerful mask generation model.",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,
|
|
1
|
+
{"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAkFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
|
|
@@ -9,6 +9,10 @@ const taskData = {
|
|
|
9
9
|
description: "Multi-speaker English dataset.",
|
|
10
10
|
id: "mythicinfinity/libritts_r",
|
|
11
11
|
},
|
|
12
|
+
{
|
|
13
|
+
description: "Mulit-lingual dataset.",
|
|
14
|
+
id: "facebook/multilingual_librispeech",
|
|
15
|
+
},
|
|
12
16
|
],
|
|
13
17
|
demo: {
|
|
14
18
|
inputs: [
|
|
@@ -33,20 +37,24 @@ const taskData = {
|
|
|
33
37
|
],
|
|
34
38
|
models: [
|
|
35
39
|
{
|
|
36
|
-
description: "A powerful TTS model.",
|
|
40
|
+
description: "A prompt based, powerful TTS model.",
|
|
37
41
|
id: "parler-tts/parler-tts-large-v1",
|
|
38
42
|
},
|
|
43
|
+
{
|
|
44
|
+
description: "A powerful TTS model that supports English and Chinese.",
|
|
45
|
+
id: "SWivid/F5-TTS",
|
|
46
|
+
},
|
|
39
47
|
{
|
|
40
48
|
description: "A massively multi-lingual TTS model.",
|
|
41
49
|
id: "coqui/XTTS-v2",
|
|
42
50
|
},
|
|
43
51
|
{
|
|
44
|
-
description: "
|
|
45
|
-
id: "
|
|
52
|
+
description: "A powerful TTS model.",
|
|
53
|
+
id: "amphion/MaskGCT",
|
|
46
54
|
},
|
|
47
55
|
{
|
|
48
|
-
description: "A
|
|
49
|
-
id: "
|
|
56
|
+
description: "A Llama based TTS model.",
|
|
57
|
+
id: "OuteAI/OuteTTS-0.1-350M",
|
|
50
58
|
},
|
|
51
59
|
],
|
|
52
60
|
spaces: [
|
|
@@ -63,8 +71,8 @@ const taskData = {
|
|
|
63
71
|
id: "mrfakename/E2-F5-TTS",
|
|
64
72
|
},
|
|
65
73
|
{
|
|
66
|
-
description: "An application that synthesizes speech for diverse speaker prompts.",
|
|
67
|
-
id: "parler-tts/
|
|
74
|
+
description: "An application that synthesizes emotional speech for diverse speaker prompts.",
|
|
75
|
+
id: "parler-tts/parler-tts-expresso",
|
|
68
76
|
},
|
|
69
77
|
],
|
|
70
78
|
summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
|
|
@@ -8,27 +8,13 @@
|
|
|
8
8
|
*/
|
|
9
9
|
export interface ZeroShotClassificationInput {
|
|
10
10
|
/**
|
|
11
|
-
* The
|
|
11
|
+
* The text to classify
|
|
12
12
|
*/
|
|
13
|
-
inputs:
|
|
13
|
+
inputs: string;
|
|
14
14
|
/**
|
|
15
15
|
* Additional inference parameters
|
|
16
16
|
*/
|
|
17
|
-
parameters
|
|
18
|
-
[property: string]: unknown;
|
|
19
|
-
}
|
|
20
|
-
/**
|
|
21
|
-
* The input text data, with candidate labels
|
|
22
|
-
*/
|
|
23
|
-
export interface ZeroShotClassificationInputData {
|
|
24
|
-
/**
|
|
25
|
-
* The set of possible class labels to classify the text into.
|
|
26
|
-
*/
|
|
27
|
-
candidateLabels: string[];
|
|
28
|
-
/**
|
|
29
|
-
* The text to classify
|
|
30
|
-
*/
|
|
31
|
-
text: string;
|
|
17
|
+
parameters: ZeroShotClassificationParameters;
|
|
32
18
|
[property: string]: unknown;
|
|
33
19
|
}
|
|
34
20
|
/**
|
|
@@ -38,8 +24,12 @@ export interface ZeroShotClassificationInputData {
|
|
|
38
24
|
*/
|
|
39
25
|
export interface ZeroShotClassificationParameters {
|
|
40
26
|
/**
|
|
41
|
-
* The
|
|
42
|
-
|
|
27
|
+
* The set of possible class labels to classify the text into.
|
|
28
|
+
*/
|
|
29
|
+
candidate_labels: string[];
|
|
30
|
+
/**
|
|
31
|
+
* The sentence used in conjunction with `candidate_labels` to attempt the text
|
|
32
|
+
* classification by replacing the placeholder with the candidate labels.
|
|
43
33
|
*/
|
|
44
34
|
hypothesis_template?: string;
|
|
45
35
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,2BAA2B;IAC3C;;OAEG;IACH,MAAM,EAAE
|
|
1
|
+
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,2BAA2B;IAC3C;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,gCAAgC,CAAC;IAC7C,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,4BAA4B,GAAG,mCAAmC,EAAE,CAAC;AACjF;;GAEG;AACH,MAAM,WAAW,mCAAmC;IACnD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
|
|
@@ -8,27 +8,13 @@
|
|
|
8
8
|
*/
|
|
9
9
|
export interface ZeroShotImageClassificationInput {
|
|
10
10
|
/**
|
|
11
|
-
* The input image data
|
|
11
|
+
* The input image data to classify as a base64-encoded string.
|
|
12
12
|
*/
|
|
13
|
-
inputs:
|
|
13
|
+
inputs: string;
|
|
14
14
|
/**
|
|
15
15
|
* Additional inference parameters
|
|
16
16
|
*/
|
|
17
|
-
parameters
|
|
18
|
-
[property: string]: unknown;
|
|
19
|
-
}
|
|
20
|
-
/**
|
|
21
|
-
* The input image data, with candidate labels
|
|
22
|
-
*/
|
|
23
|
-
export interface ZeroShotImageClassificationInputData {
|
|
24
|
-
/**
|
|
25
|
-
* The candidate labels for this image
|
|
26
|
-
*/
|
|
27
|
-
candidateLabels: string[];
|
|
28
|
-
/**
|
|
29
|
-
* The image data to classify
|
|
30
|
-
*/
|
|
31
|
-
image: unknown;
|
|
17
|
+
parameters: ZeroShotImageClassificationParameters;
|
|
32
18
|
[property: string]: unknown;
|
|
33
19
|
}
|
|
34
20
|
/**
|
|
@@ -38,8 +24,12 @@ export interface ZeroShotImageClassificationInputData {
|
|
|
38
24
|
*/
|
|
39
25
|
export interface ZeroShotImageClassificationParameters {
|
|
40
26
|
/**
|
|
41
|
-
* The
|
|
42
|
-
|
|
27
|
+
* The candidate labels for this image
|
|
28
|
+
*/
|
|
29
|
+
candidate_labels: string[];
|
|
30
|
+
/**
|
|
31
|
+
* The sentence used in conjunction with `candidate_labels` to attempt the image
|
|
32
|
+
* classification by replacing the placeholder with the candidate labels.
|
|
43
33
|
*/
|
|
44
34
|
hypothesis_template?: string;
|
|
45
35
|
[property: string]: unknown;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-image-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,MAAM,EAAE,
|
|
1
|
+
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-image-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,qCAAqC,CAAC;IAClD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,qCAAqC;IACrD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,iCAAiC,GAAG,wCAAwC,EAAE,CAAC;AAC3F;;GAEG;AACH,MAAM,WAAW,wCAAwC;IACxD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
|
|
@@ -8,29 +8,25 @@
|
|
|
8
8
|
*/
|
|
9
9
|
export interface ZeroShotObjectDetectionInput {
|
|
10
10
|
/**
|
|
11
|
-
* The input image data
|
|
11
|
+
* The input image data as a base64-encoded string.
|
|
12
12
|
*/
|
|
13
|
-
inputs:
|
|
13
|
+
inputs: string;
|
|
14
14
|
/**
|
|
15
15
|
* Additional inference parameters
|
|
16
16
|
*/
|
|
17
|
-
parameters
|
|
18
|
-
[key: string]: unknown;
|
|
19
|
-
};
|
|
17
|
+
parameters: ZeroShotObjectDetectionParameters;
|
|
20
18
|
[property: string]: unknown;
|
|
21
19
|
}
|
|
22
20
|
/**
|
|
23
|
-
*
|
|
21
|
+
* Additional inference parameters
|
|
22
|
+
*
|
|
23
|
+
* Additional inference parameters for Zero Shot Object Detection
|
|
24
24
|
*/
|
|
25
|
-
export interface
|
|
25
|
+
export interface ZeroShotObjectDetectionParameters {
|
|
26
26
|
/**
|
|
27
27
|
* The candidate labels for this image
|
|
28
28
|
*/
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* The image data to generate bounding boxes from
|
|
32
|
-
*/
|
|
33
|
-
image: unknown;
|
|
29
|
+
candidate_labels: string[];
|
|
34
30
|
[property: string]: unknown;
|
|
35
31
|
}
|
|
36
32
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-object-detection/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C;;OAEG;IACH,MAAM,EAAE,
|
|
1
|
+
{"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-object-detection/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,iCAAiC,CAAC;IAC9C,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,iCAAiC;IACjD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,6BAA6B,GAAG,oCAAoC,EAAE,CAAC;AACnF;;GAEG;AACH,MAAM,WAAW,oCAAoC;IACpD;;;OAGG;IACH,GAAG,EAAE,WAAW,CAAC;IACjB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/tasks",
|
|
3
3
|
"packageManager": "pnpm@8.10.5",
|
|
4
|
-
"version": "0.13.
|
|
4
|
+
"version": "0.13.4",
|
|
5
5
|
"description": "List of ML tasks for huggingface.co/tasks",
|
|
6
6
|
"repository": "https://github.com/huggingface/huggingface.js.git",
|
|
7
7
|
"publishConfig": {
|
package/src/index.ts
CHANGED
|
@@ -49,6 +49,7 @@ import * as snippets from "./snippets/index.js";
|
|
|
49
49
|
export * from "./gguf.js";
|
|
50
50
|
|
|
51
51
|
export { snippets };
|
|
52
|
+
export type { InferenceSnippet } from "./snippets/index.js";
|
|
52
53
|
|
|
53
54
|
export { SKUS, DEFAULT_MEMORY_OPTIONS } from "./hardware.js";
|
|
54
55
|
export type { HardwareSpec, SkuType } from "./hardware.js";
|
package/src/model-libraries.ts
CHANGED
|
@@ -212,6 +212,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
212
212
|
repoUrl: "https://github.com/cartesia-ai/cartesia_mlx",
|
|
213
213
|
snippets: snippets.cartesia_mlx,
|
|
214
214
|
},
|
|
215
|
+
clipscope: {
|
|
216
|
+
prettyLabel: "clipscope",
|
|
217
|
+
repoName: "clipscope",
|
|
218
|
+
repoUrl: "https://github.com/Lewington-pitsos/clipscope",
|
|
219
|
+
filter: false,
|
|
220
|
+
countDownloads: `path_extension:"pt"`,
|
|
221
|
+
},
|
|
215
222
|
cotracker: {
|
|
216
223
|
prettyLabel: "CoTracker",
|
|
217
224
|
repoName: "CoTracker",
|
package/src/pipelines.ts
CHANGED
|
@@ -355,6 +355,12 @@ export const PIPELINE_DATA = {
|
|
|
355
355
|
modality: "audio",
|
|
356
356
|
color: "green",
|
|
357
357
|
},
|
|
358
|
+
"audio-text-to-text": {
|
|
359
|
+
name: "Audio-Text-to-Text",
|
|
360
|
+
modality: "multimodal",
|
|
361
|
+
color: "red",
|
|
362
|
+
hideInDatasets: true,
|
|
363
|
+
},
|
|
358
364
|
"voice-activity-detection": {
|
|
359
365
|
name: "Voice Activity Detection",
|
|
360
366
|
modality: "audio",
|
|
@@ -6,12 +6,16 @@ const taskData: TaskDataCustom = {
|
|
|
6
6
|
description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
|
|
7
7
|
id: "mozilla-foundation/common_voice_17_0",
|
|
8
8
|
},
|
|
9
|
+
{
|
|
10
|
+
description: "Multilingual and diverse audio dataset with 101k hours of audio.",
|
|
11
|
+
id: "amphion/Emilia-Dataset",
|
|
12
|
+
},
|
|
9
13
|
{
|
|
10
14
|
description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
|
|
11
15
|
id: "parler-tts/mls_eng",
|
|
12
16
|
},
|
|
13
17
|
{
|
|
14
|
-
description: "A
|
|
18
|
+
description: "A multilingual audio dataset with 370K hours of audio.",
|
|
15
19
|
id: "espnet/yodas",
|
|
16
20
|
},
|
|
17
21
|
],
|
|
@@ -54,6 +58,10 @@ const taskData: TaskDataCustom = {
|
|
|
54
58
|
description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
|
|
55
59
|
id: "facebook/seamless-m4t-v2-large",
|
|
56
60
|
},
|
|
61
|
+
{
|
|
62
|
+
description: "A powerful multilingual ASR and Speech Translation model by Nvidia.",
|
|
63
|
+
id: "nvidia/canary-1b",
|
|
64
|
+
},
|
|
57
65
|
{
|
|
58
66
|
description: "Powerful speaker diarization model.",
|
|
59
67
|
id: "pyannote/speaker-diarization-3.1",
|
|
@@ -65,13 +73,17 @@ const taskData: TaskDataCustom = {
|
|
|
65
73
|
id: "hf-audio/whisper-large-v3",
|
|
66
74
|
},
|
|
67
75
|
{
|
|
68
|
-
description: "
|
|
69
|
-
id: "
|
|
76
|
+
description: "Latest ASR model from Useful Sensors.",
|
|
77
|
+
id: "mrfakename/Moonshinex",
|
|
70
78
|
},
|
|
71
79
|
{
|
|
72
80
|
description: "A high quality speech and text translation model by Meta.",
|
|
73
81
|
id: "facebook/seamless_m4t",
|
|
74
82
|
},
|
|
83
|
+
{
|
|
84
|
+
description: "A powerful multilingual ASR and Speech Translation model by Nvidia",
|
|
85
|
+
id: "nvidia/canary-1b",
|
|
86
|
+
},
|
|
75
87
|
],
|
|
76
88
|
summary:
|
|
77
89
|
"Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
|
|
@@ -22,15 +22,8 @@
|
|
|
22
22
|
"end": {
|
|
23
23
|
"type": "integer",
|
|
24
24
|
"description": "The end word index of the answer (in the OCR\u2019d version of the input or provided word boxes)."
|
|
25
|
-
},
|
|
26
|
-
"words": {
|
|
27
|
-
"type": "array",
|
|
28
|
-
"items": {
|
|
29
|
-
"type": "integer"
|
|
30
|
-
},
|
|
31
|
-
"description": "The index of each word/box pair that is in the answer"
|
|
32
25
|
}
|
|
33
26
|
},
|
|
34
|
-
"required": ["answer", "score", "start", "end"
|
|
27
|
+
"required": ["answer", "score", "start", "end"]
|
|
35
28
|
}
|
|
36
29
|
}
|
package/src/tasks/index.ts
CHANGED
|
@@ -102,7 +102,6 @@ export type * from "./zero-shot-image-classification/inference.js";
|
|
|
102
102
|
export type {
|
|
103
103
|
BoundingBox,
|
|
104
104
|
ZeroShotObjectDetectionInput,
|
|
105
|
-
ZeroShotObjectDetectionInputData,
|
|
106
105
|
ZeroShotObjectDetectionOutput,
|
|
107
106
|
ZeroShotObjectDetectionOutputElement,
|
|
108
107
|
} from "./zero-shot-object-detection/inference.js";
|
|
@@ -116,6 +115,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
116
115
|
"audio-classification": ["speechbrain", "transformers", "transformers.js"],
|
|
117
116
|
"audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
|
|
118
117
|
"automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
|
|
118
|
+
"audio-text-to-text": [],
|
|
119
119
|
"depth-estimation": ["transformers", "transformers.js"],
|
|
120
120
|
"document-question-answering": ["transformers", "transformers.js"],
|
|
121
121
|
"feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
|
|
@@ -197,6 +197,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
|
|
|
197
197
|
"any-to-any": getData("any-to-any", placeholder),
|
|
198
198
|
"audio-classification": getData("audio-classification", audioClassification),
|
|
199
199
|
"audio-to-audio": getData("audio-to-audio", audioToAudio),
|
|
200
|
+
"audio-text-to-text": getData("audio-text-to-text", placeholder),
|
|
200
201
|
"automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
|
|
201
202
|
"depth-estimation": getData("depth-estimation", depthEstimation),
|
|
202
203
|
"document-question-answering": getData("document-question-answering", documentQuestionAnswering),
|
|
@@ -12,6 +12,16 @@ Generating masks can facilitate learning, especially in semi or unsupervised lea
|
|
|
12
12
|
|
|
13
13
|
For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
|
|
14
14
|
|
|
15
|
+
### Medical Imaging
|
|
16
|
+
|
|
17
|
+
Mask generation models are used in medical imaging to aid in segmenting and analyzing specific regions.
|
|
18
|
+
|
|
19
|
+
### Autonomous Vehicles
|
|
20
|
+
|
|
21
|
+
Mask generation models are used to create segments and masks for obstacles and other objects in view.
|
|
22
|
+
|
|
23
|
+
This page was made possible thanks to the efforts of [Raj Aryan](https://huggingface.co/thatrajaryan) and other contributors.
|
|
24
|
+
|
|
15
25
|
## Task Variants
|
|
16
26
|
|
|
17
27
|
### Segmentation
|
|
@@ -1,7 +1,16 @@
|
|
|
1
1
|
import type { TaskDataCustom } from "../index.js";
|
|
2
2
|
|
|
3
3
|
const taskData: TaskDataCustom = {
|
|
4
|
-
datasets: [
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Widely used benchmark dataset for multiple Vision tasks.",
|
|
7
|
+
id: "merve/coco2017",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description: "Medical Imaging dataset of the Human Brain for segmentation and mask generating tasks",
|
|
11
|
+
id: "rocky93/BraTS_segmentation",
|
|
12
|
+
},
|
|
13
|
+
],
|
|
5
14
|
demo: {
|
|
6
15
|
inputs: [
|
|
7
16
|
{
|
|
@@ -16,7 +25,12 @@ const taskData: TaskDataCustom = {
|
|
|
16
25
|
},
|
|
17
26
|
],
|
|
18
27
|
},
|
|
19
|
-
metrics: [
|
|
28
|
+
metrics: [
|
|
29
|
+
{
|
|
30
|
+
description: "IoU is used to measure the overlap between predicted mask and the ground truth mask.",
|
|
31
|
+
id: "Intersection over Union (IoU)",
|
|
32
|
+
},
|
|
33
|
+
],
|
|
20
34
|
models: [
|
|
21
35
|
{
|
|
22
36
|
description: "Small yet powerful mask generation model.",
|
|
@@ -11,6 +11,10 @@ const taskData: TaskDataCustom = {
|
|
|
11
11
|
description: "Multi-speaker English dataset.",
|
|
12
12
|
id: "mythicinfinity/libritts_r",
|
|
13
13
|
},
|
|
14
|
+
{
|
|
15
|
+
description: "Mulit-lingual dataset.",
|
|
16
|
+
id: "facebook/multilingual_librispeech",
|
|
17
|
+
},
|
|
14
18
|
],
|
|
15
19
|
demo: {
|
|
16
20
|
inputs: [
|
|
@@ -35,20 +39,24 @@ const taskData: TaskDataCustom = {
|
|
|
35
39
|
],
|
|
36
40
|
models: [
|
|
37
41
|
{
|
|
38
|
-
description: "A powerful TTS model.",
|
|
42
|
+
description: "A prompt based, powerful TTS model.",
|
|
39
43
|
id: "parler-tts/parler-tts-large-v1",
|
|
40
44
|
},
|
|
45
|
+
{
|
|
46
|
+
description: "A powerful TTS model that supports English and Chinese.",
|
|
47
|
+
id: "SWivid/F5-TTS",
|
|
48
|
+
},
|
|
41
49
|
{
|
|
42
50
|
description: "A massively multi-lingual TTS model.",
|
|
43
51
|
id: "coqui/XTTS-v2",
|
|
44
52
|
},
|
|
45
53
|
{
|
|
46
|
-
description: "
|
|
47
|
-
id: "
|
|
54
|
+
description: "A powerful TTS model.",
|
|
55
|
+
id: "amphion/MaskGCT",
|
|
48
56
|
},
|
|
49
57
|
{
|
|
50
|
-
description: "A
|
|
51
|
-
id: "
|
|
58
|
+
description: "A Llama based TTS model.",
|
|
59
|
+
id: "OuteAI/OuteTTS-0.1-350M",
|
|
52
60
|
},
|
|
53
61
|
],
|
|
54
62
|
spaces: [
|
|
@@ -66,8 +74,8 @@ const taskData: TaskDataCustom = {
|
|
|
66
74
|
id: "mrfakename/E2-F5-TTS",
|
|
67
75
|
},
|
|
68
76
|
{
|
|
69
|
-
description: "An application that synthesizes speech for diverse speaker prompts.",
|
|
70
|
-
id: "parler-tts/
|
|
77
|
+
description: "An application that synthesizes emotional speech for diverse speaker prompts.",
|
|
78
|
+
id: "parler-tts/parler-tts-expresso",
|
|
71
79
|
},
|
|
72
80
|
],
|
|
73
81
|
summary:
|