@huggingface/tasks 0.13.2 → 0.13.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/dist/commonjs/index.d.ts +1 -0
  2. package/dist/commonjs/index.d.ts.map +1 -1
  3. package/dist/commonjs/model-libraries.d.ts +8 -1
  4. package/dist/commonjs/model-libraries.d.ts.map +1 -1
  5. package/dist/commonjs/model-libraries.js +7 -0
  6. package/dist/commonjs/pipelines.d.ts +7 -1
  7. package/dist/commonjs/pipelines.d.ts.map +1 -1
  8. package/dist/commonjs/pipelines.js +6 -0
  9. package/dist/commonjs/tasks/automatic-speech-recognition/data.d.ts.map +1 -1
  10. package/dist/commonjs/tasks/automatic-speech-recognition/data.js +15 -3
  11. package/dist/commonjs/tasks/document-question-answering/inference.d.ts +0 -4
  12. package/dist/commonjs/tasks/document-question-answering/inference.d.ts.map +1 -1
  13. package/dist/commonjs/tasks/index.d.ts +1 -1
  14. package/dist/commonjs/tasks/index.d.ts.map +1 -1
  15. package/dist/commonjs/tasks/index.js +2 -0
  16. package/dist/commonjs/tasks/mask-generation/data.d.ts.map +1 -1
  17. package/dist/commonjs/tasks/mask-generation/data.js +16 -2
  18. package/dist/commonjs/tasks/text-to-speech/data.d.ts.map +1 -1
  19. package/dist/commonjs/tasks/text-to-speech/data.js +15 -7
  20. package/dist/commonjs/tasks/zero-shot-classification/inference.d.ts +9 -19
  21. package/dist/commonjs/tasks/zero-shot-classification/inference.d.ts.map +1 -1
  22. package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts +9 -19
  23. package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
  24. package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts +8 -12
  25. package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
  26. package/dist/esm/index.d.ts +1 -0
  27. package/dist/esm/index.d.ts.map +1 -1
  28. package/dist/esm/model-libraries.d.ts +8 -1
  29. package/dist/esm/model-libraries.d.ts.map +1 -1
  30. package/dist/esm/model-libraries.js +7 -0
  31. package/dist/esm/pipelines.d.ts +7 -1
  32. package/dist/esm/pipelines.d.ts.map +1 -1
  33. package/dist/esm/pipelines.js +6 -0
  34. package/dist/esm/tasks/automatic-speech-recognition/data.d.ts.map +1 -1
  35. package/dist/esm/tasks/automatic-speech-recognition/data.js +15 -3
  36. package/dist/esm/tasks/document-question-answering/inference.d.ts +0 -4
  37. package/dist/esm/tasks/document-question-answering/inference.d.ts.map +1 -1
  38. package/dist/esm/tasks/index.d.ts +1 -1
  39. package/dist/esm/tasks/index.d.ts.map +1 -1
  40. package/dist/esm/tasks/index.js +2 -0
  41. package/dist/esm/tasks/mask-generation/data.d.ts.map +1 -1
  42. package/dist/esm/tasks/mask-generation/data.js +16 -2
  43. package/dist/esm/tasks/text-to-speech/data.d.ts.map +1 -1
  44. package/dist/esm/tasks/text-to-speech/data.js +15 -7
  45. package/dist/esm/tasks/zero-shot-classification/inference.d.ts +9 -19
  46. package/dist/esm/tasks/zero-shot-classification/inference.d.ts.map +1 -1
  47. package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts +9 -19
  48. package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
  49. package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts +8 -12
  50. package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
  51. package/package.json +1 -1
  52. package/src/index.ts +1 -0
  53. package/src/model-libraries.ts +7 -0
  54. package/src/pipelines.ts +6 -0
  55. package/src/tasks/automatic-speech-recognition/data.ts +15 -3
  56. package/src/tasks/document-question-answering/inference.ts +0 -4
  57. package/src/tasks/document-question-answering/spec/output.json +1 -8
  58. package/src/tasks/index.ts +2 -1
  59. package/src/tasks/mask-generation/about.md +10 -0
  60. package/src/tasks/mask-generation/data.ts +16 -2
  61. package/src/tasks/text-to-speech/data.ts +15 -7
  62. package/src/tasks/zero-shot-classification/inference.ts +9 -19
  63. package/src/tasks/zero-shot-classification/spec/input.json +13 -20
  64. package/src/tasks/zero-shot-image-classification/inference.ts +9 -19
  65. package/src/tasks/zero-shot-image-classification/spec/input.json +13 -19
  66. package/src/tasks/zero-shot-object-detection/inference.ts +8 -12
  67. package/src/tasks/zero-shot-object-detection/spec/input.json +13 -18
@@ -306,6 +306,12 @@ export const PIPELINE_DATA = {
306
306
  modality: "audio",
307
307
  color: "green",
308
308
  },
309
+ "audio-text-to-text": {
310
+ name: "Audio-Text-to-Text",
311
+ modality: "multimodal",
312
+ color: "red",
313
+ hideInDatasets: true,
314
+ },
309
315
  "voice-activity-detection": {
310
316
  name: "Voice Activity Detection",
311
317
  modality: "audio",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/automatic-speech-recognition/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA6Ef,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/automatic-speech-recognition/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAyFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -4,12 +4,16 @@ const taskData = {
4
4
  description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
5
5
  id: "mozilla-foundation/common_voice_17_0",
6
6
  },
7
+ {
8
+ description: "Multilingual and diverse audio dataset with 101k hours of audio.",
9
+ id: "amphion/Emilia-Dataset",
10
+ },
7
11
  {
8
12
  description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
9
13
  id: "parler-tts/mls_eng",
10
14
  },
11
15
  {
12
- description: "A multi-lingual audio dataset with 370K hours of audio.",
16
+ description: "A multilingual audio dataset with 370K hours of audio.",
13
17
  id: "espnet/yodas",
14
18
  },
15
19
  ],
@@ -52,6 +56,10 @@ const taskData = {
52
56
  description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
53
57
  id: "facebook/seamless-m4t-v2-large",
54
58
  },
59
+ {
60
+ description: "A powerful multilingual ASR and Speech Translation model by Nvidia.",
61
+ id: "nvidia/canary-1b",
62
+ },
55
63
  {
56
64
  description: "Powerful speaker diarization model.",
57
65
  id: "pyannote/speaker-diarization-3.1",
@@ -63,13 +71,17 @@ const taskData = {
63
71
  id: "hf-audio/whisper-large-v3",
64
72
  },
65
73
  {
66
- description: "Fastest speech recognition application.",
67
- id: "sanchit-gandhi/whisper-jax",
74
+ description: "Latest ASR model from Useful Sensors.",
75
+ id: "mrfakename/Moonshinex",
68
76
  },
69
77
  {
70
78
  description: "A high quality speech and text translation model by Meta.",
71
79
  id: "facebook/seamless_m4t",
72
80
  },
81
+ {
82
+ description: "A powerful multilingual ASR and Speech Translation model by Nvidia",
83
+ id: "nvidia/canary-1b",
84
+ },
73
85
  ],
74
86
  summary: "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
75
87
  widgetModels: ["openai/whisper-large-v3"],
@@ -102,10 +102,6 @@ export interface DocumentQuestionAnsweringOutputElement {
102
102
  * boxes).
103
103
  */
104
104
  start: number;
105
- /**
106
- * The index of each word/box pair that is in the answer
107
- */
108
- words: number[];
109
105
  [property: string]: unknown;
110
106
  }
111
107
  //# sourceMappingURL=inference.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/document-question-answering/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,MAAM,EAAE,kCAAkC,CAAC;IAC3C;;OAEG;IACH,UAAU,CAAC,EAAE,mCAAmC,CAAC;IACjD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,kCAAkC;IAClD;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,mCAAmC;IACnD;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC;IACvB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,OAAO,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC;AACxC,MAAM,MAAM,+BAA+B,GAAG,sCAAsC,EAAE,CAAC;AACvF;;GAEG;AACH,MAAM,WAAW,sCAAsC;IACtD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,GAAG,EAAE,MAAM,CAAC;IACZ;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,EAAE,CAAC;IAChB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1
+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/document-question-answering/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,MAAM,EAAE,kCAAkC,CAAC;IAC3C;;OAEG;IACH,UAAU,CAAC,EAAE,mCAAmC,CAAC;IACjD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,kCAAkC;IAClD;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,mCAAmC;IACnD;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,wBAAwB,CAAC,EAAE,OAAO,CAAC;IACnC;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC;IACvB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,OAAO,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC;AACxC,MAAM,MAAM,+BAA+B,GAAG,sCAAsC,EAAE,CAAC;AACvF;;GAEG;AACH,MAAM,WAAW,sCAAsC;IACtD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;;OAGG;IACH,GAAG,EAAE,MAAM,CAAC;IACZ;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
@@ -25,7 +25,7 @@ export type * from "./video-classification/inference.js";
25
25
  export type * from "./visual-question-answering/inference.js";
26
26
  export type * from "./zero-shot-classification/inference.js";
27
27
  export type * from "./zero-shot-image-classification/inference.js";
28
- export type { BoundingBox, ZeroShotObjectDetectionInput, ZeroShotObjectDetectionInputData, ZeroShotObjectDetectionOutput, ZeroShotObjectDetectionOutputElement, } from "./zero-shot-object-detection/inference.js";
28
+ export type { BoundingBox, ZeroShotObjectDetectionInput, ZeroShotObjectDetectionOutput, ZeroShotObjectDetectionOutputElement, } from "./zero-shot-object-detection/inference.js";
29
29
  import type { ModelLibraryKey } from "../model-libraries.js";
30
30
  /**
31
31
  * Model libraries compatible with each ML task
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AA4CpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,gCAAgC,EAChC,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAE7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CA6DzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CAqDxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/tasks/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AA4CpD,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,6CAA6C,CAAC;AACjE,YAAY,EACX,mBAAmB,EACnB,0BAA0B,EAC1B,oBAAoB,EACpB,4BAA4B,EAC5B,2BAA2B,EAC3B,0BAA0B,EAC1B,gCAAgC,EAChC,+BAA+B,GAC/B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,4CAA4C,CAAC;AAChE,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,0BAA0B,CAAC;AAC9C,YAAY,EACX,wBAAwB,EACxB,yBAAyB,EACzB,gCAAgC,EAChC,6BAA6B,GAC7B,MAAM,qCAAqC,CAAC;AAC7C,mBAAmB,+BAA+B,CAAC;AACnD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,iCAAiC,CAAC;AACrD,mBAAmB,mCAAmC,CAAC;AACvD,mBAAmB,oCAAoC,CAAC;AACxD,mBAAmB,8BAA8B,CAAC;AAClD,mBAAmB,yCAAyC,CAAC;AAC7D,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AAC/G,YAAY,EAAE,sBAAsB,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AACnH,mBAAmB,qCAAqC,CAAC;AACzD,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AACtF,YAAY,EACX,6BAA6B,EAC7B,uBAAuB,EACvB,wBAAwB,EACxB,+BAA+B,EAC/B,4BAA4B,GAC5B,MAAM,oCAAoC,CAAC;AAC5C,YAAY,EACX,gCAAgC,EAChC,gCAAgC,EAChC,mBAAmB,EACnB,oBAAoB,EACpB,2BAA2B,EAC3B,qCAAqC,EACrC,kCAAkC,EAClC,yBAAyB,EACzB,uCAAuC,EACvC,0BAA0B,GAC1B,MAAM,gCAAgC,CAAC;AACxC,mBAAmB,qCAAqC,CAAC;AACzD,mBAAmB,0CAA0C,CAAC;AAC9D,mBAAmB,yCAAyC,CAAC;AAC7D,mBAAmB,+CAA+C,CAAC;AACnE,YAAY,EACX,WAAW,EACX,4BAA4B,EAC5B,6BAA6B,EAC7B,oCAAoC,GACpC,MAAM,2CAA2C,CAAC;AAEnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAE7D;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,YAAY,EAAE,eAAe,EAAE,CA8DzE,CAAC;AAoBF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,YAAY,EAAE,QAAQ,GAAG,SAAS,CAsDxD,CAAC;AAEX,MAAM,WAAW,WAAW;IAC3B,WAAW,EAAE,MAAM,CAAC;IACpB,EAAE,EAAE,MAAM,CAAC;CACX;AAED,MAAM,MAAM,aAAa,GACtB;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,IAAI,EAAE,KAAK,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACd,CAAC,CAAC;IACH,IAAI,EAAE,OAAO,CAAC;CACb,GACD;IACA,QAAQ,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC;CACX,GACD;IACA,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC;IAClB,IAAI,EAAE,SAAS,CAAC;CACf,GACD;IACA,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;CACZ,GACD;IACA,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,KAAK,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,IAAI,EAAE,kBAAkB,CAAC;CACxB,CAAC;AAEL,MAAM,WAAW,QAAQ;IACxB,MAAM,EAAE,aAAa,EAAE,CAAC;IACxB,OAAO,EAAE,aAAa,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,EAAE,WAAW,EAAE,CAAC;IACxB,IAAI,EAAE,QAAQ,CAAC;IACf,EAAE,EAAE,YAAY,CAAC;IACjB,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,eAAe,EAAE,CAAC;IAC7B,OAAO,EAAE,WAAW,EAAE,CAAC;IACvB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,MAAM,EAAE,WAAW,EAAE,CAAC;IACtB,OAAO,EAAE,MAAM,CAAC;IAChB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,GAAG,OAAO,GAAG,WAAW,CAAC,CAAC"}
@@ -46,6 +46,7 @@ export const TASKS_MODEL_LIBRARIES = {
46
46
  "audio-classification": ["speechbrain", "transformers", "transformers.js"],
47
47
  "audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
48
48
  "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
49
+ "audio-text-to-text": [],
49
50
  "depth-estimation": ["transformers", "transformers.js"],
50
51
  "document-question-answering": ["transformers", "transformers.js"],
51
52
  "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
@@ -125,6 +126,7 @@ export const TASKS_DATA = {
125
126
  "any-to-any": getData("any-to-any", placeholder),
126
127
  "audio-classification": getData("audio-classification", audioClassification),
127
128
  "audio-to-audio": getData("audio-to-audio", audioToAudio),
129
+ "audio-text-to-text": getData("audio-text-to-text", placeholder),
128
130
  "automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
129
131
  "depth-estimation": getData("depth-estimation", depthEstimation),
130
132
  "document-question-answering": getData("document-question-answering", documentQuestionAnswering),
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/mask-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAkDf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/mask-generation/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAgEf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -1,5 +1,14 @@
1
1
  const taskData = {
2
- datasets: [],
2
+ datasets: [
3
+ {
4
+ description: "Widely used benchmark dataset for multiple Vision tasks.",
5
+ id: "merve/coco2017",
6
+ },
7
+ {
8
+ description: "Medical Imaging dataset of the Human Brain for segmentation and mask generating tasks",
9
+ id: "rocky93/BraTS_segmentation",
10
+ },
11
+ ],
3
12
  demo: {
4
13
  inputs: [
5
14
  {
@@ -14,7 +23,12 @@ const taskData = {
14
23
  },
15
24
  ],
16
25
  },
17
- metrics: [],
26
+ metrics: [
27
+ {
28
+ description: "IoU is used to measure the overlap between predicted mask and the ground truth mask.",
29
+ id: "Intersection over Union (IoU)",
30
+ },
31
+ ],
18
32
  models: [
19
33
  {
20
34
  description: "Small yet powerful mask generation model.",
@@ -1 +1 @@
1
- {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cA0Ef,CAAC;AAEF,eAAe,QAAQ,CAAC"}
1
+ {"version":3,"file":"data.d.ts","sourceRoot":"","sources":["../../../../src/tasks/text-to-speech/data.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAElD,QAAA,MAAM,QAAQ,EAAE,cAkFf,CAAC;AAEF,eAAe,QAAQ,CAAC"}
@@ -9,6 +9,10 @@ const taskData = {
9
9
  description: "Multi-speaker English dataset.",
10
10
  id: "mythicinfinity/libritts_r",
11
11
  },
12
+ {
13
+ description: "Mulit-lingual dataset.",
14
+ id: "facebook/multilingual_librispeech",
15
+ },
12
16
  ],
13
17
  demo: {
14
18
  inputs: [
@@ -33,20 +37,24 @@ const taskData = {
33
37
  ],
34
38
  models: [
35
39
  {
36
- description: "A powerful TTS model.",
40
+ description: "A prompt based, powerful TTS model.",
37
41
  id: "parler-tts/parler-tts-large-v1",
38
42
  },
43
+ {
44
+ description: "A powerful TTS model that supports English and Chinese.",
45
+ id: "SWivid/F5-TTS",
46
+ },
39
47
  {
40
48
  description: "A massively multi-lingual TTS model.",
41
49
  id: "coqui/XTTS-v2",
42
50
  },
43
51
  {
44
- description: "Robust TTS model.",
45
- id: "metavoiceio/metavoice-1B-v0.1",
52
+ description: "A powerful TTS model.",
53
+ id: "amphion/MaskGCT",
46
54
  },
47
55
  {
48
- description: "A prompt based, powerful TTS model.",
49
- id: "parler-tts/parler_tts_mini_v0.1",
56
+ description: "A Llama based TTS model.",
57
+ id: "OuteAI/OuteTTS-0.1-350M",
50
58
  },
51
59
  ],
52
60
  spaces: [
@@ -63,8 +71,8 @@ const taskData = {
63
71
  id: "mrfakename/E2-F5-TTS",
64
72
  },
65
73
  {
66
- description: "An application that synthesizes speech for diverse speaker prompts.",
67
- id: "parler-tts/parler_tts_mini",
74
+ description: "An application that synthesizes emotional speech for diverse speaker prompts.",
75
+ id: "parler-tts/parler-tts-expresso",
68
76
  },
69
77
  ],
70
78
  summary: "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
@@ -8,27 +8,13 @@
8
8
  */
9
9
  export interface ZeroShotClassificationInput {
10
10
  /**
11
- * The input text data, with candidate labels
11
+ * The text to classify
12
12
  */
13
- inputs: ZeroShotClassificationInputData;
13
+ inputs: string;
14
14
  /**
15
15
  * Additional inference parameters
16
16
  */
17
- parameters?: ZeroShotClassificationParameters;
18
- [property: string]: unknown;
19
- }
20
- /**
21
- * The input text data, with candidate labels
22
- */
23
- export interface ZeroShotClassificationInputData {
24
- /**
25
- * The set of possible class labels to classify the text into.
26
- */
27
- candidateLabels: string[];
28
- /**
29
- * The text to classify
30
- */
31
- text: string;
17
+ parameters: ZeroShotClassificationParameters;
32
18
  [property: string]: unknown;
33
19
  }
34
20
  /**
@@ -38,8 +24,12 @@ export interface ZeroShotClassificationInputData {
38
24
  */
39
25
  export interface ZeroShotClassificationParameters {
40
26
  /**
41
- * The sentence used in conjunction with candidateLabels to attempt the text classification
42
- * by replacing the placeholder with the candidate labels.
27
+ * The set of possible class labels to classify the text into.
28
+ */
29
+ candidate_labels: string[];
30
+ /**
31
+ * The sentence used in conjunction with `candidate_labels` to attempt the text
32
+ * classification by replacing the placeholder with the candidate labels.
43
33
  */
44
34
  hypothesis_template?: string;
45
35
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,2BAA2B;IAC3C;;OAEG;IACH,MAAM,EAAE,+BAA+B,CAAC;IACxC;;OAEG;IACH,UAAU,CAAC,EAAE,gCAAgC,CAAC;IAC9C,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,+BAA+B;IAC/C;;OAEG;IACH,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,gCAAgC;IAChD;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,4BAA4B,GAAG,mCAAmC,EAAE,CAAC;AACjF;;GAEG;AACH,MAAM,WAAW,mCAAmC;IACnD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1
+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,2BAA2B;IAC3C;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,gCAAgC,CAAC;IAC7C,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,4BAA4B,GAAG,mCAAmC,EAAE,CAAC;AACjF;;GAEG;AACH,MAAM,WAAW,mCAAmC;IACnD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
@@ -8,27 +8,13 @@
8
8
  */
9
9
  export interface ZeroShotImageClassificationInput {
10
10
  /**
11
- * The input image data, with candidate labels
11
+ * The input image data to classify as a base64-encoded string.
12
12
  */
13
- inputs: ZeroShotImageClassificationInputData;
13
+ inputs: string;
14
14
  /**
15
15
  * Additional inference parameters
16
16
  */
17
- parameters?: ZeroShotImageClassificationParameters;
18
- [property: string]: unknown;
19
- }
20
- /**
21
- * The input image data, with candidate labels
22
- */
23
- export interface ZeroShotImageClassificationInputData {
24
- /**
25
- * The candidate labels for this image
26
- */
27
- candidateLabels: string[];
28
- /**
29
- * The image data to classify
30
- */
31
- image: unknown;
17
+ parameters: ZeroShotImageClassificationParameters;
32
18
  [property: string]: unknown;
33
19
  }
34
20
  /**
@@ -38,8 +24,12 @@ export interface ZeroShotImageClassificationInputData {
38
24
  */
39
25
  export interface ZeroShotImageClassificationParameters {
40
26
  /**
41
- * The sentence used in conjunction with candidateLabels to attempt the text classification
42
- * by replacing the placeholder with the candidate labels.
27
+ * The candidate labels for this image
28
+ */
29
+ candidate_labels: string[];
30
+ /**
31
+ * The sentence used in conjunction with `candidate_labels` to attempt the image
32
+ * classification by replacing the placeholder with the candidate labels.
43
33
  */
44
34
  hypothesis_template?: string;
45
35
  [property: string]: unknown;
@@ -1 +1 @@
1
- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-image-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,MAAM,EAAE,oCAAoC,CAAC;IAC7C;;OAEG;IACH,UAAU,CAAC,EAAE,qCAAqC,CAAC;IACnD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,oCAAoC;IACpD;;OAEG;IACH,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,qCAAqC;IACrD;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,iCAAiC,GAAG,wCAAwC,EAAE,CAAC;AAC3F;;GAEG;AACH,MAAM,WAAW,wCAAwC;IACxD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1
+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-image-classification/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,qCAAqC,CAAC;IAClD,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,qCAAqC;IACrD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,iCAAiC,GAAG,wCAAwC,EAAE,CAAC;AAC3F;;GAEG;AACH,MAAM,WAAW,wCAAwC;IACxD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
@@ -8,29 +8,25 @@
8
8
  */
9
9
  export interface ZeroShotObjectDetectionInput {
10
10
  /**
11
- * The input image data, with candidate labels
11
+ * The input image data as a base64-encoded string.
12
12
  */
13
- inputs: ZeroShotObjectDetectionInputData;
13
+ inputs: string;
14
14
  /**
15
15
  * Additional inference parameters
16
16
  */
17
- parameters?: {
18
- [key: string]: unknown;
19
- };
17
+ parameters: ZeroShotObjectDetectionParameters;
20
18
  [property: string]: unknown;
21
19
  }
22
20
  /**
23
- * The input image data, with candidate labels
21
+ * Additional inference parameters
22
+ *
23
+ * Additional inference parameters for Zero Shot Object Detection
24
24
  */
25
- export interface ZeroShotObjectDetectionInputData {
25
+ export interface ZeroShotObjectDetectionParameters {
26
26
  /**
27
27
  * The candidate labels for this image
28
28
  */
29
- candidateLabels: string[];
30
- /**
31
- * The image data to generate bounding boxes from
32
- */
33
- image: unknown;
29
+ candidate_labels: string[];
34
30
  [property: string]: unknown;
35
31
  }
36
32
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-object-detection/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C;;OAEG;IACH,MAAM,EAAE,gCAAgC,CAAC;IACzC;;OAEG;IACH,UAAU,CAAC,EAAE;QACZ,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;KACvB,CAAC;IACF,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;GAEG;AACH,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B;;OAEG;IACH,KAAK,EAAE,OAAO,CAAC;IACf,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,6BAA6B,GAAG,oCAAoC,EAAE,CAAC;AACnF;;GAEG;AACH,MAAM,WAAW,oCAAoC;IACpD;;;OAGG;IACH,GAAG,EAAE,WAAW,CAAC;IACjB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
1
+ {"version":3,"file":"inference.d.ts","sourceRoot":"","sources":["../../../../src/tasks/zero-shot-object-detection/inference.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH;;GAEG;AACH,MAAM,WAAW,4BAA4B;IAC5C;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf;;OAEG;IACH,UAAU,EAAE,iCAAiC,CAAC;IAC9C,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;;GAIG;AACH,MAAM,WAAW,iCAAiC;IACjD;;OAEG;IACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B;AACD,MAAM,MAAM,6BAA6B,GAAG,oCAAoC,EAAE,CAAC;AACnF;;GAEG;AACH,MAAM,WAAW,oCAAoC;IACpD;;;OAGG;IACH,GAAG,EAAE,WAAW,CAAC;IACjB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC;CAC5B"}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.13.2",
4
+ "version": "0.13.4",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
package/src/index.ts CHANGED
@@ -49,6 +49,7 @@ import * as snippets from "./snippets/index.js";
49
49
  export * from "./gguf.js";
50
50
 
51
51
  export { snippets };
52
+ export type { InferenceSnippet } from "./snippets/index.js";
52
53
 
53
54
  export { SKUS, DEFAULT_MEMORY_OPTIONS } from "./hardware.js";
54
55
  export type { HardwareSpec, SkuType } from "./hardware.js";
@@ -212,6 +212,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
212
212
  repoUrl: "https://github.com/cartesia-ai/cartesia_mlx",
213
213
  snippets: snippets.cartesia_mlx,
214
214
  },
215
+ clipscope: {
216
+ prettyLabel: "clipscope",
217
+ repoName: "clipscope",
218
+ repoUrl: "https://github.com/Lewington-pitsos/clipscope",
219
+ filter: false,
220
+ countDownloads: `path_extension:"pt"`,
221
+ },
215
222
  cotracker: {
216
223
  prettyLabel: "CoTracker",
217
224
  repoName: "CoTracker",
package/src/pipelines.ts CHANGED
@@ -355,6 +355,12 @@ export const PIPELINE_DATA = {
355
355
  modality: "audio",
356
356
  color: "green",
357
357
  },
358
+ "audio-text-to-text": {
359
+ name: "Audio-Text-to-Text",
360
+ modality: "multimodal",
361
+ color: "red",
362
+ hideInDatasets: true,
363
+ },
358
364
  "voice-activity-detection": {
359
365
  name: "Voice Activity Detection",
360
366
  modality: "audio",
@@ -6,12 +6,16 @@ const taskData: TaskDataCustom = {
6
6
  description: "31,175 hours of multilingual audio-text dataset in 108 languages.",
7
7
  id: "mozilla-foundation/common_voice_17_0",
8
8
  },
9
+ {
10
+ description: "Multilingual and diverse audio dataset with 101k hours of audio.",
11
+ id: "amphion/Emilia-Dataset",
12
+ },
9
13
  {
10
14
  description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.",
11
15
  id: "parler-tts/mls_eng",
12
16
  },
13
17
  {
14
- description: "A multi-lingual audio dataset with 370K hours of audio.",
18
+ description: "A multilingual audio dataset with 370K hours of audio.",
15
19
  id: "espnet/yodas",
16
20
  },
17
21
  ],
@@ -54,6 +58,10 @@ const taskData: TaskDataCustom = {
54
58
  description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.",
55
59
  id: "facebook/seamless-m4t-v2-large",
56
60
  },
61
+ {
62
+ description: "A powerful multilingual ASR and Speech Translation model by Nvidia.",
63
+ id: "nvidia/canary-1b",
64
+ },
57
65
  {
58
66
  description: "Powerful speaker diarization model.",
59
67
  id: "pyannote/speaker-diarization-3.1",
@@ -65,13 +73,17 @@ const taskData: TaskDataCustom = {
65
73
  id: "hf-audio/whisper-large-v3",
66
74
  },
67
75
  {
68
- description: "Fastest speech recognition application.",
69
- id: "sanchit-gandhi/whisper-jax",
76
+ description: "Latest ASR model from Useful Sensors.",
77
+ id: "mrfakename/Moonshinex",
70
78
  },
71
79
  {
72
80
  description: "A high quality speech and text translation model by Meta.",
73
81
  id: "facebook/seamless_m4t",
74
82
  },
83
+ {
84
+ description: "A powerful multilingual ASR and Speech Translation model by Nvidia",
85
+ id: "nvidia/canary-1b",
86
+ },
75
87
  ],
76
88
  summary:
77
89
  "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.",
@@ -102,9 +102,5 @@ export interface DocumentQuestionAnsweringOutputElement {
102
102
  * boxes).
103
103
  */
104
104
  start: number;
105
- /**
106
- * The index of each word/box pair that is in the answer
107
- */
108
- words: number[];
109
105
  [property: string]: unknown;
110
106
  }
@@ -22,15 +22,8 @@
22
22
  "end": {
23
23
  "type": "integer",
24
24
  "description": "The end word index of the answer (in the OCR\u2019d version of the input or provided word boxes)."
25
- },
26
- "words": {
27
- "type": "array",
28
- "items": {
29
- "type": "integer"
30
- },
31
- "description": "The index of each word/box pair that is in the answer"
32
25
  }
33
26
  },
34
- "required": ["answer", "score", "start", "end", "words"]
27
+ "required": ["answer", "score", "start", "end"]
35
28
  }
36
29
  }
@@ -102,7 +102,6 @@ export type * from "./zero-shot-image-classification/inference.js";
102
102
  export type {
103
103
  BoundingBox,
104
104
  ZeroShotObjectDetectionInput,
105
- ZeroShotObjectDetectionInputData,
106
105
  ZeroShotObjectDetectionOutput,
107
106
  ZeroShotObjectDetectionOutputElement,
108
107
  } from "./zero-shot-object-detection/inference.js";
@@ -116,6 +115,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
116
115
  "audio-classification": ["speechbrain", "transformers", "transformers.js"],
117
116
  "audio-to-audio": ["asteroid", "fairseq", "speechbrain"],
118
117
  "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
118
+ "audio-text-to-text": [],
119
119
  "depth-estimation": ["transformers", "transformers.js"],
120
120
  "document-question-answering": ["transformers", "transformers.js"],
121
121
  "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
@@ -197,6 +197,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
197
197
  "any-to-any": getData("any-to-any", placeholder),
198
198
  "audio-classification": getData("audio-classification", audioClassification),
199
199
  "audio-to-audio": getData("audio-to-audio", audioToAudio),
200
+ "audio-text-to-text": getData("audio-text-to-text", placeholder),
200
201
  "automatic-speech-recognition": getData("automatic-speech-recognition", automaticSpeechRecognition),
201
202
  "depth-estimation": getData("depth-estimation", depthEstimation),
202
203
  "document-question-answering": getData("document-question-answering", documentQuestionAnswering),
@@ -12,6 +12,16 @@ Generating masks can facilitate learning, especially in semi or unsupervised lea
12
12
 
13
13
  For applications where humans are in the loop, masks highlight certain regions of images for humans to validate.
14
14
 
15
+ ### Medical Imaging
16
+
17
+ Mask generation models are used in medical imaging to aid in segmenting and analyzing specific regions.
18
+
19
+ ### Autonomous Vehicles
20
+
21
+ Mask generation models are used to create segments and masks for obstacles and other objects in view.
22
+
23
+ This page was made possible thanks to the efforts of [Raj Aryan](https://huggingface.co/thatrajaryan) and other contributors.
24
+
15
25
  ## Task Variants
16
26
 
17
27
  ### Segmentation
@@ -1,7 +1,16 @@
1
1
  import type { TaskDataCustom } from "../index.js";
2
2
 
3
3
  const taskData: TaskDataCustom = {
4
- datasets: [],
4
+ datasets: [
5
+ {
6
+ description: "Widely used benchmark dataset for multiple Vision tasks.",
7
+ id: "merve/coco2017",
8
+ },
9
+ {
10
+ description: "Medical Imaging dataset of the Human Brain for segmentation and mask generating tasks",
11
+ id: "rocky93/BraTS_segmentation",
12
+ },
13
+ ],
5
14
  demo: {
6
15
  inputs: [
7
16
  {
@@ -16,7 +25,12 @@ const taskData: TaskDataCustom = {
16
25
  },
17
26
  ],
18
27
  },
19
- metrics: [],
28
+ metrics: [
29
+ {
30
+ description: "IoU is used to measure the overlap between predicted mask and the ground truth mask.",
31
+ id: "Intersection over Union (IoU)",
32
+ },
33
+ ],
20
34
  models: [
21
35
  {
22
36
  description: "Small yet powerful mask generation model.",
@@ -11,6 +11,10 @@ const taskData: TaskDataCustom = {
11
11
  description: "Multi-speaker English dataset.",
12
12
  id: "mythicinfinity/libritts_r",
13
13
  },
14
+ {
15
+ description: "Mulit-lingual dataset.",
16
+ id: "facebook/multilingual_librispeech",
17
+ },
14
18
  ],
15
19
  demo: {
16
20
  inputs: [
@@ -35,20 +39,24 @@ const taskData: TaskDataCustom = {
35
39
  ],
36
40
  models: [
37
41
  {
38
- description: "A powerful TTS model.",
42
+ description: "A prompt based, powerful TTS model.",
39
43
  id: "parler-tts/parler-tts-large-v1",
40
44
  },
45
+ {
46
+ description: "A powerful TTS model that supports English and Chinese.",
47
+ id: "SWivid/F5-TTS",
48
+ },
41
49
  {
42
50
  description: "A massively multi-lingual TTS model.",
43
51
  id: "coqui/XTTS-v2",
44
52
  },
45
53
  {
46
- description: "Robust TTS model.",
47
- id: "metavoiceio/metavoice-1B-v0.1",
54
+ description: "A powerful TTS model.",
55
+ id: "amphion/MaskGCT",
48
56
  },
49
57
  {
50
- description: "A prompt based, powerful TTS model.",
51
- id: "parler-tts/parler_tts_mini_v0.1",
58
+ description: "A Llama based TTS model.",
59
+ id: "OuteAI/OuteTTS-0.1-350M",
52
60
  },
53
61
  ],
54
62
  spaces: [
@@ -66,8 +74,8 @@ const taskData: TaskDataCustom = {
66
74
  id: "mrfakename/E2-F5-TTS",
67
75
  },
68
76
  {
69
- description: "An application that synthesizes speech for diverse speaker prompts.",
70
- id: "parler-tts/parler_tts_mini",
77
+ description: "An application that synthesizes emotional speech for diverse speaker prompts.",
78
+ id: "parler-tts/parler-tts-expresso",
71
79
  },
72
80
  ],
73
81
  summary: