@huggingface/tasks 0.13.17 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/dist/commonjs/model-libraries-snippets.d.ts +2 -2
  2. package/dist/commonjs/model-libraries-snippets.d.ts.map +1 -1
  3. package/dist/commonjs/model-libraries-snippets.js +2 -2
  4. package/dist/commonjs/model-libraries.d.ts +2 -2
  5. package/dist/commonjs/tasks/audio-classification/inference.d.ts +1 -1
  6. package/dist/commonjs/tasks/audio-classification/inference.d.ts.map +1 -1
  7. package/dist/commonjs/tasks/automatic-speech-recognition/inference.d.ts +1 -1
  8. package/dist/commonjs/tasks/automatic-speech-recognition/inference.d.ts.map +1 -1
  9. package/dist/commonjs/tasks/automatic-speech-recognition/inference.js +0 -5
  10. package/dist/commonjs/tasks/chat-completion/inference.d.ts.map +1 -1
  11. package/dist/commonjs/tasks/chat-completion/inference.js +0 -5
  12. package/dist/commonjs/tasks/depth-estimation/inference.d.ts.map +1 -1
  13. package/dist/commonjs/tasks/depth-estimation/inference.js +0 -5
  14. package/dist/commonjs/tasks/feature-extraction/inference.d.ts.map +1 -1
  15. package/dist/commonjs/tasks/feature-extraction/inference.js +0 -5
  16. package/dist/commonjs/tasks/image-classification/inference.d.ts +1 -1
  17. package/dist/commonjs/tasks/image-classification/inference.d.ts.map +1 -1
  18. package/dist/commonjs/tasks/image-segmentation/inference.d.ts +1 -1
  19. package/dist/commonjs/tasks/image-segmentation/inference.d.ts.map +1 -1
  20. package/dist/commonjs/tasks/image-to-image/inference.d.ts +3 -3
  21. package/dist/commonjs/tasks/image-to-image/inference.d.ts.map +1 -1
  22. package/dist/commonjs/tasks/image-to-image/inference.js +0 -5
  23. package/dist/commonjs/tasks/image-to-text/inference.d.ts +1 -1
  24. package/dist/commonjs/tasks/image-to-text/inference.d.ts.map +1 -1
  25. package/dist/commonjs/tasks/image-to-text/inference.js +0 -5
  26. package/dist/commonjs/tasks/index.d.ts +1 -0
  27. package/dist/commonjs/tasks/index.d.ts.map +1 -1
  28. package/dist/commonjs/tasks/object-detection/inference.d.ts +1 -1
  29. package/dist/commonjs/tasks/object-detection/inference.d.ts.map +1 -1
  30. package/dist/commonjs/tasks/sentence-similarity/inference.d.ts.map +1 -1
  31. package/dist/commonjs/tasks/sentence-similarity/inference.js +0 -5
  32. package/dist/commonjs/tasks/summarization/inference.d.ts.map +1 -1
  33. package/dist/commonjs/tasks/summarization/inference.js +0 -5
  34. package/dist/commonjs/tasks/text-generation/inference.d.ts.map +1 -1
  35. package/dist/commonjs/tasks/text-generation/inference.js +0 -5
  36. package/dist/commonjs/tasks/text-to-audio/inference.d.ts +14 -15
  37. package/dist/commonjs/tasks/text-to-audio/inference.d.ts.map +1 -1
  38. package/dist/commonjs/tasks/text-to-audio/inference.js +0 -5
  39. package/dist/commonjs/tasks/text-to-image/inference.d.ts +2 -2
  40. package/dist/commonjs/tasks/text-to-image/inference.d.ts.map +1 -1
  41. package/dist/commonjs/tasks/text-to-image/inference.js +0 -5
  42. package/dist/commonjs/tasks/text-to-speech/inference.d.ts +14 -17
  43. package/dist/commonjs/tasks/text-to-speech/inference.d.ts.map +1 -1
  44. package/dist/commonjs/tasks/text-to-speech/inference.js +0 -5
  45. package/dist/commonjs/tasks/text-to-video/inference.d.ts +58 -0
  46. package/dist/commonjs/tasks/text-to-video/inference.d.ts.map +1 -0
  47. package/dist/commonjs/tasks/text-to-video/inference.js +2 -0
  48. package/dist/commonjs/tasks/text2text-generation/inference.d.ts.map +1 -1
  49. package/dist/commonjs/tasks/text2text-generation/inference.js +0 -5
  50. package/dist/commonjs/tasks/translation/inference.d.ts.map +1 -1
  51. package/dist/commonjs/tasks/translation/inference.js +0 -5
  52. package/dist/commonjs/tasks/visual-question-answering/inference.d.ts +1 -1
  53. package/dist/commonjs/tasks/visual-question-answering/inference.d.ts.map +1 -1
  54. package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts +1 -1
  55. package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
  56. package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts +1 -1
  57. package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
  58. package/dist/esm/model-libraries-snippets.d.ts +2 -2
  59. package/dist/esm/model-libraries-snippets.d.ts.map +1 -1
  60. package/dist/esm/model-libraries-snippets.js +2 -2
  61. package/dist/esm/model-libraries.d.ts +2 -2
  62. package/dist/esm/tasks/audio-classification/inference.d.ts +1 -1
  63. package/dist/esm/tasks/audio-classification/inference.d.ts.map +1 -1
  64. package/dist/esm/tasks/automatic-speech-recognition/inference.d.ts +1 -1
  65. package/dist/esm/tasks/automatic-speech-recognition/inference.d.ts.map +1 -1
  66. package/dist/esm/tasks/automatic-speech-recognition/inference.js +0 -5
  67. package/dist/esm/tasks/chat-completion/inference.d.ts.map +1 -1
  68. package/dist/esm/tasks/chat-completion/inference.js +0 -5
  69. package/dist/esm/tasks/depth-estimation/inference.d.ts.map +1 -1
  70. package/dist/esm/tasks/depth-estimation/inference.js +0 -5
  71. package/dist/esm/tasks/feature-extraction/inference.d.ts.map +1 -1
  72. package/dist/esm/tasks/feature-extraction/inference.js +0 -5
  73. package/dist/esm/tasks/image-classification/inference.d.ts +1 -1
  74. package/dist/esm/tasks/image-classification/inference.d.ts.map +1 -1
  75. package/dist/esm/tasks/image-segmentation/inference.d.ts +1 -1
  76. package/dist/esm/tasks/image-segmentation/inference.d.ts.map +1 -1
  77. package/dist/esm/tasks/image-to-image/inference.d.ts +3 -3
  78. package/dist/esm/tasks/image-to-image/inference.d.ts.map +1 -1
  79. package/dist/esm/tasks/image-to-image/inference.js +0 -5
  80. package/dist/esm/tasks/image-to-text/inference.d.ts +1 -1
  81. package/dist/esm/tasks/image-to-text/inference.d.ts.map +1 -1
  82. package/dist/esm/tasks/image-to-text/inference.js +0 -5
  83. package/dist/esm/tasks/index.d.ts +1 -0
  84. package/dist/esm/tasks/index.d.ts.map +1 -1
  85. package/dist/esm/tasks/object-detection/inference.d.ts +1 -1
  86. package/dist/esm/tasks/object-detection/inference.d.ts.map +1 -1
  87. package/dist/esm/tasks/sentence-similarity/inference.d.ts.map +1 -1
  88. package/dist/esm/tasks/sentence-similarity/inference.js +0 -5
  89. package/dist/esm/tasks/summarization/inference.d.ts.map +1 -1
  90. package/dist/esm/tasks/summarization/inference.js +0 -5
  91. package/dist/esm/tasks/text-generation/inference.d.ts.map +1 -1
  92. package/dist/esm/tasks/text-generation/inference.js +0 -5
  93. package/dist/esm/tasks/text-to-audio/inference.d.ts +14 -15
  94. package/dist/esm/tasks/text-to-audio/inference.d.ts.map +1 -1
  95. package/dist/esm/tasks/text-to-audio/inference.js +0 -5
  96. package/dist/esm/tasks/text-to-image/inference.d.ts +2 -2
  97. package/dist/esm/tasks/text-to-image/inference.d.ts.map +1 -1
  98. package/dist/esm/tasks/text-to-image/inference.js +0 -5
  99. package/dist/esm/tasks/text-to-speech/inference.d.ts +14 -17
  100. package/dist/esm/tasks/text-to-speech/inference.d.ts.map +1 -1
  101. package/dist/esm/tasks/text-to-speech/inference.js +0 -5
  102. package/dist/esm/tasks/text-to-video/inference.d.ts +58 -0
  103. package/dist/esm/tasks/text-to-video/inference.d.ts.map +1 -0
  104. package/dist/esm/tasks/text-to-video/inference.js +1 -0
  105. package/dist/esm/tasks/text2text-generation/inference.d.ts.map +1 -1
  106. package/dist/esm/tasks/text2text-generation/inference.js +0 -5
  107. package/dist/esm/tasks/translation/inference.d.ts.map +1 -1
  108. package/dist/esm/tasks/translation/inference.js +0 -5
  109. package/dist/esm/tasks/visual-question-answering/inference.d.ts +1 -1
  110. package/dist/esm/tasks/visual-question-answering/inference.d.ts.map +1 -1
  111. package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts +1 -1
  112. package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
  113. package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts +1 -1
  114. package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
  115. package/package.json +1 -1
  116. package/src/model-libraries-snippets.ts +3 -3
  117. package/src/tasks/audio-classification/inference.ts +1 -1
  118. package/src/tasks/audio-classification/spec/input.json +2 -1
  119. package/src/tasks/automatic-speech-recognition/inference.ts +1 -7
  120. package/src/tasks/automatic-speech-recognition/spec/input.json +2 -1
  121. package/src/tasks/chat-completion/inference.ts +0 -33
  122. package/src/tasks/depth-estimation/inference.ts +3 -3
  123. package/src/tasks/document-question-answering/spec/input.json +2 -1
  124. package/src/tasks/feature-extraction/inference.ts +0 -3
  125. package/src/tasks/image-classification/inference.ts +1 -1
  126. package/src/tasks/image-classification/spec/input.json +2 -1
  127. package/src/tasks/image-segmentation/inference.ts +1 -1
  128. package/src/tasks/image-segmentation/spec/input.json +2 -1
  129. package/src/tasks/image-to-image/inference.ts +3 -7
  130. package/src/tasks/image-to-image/spec/input.json +4 -6
  131. package/src/tasks/image-to-text/inference.ts +1 -6
  132. package/src/tasks/image-to-text/spec/input.json +2 -1
  133. package/src/tasks/index.ts +1 -0
  134. package/src/tasks/object-detection/inference.ts +1 -1
  135. package/src/tasks/object-detection/spec/input.json +2 -1
  136. package/src/tasks/sentence-similarity/inference.ts +3 -4
  137. package/src/tasks/summarization/inference.ts +3 -5
  138. package/src/tasks/text-generation/inference.ts +0 -13
  139. package/src/tasks/text-to-audio/inference.ts +14 -20
  140. package/src/tasks/text-to-audio/spec/output.json +3 -2
  141. package/src/tasks/text-to-image/inference.ts +2 -6
  142. package/src/tasks/text-to-image/spec/input.json +2 -5
  143. package/src/tasks/text-to-speech/inference.ts +14 -22
  144. package/src/tasks/text-to-speech/spec/output.json +13 -2
  145. package/src/tasks/text-to-video/inference.ts +57 -0
  146. package/src/tasks/text-to-video/spec/input.json +49 -0
  147. package/src/tasks/text-to-video/spec/output.json +13 -0
  148. package/src/tasks/text2text-generation/inference.ts +3 -5
  149. package/src/tasks/translation/inference.ts +3 -5
  150. package/src/tasks/visual-question-answering/inference.ts +1 -1
  151. package/src/tasks/visual-question-answering/spec/input.json +4 -2
  152. package/src/tasks/zero-shot-image-classification/inference.ts +1 -1
  153. package/src/tasks/zero-shot-image-classification/spec/input.json +2 -1
  154. package/src/tasks/zero-shot-object-detection/inference.ts +1 -1
  155. package/src/tasks/zero-shot-object-detection/spec/input.json +2 -1
@@ -3,7 +3,6 @@
3
3
  *
4
4
  * Using src/scripts/inference-codegen
5
5
  */
6
-
7
6
  /**
8
7
  * Inputs for Image To Image inference
9
8
  */
@@ -12,14 +11,13 @@ export interface ImageToImageInput {
12
11
  * The input image data as a base64-encoded string. If no `parameters` are provided, you can
13
12
  * also provide the image data as a raw bytes payload.
14
13
  */
15
- inputs: string;
14
+ inputs: Blob;
16
15
  /**
17
16
  * Additional inference parameters for Image To Image
18
17
  */
19
18
  parameters?: ImageToImageParameters;
20
19
  [property: string]: unknown;
21
20
  }
22
-
23
21
  /**
24
22
  * Additional inference parameters for Image To Image
25
23
  */
@@ -30,9 +28,9 @@ export interface ImageToImageParameters {
30
28
  */
31
29
  guidance_scale?: number;
32
30
  /**
33
- * One or several prompt to guide what NOT to include in image generation.
31
+ * One prompt to guide what NOT to include in image generation.
34
32
  */
35
- negative_prompt?: string[];
33
+ negative_prompt?: string;
36
34
  /**
37
35
  * For diffusion models. The number of denoising steps. More denoising steps usually lead to
38
36
  * a higher quality image at the expense of slower inference.
@@ -44,7 +42,6 @@ export interface ImageToImageParameters {
44
42
  target_size?: TargetSize;
45
43
  [property: string]: unknown;
46
44
  }
47
-
48
45
  /**
49
46
  * The size in pixel of the output image.
50
47
  */
@@ -53,7 +50,6 @@ export interface TargetSize {
53
50
  width: number;
54
51
  [property: string]: unknown;
55
52
  }
56
-
57
53
  /**
58
54
  * Outputs of inference for the Image To Image task
59
55
  */
@@ -7,7 +7,8 @@
7
7
  "properties": {
8
8
  "inputs": {
9
9
  "type": "string",
10
- "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
11
+ "comment": "type=binary"
11
12
  },
12
13
  "parameters": {
13
14
  "description": "Additional inference parameters for Image To Image",
@@ -24,11 +25,8 @@
24
25
  "description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
25
26
  },
26
27
  "negative_prompt": {
27
- "type": "array",
28
- "items": {
29
- "type": "string"
30
- },
31
- "description": "One or several prompt to guide what NOT to include in image generation."
28
+ "type": "string",
29
+ "description": "One prompt to guide what NOT to include in image generation."
32
30
  },
33
31
  "num_inference_steps": {
34
32
  "type": "integer",
@@ -3,7 +3,6 @@
3
3
  *
4
4
  * Using src/scripts/inference-codegen
5
5
  */
6
-
7
6
  /**
8
7
  * Inputs for Image To Text inference
9
8
  */
@@ -11,14 +10,13 @@ export interface ImageToTextInput {
11
10
  /**
12
11
  * The input image data
13
12
  */
14
- inputs: unknown;
13
+ inputs: Blob;
15
14
  /**
16
15
  * Additional inference parameters for Image To Text
17
16
  */
18
17
  parameters?: ImageToTextParameters;
19
18
  [property: string]: unknown;
20
19
  }
21
-
22
20
  /**
23
21
  * Additional inference parameters for Image To Text
24
22
  */
@@ -33,7 +31,6 @@ export interface ImageToTextParameters {
33
31
  max_new_tokens?: number;
34
32
  [property: string]: unknown;
35
33
  }
36
-
37
34
  /**
38
35
  * Parametrization of the text generation process
39
36
  */
@@ -120,12 +117,10 @@ export interface GenerationParameters {
120
117
  use_cache?: boolean;
121
118
  [property: string]: unknown;
122
119
  }
123
-
124
120
  /**
125
121
  * Controls the stopping condition for beam-based methods.
126
122
  */
127
123
  export type EarlyStoppingUnion = boolean | "never";
128
-
129
124
  /**
130
125
  * Outputs of inference for the Image To Text task
131
126
  */
@@ -6,7 +6,8 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "inputs": {
9
- "description": "The input image data"
9
+ "description": "The input image data",
10
+ "comment": "type=binary"
10
11
  },
11
12
  "parameters": {
12
13
  "description": "Additional inference parameters for Image To Text",
@@ -73,6 +73,7 @@ export type * from "./sentence-similarity/inference.js";
73
73
  export type * from "./summarization/inference.js";
74
74
  export type * from "./table-question-answering/inference.js";
75
75
  export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference.js";
76
+ export type { TextToVideoParameters, TextToVideoOutput, TextToVideoInput } from "./text-to-video/inference.js";
76
77
  export type { TextToSpeechParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference.js";
77
78
  export type * from "./token-classification/inference.js";
78
79
  export type { TranslationInput, TranslationOutput } from "./translation/inference.js";
@@ -11,7 +11,7 @@ export interface ObjectDetectionInput {
11
11
  * The input image data as a base64-encoded string. If no `parameters` are provided, you can
12
12
  * also provide the image data as a raw bytes payload.
13
13
  */
14
- inputs: string;
14
+ inputs: Blob;
15
15
  /**
16
16
  * Additional inference parameters for Object Detection
17
17
  */
@@ -7,7 +7,8 @@
7
7
  "properties": {
8
8
  "inputs": {
9
9
  "type": "string",
10
- "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
10
+ "description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
11
+ "comment": "type=binary"
11
12
  },
12
13
  "parameters": {
13
14
  "description": "Additional inference parameters for Object Detection",
@@ -3,9 +3,7 @@
3
3
  *
4
4
  * Using src/scripts/inference-codegen
5
5
  */
6
-
7
6
  export type SentenceSimilarityOutput = number[];
8
-
9
7
  /**
10
8
  * Inputs for Sentence similarity inference
11
9
  */
@@ -14,10 +12,11 @@ export interface SentenceSimilarityInput {
14
12
  /**
15
13
  * Additional inference parameters for Sentence Similarity
16
14
  */
17
- parameters?: { [key: string]: unknown };
15
+ parameters?: {
16
+ [key: string]: unknown;
17
+ };
18
18
  [property: string]: unknown;
19
19
  }
20
-
21
20
  export interface SentenceSimilarityInputData {
22
21
  /**
23
22
  * A list of strings which will be compared against the source_sentence.
@@ -3,7 +3,6 @@
3
3
  *
4
4
  * Using src/scripts/inference-codegen
5
5
  */
6
-
7
6
  /**
8
7
  * Inputs for Summarization inference
9
8
  */
@@ -18,7 +17,6 @@ export interface SummarizationInput {
18
17
  parameters?: SummarizationParameters;
19
18
  [property: string]: unknown;
20
19
  }
21
-
22
20
  /**
23
21
  * Additional inference parameters for summarization.
24
22
  */
@@ -30,19 +28,19 @@ export interface SummarizationParameters {
30
28
  /**
31
29
  * Additional parametrization of the text generation algorithm.
32
30
  */
33
- generate_parameters?: { [key: string]: unknown };
31
+ generate_parameters?: {
32
+ [key: string]: unknown;
33
+ };
34
34
  /**
35
35
  * The truncation strategy to use.
36
36
  */
37
37
  truncation?: SummarizationTruncationStrategy;
38
38
  [property: string]: unknown;
39
39
  }
40
-
41
40
  /**
42
41
  * The truncation strategy to use.
43
42
  */
44
43
  export type SummarizationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
45
-
46
44
  /**
47
45
  * Outputs of inference for the Summarization task
48
46
  */
@@ -3,7 +3,6 @@
3
3
  *
4
4
  * Using src/scripts/inference-codegen
5
5
  */
6
-
7
6
  /**
8
7
  * Text Generation Input.
9
8
  *
@@ -17,7 +16,6 @@ export interface TextGenerationInput {
17
16
  stream?: boolean;
18
17
  [property: string]: unknown;
19
18
  }
20
-
21
19
  export interface TextGenerationInputGenerateParameters {
22
20
  /**
23
21
  * Lora adapter id
@@ -100,7 +98,6 @@ export interface TextGenerationInputGenerateParameters {
100
98
  watermark?: boolean;
101
99
  [property: string]: unknown;
102
100
  }
103
-
104
101
  export interface TextGenerationInputGrammarType {
105
102
  type: Type;
106
103
  /**
@@ -112,9 +109,7 @@ export interface TextGenerationInputGrammarType {
112
109
  value: unknown;
113
110
  [property: string]: unknown;
114
111
  }
115
-
116
112
  export type Type = "json" | "regex";
117
-
118
113
  /**
119
114
  * Text Generation Output.
120
115
  *
@@ -127,7 +122,6 @@ export interface TextGenerationOutput {
127
122
  generated_text: string;
128
123
  [property: string]: unknown;
129
124
  }
130
-
131
125
  export interface TextGenerationOutputDetails {
132
126
  best_of_sequences?: TextGenerationOutputBestOfSequence[];
133
127
  finish_reason: TextGenerationOutputFinishReason;
@@ -138,7 +132,6 @@ export interface TextGenerationOutputDetails {
138
132
  top_tokens?: Array<TextGenerationOutputToken[]>;
139
133
  [property: string]: unknown;
140
134
  }
141
-
142
135
  export interface TextGenerationOutputBestOfSequence {
143
136
  finish_reason: TextGenerationOutputFinishReason;
144
137
  generated_text: string;
@@ -149,16 +142,13 @@ export interface TextGenerationOutputBestOfSequence {
149
142
  top_tokens?: Array<TextGenerationOutputToken[]>;
150
143
  [property: string]: unknown;
151
144
  }
152
-
153
145
  export type TextGenerationOutputFinishReason = "length" | "eos_token" | "stop_sequence";
154
-
155
146
  export interface TextGenerationOutputPrefillToken {
156
147
  id: number;
157
148
  logprob: number;
158
149
  text: string;
159
150
  [property: string]: unknown;
160
151
  }
161
-
162
152
  export interface TextGenerationOutputToken {
163
153
  id: number;
164
154
  logprob: number;
@@ -166,7 +156,6 @@ export interface TextGenerationOutputToken {
166
156
  text: string;
167
157
  [property: string]: unknown;
168
158
  }
169
-
170
159
  /**
171
160
  * Text Generation Stream Output.
172
161
  *
@@ -182,7 +171,6 @@ export interface TextGenerationStreamOutput {
182
171
  top_tokens?: TextGenerationStreamOutputToken[];
183
172
  [property: string]: unknown;
184
173
  }
185
-
186
174
  export interface TextGenerationStreamOutputStreamDetails {
187
175
  finish_reason: TextGenerationOutputFinishReason;
188
176
  generated_tokens: number;
@@ -190,7 +178,6 @@ export interface TextGenerationStreamOutputStreamDetails {
190
178
  seed?: number;
191
179
  [property: string]: unknown;
192
180
  }
193
-
194
181
  export interface TextGenerationStreamOutputToken {
195
182
  id: number;
196
183
  logprob: number;
@@ -1,9 +1,22 @@
1
+ /**
2
+ * Outputs of inference for the Text To Audio task
3
+ */
4
+ export interface TextToAudioOutput {
5
+ /**
6
+ * The generated audio waveform.
7
+ */
8
+ audio: Blob;
9
+ /**
10
+ * The sampling rate of the generated audio waveform.
11
+ */
12
+ sampling_rate: number;
13
+ [property: string]: unknown;
14
+ }
1
15
  /**
2
16
  * Inference code generated from the JSON schema spec in ./spec
3
17
  *
4
18
  * Using src/scripts/inference-codegen
5
19
  */
6
-
7
20
  /**
8
21
  * Inputs for Text To Audio inference
9
22
  */
@@ -18,7 +31,6 @@ export interface TextToAudioInput {
18
31
  parameters?: TextToAudioParameters;
19
32
  [property: string]: unknown;
20
33
  }
21
-
22
34
  /**
23
35
  * Additional inference parameters for Text To Audio
24
36
  */
@@ -29,7 +41,6 @@ export interface TextToAudioParameters {
29
41
  generation_parameters?: GenerationParameters;
30
42
  [property: string]: unknown;
31
43
  }
32
-
33
44
  /**
34
45
  * Parametrization of the text generation process
35
46
  */
@@ -116,24 +127,7 @@ export interface GenerationParameters {
116
127
  use_cache?: boolean;
117
128
  [property: string]: unknown;
118
129
  }
119
-
120
130
  /**
121
131
  * Controls the stopping condition for beam-based methods.
122
132
  */
123
133
  export type EarlyStoppingUnion = boolean | "never";
124
-
125
- /**
126
- * Outputs of inference for the Text To Audio task
127
- */
128
- export interface TextToAudioOutput {
129
- /**
130
- * The generated audio waveform.
131
- */
132
- audio: unknown;
133
- samplingRate: unknown;
134
- /**
135
- * The sampling rate of the generated audio waveform.
136
- */
137
- sampling_rate?: number;
138
- [property: string]: unknown;
139
- }
@@ -6,12 +6,13 @@
6
6
  "type": "object",
7
7
  "properties": {
8
8
  "audio": {
9
- "description": "The generated audio waveform."
9
+ "description": "The generated audio waveform.",
10
+ "comment": "type=binary"
10
11
  },
11
12
  "sampling_rate": {
12
13
  "type": "number",
13
14
  "description": "The sampling rate of the generated audio waveform."
14
15
  }
15
16
  },
16
- "required": ["audio", "samplingRate"]
17
+ "required": ["audio", "sampling_rate"]
17
18
  }
@@ -3,7 +3,6 @@
3
3
  *
4
4
  * Using src/scripts/inference-codegen
5
5
  */
6
-
7
6
  /**
8
7
  * Inputs for Text To Image inference
9
8
  */
@@ -18,7 +17,6 @@ export interface TextToImageInput {
18
17
  parameters?: TextToImageParameters;
19
18
  [property: string]: unknown;
20
19
  }
21
-
22
20
  /**
23
21
  * Additional inference parameters for Text To Image
24
22
  */
@@ -29,9 +27,9 @@ export interface TextToImageParameters {
29
27
  */
30
28
  guidance_scale?: number;
31
29
  /**
32
- * One or several prompt to guide what NOT to include in image generation.
30
+ * One prompt to guide what NOT to include in image generation.
33
31
  */
34
- negative_prompt?: string[];
32
+ negative_prompt?: string;
35
33
  /**
36
34
  * The number of denoising steps. More denoising steps usually lead to a higher quality
37
35
  * image at the expense of slower inference.
@@ -51,7 +49,6 @@ export interface TextToImageParameters {
51
49
  target_size?: TargetSize;
52
50
  [property: string]: unknown;
53
51
  }
54
-
55
52
  /**
56
53
  * The size in pixel of the output image
57
54
  */
@@ -60,7 +57,6 @@ export interface TargetSize {
60
57
  width: number;
61
58
  [property: string]: unknown;
62
59
  }
63
-
64
60
  /**
65
61
  * Outputs of inference for the Text To Image task
66
62
  */
@@ -24,11 +24,8 @@
24
24
  "description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
25
25
  },
26
26
  "negative_prompt": {
27
- "type": "array",
28
- "items": {
29
- "type": "string"
30
- },
31
- "description": "One or several prompt to guide what NOT to include in image generation."
27
+ "type": "string",
28
+ "description": "One prompt to guide what NOT to include in image generation."
32
29
  },
33
30
  "num_inference_steps": {
34
31
  "type": "integer",
@@ -1,9 +1,22 @@
1
+ /**
2
+ * Outputs of inference for the Text To Speech task
3
+ */
4
+ export interface TextToSpeechOutput {
5
+ /**
6
+ * The generated audio
7
+ */
8
+ audio: Blob;
9
+ /**
10
+ * The sampling rate of the generated audio waveform.
11
+ */
12
+ sampling_rate?: number;
13
+ [property: string]: unknown;
14
+ }
1
15
  /**
2
16
  * Inference code generated from the JSON schema spec in ./spec
3
17
  *
4
18
  * Using src/scripts/inference-codegen
5
19
  */
6
-
7
20
  /**
8
21
  * Inputs for Text To Speech inference
9
22
  */
@@ -18,7 +31,6 @@ export interface TextToSpeechInput {
18
31
  parameters?: TextToSpeechParameters;
19
32
  [property: string]: unknown;
20
33
  }
21
-
22
34
  /**
23
35
  * Additional inference parameters for Text To Speech
24
36
  */
@@ -29,7 +41,6 @@ export interface TextToSpeechParameters {
29
41
  generation_parameters?: GenerationParameters;
30
42
  [property: string]: unknown;
31
43
  }
32
-
33
44
  /**
34
45
  * Parametrization of the text generation process
35
46
  */
@@ -116,26 +127,7 @@ export interface GenerationParameters {
116
127
  use_cache?: boolean;
117
128
  [property: string]: unknown;
118
129
  }
119
-
120
130
  /**
121
131
  * Controls the stopping condition for beam-based methods.
122
132
  */
123
133
  export type EarlyStoppingUnion = boolean | "never";
124
-
125
- /**
126
- * Outputs for Text to Speech inference
127
- *
128
- * Outputs of inference for the Text To Audio task
129
- */
130
- export interface TextToSpeechOutput {
131
- /**
132
- * The generated audio waveform.
133
- */
134
- audio: unknown;
135
- samplingRate: unknown;
136
- /**
137
- * The sampling rate of the generated audio waveform.
138
- */
139
- sampling_rate?: number;
140
- [property: string]: unknown;
141
- }
@@ -1,7 +1,18 @@
1
1
  {
2
- "$ref": "/inference/schemas/text-to-audio/output.json",
3
2
  "$id": "/inference/schemas/text-to-speech/output.json",
4
3
  "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Text To Speech task",
5
5
  "title": "TextToSpeechOutput",
6
- "description": "Outputs for Text to Speech inference"
6
+ "type": "object",
7
+ "properties": {
8
+ "audio": {
9
+ "description": "The generated audio",
10
+ "comment": "type=binary"
11
+ },
12
+ "sampling_rate": {
13
+ "type": "number",
14
+ "description": "The sampling rate of the generated audio waveform."
15
+ }
16
+ },
17
+ "required": ["audio"]
7
18
  }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Text To Video inference
8
+ */
9
+ export interface TextToVideoInput {
10
+ /**
11
+ * The input text data (sometimes called "prompt")
12
+ */
13
+ inputs: string;
14
+ /**
15
+ * Additional inference parameters for Text To Video
16
+ */
17
+ parameters?: TextToVideoParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * Additional inference parameters for Text To Video
22
+ */
23
+ export interface TextToVideoParameters {
24
+ /**
25
+ * A higher guidance scale value encourages the model to generate images closely linked to
26
+ * the text prompt, but values too high may cause saturation and other artifacts.
27
+ */
28
+ guidance_scale?: number;
29
+ /**
30
+ * One or several prompt to guide what NOT to include in image generation.
31
+ */
32
+ negative_prompt?: string[];
33
+ /**
34
+ * The num_frames parameter determines how many video frames are generated.
35
+ */
36
+ num_frames?: number;
37
+ /**
38
+ * The number of denoising steps. More denoising steps usually lead to a higher quality
39
+ * image at the expense of slower inference.
40
+ */
41
+ num_inference_steps?: number;
42
+ /**
43
+ * Seed for the random number generator.
44
+ */
45
+ seed?: number;
46
+ [property: string]: unknown;
47
+ }
48
+ /**
49
+ * Outputs of inference for the Text To Video task
50
+ */
51
+ export interface TextToVideoOutput {
52
+ /**
53
+ * The generated video returned as raw bytes in the payload.
54
+ */
55
+ video: unknown;
56
+ [property: string]: unknown;
57
+ }
@@ -0,0 +1,49 @@
1
+ {
2
+ "$id": "/inference/schemas/text-to-video/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Text To Video inference",
5
+ "title": "TextToVideoInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input text data (sometimes called \"prompt\")",
10
+ "type": "string"
11
+ },
12
+ "parameters": {
13
+ "description": "Additional inference parameters for Text To Video",
14
+ "$ref": "#/$defs/TextToVideoParameters"
15
+ }
16
+ },
17
+ "$defs": {
18
+ "TextToVideoParameters": {
19
+ "title": "TextToVideoParameters",
20
+ "type": "object",
21
+ "properties": {
22
+ "num_frames": {
23
+ "type": "number",
24
+ "description": "The num_frames parameter determines how many video frames are generated."
25
+ },
26
+ "guidance_scale": {
27
+ "type": "number",
28
+ "description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
29
+ },
30
+ "negative_prompt": {
31
+ "type": "array",
32
+ "items": {
33
+ "type": "string"
34
+ },
35
+ "description": "One or several prompt to guide what NOT to include in image generation."
36
+ },
37
+ "num_inference_steps": {
38
+ "type": "integer",
39
+ "description": "The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
40
+ },
41
+ "seed": {
42
+ "type": "integer",
43
+ "description": "Seed for the random number generator."
44
+ }
45
+ }
46
+ }
47
+ },
48
+ "required": ["inputs"]
49
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "$id": "/inference/schemas/text-to-video/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Text To Video task",
5
+ "title": "TextToVideoOutput",
6
+ "type": "object",
7
+ "properties": {
8
+ "video": {
9
+ "description": "The generated video returned as raw bytes in the payload."
10
+ }
11
+ },
12
+ "required": ["video"]
13
+ }