@huggingface/tasks 0.13.17 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commonjs/model-libraries-snippets.d.ts +2 -2
- package/dist/commonjs/model-libraries-snippets.d.ts.map +1 -1
- package/dist/commonjs/model-libraries-snippets.js +2 -2
- package/dist/commonjs/model-libraries.d.ts +2 -2
- package/dist/commonjs/tasks/audio-classification/inference.d.ts +1 -1
- package/dist/commonjs/tasks/audio-classification/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/automatic-speech-recognition/inference.d.ts +1 -1
- package/dist/commonjs/tasks/automatic-speech-recognition/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/automatic-speech-recognition/inference.js +0 -5
- package/dist/commonjs/tasks/chat-completion/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/chat-completion/inference.js +0 -5
- package/dist/commonjs/tasks/depth-estimation/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/depth-estimation/inference.js +0 -5
- package/dist/commonjs/tasks/feature-extraction/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/feature-extraction/inference.js +0 -5
- package/dist/commonjs/tasks/image-classification/inference.d.ts +1 -1
- package/dist/commonjs/tasks/image-classification/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-segmentation/inference.d.ts +1 -1
- package/dist/commonjs/tasks/image-segmentation/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-to-image/inference.d.ts +3 -3
- package/dist/commonjs/tasks/image-to-image/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-to-image/inference.js +0 -5
- package/dist/commonjs/tasks/image-to-text/inference.d.ts +1 -1
- package/dist/commonjs/tasks/image-to-text/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/image-to-text/inference.js +0 -5
- package/dist/commonjs/tasks/index.d.ts +1 -0
- package/dist/commonjs/tasks/index.d.ts.map +1 -1
- package/dist/commonjs/tasks/object-detection/inference.d.ts +1 -1
- package/dist/commonjs/tasks/object-detection/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/sentence-similarity/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/sentence-similarity/inference.js +0 -5
- package/dist/commonjs/tasks/summarization/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/summarization/inference.js +0 -5
- package/dist/commonjs/tasks/text-generation/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-generation/inference.js +0 -5
- package/dist/commonjs/tasks/text-to-audio/inference.d.ts +14 -15
- package/dist/commonjs/tasks/text-to-audio/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-audio/inference.js +0 -5
- package/dist/commonjs/tasks/text-to-image/inference.d.ts +2 -2
- package/dist/commonjs/tasks/text-to-image/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-image/inference.js +0 -5
- package/dist/commonjs/tasks/text-to-speech/inference.d.ts +14 -17
- package/dist/commonjs/tasks/text-to-speech/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/text-to-speech/inference.js +0 -5
- package/dist/commonjs/tasks/text-to-video/inference.d.ts +58 -0
- package/dist/commonjs/tasks/text-to-video/inference.d.ts.map +1 -0
- package/dist/commonjs/tasks/text-to-video/inference.js +2 -0
- package/dist/commonjs/tasks/text2text-generation/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/text2text-generation/inference.js +0 -5
- package/dist/commonjs/tasks/translation/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/translation/inference.js +0 -5
- package/dist/commonjs/tasks/visual-question-answering/inference.d.ts +1 -1
- package/dist/commonjs/tasks/visual-question-answering/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts +1 -1
- package/dist/commonjs/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
- package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts +1 -1
- package/dist/commonjs/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
- package/dist/esm/model-libraries-snippets.d.ts +2 -2
- package/dist/esm/model-libraries-snippets.d.ts.map +1 -1
- package/dist/esm/model-libraries-snippets.js +2 -2
- package/dist/esm/model-libraries.d.ts +2 -2
- package/dist/esm/tasks/audio-classification/inference.d.ts +1 -1
- package/dist/esm/tasks/audio-classification/inference.d.ts.map +1 -1
- package/dist/esm/tasks/automatic-speech-recognition/inference.d.ts +1 -1
- package/dist/esm/tasks/automatic-speech-recognition/inference.d.ts.map +1 -1
- package/dist/esm/tasks/automatic-speech-recognition/inference.js +0 -5
- package/dist/esm/tasks/chat-completion/inference.d.ts.map +1 -1
- package/dist/esm/tasks/chat-completion/inference.js +0 -5
- package/dist/esm/tasks/depth-estimation/inference.d.ts.map +1 -1
- package/dist/esm/tasks/depth-estimation/inference.js +0 -5
- package/dist/esm/tasks/feature-extraction/inference.d.ts.map +1 -1
- package/dist/esm/tasks/feature-extraction/inference.js +0 -5
- package/dist/esm/tasks/image-classification/inference.d.ts +1 -1
- package/dist/esm/tasks/image-classification/inference.d.ts.map +1 -1
- package/dist/esm/tasks/image-segmentation/inference.d.ts +1 -1
- package/dist/esm/tasks/image-segmentation/inference.d.ts.map +1 -1
- package/dist/esm/tasks/image-to-image/inference.d.ts +3 -3
- package/dist/esm/tasks/image-to-image/inference.d.ts.map +1 -1
- package/dist/esm/tasks/image-to-image/inference.js +0 -5
- package/dist/esm/tasks/image-to-text/inference.d.ts +1 -1
- package/dist/esm/tasks/image-to-text/inference.d.ts.map +1 -1
- package/dist/esm/tasks/image-to-text/inference.js +0 -5
- package/dist/esm/tasks/index.d.ts +1 -0
- package/dist/esm/tasks/index.d.ts.map +1 -1
- package/dist/esm/tasks/object-detection/inference.d.ts +1 -1
- package/dist/esm/tasks/object-detection/inference.d.ts.map +1 -1
- package/dist/esm/tasks/sentence-similarity/inference.d.ts.map +1 -1
- package/dist/esm/tasks/sentence-similarity/inference.js +0 -5
- package/dist/esm/tasks/summarization/inference.d.ts.map +1 -1
- package/dist/esm/tasks/summarization/inference.js +0 -5
- package/dist/esm/tasks/text-generation/inference.d.ts.map +1 -1
- package/dist/esm/tasks/text-generation/inference.js +0 -5
- package/dist/esm/tasks/text-to-audio/inference.d.ts +14 -15
- package/dist/esm/tasks/text-to-audio/inference.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-audio/inference.js +0 -5
- package/dist/esm/tasks/text-to-image/inference.d.ts +2 -2
- package/dist/esm/tasks/text-to-image/inference.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-image/inference.js +0 -5
- package/dist/esm/tasks/text-to-speech/inference.d.ts +14 -17
- package/dist/esm/tasks/text-to-speech/inference.d.ts.map +1 -1
- package/dist/esm/tasks/text-to-speech/inference.js +0 -5
- package/dist/esm/tasks/text-to-video/inference.d.ts +58 -0
- package/dist/esm/tasks/text-to-video/inference.d.ts.map +1 -0
- package/dist/esm/tasks/text-to-video/inference.js +1 -0
- package/dist/esm/tasks/text2text-generation/inference.d.ts.map +1 -1
- package/dist/esm/tasks/text2text-generation/inference.js +0 -5
- package/dist/esm/tasks/translation/inference.d.ts.map +1 -1
- package/dist/esm/tasks/translation/inference.js +0 -5
- package/dist/esm/tasks/visual-question-answering/inference.d.ts +1 -1
- package/dist/esm/tasks/visual-question-answering/inference.d.ts.map +1 -1
- package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts +1 -1
- package/dist/esm/tasks/zero-shot-image-classification/inference.d.ts.map +1 -1
- package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts +1 -1
- package/dist/esm/tasks/zero-shot-object-detection/inference.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/model-libraries-snippets.ts +3 -3
- package/src/tasks/audio-classification/inference.ts +1 -1
- package/src/tasks/audio-classification/spec/input.json +2 -1
- package/src/tasks/automatic-speech-recognition/inference.ts +1 -7
- package/src/tasks/automatic-speech-recognition/spec/input.json +2 -1
- package/src/tasks/chat-completion/inference.ts +0 -33
- package/src/tasks/depth-estimation/inference.ts +3 -3
- package/src/tasks/document-question-answering/spec/input.json +2 -1
- package/src/tasks/feature-extraction/inference.ts +0 -3
- package/src/tasks/image-classification/inference.ts +1 -1
- package/src/tasks/image-classification/spec/input.json +2 -1
- package/src/tasks/image-segmentation/inference.ts +1 -1
- package/src/tasks/image-segmentation/spec/input.json +2 -1
- package/src/tasks/image-to-image/inference.ts +3 -7
- package/src/tasks/image-to-image/spec/input.json +4 -6
- package/src/tasks/image-to-text/inference.ts +1 -6
- package/src/tasks/image-to-text/spec/input.json +2 -1
- package/src/tasks/index.ts +1 -0
- package/src/tasks/object-detection/inference.ts +1 -1
- package/src/tasks/object-detection/spec/input.json +2 -1
- package/src/tasks/sentence-similarity/inference.ts +3 -4
- package/src/tasks/summarization/inference.ts +3 -5
- package/src/tasks/text-generation/inference.ts +0 -13
- package/src/tasks/text-to-audio/inference.ts +14 -20
- package/src/tasks/text-to-audio/spec/output.json +3 -2
- package/src/tasks/text-to-image/inference.ts +2 -6
- package/src/tasks/text-to-image/spec/input.json +2 -5
- package/src/tasks/text-to-speech/inference.ts +14 -22
- package/src/tasks/text-to-speech/spec/output.json +13 -2
- package/src/tasks/text-to-video/inference.ts +57 -0
- package/src/tasks/text-to-video/spec/input.json +49 -0
- package/src/tasks/text-to-video/spec/output.json +13 -0
- package/src/tasks/text2text-generation/inference.ts +3 -5
- package/src/tasks/translation/inference.ts +3 -5
- package/src/tasks/visual-question-answering/inference.ts +1 -1
- package/src/tasks/visual-question-answering/spec/input.json +4 -2
- package/src/tasks/zero-shot-image-classification/inference.ts +1 -1
- package/src/tasks/zero-shot-image-classification/spec/input.json +2 -1
- package/src/tasks/zero-shot-object-detection/inference.ts +1 -1
- package/src/tasks/zero-shot-object-detection/spec/input.json +2 -1
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Using src/scripts/inference-codegen
|
|
5
5
|
*/
|
|
6
|
-
|
|
7
6
|
/**
|
|
8
7
|
* Inputs for Image To Image inference
|
|
9
8
|
*/
|
|
@@ -12,14 +11,13 @@ export interface ImageToImageInput {
|
|
|
12
11
|
* The input image data as a base64-encoded string. If no `parameters` are provided, you can
|
|
13
12
|
* also provide the image data as a raw bytes payload.
|
|
14
13
|
*/
|
|
15
|
-
inputs:
|
|
14
|
+
inputs: Blob;
|
|
16
15
|
/**
|
|
17
16
|
* Additional inference parameters for Image To Image
|
|
18
17
|
*/
|
|
19
18
|
parameters?: ImageToImageParameters;
|
|
20
19
|
[property: string]: unknown;
|
|
21
20
|
}
|
|
22
|
-
|
|
23
21
|
/**
|
|
24
22
|
* Additional inference parameters for Image To Image
|
|
25
23
|
*/
|
|
@@ -30,9 +28,9 @@ export interface ImageToImageParameters {
|
|
|
30
28
|
*/
|
|
31
29
|
guidance_scale?: number;
|
|
32
30
|
/**
|
|
33
|
-
* One
|
|
31
|
+
* One prompt to guide what NOT to include in image generation.
|
|
34
32
|
*/
|
|
35
|
-
negative_prompt?: string
|
|
33
|
+
negative_prompt?: string;
|
|
36
34
|
/**
|
|
37
35
|
* For diffusion models. The number of denoising steps. More denoising steps usually lead to
|
|
38
36
|
* a higher quality image at the expense of slower inference.
|
|
@@ -44,7 +42,6 @@ export interface ImageToImageParameters {
|
|
|
44
42
|
target_size?: TargetSize;
|
|
45
43
|
[property: string]: unknown;
|
|
46
44
|
}
|
|
47
|
-
|
|
48
45
|
/**
|
|
49
46
|
* The size in pixel of the output image.
|
|
50
47
|
*/
|
|
@@ -53,7 +50,6 @@ export interface TargetSize {
|
|
|
53
50
|
width: number;
|
|
54
51
|
[property: string]: unknown;
|
|
55
52
|
}
|
|
56
|
-
|
|
57
53
|
/**
|
|
58
54
|
* Outputs of inference for the Image To Image task
|
|
59
55
|
*/
|
|
@@ -7,7 +7,8 @@
|
|
|
7
7
|
"properties": {
|
|
8
8
|
"inputs": {
|
|
9
9
|
"type": "string",
|
|
10
|
-
"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
|
|
10
|
+
"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
|
|
11
|
+
"comment": "type=binary"
|
|
11
12
|
},
|
|
12
13
|
"parameters": {
|
|
13
14
|
"description": "Additional inference parameters for Image To Image",
|
|
@@ -24,11 +25,8 @@
|
|
|
24
25
|
"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
|
|
25
26
|
},
|
|
26
27
|
"negative_prompt": {
|
|
27
|
-
"type": "
|
|
28
|
-
"
|
|
29
|
-
"type": "string"
|
|
30
|
-
},
|
|
31
|
-
"description": "One or several prompt to guide what NOT to include in image generation."
|
|
28
|
+
"type": "string",
|
|
29
|
+
"description": "One prompt to guide what NOT to include in image generation."
|
|
32
30
|
},
|
|
33
31
|
"num_inference_steps": {
|
|
34
32
|
"type": "integer",
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Using src/scripts/inference-codegen
|
|
5
5
|
*/
|
|
6
|
-
|
|
7
6
|
/**
|
|
8
7
|
* Inputs for Image To Text inference
|
|
9
8
|
*/
|
|
@@ -11,14 +10,13 @@ export interface ImageToTextInput {
|
|
|
11
10
|
/**
|
|
12
11
|
* The input image data
|
|
13
12
|
*/
|
|
14
|
-
inputs:
|
|
13
|
+
inputs: Blob;
|
|
15
14
|
/**
|
|
16
15
|
* Additional inference parameters for Image To Text
|
|
17
16
|
*/
|
|
18
17
|
parameters?: ImageToTextParameters;
|
|
19
18
|
[property: string]: unknown;
|
|
20
19
|
}
|
|
21
|
-
|
|
22
20
|
/**
|
|
23
21
|
* Additional inference parameters for Image To Text
|
|
24
22
|
*/
|
|
@@ -33,7 +31,6 @@ export interface ImageToTextParameters {
|
|
|
33
31
|
max_new_tokens?: number;
|
|
34
32
|
[property: string]: unknown;
|
|
35
33
|
}
|
|
36
|
-
|
|
37
34
|
/**
|
|
38
35
|
* Parametrization of the text generation process
|
|
39
36
|
*/
|
|
@@ -120,12 +117,10 @@ export interface GenerationParameters {
|
|
|
120
117
|
use_cache?: boolean;
|
|
121
118
|
[property: string]: unknown;
|
|
122
119
|
}
|
|
123
|
-
|
|
124
120
|
/**
|
|
125
121
|
* Controls the stopping condition for beam-based methods.
|
|
126
122
|
*/
|
|
127
123
|
export type EarlyStoppingUnion = boolean | "never";
|
|
128
|
-
|
|
129
124
|
/**
|
|
130
125
|
* Outputs of inference for the Image To Text task
|
|
131
126
|
*/
|
package/src/tasks/index.ts
CHANGED
|
@@ -73,6 +73,7 @@ export type * from "./sentence-similarity/inference.js";
|
|
|
73
73
|
export type * from "./summarization/inference.js";
|
|
74
74
|
export type * from "./table-question-answering/inference.js";
|
|
75
75
|
export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference.js";
|
|
76
|
+
export type { TextToVideoParameters, TextToVideoOutput, TextToVideoInput } from "./text-to-video/inference.js";
|
|
76
77
|
export type { TextToSpeechParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference.js";
|
|
77
78
|
export type * from "./token-classification/inference.js";
|
|
78
79
|
export type { TranslationInput, TranslationOutput } from "./translation/inference.js";
|
|
@@ -11,7 +11,7 @@ export interface ObjectDetectionInput {
|
|
|
11
11
|
* The input image data as a base64-encoded string. If no `parameters` are provided, you can
|
|
12
12
|
* also provide the image data as a raw bytes payload.
|
|
13
13
|
*/
|
|
14
|
-
inputs:
|
|
14
|
+
inputs: Blob;
|
|
15
15
|
/**
|
|
16
16
|
* Additional inference parameters for Object Detection
|
|
17
17
|
*/
|
|
@@ -7,7 +7,8 @@
|
|
|
7
7
|
"properties": {
|
|
8
8
|
"inputs": {
|
|
9
9
|
"type": "string",
|
|
10
|
-
"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
|
|
10
|
+
"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload.",
|
|
11
|
+
"comment": "type=binary"
|
|
11
12
|
},
|
|
12
13
|
"parameters": {
|
|
13
14
|
"description": "Additional inference parameters for Object Detection",
|
|
@@ -3,9 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Using src/scripts/inference-codegen
|
|
5
5
|
*/
|
|
6
|
-
|
|
7
6
|
export type SentenceSimilarityOutput = number[];
|
|
8
|
-
|
|
9
7
|
/**
|
|
10
8
|
* Inputs for Sentence similarity inference
|
|
11
9
|
*/
|
|
@@ -14,10 +12,11 @@ export interface SentenceSimilarityInput {
|
|
|
14
12
|
/**
|
|
15
13
|
* Additional inference parameters for Sentence Similarity
|
|
16
14
|
*/
|
|
17
|
-
parameters?: {
|
|
15
|
+
parameters?: {
|
|
16
|
+
[key: string]: unknown;
|
|
17
|
+
};
|
|
18
18
|
[property: string]: unknown;
|
|
19
19
|
}
|
|
20
|
-
|
|
21
20
|
export interface SentenceSimilarityInputData {
|
|
22
21
|
/**
|
|
23
22
|
* A list of strings which will be compared against the source_sentence.
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Using src/scripts/inference-codegen
|
|
5
5
|
*/
|
|
6
|
-
|
|
7
6
|
/**
|
|
8
7
|
* Inputs for Summarization inference
|
|
9
8
|
*/
|
|
@@ -18,7 +17,6 @@ export interface SummarizationInput {
|
|
|
18
17
|
parameters?: SummarizationParameters;
|
|
19
18
|
[property: string]: unknown;
|
|
20
19
|
}
|
|
21
|
-
|
|
22
20
|
/**
|
|
23
21
|
* Additional inference parameters for summarization.
|
|
24
22
|
*/
|
|
@@ -30,19 +28,19 @@ export interface SummarizationParameters {
|
|
|
30
28
|
/**
|
|
31
29
|
* Additional parametrization of the text generation algorithm.
|
|
32
30
|
*/
|
|
33
|
-
generate_parameters?: {
|
|
31
|
+
generate_parameters?: {
|
|
32
|
+
[key: string]: unknown;
|
|
33
|
+
};
|
|
34
34
|
/**
|
|
35
35
|
* The truncation strategy to use.
|
|
36
36
|
*/
|
|
37
37
|
truncation?: SummarizationTruncationStrategy;
|
|
38
38
|
[property: string]: unknown;
|
|
39
39
|
}
|
|
40
|
-
|
|
41
40
|
/**
|
|
42
41
|
* The truncation strategy to use.
|
|
43
42
|
*/
|
|
44
43
|
export type SummarizationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
|
|
45
|
-
|
|
46
44
|
/**
|
|
47
45
|
* Outputs of inference for the Summarization task
|
|
48
46
|
*/
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Using src/scripts/inference-codegen
|
|
5
5
|
*/
|
|
6
|
-
|
|
7
6
|
/**
|
|
8
7
|
* Text Generation Input.
|
|
9
8
|
*
|
|
@@ -17,7 +16,6 @@ export interface TextGenerationInput {
|
|
|
17
16
|
stream?: boolean;
|
|
18
17
|
[property: string]: unknown;
|
|
19
18
|
}
|
|
20
|
-
|
|
21
19
|
export interface TextGenerationInputGenerateParameters {
|
|
22
20
|
/**
|
|
23
21
|
* Lora adapter id
|
|
@@ -100,7 +98,6 @@ export interface TextGenerationInputGenerateParameters {
|
|
|
100
98
|
watermark?: boolean;
|
|
101
99
|
[property: string]: unknown;
|
|
102
100
|
}
|
|
103
|
-
|
|
104
101
|
export interface TextGenerationInputGrammarType {
|
|
105
102
|
type: Type;
|
|
106
103
|
/**
|
|
@@ -112,9 +109,7 @@ export interface TextGenerationInputGrammarType {
|
|
|
112
109
|
value: unknown;
|
|
113
110
|
[property: string]: unknown;
|
|
114
111
|
}
|
|
115
|
-
|
|
116
112
|
export type Type = "json" | "regex";
|
|
117
|
-
|
|
118
113
|
/**
|
|
119
114
|
* Text Generation Output.
|
|
120
115
|
*
|
|
@@ -127,7 +122,6 @@ export interface TextGenerationOutput {
|
|
|
127
122
|
generated_text: string;
|
|
128
123
|
[property: string]: unknown;
|
|
129
124
|
}
|
|
130
|
-
|
|
131
125
|
export interface TextGenerationOutputDetails {
|
|
132
126
|
best_of_sequences?: TextGenerationOutputBestOfSequence[];
|
|
133
127
|
finish_reason: TextGenerationOutputFinishReason;
|
|
@@ -138,7 +132,6 @@ export interface TextGenerationOutputDetails {
|
|
|
138
132
|
top_tokens?: Array<TextGenerationOutputToken[]>;
|
|
139
133
|
[property: string]: unknown;
|
|
140
134
|
}
|
|
141
|
-
|
|
142
135
|
export interface TextGenerationOutputBestOfSequence {
|
|
143
136
|
finish_reason: TextGenerationOutputFinishReason;
|
|
144
137
|
generated_text: string;
|
|
@@ -149,16 +142,13 @@ export interface TextGenerationOutputBestOfSequence {
|
|
|
149
142
|
top_tokens?: Array<TextGenerationOutputToken[]>;
|
|
150
143
|
[property: string]: unknown;
|
|
151
144
|
}
|
|
152
|
-
|
|
153
145
|
export type TextGenerationOutputFinishReason = "length" | "eos_token" | "stop_sequence";
|
|
154
|
-
|
|
155
146
|
export interface TextGenerationOutputPrefillToken {
|
|
156
147
|
id: number;
|
|
157
148
|
logprob: number;
|
|
158
149
|
text: string;
|
|
159
150
|
[property: string]: unknown;
|
|
160
151
|
}
|
|
161
|
-
|
|
162
152
|
export interface TextGenerationOutputToken {
|
|
163
153
|
id: number;
|
|
164
154
|
logprob: number;
|
|
@@ -166,7 +156,6 @@ export interface TextGenerationOutputToken {
|
|
|
166
156
|
text: string;
|
|
167
157
|
[property: string]: unknown;
|
|
168
158
|
}
|
|
169
|
-
|
|
170
159
|
/**
|
|
171
160
|
* Text Generation Stream Output.
|
|
172
161
|
*
|
|
@@ -182,7 +171,6 @@ export interface TextGenerationStreamOutput {
|
|
|
182
171
|
top_tokens?: TextGenerationStreamOutputToken[];
|
|
183
172
|
[property: string]: unknown;
|
|
184
173
|
}
|
|
185
|
-
|
|
186
174
|
export interface TextGenerationStreamOutputStreamDetails {
|
|
187
175
|
finish_reason: TextGenerationOutputFinishReason;
|
|
188
176
|
generated_tokens: number;
|
|
@@ -190,7 +178,6 @@ export interface TextGenerationStreamOutputStreamDetails {
|
|
|
190
178
|
seed?: number;
|
|
191
179
|
[property: string]: unknown;
|
|
192
180
|
}
|
|
193
|
-
|
|
194
181
|
export interface TextGenerationStreamOutputToken {
|
|
195
182
|
id: number;
|
|
196
183
|
logprob: number;
|
|
@@ -1,9 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Outputs of inference for the Text To Audio task
|
|
3
|
+
*/
|
|
4
|
+
export interface TextToAudioOutput {
|
|
5
|
+
/**
|
|
6
|
+
* The generated audio waveform.
|
|
7
|
+
*/
|
|
8
|
+
audio: Blob;
|
|
9
|
+
/**
|
|
10
|
+
* The sampling rate of the generated audio waveform.
|
|
11
|
+
*/
|
|
12
|
+
sampling_rate: number;
|
|
13
|
+
[property: string]: unknown;
|
|
14
|
+
}
|
|
1
15
|
/**
|
|
2
16
|
* Inference code generated from the JSON schema spec in ./spec
|
|
3
17
|
*
|
|
4
18
|
* Using src/scripts/inference-codegen
|
|
5
19
|
*/
|
|
6
|
-
|
|
7
20
|
/**
|
|
8
21
|
* Inputs for Text To Audio inference
|
|
9
22
|
*/
|
|
@@ -18,7 +31,6 @@ export interface TextToAudioInput {
|
|
|
18
31
|
parameters?: TextToAudioParameters;
|
|
19
32
|
[property: string]: unknown;
|
|
20
33
|
}
|
|
21
|
-
|
|
22
34
|
/**
|
|
23
35
|
* Additional inference parameters for Text To Audio
|
|
24
36
|
*/
|
|
@@ -29,7 +41,6 @@ export interface TextToAudioParameters {
|
|
|
29
41
|
generation_parameters?: GenerationParameters;
|
|
30
42
|
[property: string]: unknown;
|
|
31
43
|
}
|
|
32
|
-
|
|
33
44
|
/**
|
|
34
45
|
* Parametrization of the text generation process
|
|
35
46
|
*/
|
|
@@ -116,24 +127,7 @@ export interface GenerationParameters {
|
|
|
116
127
|
use_cache?: boolean;
|
|
117
128
|
[property: string]: unknown;
|
|
118
129
|
}
|
|
119
|
-
|
|
120
130
|
/**
|
|
121
131
|
* Controls the stopping condition for beam-based methods.
|
|
122
132
|
*/
|
|
123
133
|
export type EarlyStoppingUnion = boolean | "never";
|
|
124
|
-
|
|
125
|
-
/**
|
|
126
|
-
* Outputs of inference for the Text To Audio task
|
|
127
|
-
*/
|
|
128
|
-
export interface TextToAudioOutput {
|
|
129
|
-
/**
|
|
130
|
-
* The generated audio waveform.
|
|
131
|
-
*/
|
|
132
|
-
audio: unknown;
|
|
133
|
-
samplingRate: unknown;
|
|
134
|
-
/**
|
|
135
|
-
* The sampling rate of the generated audio waveform.
|
|
136
|
-
*/
|
|
137
|
-
sampling_rate?: number;
|
|
138
|
-
[property: string]: unknown;
|
|
139
|
-
}
|
|
@@ -6,12 +6,13 @@
|
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
8
|
"audio": {
|
|
9
|
-
"description": "The generated audio waveform."
|
|
9
|
+
"description": "The generated audio waveform.",
|
|
10
|
+
"comment": "type=binary"
|
|
10
11
|
},
|
|
11
12
|
"sampling_rate": {
|
|
12
13
|
"type": "number",
|
|
13
14
|
"description": "The sampling rate of the generated audio waveform."
|
|
14
15
|
}
|
|
15
16
|
},
|
|
16
|
-
"required": ["audio", "
|
|
17
|
+
"required": ["audio", "sampling_rate"]
|
|
17
18
|
}
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Using src/scripts/inference-codegen
|
|
5
5
|
*/
|
|
6
|
-
|
|
7
6
|
/**
|
|
8
7
|
* Inputs for Text To Image inference
|
|
9
8
|
*/
|
|
@@ -18,7 +17,6 @@ export interface TextToImageInput {
|
|
|
18
17
|
parameters?: TextToImageParameters;
|
|
19
18
|
[property: string]: unknown;
|
|
20
19
|
}
|
|
21
|
-
|
|
22
20
|
/**
|
|
23
21
|
* Additional inference parameters for Text To Image
|
|
24
22
|
*/
|
|
@@ -29,9 +27,9 @@ export interface TextToImageParameters {
|
|
|
29
27
|
*/
|
|
30
28
|
guidance_scale?: number;
|
|
31
29
|
/**
|
|
32
|
-
* One
|
|
30
|
+
* One prompt to guide what NOT to include in image generation.
|
|
33
31
|
*/
|
|
34
|
-
negative_prompt?: string
|
|
32
|
+
negative_prompt?: string;
|
|
35
33
|
/**
|
|
36
34
|
* The number of denoising steps. More denoising steps usually lead to a higher quality
|
|
37
35
|
* image at the expense of slower inference.
|
|
@@ -51,7 +49,6 @@ export interface TextToImageParameters {
|
|
|
51
49
|
target_size?: TargetSize;
|
|
52
50
|
[property: string]: unknown;
|
|
53
51
|
}
|
|
54
|
-
|
|
55
52
|
/**
|
|
56
53
|
* The size in pixel of the output image
|
|
57
54
|
*/
|
|
@@ -60,7 +57,6 @@ export interface TargetSize {
|
|
|
60
57
|
width: number;
|
|
61
58
|
[property: string]: unknown;
|
|
62
59
|
}
|
|
63
|
-
|
|
64
60
|
/**
|
|
65
61
|
* Outputs of inference for the Text To Image task
|
|
66
62
|
*/
|
|
@@ -24,11 +24,8 @@
|
|
|
24
24
|
"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
|
|
25
25
|
},
|
|
26
26
|
"negative_prompt": {
|
|
27
|
-
"type": "
|
|
28
|
-
"
|
|
29
|
-
"type": "string"
|
|
30
|
-
},
|
|
31
|
-
"description": "One or several prompt to guide what NOT to include in image generation."
|
|
27
|
+
"type": "string",
|
|
28
|
+
"description": "One prompt to guide what NOT to include in image generation."
|
|
32
29
|
},
|
|
33
30
|
"num_inference_steps": {
|
|
34
31
|
"type": "integer",
|
|
@@ -1,9 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Outputs of inference for the Text To Speech task
|
|
3
|
+
*/
|
|
4
|
+
export interface TextToSpeechOutput {
|
|
5
|
+
/**
|
|
6
|
+
* The generated audio
|
|
7
|
+
*/
|
|
8
|
+
audio: Blob;
|
|
9
|
+
/**
|
|
10
|
+
* The sampling rate of the generated audio waveform.
|
|
11
|
+
*/
|
|
12
|
+
sampling_rate?: number;
|
|
13
|
+
[property: string]: unknown;
|
|
14
|
+
}
|
|
1
15
|
/**
|
|
2
16
|
* Inference code generated from the JSON schema spec in ./spec
|
|
3
17
|
*
|
|
4
18
|
* Using src/scripts/inference-codegen
|
|
5
19
|
*/
|
|
6
|
-
|
|
7
20
|
/**
|
|
8
21
|
* Inputs for Text To Speech inference
|
|
9
22
|
*/
|
|
@@ -18,7 +31,6 @@ export interface TextToSpeechInput {
|
|
|
18
31
|
parameters?: TextToSpeechParameters;
|
|
19
32
|
[property: string]: unknown;
|
|
20
33
|
}
|
|
21
|
-
|
|
22
34
|
/**
|
|
23
35
|
* Additional inference parameters for Text To Speech
|
|
24
36
|
*/
|
|
@@ -29,7 +41,6 @@ export interface TextToSpeechParameters {
|
|
|
29
41
|
generation_parameters?: GenerationParameters;
|
|
30
42
|
[property: string]: unknown;
|
|
31
43
|
}
|
|
32
|
-
|
|
33
44
|
/**
|
|
34
45
|
* Parametrization of the text generation process
|
|
35
46
|
*/
|
|
@@ -116,26 +127,7 @@ export interface GenerationParameters {
|
|
|
116
127
|
use_cache?: boolean;
|
|
117
128
|
[property: string]: unknown;
|
|
118
129
|
}
|
|
119
|
-
|
|
120
130
|
/**
|
|
121
131
|
* Controls the stopping condition for beam-based methods.
|
|
122
132
|
*/
|
|
123
133
|
export type EarlyStoppingUnion = boolean | "never";
|
|
124
|
-
|
|
125
|
-
/**
|
|
126
|
-
* Outputs for Text to Speech inference
|
|
127
|
-
*
|
|
128
|
-
* Outputs of inference for the Text To Audio task
|
|
129
|
-
*/
|
|
130
|
-
export interface TextToSpeechOutput {
|
|
131
|
-
/**
|
|
132
|
-
* The generated audio waveform.
|
|
133
|
-
*/
|
|
134
|
-
audio: unknown;
|
|
135
|
-
samplingRate: unknown;
|
|
136
|
-
/**
|
|
137
|
-
* The sampling rate of the generated audio waveform.
|
|
138
|
-
*/
|
|
139
|
-
sampling_rate?: number;
|
|
140
|
-
[property: string]: unknown;
|
|
141
|
-
}
|
|
@@ -1,7 +1,18 @@
|
|
|
1
1
|
{
|
|
2
|
-
"$ref": "/inference/schemas/text-to-audio/output.json",
|
|
3
2
|
"$id": "/inference/schemas/text-to-speech/output.json",
|
|
4
3
|
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Text To Speech task",
|
|
5
5
|
"title": "TextToSpeechOutput",
|
|
6
|
-
"
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"audio": {
|
|
9
|
+
"description": "The generated audio",
|
|
10
|
+
"comment": "type=binary"
|
|
11
|
+
},
|
|
12
|
+
"sampling_rate": {
|
|
13
|
+
"type": "number",
|
|
14
|
+
"description": "The sampling rate of the generated audio waveform."
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"required": ["audio"]
|
|
7
18
|
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Text To Video inference
|
|
8
|
+
*/
|
|
9
|
+
export interface TextToVideoInput {
|
|
10
|
+
/**
|
|
11
|
+
* The input text data (sometimes called "prompt")
|
|
12
|
+
*/
|
|
13
|
+
inputs: string;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters for Text To Video
|
|
16
|
+
*/
|
|
17
|
+
parameters?: TextToVideoParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Additional inference parameters for Text To Video
|
|
22
|
+
*/
|
|
23
|
+
export interface TextToVideoParameters {
|
|
24
|
+
/**
|
|
25
|
+
* A higher guidance scale value encourages the model to generate images closely linked to
|
|
26
|
+
* the text prompt, but values too high may cause saturation and other artifacts.
|
|
27
|
+
*/
|
|
28
|
+
guidance_scale?: number;
|
|
29
|
+
/**
|
|
30
|
+
* One or several prompt to guide what NOT to include in image generation.
|
|
31
|
+
*/
|
|
32
|
+
negative_prompt?: string[];
|
|
33
|
+
/**
|
|
34
|
+
* The num_frames parameter determines how many video frames are generated.
|
|
35
|
+
*/
|
|
36
|
+
num_frames?: number;
|
|
37
|
+
/**
|
|
38
|
+
* The number of denoising steps. More denoising steps usually lead to a higher quality
|
|
39
|
+
* image at the expense of slower inference.
|
|
40
|
+
*/
|
|
41
|
+
num_inference_steps?: number;
|
|
42
|
+
/**
|
|
43
|
+
* Seed for the random number generator.
|
|
44
|
+
*/
|
|
45
|
+
seed?: number;
|
|
46
|
+
[property: string]: unknown;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Outputs of inference for the Text To Video task
|
|
50
|
+
*/
|
|
51
|
+
export interface TextToVideoOutput {
|
|
52
|
+
/**
|
|
53
|
+
* The generated video returned as raw bytes in the payload.
|
|
54
|
+
*/
|
|
55
|
+
video: unknown;
|
|
56
|
+
[property: string]: unknown;
|
|
57
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/text-to-video/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Text To Video inference",
|
|
5
|
+
"title": "TextToVideoInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "The input text data (sometimes called \"prompt\")",
|
|
10
|
+
"type": "string"
|
|
11
|
+
},
|
|
12
|
+
"parameters": {
|
|
13
|
+
"description": "Additional inference parameters for Text To Video",
|
|
14
|
+
"$ref": "#/$defs/TextToVideoParameters"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"$defs": {
|
|
18
|
+
"TextToVideoParameters": {
|
|
19
|
+
"title": "TextToVideoParameters",
|
|
20
|
+
"type": "object",
|
|
21
|
+
"properties": {
|
|
22
|
+
"num_frames": {
|
|
23
|
+
"type": "number",
|
|
24
|
+
"description": "The num_frames parameter determines how many video frames are generated."
|
|
25
|
+
},
|
|
26
|
+
"guidance_scale": {
|
|
27
|
+
"type": "number",
|
|
28
|
+
"description": "A higher guidance scale value encourages the model to generate images closely linked to the text prompt, but values too high may cause saturation and other artifacts."
|
|
29
|
+
},
|
|
30
|
+
"negative_prompt": {
|
|
31
|
+
"type": "array",
|
|
32
|
+
"items": {
|
|
33
|
+
"type": "string"
|
|
34
|
+
},
|
|
35
|
+
"description": "One or several prompt to guide what NOT to include in image generation."
|
|
36
|
+
},
|
|
37
|
+
"num_inference_steps": {
|
|
38
|
+
"type": "integer",
|
|
39
|
+
"description": "The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
|
|
40
|
+
},
|
|
41
|
+
"seed": {
|
|
42
|
+
"type": "integer",
|
|
43
|
+
"description": "Seed for the random number generator."
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"required": ["inputs"]
|
|
49
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/text-to-video/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Text To Video task",
|
|
5
|
+
"title": "TextToVideoOutput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"video": {
|
|
9
|
+
"description": "The generated video returned as raw bytes in the payload."
|
|
10
|
+
}
|
|
11
|
+
},
|
|
12
|
+
"required": ["video"]
|
|
13
|
+
}
|