@huggingface/tasks 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.cjs +3144 -3085
- package/dist/index.d.ts +441 -74
- package/dist/index.js +3143 -3084
- package/package.json +1 -1
- package/src/index.ts +2 -5
- package/src/library-to-tasks.ts +1 -1
- package/src/model-libraries-downloads.ts +20 -0
- package/src/{library-ui-elements.ts → model-libraries-snippets.ts} +46 -292
- package/src/model-libraries.ts +375 -44
- package/src/pipelines.ts +14 -8
- package/src/tasks/audio-classification/inference.ts +4 -4
- package/src/tasks/audio-classification/spec/input.json +4 -4
- package/src/tasks/audio-classification/spec/output.json +1 -12
- package/src/tasks/automatic-speech-recognition/inference.ts +35 -30
- package/src/tasks/automatic-speech-recognition/spec/input.json +3 -3
- package/src/tasks/automatic-speech-recognition/spec/output.json +30 -28
- package/src/tasks/common-definitions.json +25 -17
- package/src/tasks/depth-estimation/inference.ts +10 -10
- package/src/tasks/depth-estimation/spec/input.json +3 -8
- package/src/tasks/depth-estimation/spec/output.json +9 -3
- package/src/tasks/document-question-answering/inference.ts +16 -8
- package/src/tasks/document-question-answering/spec/input.json +9 -9
- package/src/tasks/document-question-answering/spec/output.json +2 -2
- package/src/tasks/feature-extraction/inference.ts +1 -1
- package/src/tasks/feature-extraction/spec/input.json +2 -2
- package/src/tasks/fill-mask/inference.ts +4 -3
- package/src/tasks/fill-mask/spec/input.json +3 -3
- package/src/tasks/fill-mask/spec/output.json +1 -1
- package/src/tasks/image-classification/inference.ts +3 -3
- package/src/tasks/image-classification/spec/input.json +4 -4
- package/src/tasks/image-segmentation/inference.ts +3 -3
- package/src/tasks/image-segmentation/spec/input.json +4 -4
- package/src/tasks/image-to-image/inference.ts +5 -5
- package/src/tasks/image-to-image/spec/input.json +9 -7
- package/src/tasks/image-to-text/inference.ts +25 -20
- package/src/tasks/image-to-text/spec/input.json +3 -3
- package/src/tasks/image-to-text/spec/output.json +8 -11
- package/src/tasks/index.ts +2 -0
- package/src/tasks/object-detection/inference.ts +1 -1
- package/src/tasks/object-detection/spec/input.json +2 -2
- package/src/tasks/placeholder/spec/input.json +4 -4
- package/src/tasks/placeholder/spec/output.json +1 -1
- package/src/tasks/question-answering/inference.ts +8 -8
- package/src/tasks/question-answering/spec/input.json +9 -9
- package/src/tasks/sentence-similarity/inference.ts +1 -1
- package/src/tasks/sentence-similarity/spec/input.json +2 -2
- package/src/tasks/summarization/inference.ts +5 -4
- package/src/tasks/table-question-answering/inference.ts +1 -1
- package/src/tasks/table-question-answering/spec/input.json +8 -3
- package/src/tasks/text-classification/inference.ts +3 -3
- package/src/tasks/text-classification/spec/input.json +4 -4
- package/src/tasks/text-generation/inference.ts +123 -14
- package/src/tasks/text-generation/spec/input.json +28 -12
- package/src/tasks/text-generation/spec/output.json +112 -9
- package/src/tasks/text-to-audio/inference.ts +24 -19
- package/src/tasks/text-to-audio/spec/input.json +2 -2
- package/src/tasks/text-to-audio/spec/output.json +10 -13
- package/src/tasks/text-to-image/inference.ts +6 -8
- package/src/tasks/text-to-image/spec/input.json +9 -7
- package/src/tasks/text-to-image/spec/output.json +7 -9
- package/src/tasks/text-to-speech/inference.ts +18 -17
- package/src/tasks/text2text-generation/inference.ts +10 -8
- package/src/tasks/text2text-generation/spec/input.json +4 -4
- package/src/tasks/text2text-generation/spec/output.json +8 -11
- package/src/tasks/token-classification/inference.ts +4 -4
- package/src/tasks/token-classification/spec/input.json +4 -4
- package/src/tasks/token-classification/spec/output.json +1 -1
- package/src/tasks/translation/inference.ts +5 -4
- package/src/tasks/video-classification/inference.ts +5 -5
- package/src/tasks/video-classification/spec/input.json +6 -6
- package/src/tasks/visual-question-answering/inference.ts +2 -2
- package/src/tasks/visual-question-answering/spec/input.json +3 -3
- package/src/tasks/zero-shot-classification/inference.ts +3 -3
- package/src/tasks/zero-shot-classification/spec/input.json +4 -4
- package/src/tasks/zero-shot-image-classification/inference.ts +2 -2
- package/src/tasks/zero-shot-image-classification/spec/input.json +3 -3
- package/src/tasks/zero-shot-object-detection/inference.ts +1 -1
- package/src/tasks/zero-shot-object-detection/spec/input.json +2 -2
|
@@ -3,18 +3,15 @@
|
|
|
3
3
|
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
4
|
"description": "Outputs of inference for the Text To Audio task",
|
|
5
5
|
"title": "TextToAudioOutput",
|
|
6
|
-
"type": "
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
|
|
10
|
-
"audio": {
|
|
11
|
-
"description": "The generated audio waveform."
|
|
12
|
-
},
|
|
13
|
-
"samplingRate": {
|
|
14
|
-
"type": "number",
|
|
15
|
-
"description": "The sampling rate of the generated audio waveform."
|
|
16
|
-
}
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"audio": {
|
|
9
|
+
"description": "The generated audio waveform."
|
|
17
10
|
},
|
|
18
|
-
"
|
|
19
|
-
|
|
11
|
+
"sampling_rate": {
|
|
12
|
+
"type": "number",
|
|
13
|
+
"description": "The sampling rate of the generated audio waveform."
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"required": ["audio", "samplingRate"]
|
|
20
17
|
}
|
|
@@ -11,7 +11,7 @@ export interface TextToImageInput {
|
|
|
11
11
|
/**
|
|
12
12
|
* The input text data (sometimes called "prompt"
|
|
13
13
|
*/
|
|
14
|
-
|
|
14
|
+
inputs: string;
|
|
15
15
|
/**
|
|
16
16
|
* Additional inference parameters
|
|
17
17
|
*/
|
|
@@ -29,16 +29,16 @@ export interface TextToImageParameters {
|
|
|
29
29
|
* For diffusion models. A higher guidance scale value encourages the model to generate
|
|
30
30
|
* images closely linked to the text prompt at the expense of lower image quality.
|
|
31
31
|
*/
|
|
32
|
-
|
|
32
|
+
guidance_scale?: number;
|
|
33
33
|
/**
|
|
34
34
|
* One or several prompt to guide what NOT to include in image generation.
|
|
35
35
|
*/
|
|
36
|
-
|
|
36
|
+
negative_prompt?: string[];
|
|
37
37
|
/**
|
|
38
38
|
* For diffusion models. The number of denoising steps. More denoising steps usually lead to
|
|
39
39
|
* a higher quality image at the expense of slower inference.
|
|
40
40
|
*/
|
|
41
|
-
|
|
41
|
+
num_inference_steps?: number;
|
|
42
42
|
/**
|
|
43
43
|
* For diffusion models. Override the scheduler with a compatible one
|
|
44
44
|
*/
|
|
@@ -46,7 +46,7 @@ export interface TextToImageParameters {
|
|
|
46
46
|
/**
|
|
47
47
|
* The size in pixel of the output image
|
|
48
48
|
*/
|
|
49
|
-
|
|
49
|
+
target_size?: TargetSize;
|
|
50
50
|
[property: string]: unknown;
|
|
51
51
|
}
|
|
52
52
|
|
|
@@ -62,9 +62,7 @@ export interface TargetSize {
|
|
|
62
62
|
/**
|
|
63
63
|
* Outputs of inference for the Text To Image task
|
|
64
64
|
*/
|
|
65
|
-
export
|
|
66
|
-
|
|
67
|
-
export interface TextToImageOutputObject {
|
|
65
|
+
export interface TextToImageOutput {
|
|
68
66
|
/**
|
|
69
67
|
* The generated image
|
|
70
68
|
*/
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"title": "TextToImageInput",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
|
-
"
|
|
8
|
+
"inputs": {
|
|
9
9
|
"description": "The input text data (sometimes called \"prompt\"",
|
|
10
10
|
"type": "string"
|
|
11
11
|
},
|
|
@@ -20,20 +20,22 @@
|
|
|
20
20
|
"description": "Additional inference parameters for Text To Image",
|
|
21
21
|
"type": "object",
|
|
22
22
|
"properties": {
|
|
23
|
-
"
|
|
23
|
+
"guidance_scale": {
|
|
24
24
|
"type": "number",
|
|
25
25
|
"description": "For diffusion models. A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality."
|
|
26
26
|
},
|
|
27
|
-
"
|
|
27
|
+
"negative_prompt": {
|
|
28
28
|
"type": "array",
|
|
29
|
-
"items": {
|
|
29
|
+
"items": {
|
|
30
|
+
"type": "string"
|
|
31
|
+
},
|
|
30
32
|
"description": "One or several prompt to guide what NOT to include in image generation."
|
|
31
33
|
},
|
|
32
|
-
"
|
|
34
|
+
"num_inference_steps": {
|
|
33
35
|
"type": "integer",
|
|
34
36
|
"description": "For diffusion models. The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."
|
|
35
37
|
},
|
|
36
|
-
"
|
|
38
|
+
"target_size": {
|
|
37
39
|
"type": "object",
|
|
38
40
|
"description": "The size in pixel of the output image",
|
|
39
41
|
"properties": {
|
|
@@ -53,5 +55,5 @@
|
|
|
53
55
|
}
|
|
54
56
|
}
|
|
55
57
|
},
|
|
56
|
-
"required": ["
|
|
58
|
+
"required": ["inputs"]
|
|
57
59
|
}
|
|
@@ -3,13 +3,11 @@
|
|
|
3
3
|
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
4
|
"description": "Outputs of inference for the Text To Image task",
|
|
5
5
|
"title": "TextToImageOutput",
|
|
6
|
-
"type": "
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
"required": ["image"]
|
|
14
|
-
}
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"image": {
|
|
9
|
+
"description": "The generated image"
|
|
10
|
+
}
|
|
11
|
+
},
|
|
12
|
+
"required": ["image"]
|
|
15
13
|
}
|
|
@@ -13,7 +13,7 @@ export interface TextToSpeechInput {
|
|
|
13
13
|
/**
|
|
14
14
|
* The input text data
|
|
15
15
|
*/
|
|
16
|
-
|
|
16
|
+
inputs: string;
|
|
17
17
|
/**
|
|
18
18
|
* Additional inference parameters
|
|
19
19
|
*/
|
|
@@ -43,18 +43,18 @@ export interface GenerationParameters {
|
|
|
43
43
|
/**
|
|
44
44
|
* Whether to use sampling instead of greedy decoding when generating new tokens.
|
|
45
45
|
*/
|
|
46
|
-
|
|
46
|
+
do_sample?: boolean;
|
|
47
47
|
/**
|
|
48
48
|
* Controls the stopping condition for beam-based methods.
|
|
49
49
|
*/
|
|
50
|
-
|
|
50
|
+
early_stopping?: EarlyStoppingUnion;
|
|
51
51
|
/**
|
|
52
52
|
* If set to float strictly between 0 and 1, only tokens with a conditional probability
|
|
53
53
|
* greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
|
|
54
54
|
* 3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
|
|
55
55
|
* Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
|
|
56
56
|
*/
|
|
57
|
-
|
|
57
|
+
epsilon_cutoff?: number;
|
|
58
58
|
/**
|
|
59
59
|
* Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
|
|
60
60
|
* float strictly between 0 and 1, a token is only considered if it is greater than either
|
|
@@ -64,37 +64,37 @@ export interface GenerationParameters {
|
|
|
64
64
|
* See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
|
|
65
65
|
* for more details.
|
|
66
66
|
*/
|
|
67
|
-
|
|
67
|
+
eta_cutoff?: number;
|
|
68
68
|
/**
|
|
69
69
|
* The maximum length (in tokens) of the generated text, including the input.
|
|
70
70
|
*/
|
|
71
|
-
|
|
71
|
+
max_length?: number;
|
|
72
72
|
/**
|
|
73
73
|
* The maximum number of tokens to generate. Takes precedence over maxLength.
|
|
74
74
|
*/
|
|
75
|
-
|
|
75
|
+
max_new_tokens?: number;
|
|
76
76
|
/**
|
|
77
77
|
* The minimum length (in tokens) of the generated text, including the input.
|
|
78
78
|
*/
|
|
79
|
-
|
|
79
|
+
min_length?: number;
|
|
80
80
|
/**
|
|
81
81
|
* The minimum number of tokens to generate. Takes precedence over maxLength.
|
|
82
82
|
*/
|
|
83
|
-
|
|
83
|
+
min_new_tokens?: number;
|
|
84
84
|
/**
|
|
85
85
|
* Number of groups to divide num_beams into in order to ensure diversity among different
|
|
86
86
|
* groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
|
|
87
87
|
*/
|
|
88
|
-
|
|
88
|
+
num_beam_groups?: number;
|
|
89
89
|
/**
|
|
90
90
|
* Number of beams to use for beam search.
|
|
91
91
|
*/
|
|
92
|
-
|
|
92
|
+
num_beams?: number;
|
|
93
93
|
/**
|
|
94
94
|
* The value balances the model confidence and the degeneration penalty in contrastive
|
|
95
95
|
* search decoding.
|
|
96
96
|
*/
|
|
97
|
-
|
|
97
|
+
penalty_alpha?: number;
|
|
98
98
|
/**
|
|
99
99
|
* The value used to modulate the next token probabilities.
|
|
100
100
|
*/
|
|
@@ -102,12 +102,12 @@ export interface GenerationParameters {
|
|
|
102
102
|
/**
|
|
103
103
|
* The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
|
104
104
|
*/
|
|
105
|
-
|
|
105
|
+
top_k?: number;
|
|
106
106
|
/**
|
|
107
107
|
* If set to float < 1, only the smallest set of most probable tokens with probabilities
|
|
108
108
|
* that add up to top_p or higher are kept for generation.
|
|
109
109
|
*/
|
|
110
|
-
|
|
110
|
+
top_p?: number;
|
|
111
111
|
/**
|
|
112
112
|
* Local typicality measures how similar the conditional probability of predicting a target
|
|
113
113
|
* token next is to the expected conditional probability of predicting a random token next,
|
|
@@ -115,11 +115,11 @@ export interface GenerationParameters {
|
|
|
115
115
|
* most locally typical tokens with probabilities that add up to typical_p or higher are
|
|
116
116
|
* kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
|
|
117
117
|
*/
|
|
118
|
-
|
|
118
|
+
typical_p?: number;
|
|
119
119
|
/**
|
|
120
120
|
* Whether the model should use the past last key/values attentions to speed up decoding
|
|
121
121
|
*/
|
|
122
|
-
|
|
122
|
+
use_cache?: boolean;
|
|
123
123
|
[property: string]: unknown;
|
|
124
124
|
}
|
|
125
125
|
|
|
@@ -138,9 +138,10 @@ export interface TextToSpeechOutput {
|
|
|
138
138
|
* The generated audio waveform.
|
|
139
139
|
*/
|
|
140
140
|
audio: unknown;
|
|
141
|
+
samplingRate: unknown;
|
|
141
142
|
/**
|
|
142
143
|
* The sampling rate of the generated audio waveform.
|
|
143
144
|
*/
|
|
144
|
-
|
|
145
|
+
sampling_rate?: number;
|
|
145
146
|
[property: string]: unknown;
|
|
146
147
|
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Using src/scripts/inference-codegen
|
|
5
5
|
*/
|
|
6
|
+
|
|
6
7
|
/**
|
|
7
8
|
* Inputs for Text2text Generation inference
|
|
8
9
|
*/
|
|
@@ -10,13 +11,14 @@ export interface Text2TextGenerationInput {
|
|
|
10
11
|
/**
|
|
11
12
|
* The input text data
|
|
12
13
|
*/
|
|
13
|
-
|
|
14
|
+
inputs: string;
|
|
14
15
|
/**
|
|
15
16
|
* Additional inference parameters
|
|
16
17
|
*/
|
|
17
18
|
parameters?: Text2TextGenerationParameters;
|
|
18
19
|
[property: string]: unknown;
|
|
19
20
|
}
|
|
21
|
+
|
|
20
22
|
/**
|
|
21
23
|
* Additional inference parameters
|
|
22
24
|
*
|
|
@@ -26,28 +28,28 @@ export interface Text2TextGenerationParameters {
|
|
|
26
28
|
/**
|
|
27
29
|
* Whether to clean up the potential extra spaces in the text output.
|
|
28
30
|
*/
|
|
29
|
-
|
|
31
|
+
clean_up_tokenization_spaces?: boolean;
|
|
30
32
|
/**
|
|
31
33
|
* Additional parametrization of the text generation algorithm
|
|
32
34
|
*/
|
|
33
|
-
|
|
34
|
-
[key: string]: unknown;
|
|
35
|
-
};
|
|
35
|
+
generate_parameters?: { [key: string]: unknown };
|
|
36
36
|
/**
|
|
37
37
|
* The truncation strategy to use
|
|
38
38
|
*/
|
|
39
39
|
truncation?: Text2TextGenerationTruncationStrategy;
|
|
40
40
|
[property: string]: unknown;
|
|
41
41
|
}
|
|
42
|
+
|
|
42
43
|
export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
|
|
43
|
-
|
|
44
|
+
|
|
44
45
|
/**
|
|
45
46
|
* Outputs of inference for the Text2text Generation task
|
|
46
47
|
*/
|
|
47
|
-
export interface
|
|
48
|
+
export interface Text2TextGenerationOutput {
|
|
49
|
+
generatedText: unknown;
|
|
48
50
|
/**
|
|
49
51
|
* The generated text.
|
|
50
52
|
*/
|
|
51
|
-
|
|
53
|
+
generated_text?: string;
|
|
52
54
|
[property: string]: unknown;
|
|
53
55
|
}
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"title": "Text2TextGenerationInput",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
|
-
"
|
|
8
|
+
"inputs": {
|
|
9
9
|
"description": "The input text data",
|
|
10
10
|
"type": "string"
|
|
11
11
|
},
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"description": "Additional inference parameters for Text2text Generation",
|
|
21
21
|
"type": "object",
|
|
22
22
|
"properties": {
|
|
23
|
-
"
|
|
23
|
+
"clean_up_tokenization_spaces": {
|
|
24
24
|
"type": "boolean",
|
|
25
25
|
"description": "Whether to clean up the potential extra spaces in the text output."
|
|
26
26
|
},
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
}
|
|
44
44
|
]
|
|
45
45
|
},
|
|
46
|
-
"
|
|
46
|
+
"generate_parameters": {
|
|
47
47
|
"title": "generateParameters",
|
|
48
48
|
"type": "object",
|
|
49
49
|
"description": "Additional parametrization of the text generation algorithm"
|
|
@@ -51,5 +51,5 @@
|
|
|
51
51
|
}
|
|
52
52
|
}
|
|
53
53
|
},
|
|
54
|
-
"required": ["
|
|
54
|
+
"required": ["inputs"]
|
|
55
55
|
}
|
|
@@ -3,15 +3,12 @@
|
|
|
3
3
|
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
4
|
"description": "Outputs of inference for the Text2text Generation task",
|
|
5
5
|
"title": "Text2TextGenerationOutput",
|
|
6
|
-
"type": "
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
|
|
10
|
-
"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
},
|
|
15
|
-
"required": ["generatedText"]
|
|
16
|
-
}
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"generated_text": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"description": "The generated text."
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"required": ["generatedText"]
|
|
17
14
|
}
|
|
@@ -10,7 +10,7 @@ export interface TokenClassificationInput {
|
|
|
10
10
|
/**
|
|
11
11
|
* The input text data
|
|
12
12
|
*/
|
|
13
|
-
|
|
13
|
+
inputs: string;
|
|
14
14
|
/**
|
|
15
15
|
* Additional inference parameters
|
|
16
16
|
*/
|
|
@@ -26,11 +26,11 @@ export interface TokenClassificationParameters {
|
|
|
26
26
|
/**
|
|
27
27
|
* The strategy used to fuse tokens based on model predictions
|
|
28
28
|
*/
|
|
29
|
-
|
|
29
|
+
aggregation_strategy?: TokenClassificationAggregationStrategy;
|
|
30
30
|
/**
|
|
31
31
|
* A list of labels to ignore
|
|
32
32
|
*/
|
|
33
|
-
|
|
33
|
+
ignore_labels?: string[];
|
|
34
34
|
/**
|
|
35
35
|
* The number of overlapping tokens between chunks when splitting the input text.
|
|
36
36
|
*/
|
|
@@ -64,7 +64,7 @@ export interface TokenClassificationOutputElement {
|
|
|
64
64
|
/**
|
|
65
65
|
* The predicted label for that group of tokens
|
|
66
66
|
*/
|
|
67
|
-
|
|
67
|
+
entity_group?: string;
|
|
68
68
|
label: unknown;
|
|
69
69
|
/**
|
|
70
70
|
* The associated score / probability
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"title": "TokenClassificationInput",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
|
-
"
|
|
8
|
+
"inputs": {
|
|
9
9
|
"description": "The input text data",
|
|
10
10
|
"type": "string"
|
|
11
11
|
},
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"description": "Additional inference parameters for Token Classification",
|
|
21
21
|
"type": "object",
|
|
22
22
|
"properties": {
|
|
23
|
-
"
|
|
23
|
+
"ignore_labels": {
|
|
24
24
|
"type": "array",
|
|
25
25
|
"items": {
|
|
26
26
|
"type": "string"
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
"type": "integer",
|
|
32
32
|
"description": "The number of overlapping tokens between chunks when splitting the input text."
|
|
33
33
|
},
|
|
34
|
-
"
|
|
34
|
+
"aggregation_strategy": {
|
|
35
35
|
"title": "TokenClassificationAggregationStrategy",
|
|
36
36
|
"type": "string",
|
|
37
37
|
"description": "The strategy used to fuse tokens based on model predictions",
|
|
@@ -61,5 +61,5 @@
|
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
63
|
},
|
|
64
|
-
"required": ["
|
|
64
|
+
"required": ["inputs"]
|
|
65
65
|
}
|
|
@@ -13,7 +13,7 @@ export interface TranslationInput {
|
|
|
13
13
|
/**
|
|
14
14
|
* The input text data
|
|
15
15
|
*/
|
|
16
|
-
|
|
16
|
+
inputs: string;
|
|
17
17
|
/**
|
|
18
18
|
* Additional inference parameters
|
|
19
19
|
*/
|
|
@@ -30,11 +30,11 @@ export interface Text2TextGenerationParameters {
|
|
|
30
30
|
/**
|
|
31
31
|
* Whether to clean up the potential extra spaces in the text output.
|
|
32
32
|
*/
|
|
33
|
-
|
|
33
|
+
clean_up_tokenization_spaces?: boolean;
|
|
34
34
|
/**
|
|
35
35
|
* Additional parametrization of the text generation algorithm
|
|
36
36
|
*/
|
|
37
|
-
|
|
37
|
+
generate_parameters?: { [key: string]: unknown };
|
|
38
38
|
/**
|
|
39
39
|
* The truncation strategy to use
|
|
40
40
|
*/
|
|
@@ -50,9 +50,10 @@ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest
|
|
|
50
50
|
* Outputs of inference for the Text2text Generation task
|
|
51
51
|
*/
|
|
52
52
|
export interface TranslationOutput {
|
|
53
|
+
generatedText: unknown;
|
|
53
54
|
/**
|
|
54
55
|
* The generated text.
|
|
55
56
|
*/
|
|
56
|
-
|
|
57
|
+
generated_text?: string;
|
|
57
58
|
[property: string]: unknown;
|
|
58
59
|
}
|
|
@@ -10,7 +10,7 @@ export interface VideoClassificationInput {
|
|
|
10
10
|
/**
|
|
11
11
|
* The input video data
|
|
12
12
|
*/
|
|
13
|
-
|
|
13
|
+
inputs: unknown;
|
|
14
14
|
/**
|
|
15
15
|
* Additional inference parameters
|
|
16
16
|
*/
|
|
@@ -26,16 +26,16 @@ export interface VideoClassificationParameters {
|
|
|
26
26
|
/**
|
|
27
27
|
* The sampling rate used to select frames from the video.
|
|
28
28
|
*/
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
frame_sampling_rate?: number;
|
|
30
|
+
function_to_apply?: ClassificationOutputTransform;
|
|
31
31
|
/**
|
|
32
32
|
* The number of sampled frames to consider for classification.
|
|
33
33
|
*/
|
|
34
|
-
|
|
34
|
+
num_frames?: number;
|
|
35
35
|
/**
|
|
36
36
|
* When specified, limits the output to the top K most probable classes.
|
|
37
37
|
*/
|
|
38
|
-
|
|
38
|
+
top_k?: number;
|
|
39
39
|
[property: string]: unknown;
|
|
40
40
|
}
|
|
41
41
|
/**
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"title": "VideoClassificationInput",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
|
-
"
|
|
8
|
+
"inputs": {
|
|
9
9
|
"description": "The input video data"
|
|
10
10
|
},
|
|
11
11
|
"parameters": {
|
|
@@ -19,24 +19,24 @@
|
|
|
19
19
|
"description": "Additional inference parameters for Video Classification",
|
|
20
20
|
"type": "object",
|
|
21
21
|
"properties": {
|
|
22
|
-
"
|
|
22
|
+
"function_to_apply": {
|
|
23
23
|
"title": "TextClassificationOutputTransform",
|
|
24
24
|
"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
|
|
25
25
|
},
|
|
26
|
-
"
|
|
26
|
+
"num_frames": {
|
|
27
27
|
"type": "integer",
|
|
28
28
|
"description": "The number of sampled frames to consider for classification."
|
|
29
29
|
},
|
|
30
|
-
"
|
|
30
|
+
"frame_sampling_rate": {
|
|
31
31
|
"type": "integer",
|
|
32
32
|
"description": "The sampling rate used to select frames from the video."
|
|
33
33
|
},
|
|
34
|
-
"
|
|
34
|
+
"top_k": {
|
|
35
35
|
"type": "integer",
|
|
36
36
|
"description": "When specified, limits the output to the top K most probable classes."
|
|
37
37
|
}
|
|
38
38
|
}
|
|
39
39
|
}
|
|
40
40
|
},
|
|
41
|
-
"required": ["
|
|
41
|
+
"required": ["inputs"]
|
|
42
42
|
}
|
|
@@ -10,7 +10,7 @@ export interface VisualQuestionAnsweringInput {
|
|
|
10
10
|
/**
|
|
11
11
|
* One (image, question) pair to answer
|
|
12
12
|
*/
|
|
13
|
-
|
|
13
|
+
inputs: VisualQuestionAnsweringInputData;
|
|
14
14
|
/**
|
|
15
15
|
* Additional inference parameters
|
|
16
16
|
*/
|
|
@@ -42,7 +42,7 @@ export interface VisualQuestionAnsweringParameters {
|
|
|
42
42
|
* return less than topk answers if there are not enough options available within the
|
|
43
43
|
* context.
|
|
44
44
|
*/
|
|
45
|
-
|
|
45
|
+
top_k?: number;
|
|
46
46
|
[property: string]: unknown;
|
|
47
47
|
}
|
|
48
48
|
export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"title": "VisualQuestionAnsweringInput",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
|
-
"
|
|
8
|
+
"inputs": {
|
|
9
9
|
"description": "One (image, question) pair to answer",
|
|
10
10
|
"type": "object",
|
|
11
11
|
"title": "VisualQuestionAnsweringInputData",
|
|
@@ -30,12 +30,12 @@
|
|
|
30
30
|
"description": "Additional inference parameters for Visual Question Answering",
|
|
31
31
|
"type": "object",
|
|
32
32
|
"properties": {
|
|
33
|
-
"
|
|
33
|
+
"top_k": {
|
|
34
34
|
"type": "integer",
|
|
35
35
|
"description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
|
|
36
36
|
}
|
|
37
37
|
}
|
|
38
38
|
}
|
|
39
39
|
},
|
|
40
|
-
"required": ["
|
|
40
|
+
"required": ["inputs"]
|
|
41
41
|
}
|
|
@@ -10,7 +10,7 @@ export interface ZeroShotClassificationInput {
|
|
|
10
10
|
/**
|
|
11
11
|
* The input text data, with candidate labels
|
|
12
12
|
*/
|
|
13
|
-
|
|
13
|
+
inputs: ZeroShotClassificationInputData;
|
|
14
14
|
/**
|
|
15
15
|
* Additional inference parameters
|
|
16
16
|
*/
|
|
@@ -41,13 +41,13 @@ export interface ZeroShotClassificationParameters {
|
|
|
41
41
|
* The sentence used in conjunction with candidateLabels to attempt the text classification
|
|
42
42
|
* by replacing the placeholder with the candidate labels.
|
|
43
43
|
*/
|
|
44
|
-
|
|
44
|
+
hypothesis_template?: string;
|
|
45
45
|
/**
|
|
46
46
|
* Whether multiple candidate labels can be true. If false, the scores are normalized such
|
|
47
47
|
* that the sum of the label likelihoods for each sequence is 1. If true, the labels are
|
|
48
48
|
* considered independent and probabilities are normalized for each candidate.
|
|
49
49
|
*/
|
|
50
|
-
|
|
50
|
+
multi_label?: boolean;
|
|
51
51
|
[property: string]: unknown;
|
|
52
52
|
}
|
|
53
53
|
export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"title": "ZeroShotClassificationInput",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"properties": {
|
|
8
|
-
"
|
|
8
|
+
"inputs": {
|
|
9
9
|
"description": "The input text data, with candidate labels",
|
|
10
10
|
"type": "object",
|
|
11
11
|
"title": "ZeroShotClassificationInputData",
|
|
@@ -35,16 +35,16 @@
|
|
|
35
35
|
"description": "Additional inference parameters for Zero Shot Classification",
|
|
36
36
|
"type": "object",
|
|
37
37
|
"properties": {
|
|
38
|
-
"
|
|
38
|
+
"hypothesis_template": {
|
|
39
39
|
"type": "string",
|
|
40
40
|
"description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
|
|
41
41
|
},
|
|
42
|
-
"
|
|
42
|
+
"multi_label": {
|
|
43
43
|
"type": "boolean",
|
|
44
44
|
"description": "Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate."
|
|
45
45
|
}
|
|
46
46
|
}
|
|
47
47
|
}
|
|
48
48
|
},
|
|
49
|
-
"required": ["
|
|
49
|
+
"required": ["inputs"]
|
|
50
50
|
}
|