@civitai/client 0.2.0-beta.31 → 0.2.0-beta.32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,16 @@
1
+ /**
2
+ * Represents AI vs real image recognition results.
3
+ */
4
+ export type AiRecognitionResult = {
5
+ /**
6
+ * Classification label: "ai" or "real".
7
+ */
8
+ label: string;
9
+ /**
10
+ * Confidence score for the classification (0.0 to 1.0).
11
+ */
12
+ confidence: number;
13
+ };
1
14
  /**
2
15
  * Base input for AI Toolkit training across all ecosystems
3
16
  */
@@ -75,6 +88,78 @@ export type AiToolkitTrainingInput = TrainingInput & {
75
88
  } & {
76
89
  engine: 'ai-toolkit';
77
90
  };
91
+ /**
92
+ * Input parameters for ACE Step 1.5 audio generation workflow step
93
+ */
94
+ export type AceStepAudioInput = {
95
+ musicDescription: string;
96
+ lyrics: string;
97
+ /**
98
+ * Random seed for reproducible generation
99
+ */
100
+ seed?: number;
101
+ /**
102
+ * Duration in seconds (1-190)
103
+ */
104
+ duration?: number;
105
+ /**
106
+ * Beats per minute (40-200)
107
+ */
108
+ bpm?: number;
109
+ /**
110
+ * Time signature (e.g., "4" for 4/4 time)
111
+ */
112
+ timeSignature?: string;
113
+ /**
114
+ * Language code (e.g., "en", "zh", "ja", "ko")
115
+ */
116
+ language?: string;
117
+ /**
118
+ * Musical key (e.g., "C major", "E minor")
119
+ */
120
+ key?: string;
121
+ /**
122
+ * Weight for instrumental elements (0.0-1.0)
123
+ */
124
+ instrumentalWeight?: number;
125
+ /**
126
+ * Weight for vocal elements (0.0-1.0)
127
+ */
128
+ vocalWeight?: number;
129
+ /**
130
+ * Optional model override (uses default ACE Step 1.5 turbo if not specified)
131
+ */
132
+ model?: string | null;
133
+ };
134
+ /**
135
+ * Output from ACE Step 1.5 audio generation workflow step
136
+ */
137
+ export type AceStepAudioOutput = {
138
+ audioBlob: AudioBlob;
139
+ };
140
+ /**
141
+ * Workflow step for generating music using ACE Step 1.5.
142
+ * Produces full songs from text descriptions and structured lyrics.
143
+ */
144
+ export type AceStepAudioStep = WorkflowStep & {
145
+ $type: 'aceStepAudio';
146
+ } & {
147
+ input: AceStepAudioInput;
148
+ output?: AceStepAudioOutput;
149
+ } & {
150
+ $type: 'aceStepAudio';
151
+ };
152
+ /**
153
+ * Workflow step for generating music using ACE Step 1.5.
154
+ * Produces full songs from text descriptions and structured lyrics.
155
+ */
156
+ export type AceStepAudioStepTemplate = WorkflowStepTemplate & {
157
+ $type: 'aceStepAudio';
158
+ } & {
159
+ input: AceStepAudioInput;
160
+ } & {
161
+ $type: 'aceStepAudio';
162
+ };
78
163
  export type AgeClassificationInput = {
79
164
  /**
80
165
  * An optional model to use for age classification. If not provided, the default model will determined by the worker
@@ -86,10 +171,51 @@ export type AgeClassificationInput = {
86
171
  mediaUrl: string;
87
172
  };
88
173
  export type AgeClassificationOutput = {
174
+ /**
175
+ * Age classification results per image
176
+ */
89
177
  labels: {
90
178
  [key: string]: Array<AgeClassifierLabel>;
91
179
  };
180
+ /**
181
+ * Whether any minor was detected in the dataset
182
+ */
92
183
  hasMinor: boolean;
184
+ /**
185
+ * Total number of age predictions made
186
+ */
187
+ numPredictions?: number;
188
+ /**
189
+ * Number of flagged predictions (minors)
190
+ */
191
+ numFlagged?: number;
192
+ /**
193
+ * Face recognition results per image (if available)
194
+ */
195
+ faceRecognitionFaces?: {
196
+ [key: string]: Array<FaceDetectionInfo>;
197
+ } | null;
198
+ /**
199
+ * Total number of faces detected across all images
200
+ */
201
+ faceRecognitionNumFaces?: number | null;
202
+ /**
203
+ * Estimated number of unique people in the dataset (based on similarity clustering)
204
+ */
205
+ faceRecognitionNumUniquePeople?: number | null;
206
+ /**
207
+ * Whether face recognition data is available for this result
208
+ */
209
+ faceRecognitionAvailable?: boolean;
210
+ };
211
+ /**
212
+ * Represents age classification results for media content.
213
+ */
214
+ export type AgeClassificationResult = {
215
+ /**
216
+ * Array of detected people with age classifications.
217
+ */
218
+ detections: Array<AgeDetection>;
93
219
  };
94
220
  /**
95
221
  * Age classification
@@ -117,6 +243,35 @@ export type AgeClassifierLabel = {
117
243
  isMinor: boolean;
118
244
  boundingBox: Array<number>;
119
245
  };
246
+ /**
247
+ * Represents a single age detection in an image.
248
+ */
249
+ export type AgeDetection = {
250
+ /**
251
+ * The detector type used (e.g., "yolo", "mediapipe", "ensemble").
252
+ */
253
+ detectorType: string;
254
+ boundingBox: BoundingBox;
255
+ /**
256
+ * The age label (e.g., "adult", "child", "teen").
257
+ */
258
+ ageLabel: string;
259
+ /**
260
+ * Confidence score for the age classification (0.0 to 1.0).
261
+ */
262
+ confidence: number;
263
+ /**
264
+ * Whether the detected person is classified as a minor.
265
+ */
266
+ isMinor: boolean;
267
+ /**
268
+ * Full probability distribution across all age labels.
269
+ * Keys are age labels (e.g., "Child 0-12", "Adult 21-44"), values are confidence scores (0.0 to 1.0).
270
+ */
271
+ topK?: {
272
+ [key: string]: number;
273
+ } | null;
274
+ };
120
275
  export declare const AnimalPoseBboxDetector: {
121
276
  readonly YOLOX_L_TORCHSCRIPT_PT: 'yolox_l.torchscript.pt';
122
277
  readonly YOLOX_L_ONNX: 'yolox_l.onnx';
@@ -131,6 +286,19 @@ export declare const AnimalPoseEstimator: {
131
286
  readonly RTMPOSE_M_AP10K_256_ONNX: 'rtmpose-m_ap10k_256.onnx';
132
287
  };
133
288
  export type AnimalPoseEstimator = (typeof AnimalPoseEstimator)[keyof typeof AnimalPoseEstimator];
289
+ /**
290
+ * Represents anime vs real image recognition results.
291
+ */
292
+ export type AnimeRecognitionResult = {
293
+ /**
294
+ * Classification label: "anime" or "real".
295
+ */
296
+ label: string;
297
+ /**
298
+ * Confidence score for the classification (0.0 to 1.0).
299
+ */
300
+ confidence: number;
301
+ };
134
302
  export declare const AnylineMergeWith: {
135
303
  readonly LINEART_STANDARD: 'lineart_standard';
136
304
  readonly LINEART_REALISTIC: 'lineart_realistic';
@@ -141,7 +309,9 @@ export type AnylineMergeWith = (typeof AnylineMergeWith)[keyof typeof AnylineMer
141
309
  /**
142
310
  * An assistant message representing a prior response.
143
311
  */
144
- export type AssistantMessage = {
312
+ export type AssistantMessage = ChatCompletionMessage & {
313
+ role: 'assistant';
314
+ } & {
145
315
  /**
146
316
  * The assistant message content (text only).
147
317
  */
@@ -154,6 +324,18 @@ export type AssistantMessage = {
154
324
  * Optional refusal message if the model refused to respond.
155
325
  */
156
326
  refusal?: string | null;
327
+ } & {
328
+ role: 'assistant';
329
+ };
330
+ export type AudioBlob = Blob & {
331
+ type: 'audio';
332
+ } & {
333
+ /**
334
+ * Duration of the audio in seconds
335
+ */
336
+ duration?: number | null;
337
+ } & {
338
+ type: 'audio';
157
339
  };
158
340
  export type BatchOcrSafetyClassificationInput = {
159
341
  mediaUrls: Array<string>;
@@ -197,6 +379,65 @@ export type Blob = {
197
379
  */
198
380
  blockedReason?: string | null;
199
381
  };
382
+ /**
383
+ * A rectangular region defined by pixel coordinates.
384
+ */
385
+ export type BlurRegion = {
386
+ /**
387
+ * Left edge X coordinate in pixels.
388
+ */
389
+ x1: number;
390
+ /**
391
+ * Top edge Y coordinate in pixels.
392
+ */
393
+ y1: number;
394
+ /**
395
+ * Right edge X coordinate in pixels.
396
+ */
397
+ x2: number;
398
+ /**
399
+ * Bottom edge Y coordinate in pixels.
400
+ */
401
+ y2: number;
402
+ };
403
+ /**
404
+ * Determines how regions are applied to the blur operation.
405
+ */
406
+ export declare const BlurRegionMode: {
407
+ readonly INCLUDE: 'include';
408
+ readonly EXCLUDE: 'exclude';
409
+ };
410
+ /**
411
+ * Determines how regions are applied to the blur operation.
412
+ */
413
+ export type BlurRegionMode = (typeof BlurRegionMode)[keyof typeof BlurRegionMode];
414
+ /**
415
+ * Applies a Gaussian blur with optional region-based masking.
416
+ */
417
+ export type BlurTransform = ImageTransform & {
418
+ type: 'blur';
419
+ } & {
420
+ /**
421
+ * The blur intensity (1-100).
422
+ */
423
+ blur: number;
424
+ mode: BlurRegionMode;
425
+ /**
426
+ * Rectangular regions that define the blur mask.
427
+ */
428
+ regions?: Array<BlurRegion>;
429
+ } & {
430
+ type: 'blur';
431
+ };
432
+ /**
433
+ * Represents a bounding box with coordinates.
434
+ */
435
+ export type BoundingBox = {
436
+ x1: number;
437
+ y1: number;
438
+ x2: number;
439
+ y2: number;
440
+ };
200
441
  export declare const BuzzClientAccount: {
201
442
  readonly YELLOW: 'yellow';
202
443
  readonly BLUE: 'blue';
@@ -216,12 +457,39 @@ export type ChatCompletionChoice = {
216
457
  /**
217
458
  * The reason the model stopped generating.
218
459
  */
219
- finishReason: string;
460
+ finishReason?: string | null;
220
461
  /**
221
462
  * Log probability information (if requested).
222
463
  */
223
464
  logprobs?: unknown;
224
465
  };
466
+ /**
467
+ * Base type for message content parts.
468
+ * Supports both camelCase (imageUrl) and snake_case (image_url) type discriminators via ContentPartJsonConverter.
469
+ */
470
+ export type ChatCompletionContentPart = {
471
+ type?: string;
472
+ /**
473
+ * The text content.
474
+ */
475
+ text?: string | null;
476
+ imageUrl?: ChatCompletionImageUrl;
477
+ };
478
+ /**
479
+ * Image URL details matching OpenAI API spec.
480
+ */
481
+ export type ChatCompletionImageUrl = {
482
+ /**
483
+ * The image source (can be a URL, base64 data URI, or raw base64).
484
+ * After processing, this will contain the blob URL.
485
+ */
486
+ url: string;
487
+ /**
488
+ * The detail level for processing the image.
489
+ * "low", "high", or "auto" (default).
490
+ */
491
+ detail?: string | null;
492
+ };
225
493
  /**
226
494
  * Input for a chat completion step, compatible with OpenAI Chat Completions API.
227
495
  */
@@ -277,7 +545,7 @@ export type ChatCompletionInput = {
277
545
  * Uses ChatCompletionMessageJsonConverter to handle polymorphism and user message content flexibility.
278
546
  */
279
547
  export type ChatCompletionMessage = {
280
- [key: string]: never;
548
+ role: string;
281
549
  };
282
550
  /**
283
551
  * Output from a chat completion step.
@@ -527,6 +795,47 @@ export type CursedArrayOfTelemetryCursorAndWorkflow = {
527
795
  next: string;
528
796
  items: Array<Workflow>;
529
797
  };
798
+ export type CustomTextToSpeechInput = TextToSpeechInput & {
799
+ engine: 'custom';
800
+ } & {
801
+ /**
802
+ * Reference audio URL for Base voice-cloning mode.
803
+ * Accepts AIR URNs (existing resources) or HTTP(S) URLs.
804
+ */
805
+ refAudioUrl?: string | null;
806
+ /**
807
+ * Transcript of the reference audio.
808
+ * Required for Base mode unless XVectorOnlyMode is true.
809
+ */
810
+ refText?: string | null;
811
+ /**
812
+ * If true, uses only speaker embedding for Base mode (ref_text not required).
813
+ */
814
+ xVectorOnlyMode?: boolean;
815
+ /**
816
+ * Built-in speaker name for CustomVoice mode.
817
+ */
818
+ speaker?:
819
+ | 'aiden'
820
+ | 'dylan'
821
+ | 'eric'
822
+ | 'ono_anna'
823
+ | 'ryan'
824
+ | 'serena'
825
+ | 'sohee'
826
+ | 'uncle_fu'
827
+ | 'vivian';
828
+ /**
829
+ * Optional style instruction for CustomVoice mode.
830
+ */
831
+ instruct?: string | null;
832
+ /**
833
+ * Optional generation cap for max tokens.
834
+ */
835
+ maxNewTokens?: number | null;
836
+ } & {
837
+ engine: 'custom';
838
+ };
530
839
  export declare const DensePoseColormap: {
531
840
  readonly 'VIRIDIS (_MAGIC_ANIMATE)': 'Viridis (MagicAnimate)';
532
841
  readonly 'PARULA (_CIVIT_AI)': 'Parula (CivitAI)';
@@ -627,6 +936,56 @@ export type EpochResult = {
627
936
  */
628
937
  blobUrl: string;
629
938
  };
939
+ /**
940
+ * Represents a single face detection with embeddings and landmarks.
941
+ */
942
+ export type FaceDetection = {
943
+ boundingBox: BoundingBox;
944
+ /**
945
+ * Facial landmarks (e.g., eyes, nose, mouth positions).
946
+ * Dictionary keys are landmark names, values are (x, y) coordinates.
947
+ */
948
+ landmarks?: {
949
+ [key: string]: ValueTupleOfDoubleAndDouble;
950
+ } | null;
951
+ /**
952
+ * Face embedding vector for similarity comparison.
953
+ */
954
+ embedding?: Array<number> | null;
955
+ };
956
+ /**
957
+ * Information about a detected face in an image
958
+ */
959
+ export type FaceDetectionInfo = {
960
+ /**
961
+ * Bounding box coordinates [x1, y1, x2, y2]
962
+ */
963
+ boundingBox: Array<number>;
964
+ /**
965
+ * Facial landmarks (e.g., eyes, nose, mouth positions)
966
+ */
967
+ landmarks?: {
968
+ [key: string]: Array<number>;
969
+ } | null;
970
+ /**
971
+ * Whether this face has an embedding (for similarity comparison)
972
+ */
973
+ hasEmbedding?: boolean;
974
+ };
975
+ /**
976
+ * Represents face recognition results for media content.
977
+ */
978
+ export type FaceRecognitionResult = {
979
+ /**
980
+ * Array of detected faces with embeddings and landmarks.
981
+ */
982
+ faces: Array<FaceDetection>;
983
+ /**
984
+ * Pairwise cosine similarity matrix between detected faces.
985
+ * Matrix[i][j] represents similarity between face i and face j.
986
+ */
987
+ similarityMatrix?: Array<Array<number>> | null;
988
+ };
630
989
  export declare const FileFormat: {
631
990
  readonly UNKNOWN: 'unknown';
632
991
  readonly SAFE_TENSOR: 'safeTensor';
@@ -1099,6 +1458,12 @@ export type ImageJobNetworkParams = {
1099
1458
  */
1100
1459
  export type ImageOutputFormat = {
1101
1460
  format: string;
1461
+ /**
1462
+ * When true, metadata such as EXIF data will be stripped from the output image. Defaults to false.
1463
+ * Note that some formats like JPEG may still include minimal metadata even when this is false, due to encoder behavior.
1464
+ * Setting this to true will attempt to remove all metadata, but results may vary by format and encoder implementation.
1465
+ */
1466
+ hideMetadata?: boolean;
1102
1467
  };
1103
1468
  export declare const ImageResouceTrainingModerationStatus: {
1104
1469
  readonly EVALUATING: 'evaluating';
@@ -1237,7 +1602,14 @@ export type ImageUpscalerInput = {
1237
1602
  * Either A URL, A DataURL or a Base64 string
1238
1603
  */
1239
1604
  image: string;
1240
- scaleFactor?: number;
1605
+ /**
1606
+ * The upscaler model to use (AIR URN format).
1607
+ */
1608
+ model?: string | null;
1609
+ /**
1610
+ * Number of times to repeat the upscale operation (1-3). Each repeat doubles the resolution.
1611
+ */
1612
+ numberOfRepeats?: number;
1241
1613
  };
1242
1614
  export type ImageUpscalerOutput = {
1243
1615
  blob: ImageBlob;
@@ -1366,6 +1738,51 @@ export declare const KlingModel: {
1366
1738
  readonly V2_5_TURBO: 'v2.5-turbo';
1367
1739
  };
1368
1740
  export type KlingModel = (typeof KlingModel)[keyof typeof KlingModel];
1741
+ export type KlingV3ElementInput = {
1742
+ /**
1743
+ * Either A URL, A DataURL or a Base64 string
1744
+ */
1745
+ frontalImage?: string | null;
1746
+ referenceImages?: Array<string>;
1747
+ videoUrl?: string | null;
1748
+ };
1749
+ export type KlingV3MultiPrompt = {
1750
+ prompt: string;
1751
+ duration?: number;
1752
+ };
1753
+ export declare const KlingV3Operation: {
1754
+ readonly TEXT_TO_VIDEO: 'text-to-video';
1755
+ readonly IMAGE_TO_VIDEO: 'image-to-video';
1756
+ readonly REFERENCE_TO_VIDEO: 'reference-to-video';
1757
+ readonly VIDEO_TO_VIDEO_EDIT: 'video-to-video-edit';
1758
+ readonly VIDEO_TO_VIDEO_REFERENCE: 'video-to-video-reference';
1759
+ };
1760
+ export type KlingV3Operation = (typeof KlingV3Operation)[keyof typeof KlingV3Operation];
1761
+ export type KlingV3VideoGenInput = VideoGenInput & {
1762
+ engine: 'kling-v3';
1763
+ } & {
1764
+ operation?: KlingV3Operation;
1765
+ mode?: KlingMode;
1766
+ duration?: number;
1767
+ aspectRatio?: KlingVideoGenAspectRatio;
1768
+ /**
1769
+ * Either A URL, A DataURL or a Base64 string
1770
+ */
1771
+ sourceImage?: string | null;
1772
+ /**
1773
+ * Either A URL, A DataURL or a Base64 string
1774
+ */
1775
+ endImage?: string | null;
1776
+ videoUrl?: string | null;
1777
+ images?: Array<string>;
1778
+ elements?: Array<KlingV3ElementInput>;
1779
+ generateAudio?: boolean;
1780
+ voiceIds?: Array<string> | null;
1781
+ keepAudio?: boolean;
1782
+ multiPrompt?: Array<KlingV3MultiPrompt> | null;
1783
+ } & {
1784
+ engine: 'kling-v3';
1785
+ };
1369
1786
  export declare const KlingVideoGenAspectRatio: {
1370
1787
  readonly '16:9': '16:9';
1371
1788
  readonly '9:16': '9:16';
@@ -1588,6 +2005,22 @@ export type MediaRatingInput = {
1588
2005
  * The engine to use for media rating. Valid values: "default" (HiveVLM) or "civitai".
1589
2006
  */
1590
2007
  engine?: string;
2008
+ /**
2009
+ * Include age classification analysis in the results (civitai engine only).
2010
+ */
2011
+ includeAgeClassification?: boolean;
2012
+ /**
2013
+ * Include face recognition and similarity analysis in the results (civitai engine only).
2014
+ */
2015
+ includeFaceRecognition?: boolean;
2016
+ /**
2017
+ * Include AI vs real image detection in the results (civitai engine only, GPU-only).
2018
+ */
2019
+ includeAIRecognition?: boolean;
2020
+ /**
2021
+ * Include anime vs real image detection in the results (civitai engine only, GPU-only).
2022
+ */
2023
+ includeAnimeRecognition?: boolean;
1591
2024
  };
1592
2025
  /**
1593
2026
  * Represents the output information returned from the MediaRating workflow step.
@@ -1606,6 +2039,10 @@ export type MediaRatingOutput = {
1606
2039
  * Detected content labels (e.g., "Animal", "Child", etc.).
1607
2040
  */
1608
2041
  labels?: Array<string> | null;
2042
+ ageClassification?: AgeClassificationResult;
2043
+ faceRecognition?: FaceRecognitionResult;
2044
+ aiRecognition?: AiRecognitionResult;
2045
+ animeRecognition?: AnimeRecognitionResult;
1609
2046
  };
1610
2047
  /**
1611
2048
  * MediaRating
@@ -2412,6 +2849,8 @@ export declare const SdCppSampleMethod: {
2412
2849
  readonly DDIM_TRAILING: 'ddim_trailing';
2413
2850
  readonly EULER_A: 'euler_a';
2414
2851
  readonly LCM: 'lcm';
2852
+ readonly RES_MULTISTEP: 'res_multistep';
2853
+ readonly RES_2S: 'res_2s';
2415
2854
  };
2416
2855
  export type SdCppSampleMethod = (typeof SdCppSampleMethod)[keyof typeof SdCppSampleMethod];
2417
2856
  export declare const SdCppSchedule: {
@@ -2420,6 +2859,7 @@ export declare const SdCppSchedule: {
2420
2859
  readonly KARRAS: 'karras';
2421
2860
  readonly EXPONENTIAL: 'exponential';
2422
2861
  readonly AYS: 'ays';
2862
+ readonly BONG_TANGENT: 'bong_tangent';
2423
2863
  };
2424
2864
  export type SdCppSchedule = (typeof SdCppSchedule)[keyof typeof SdCppSchedule];
2425
2865
  /**
@@ -2500,6 +2940,23 @@ export type SoraVideoGenInput = VideoGenInput & {
2500
2940
  } & {
2501
2941
  engine: 'sora';
2502
2942
  };
2943
+ /**
2944
+ * A system message that sets the behavior of the assistant.
2945
+ */
2946
+ export type SystemMessage = ChatCompletionMessage & {
2947
+ role: 'system';
2948
+ } & {
2949
+ /**
2950
+ * The system message content (text only).
2951
+ */
2952
+ content: string;
2953
+ /**
2954
+ * Optional name for the participant.
2955
+ */
2956
+ name?: string | null;
2957
+ } & {
2958
+ role: 'system';
2959
+ };
2503
2960
  /**
2504
2961
  * Input for an text to image step.
2505
2962
  */
@@ -2607,6 +3064,52 @@ export type TextToImageStepTemplate = WorkflowStepTemplate & {
2607
3064
  } & {
2608
3065
  $type: 'textToImage';
2609
3066
  };
3067
+ export type TextToSpeechInput = {
3068
+ engine: string | null;
3069
+ /**
3070
+ * The text to synthesize into speech.
3071
+ */
3072
+ text: string;
3073
+ /**
3074
+ * Target language (e.g., "English", "Chinese"). Defaults to "Auto".
3075
+ */
3076
+ language?: string | null;
3077
+ };
3078
+ /**
3079
+ * Output from text-to-speech workflow step.
3080
+ */
3081
+ export type TextToSpeechOutput = {
3082
+ audioBlob: AudioBlob;
3083
+ /**
3084
+ * Whether the TTS used "base" (voice cloning) or "custom_voice" (built-in speaker) mode.
3085
+ */
3086
+ modelType?: string | null;
3087
+ /**
3088
+ * The speaker name used (set for CustomVoice mode, null for Base mode).
3089
+ */
3090
+ speaker?: string | null;
3091
+ };
3092
+ /**
3093
+ * Text-to-Speech
3094
+ */
3095
+ export type TextToSpeechStep = WorkflowStep & {
3096
+ $type: 'textToSpeech';
3097
+ } & {
3098
+ input: TextToSpeechInput;
3099
+ output?: TextToSpeechOutput;
3100
+ } & {
3101
+ $type: 'textToSpeech';
3102
+ };
3103
+ /**
3104
+ * Text-to-Speech
3105
+ */
3106
+ export type TextToSpeechStepTemplate = WorkflowStepTemplate & {
3107
+ $type: 'textToSpeech';
3108
+ } & {
3109
+ input: TextToSpeechInput;
3110
+ } & {
3111
+ $type: 'textToSpeech';
3112
+ };
2610
3113
  /**
2611
3114
  * Represents training data in various formats
2612
3115
  */
@@ -2772,6 +3275,65 @@ export type TranscodeStepTemplate = WorkflowStepTemplate & {
2772
3275
  } & {
2773
3276
  $type: 'transcode';
2774
3277
  };
3278
+ export type TranscriptionInput = {
3279
+ mediaUrl: string;
3280
+ /**
3281
+ * Optional language hint (e.g., "en", "zh") to guide transcription.
3282
+ */
3283
+ language?: string | null;
3284
+ /**
3285
+ * Optional context prompt to improve transcription accuracy.
3286
+ */
3287
+ context?: string | null;
3288
+ /**
3289
+ * Whether to return word-level timestamps. Defaults to true.
3290
+ */
3291
+ returnTimeStamps?: boolean;
3292
+ };
3293
+ export type TranscriptionOutput = {
3294
+ /**
3295
+ * The full transcribed text.
3296
+ */
3297
+ text: string;
3298
+ /**
3299
+ * The detected language of the audio.
3300
+ */
3301
+ language: string;
3302
+ /**
3303
+ * Word-level timestamps (if requested).
3304
+ */
3305
+ timeStamps?: Array<TranscriptionTimeStamp>;
3306
+ /**
3307
+ * Total processing time in seconds.
3308
+ */
3309
+ elapsedSeconds?: number;
3310
+ };
3311
+ /**
3312
+ * Transcription
3313
+ */
3314
+ export type TranscriptionStep = WorkflowStep & {
3315
+ $type: 'transcription';
3316
+ } & {
3317
+ input: TranscriptionInput;
3318
+ output?: TranscriptionOutput;
3319
+ } & {
3320
+ $type: 'transcription';
3321
+ };
3322
+ /**
3323
+ * Transcription
3324
+ */
3325
+ export type TranscriptionStepTemplate = WorkflowStepTemplate & {
3326
+ $type: 'transcription';
3327
+ } & {
3328
+ input: TranscriptionInput;
3329
+ } & {
3330
+ $type: 'transcription';
3331
+ };
3332
+ export type TranscriptionTimeStamp = {
3333
+ text: string;
3334
+ startTime: number;
3335
+ endTime: number;
3336
+ };
2775
3337
  export type TryOnUInput = {
2776
3338
  subjectUrl: string;
2777
3339
  garmentUrl: string;
@@ -2824,6 +3386,24 @@ export type UpdateWorkflowStepRequest = {
2824
3386
  [key: string]: unknown;
2825
3387
  };
2826
3388
  };
3389
+ /**
3390
+ * A user message that can contain text and/or images.
3391
+ */
3392
+ export type UserMessage = ChatCompletionMessage & {
3393
+ role: 'user';
3394
+ } & {
3395
+ /**
3396
+ * The message content - can be a simple string or array of content parts.
3397
+ * When deserialized from a string, it will be converted to a single TextContentPart.
3398
+ */
3399
+ content: Array<ChatCompletionContentPart>;
3400
+ /**
3401
+ * Optional name for the participant.
3402
+ */
3403
+ name?: string | null;
3404
+ } & {
3405
+ role: 'user';
3406
+ };
2827
3407
  export type ValidationProblemDetails = {
2828
3408
  type?: string | null;
2829
3409
  title?: string | null;
@@ -2845,6 +3425,9 @@ export type ValidationProblemDetails = {
2845
3425
  }
2846
3426
  | undefined;
2847
3427
  };
3428
+ export type ValueTupleOfDoubleAndDouble = {
3429
+ [key: string]: never;
3430
+ };
2848
3431
  export type ValueTupleOfStringAndInt32 = {
2849
3432
  [key: string]: never;
2850
3433
  };
@@ -2865,7 +3448,7 @@ export type Veo3VideoGenInput = VideoGenInput & {
2865
3448
  negativePrompt?: string | null;
2866
3449
  enablePromptEnhancer?: boolean;
2867
3450
  aspectRatio?: Veo3AspectRatio;
2868
- duration?: number;
3451
+ duration?: 4 | 6 | 8;
2869
3452
  generateAudio?: boolean;
2870
3453
  seed?: number | null;
2871
3454
  fastMode?: boolean;
@@ -3937,6 +4520,9 @@ export type ZImageBaseImageGenInput = ZImageImageGenInput & {
3937
4520
  cfgScale?: number;
3938
4521
  seed?: number | null;
3939
4522
  quantity?: number;
4523
+ loras?: {
4524
+ [key: string]: number;
4525
+ };
3940
4526
  } & {
3941
4527
  model: 'base';
3942
4528
  };
@@ -3967,6 +4553,9 @@ export type ZImageTurboImageGenInput = ZImageImageGenInput & {
3967
4553
  cfgScale?: number;
3968
4554
  seed?: number | null;
3969
4555
  quantity?: number;
4556
+ loras?: {
4557
+ [key: string]: number;
4558
+ };
3970
4559
  } & {
3971
4560
  model: 'turbo';
3972
4561
  };
@@ -4173,6 +4762,35 @@ export type GetBlockedContentErrors = {
4173
4762
  403: ProblemDetails;
4174
4763
  };
4175
4764
  export type GetBlockedContentError = GetBlockedContentErrors[keyof GetBlockedContentErrors];
4765
+ export type InvokeAceStepAudioStepTemplateData = {
4766
+ body?: AceStepAudioInput;
4767
+ path?: never;
4768
+ query?: {
4769
+ experimental?: boolean;
4770
+ allowMatureContent?: boolean;
4771
+ };
4772
+ url: '/v2/consumer/recipes/aceStepAudio';
4773
+ };
4774
+ export type InvokeAceStepAudioStepTemplateErrors = {
4775
+ /**
4776
+ * Bad Request
4777
+ */
4778
+ 400: ProblemDetails;
4779
+ /**
4780
+ * Unauthorized
4781
+ */
4782
+ 401: ProblemDetails;
4783
+ };
4784
+ export type InvokeAceStepAudioStepTemplateError =
4785
+ InvokeAceStepAudioStepTemplateErrors[keyof InvokeAceStepAudioStepTemplateErrors];
4786
+ export type InvokeAceStepAudioStepTemplateResponses = {
4787
+ /**
4788
+ * OK
4789
+ */
4790
+ 200: AceStepAudioOutput;
4791
+ };
4792
+ export type InvokeAceStepAudioStepTemplateResponse =
4793
+ InvokeAceStepAudioStepTemplateResponses[keyof InvokeAceStepAudioStepTemplateResponses];
4176
4794
  export type InvokeAgeClassificationStepTemplateData = {
4177
4795
  body?: AgeClassificationInput;
4178
4796
  path?: never;
@@ -4550,6 +5168,35 @@ export type InvokeTextToImageStepTemplateResponses = {
4550
5168
  };
4551
5169
  export type InvokeTextToImageStepTemplateResponse =
4552
5170
  InvokeTextToImageStepTemplateResponses[keyof InvokeTextToImageStepTemplateResponses];
5171
+ export type InvokeTextToSpeechStepTemplateData = {
5172
+ body?: TextToSpeechInput;
5173
+ path?: never;
5174
+ query?: {
5175
+ experimental?: boolean;
5176
+ allowMatureContent?: boolean;
5177
+ };
5178
+ url: '/v2/consumer/recipes/textToSpeech';
5179
+ };
5180
+ export type InvokeTextToSpeechStepTemplateErrors = {
5181
+ /**
5182
+ * Bad Request
5183
+ */
5184
+ 400: ProblemDetails;
5185
+ /**
5186
+ * Unauthorized
5187
+ */
5188
+ 401: ProblemDetails;
5189
+ };
5190
+ export type InvokeTextToSpeechStepTemplateError =
5191
+ InvokeTextToSpeechStepTemplateErrors[keyof InvokeTextToSpeechStepTemplateErrors];
5192
+ export type InvokeTextToSpeechStepTemplateResponses = {
5193
+ /**
5194
+ * OK
5195
+ */
5196
+ 200: TextToSpeechOutput;
5197
+ };
5198
+ export type InvokeTextToSpeechStepTemplateResponse =
5199
+ InvokeTextToSpeechStepTemplateResponses[keyof InvokeTextToSpeechStepTemplateResponses];
4553
5200
  export type InvokeTrainingStepTemplateData = {
4554
5201
  body?: TrainingInput;
4555
5202
  path?: never;
@@ -4579,6 +5226,35 @@ export type InvokeTrainingStepTemplateResponses = {
4579
5226
  };
4580
5227
  export type InvokeTrainingStepTemplateResponse =
4581
5228
  InvokeTrainingStepTemplateResponses[keyof InvokeTrainingStepTemplateResponses];
5229
+ export type InvokeTranscriptionStepTemplateData = {
5230
+ body?: TranscriptionInput;
5231
+ path?: never;
5232
+ query?: {
5233
+ experimental?: boolean;
5234
+ allowMatureContent?: boolean;
5235
+ };
5236
+ url: '/v2/consumer/recipes/transcription';
5237
+ };
5238
+ export type InvokeTranscriptionStepTemplateErrors = {
5239
+ /**
5240
+ * Bad Request
5241
+ */
5242
+ 400: ProblemDetails;
5243
+ /**
5244
+ * Unauthorized
5245
+ */
5246
+ 401: ProblemDetails;
5247
+ };
5248
+ export type InvokeTranscriptionStepTemplateError =
5249
+ InvokeTranscriptionStepTemplateErrors[keyof InvokeTranscriptionStepTemplateErrors];
5250
+ export type InvokeTranscriptionStepTemplateResponses = {
5251
+ /**
5252
+ * OK
5253
+ */
5254
+ 200: TranscriptionOutput;
5255
+ };
5256
+ export type InvokeTranscriptionStepTemplateResponse =
5257
+ InvokeTranscriptionStepTemplateResponses[keyof InvokeTranscriptionStepTemplateResponses];
4582
5258
  export type InvokeVideoEnhancementStepTemplateData = {
4583
5259
  body?: VideoEnhancementInput;
4584
5260
  path?: never;