@xiaozhiclaw/provider-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/aliyun-oss-file-upload-adapter.d.ts +44 -0
- package/dist/adapters/aliyun-oss-file-upload-adapter.js +96 -0
- package/dist/adapters/gemini-file-upload-adapter.d.ts +26 -0
- package/dist/adapters/gemini-file-upload-adapter.js +92 -0
- package/dist/adapters/hub-oss-file-upload-adapter.d.ts +29 -0
- package/dist/adapters/hub-oss-file-upload-adapter.js +53 -0
- package/dist/adapters/index.d.ts +10 -0
- package/dist/adapters/index.js +10 -0
- package/dist/adapters/openai-file-upload-adapter.d.ts +38 -0
- package/dist/adapters/openai-file-upload-adapter.js +56 -0
- package/dist/adapters/volcengine-file-upload-adapter.d.ts +24 -0
- package/dist/adapters/volcengine-file-upload-adapter.js +45 -0
- package/dist/builtin-providers.d.ts +8 -0
- package/dist/builtin-providers.js +2237 -0
- package/dist/constants.d.ts +1 -0
- package/dist/constants.js +1 -0
- package/dist/credentials.d.ts +1 -0
- package/dist/credentials.js +8 -0
- package/dist/debug-transport.d.ts +12 -0
- package/dist/debug-transport.js +99 -0
- package/dist/errors.d.ts +11 -0
- package/dist/errors.js +12 -0
- package/dist/events.d.ts +48 -0
- package/dist/events.js +1 -0
- package/dist/file-upload-service.d.ts +68 -0
- package/dist/file-upload-service.js +110 -0
- package/dist/gemini-schema-utils.d.ts +17 -0
- package/dist/gemini-schema-utils.js +76 -0
- package/dist/index.d.ts +37 -0
- package/dist/index.js +33 -0
- package/dist/llm-client.d.ts +43 -0
- package/dist/llm-client.js +217 -0
- package/dist/media-client.d.ts +42 -0
- package/dist/media-client.js +174 -0
- package/dist/media-transport.d.ts +176 -0
- package/dist/media-transport.js +16 -0
- package/dist/media.d.ts +2 -0
- package/dist/media.js +1 -0
- package/dist/model-detection.d.ts +22 -0
- package/dist/model-detection.js +28 -0
- package/dist/paths.d.ts +2 -0
- package/dist/paths.js +11 -0
- package/dist/provider-def.d.ts +220 -0
- package/dist/provider-def.js +9 -0
- package/dist/provider-registry.d.ts +51 -0
- package/dist/provider-registry.js +130 -0
- package/dist/provider-tool-api.d.ts +44 -0
- package/dist/provider-tool-api.js +9 -0
- package/dist/provider-variant-resolver.d.ts +35 -0
- package/dist/provider-variant-resolver.js +174 -0
- package/dist/retry.d.ts +37 -0
- package/dist/retry.js +71 -0
- package/dist/transport.d.ts +281 -0
- package/dist/transport.js +27 -0
- package/dist/transports/anthropic-messages.d.ts +65 -0
- package/dist/transports/anthropic-messages.js +1004 -0
- package/dist/transports/gemini-cache-api.d.ts +86 -0
- package/dist/transports/gemini-cache-api.js +141 -0
- package/dist/transports/gemini-file-api.d.ts +90 -0
- package/dist/transports/gemini-file-api.js +164 -0
- package/dist/transports/gemini-generatecontent.d.ts +56 -0
- package/dist/transports/gemini-generatecontent.js +688 -0
- package/dist/transports/gemini-lyria-realtime.d.ts +117 -0
- package/dist/transports/gemini-lyria-realtime.js +295 -0
- package/dist/transports/gemini-media.d.ts +53 -0
- package/dist/transports/gemini-media.js +383 -0
- package/dist/transports/media-resolve.d.ts +50 -0
- package/dist/transports/media-resolve.js +91 -0
- package/dist/transports/minimax-media.d.ts +56 -0
- package/dist/transports/minimax-media.js +433 -0
- package/dist/transports/openai-chat.d.ts +81 -0
- package/dist/transports/openai-chat.js +782 -0
- package/dist/transports/openai-media.d.ts +24 -0
- package/dist/transports/openai-media.js +118 -0
- package/dist/transports/openai-responses.d.ts +63 -0
- package/dist/transports/openai-responses.js +778 -0
- package/dist/transports/qwen-media.d.ts +59 -0
- package/dist/transports/qwen-media.js +411 -0
- package/dist/transports/realtime-transport.d.ts +183 -0
- package/dist/transports/realtime-transport.js +332 -0
- package/dist/transports/volcengine-grounding.d.ts +58 -0
- package/dist/transports/volcengine-grounding.js +69 -0
- package/dist/transports/volcengine-media.d.ts +94 -0
- package/dist/transports/volcengine-media.js +801 -0
- package/dist/transports/volcengine-responses.d.ts +64 -0
- package/dist/transports/volcengine-responses.js +797 -0
- package/dist/transports/zhipu-media.d.ts +82 -0
- package/dist/transports/zhipu-media.js +522 -0
- package/dist/transports/zhipu-tool-api.d.ts +35 -0
- package/dist/transports/zhipu-tool-api.js +126 -0
- package/dist/wire-types.d.ts +51 -0
- package/dist/wire-types.js +1 -0
- package/package.json +33 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gemini Media Transport 鈥?unified media generation for all Gemini media APIs.
|
|
3
|
+
*
|
|
4
|
+
* Supported media types and endpoints:
|
|
5
|
+
* image 鈥?POST /models/{model}:generateContent (responseModalities: ["TEXT","IMAGE"])
|
|
6
|
+
* video 鈥?POST /models/{model}:predictLongRunning 鈫?poll operations 鈫?download URI
|
|
7
|
+
* music 鈥?POST /models/{model}:generateContent (Lyria 3 鈥?inlineData audio)
|
|
8
|
+
* music_realtime 鈥?WebSocket session (Lyria RealTime 鈥?streaming PCM 鈫?WAV)
|
|
9
|
+
* tts 鈥?POST /models/{model}:generateContent (speechConfig 鈥?inlineData PCM)
|
|
10
|
+
* embedding 鈥?POST /models/{model}:embedContent (float vector)
|
|
11
|
+
*
|
|
12
|
+
* Auth: x-goog-api-key header for all endpoints.
|
|
13
|
+
*/
|
|
14
|
+
import { writeFileSync, mkdirSync } from "node:fs";
|
|
15
|
+
import { join } from "node:path";
|
|
16
|
+
import { randomUUID } from "node:crypto";
|
|
17
|
+
import { getUserCacheDir } from "../paths.js";
|
|
18
|
+
import { generateRealtimeMusic } from "./gemini-lyria-realtime.js";
|
|
19
|
+
const DEFAULT_TIMEOUT_MS = 180_000;
|
|
20
|
+
const VIDEO_POLL_INTERVAL_MS = 10_000;
|
|
21
|
+
const VIDEO_MAX_POLL_MS = 600_000; // 10 min
|
|
22
|
+
export class GeminiMediaTransport {
|
|
23
|
+
supportedTypes = ["image", "video", "music", "music_realtime", "tts", "embedding"];
|
|
24
|
+
apiBase;
|
|
25
|
+
timeoutMs;
|
|
26
|
+
constructor(config) {
|
|
27
|
+
this.apiBase = config.baseUrl
|
|
28
|
+
.replace(/\/openai\/?$/, "")
|
|
29
|
+
.replace(/\/+$/, "");
|
|
30
|
+
this.timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
31
|
+
}
|
|
32
|
+
async generate(request, apiKey, signal) {
|
|
33
|
+
switch (request.mediaType) {
|
|
34
|
+
case "image": return this.generateImage(request, apiKey, signal);
|
|
35
|
+
case "video": return this.generateVideo(request, apiKey, signal);
|
|
36
|
+
case "music": return this.generateMusic(request, apiKey, signal);
|
|
37
|
+
case "music_realtime": return this.generateMusicRealtime(request, apiKey, signal);
|
|
38
|
+
case "tts": return this.generateTTS(request, apiKey, signal);
|
|
39
|
+
case "embedding": return this.generateEmbedding(request, apiKey, signal);
|
|
40
|
+
default:
|
|
41
|
+
throw new Error(`GeminiMediaTransport: unsupported mediaType "${request.mediaType}"`);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
// 鈹€鈹€ AsyncMediaTransport task management (video only) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
45
|
+
async deleteVideoTask(_taskId, _apiKey, _signal) {
|
|
46
|
+
// Gemini operations API does not expose a cancel/delete endpoint for video tasks
|
|
47
|
+
throw new Error("Gemini does not support deleting video generation tasks");
|
|
48
|
+
}
|
|
49
|
+
async listVideoTasks(_apiKey, _options, _signal) {
|
|
50
|
+
// Gemini operations API does not expose a list endpoint
|
|
51
|
+
return { data: [] };
|
|
52
|
+
}
|
|
53
|
+
async getTaskStatus(taskId, apiKey, signal) {
|
|
54
|
+
const url = `${this.apiBase}/${taskId}`;
|
|
55
|
+
const res = await fetch(url, {
|
|
56
|
+
method: "GET",
|
|
57
|
+
headers: { "x-goog-api-key": apiKey },
|
|
58
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
59
|
+
});
|
|
60
|
+
if (!res.ok) {
|
|
61
|
+
const text = await res.text().catch(() => "");
|
|
62
|
+
throw new Error(`Gemini operation query error ${res.status}: ${text}`);
|
|
63
|
+
}
|
|
64
|
+
const data = await res.json();
|
|
65
|
+
const done = data.done;
|
|
66
|
+
return {
|
|
67
|
+
status: done ? "completed" : "running",
|
|
68
|
+
task: data,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
// 鈹€鈹€ Image (Nano Banana / Gemini generateContent) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
72
|
+
async generateImage(request, apiKey, signal) {
|
|
73
|
+
const start = Date.now();
|
|
74
|
+
const url = `${this.apiBase}/models/${request.model}:generateContent`;
|
|
75
|
+
const generationConfig = {
|
|
76
|
+
responseModalities: ["TEXT", "IMAGE"],
|
|
77
|
+
};
|
|
78
|
+
if (request.aspectRatio)
|
|
79
|
+
generationConfig.aspectRatio = request.aspectRatio;
|
|
80
|
+
const body = {
|
|
81
|
+
contents: [{
|
|
82
|
+
parts: [{ text: request.prompt }],
|
|
83
|
+
}],
|
|
84
|
+
generationConfig,
|
|
85
|
+
};
|
|
86
|
+
const data = await this.postJson(url, body, apiKey, signal);
|
|
87
|
+
const mediaUrls = this.extractInlineImages(data);
|
|
88
|
+
return {
|
|
89
|
+
mediaUrls,
|
|
90
|
+
model: request.model,
|
|
91
|
+
durationMs: Date.now() - start,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
// 鈹€鈹€ Video (Veo 鈥?predictLongRunning 鈫?poll 鈫?download) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
95
|
+
async generateVideo(request, apiKey, signal) {
|
|
96
|
+
const start = Date.now();
|
|
97
|
+
const url = `${this.apiBase}/models/${request.model}:predictLongRunning`;
|
|
98
|
+
// Build instances
|
|
99
|
+
const instance = { prompt: request.prompt };
|
|
100
|
+
// Image-to-video: first frame image
|
|
101
|
+
if (request.imageUrl) {
|
|
102
|
+
instance.image = { inlineData: await this.resolveImageData(request.imageUrl) };
|
|
103
|
+
}
|
|
104
|
+
// Reference images (Veo 3.1 鈥?asset references)
|
|
105
|
+
if (request.referenceImages?.length) {
|
|
106
|
+
instance.referenceImages = await Promise.all(request.referenceImages.map(async (imgUrl) => ({
|
|
107
|
+
image: { inlineData: await this.resolveImageData(imgUrl) },
|
|
108
|
+
referenceType: "asset",
|
|
109
|
+
})));
|
|
110
|
+
}
|
|
111
|
+
// Build parameters
|
|
112
|
+
const parameters = {};
|
|
113
|
+
if (request.aspectRatio)
|
|
114
|
+
parameters.aspectRatio = request.aspectRatio;
|
|
115
|
+
if (request.resolution)
|
|
116
|
+
parameters.resolution = request.resolution;
|
|
117
|
+
if (request.duration)
|
|
118
|
+
parameters.durationSeconds = String(request.duration);
|
|
119
|
+
if (request.n)
|
|
120
|
+
parameters.numberOfVideos = request.n;
|
|
121
|
+
if (request.seed !== undefined)
|
|
122
|
+
parameters.seed = request.seed;
|
|
123
|
+
const body = { instances: [instance] };
|
|
124
|
+
if (Object.keys(parameters).length > 0)
|
|
125
|
+
body.parameters = parameters;
|
|
126
|
+
// Submit
|
|
127
|
+
const submitRes = await this.postJson(url, body, apiKey, signal);
|
|
128
|
+
const operationName = submitRes.name;
|
|
129
|
+
if (!operationName) {
|
|
130
|
+
throw new Error("Gemini video submit: no operation name in response");
|
|
131
|
+
}
|
|
132
|
+
const taskId = operationName;
|
|
133
|
+
request.onProgress?.(5, "submitted", taskId);
|
|
134
|
+
// Poll
|
|
135
|
+
const result = await this.pollOperation(operationName, apiKey, signal, request.onProgress, taskId);
|
|
136
|
+
// Extract video URI from completed operation
|
|
137
|
+
const videoResponse = result.response?.generateVideoResponse;
|
|
138
|
+
const samples = (videoResponse?.generatedSamples ?? []);
|
|
139
|
+
const mediaUrls = [];
|
|
140
|
+
for (const sample of samples) {
|
|
141
|
+
if (sample.video?.uri) {
|
|
142
|
+
mediaUrls.push(sample.video.uri);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return {
|
|
146
|
+
mediaUrls,
|
|
147
|
+
model: request.model,
|
|
148
|
+
durationMs: Date.now() - start,
|
|
149
|
+
taskId,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
// 鈹€鈹€ Music (Lyria 3 鈥?generateContent 鈫?inlineData audio) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
153
|
+
async generateMusic(request, apiKey, signal) {
|
|
154
|
+
const start = Date.now();
|
|
155
|
+
const url = `${this.apiBase}/models/${request.model}:generateContent`;
|
|
156
|
+
const parts = [{ text: request.prompt }];
|
|
157
|
+
const generationConfig = {};
|
|
158
|
+
// Lyria defaults to AUDIO+TEXT, but we can be explicit
|
|
159
|
+
if (request.audioFormat === "wav") {
|
|
160
|
+
generationConfig.responseModalities = ["AUDIO", "TEXT"];
|
|
161
|
+
generationConfig.responseFormat = { audio: { mimeType: "audio/wav" } };
|
|
162
|
+
}
|
|
163
|
+
const body = {
|
|
164
|
+
contents: [{ parts }],
|
|
165
|
+
};
|
|
166
|
+
if (Object.keys(generationConfig).length > 0)
|
|
167
|
+
body.generationConfig = generationConfig;
|
|
168
|
+
const data = await this.postJson(url, body, apiKey, signal);
|
|
169
|
+
const mediaUrls = this.extractInlineAudio(data, "gemini-music");
|
|
170
|
+
return {
|
|
171
|
+
mediaUrls,
|
|
172
|
+
model: request.model,
|
|
173
|
+
durationMs: Date.now() - start,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
// 鈹€鈹€ Music RealTime (Lyria RealTime 鈥?WebSocket streaming session) 鈹€鈹€鈹€鈹€
|
|
177
|
+
async generateMusicRealtime(request, apiKey, signal) {
|
|
178
|
+
const result = await generateRealtimeMusic(apiKey, { baseUrl: this.apiBase }, {
|
|
179
|
+
prompts: [{ text: request.prompt, weight: 1.0 }],
|
|
180
|
+
durationSeconds: request.duration ?? 30,
|
|
181
|
+
musicConfig: {
|
|
182
|
+
audioFormat: request.audioFormat === "wav" ? "pcm16" : undefined,
|
|
183
|
+
sampleRateHz: 48000,
|
|
184
|
+
},
|
|
185
|
+
model: request.model,
|
|
186
|
+
signal,
|
|
187
|
+
onProgress: request.onProgress,
|
|
188
|
+
});
|
|
189
|
+
return {
|
|
190
|
+
mediaUrls: [result.filePath],
|
|
191
|
+
model: request.model,
|
|
192
|
+
durationMs: result.durationMs,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
// 鈹€鈹€ TTS (generateContent with speechConfig 鈫?PCM audio) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
196
|
+
async generateTTS(request, apiKey, signal) {
|
|
197
|
+
const start = Date.now();
|
|
198
|
+
const url = `${this.apiBase}/models/${request.model}:generateContent`;
|
|
199
|
+
const generationConfig = {
|
|
200
|
+
responseModalities: ["AUDIO"],
|
|
201
|
+
};
|
|
202
|
+
// Build speechConfig
|
|
203
|
+
if (request.voice) {
|
|
204
|
+
generationConfig.speechConfig = {
|
|
205
|
+
voiceConfig: {
|
|
206
|
+
prebuiltVoiceConfig: { voiceName: request.voice },
|
|
207
|
+
},
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
const body = {
|
|
211
|
+
contents: [{ parts: [{ text: request.text ?? request.prompt }] }],
|
|
212
|
+
generationConfig,
|
|
213
|
+
};
|
|
214
|
+
const data = await this.postJson(url, body, apiKey, signal);
|
|
215
|
+
const mediaUrls = this.extractInlineAudio(data, "gemini-tts");
|
|
216
|
+
return {
|
|
217
|
+
mediaUrls,
|
|
218
|
+
model: request.model,
|
|
219
|
+
durationMs: Date.now() - start,
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
// 鈹€鈹€ Embedding (embedContent 鈫?float vector) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
223
|
+
async generateEmbedding(request, apiKey, signal) {
|
|
224
|
+
const start = Date.now();
|
|
225
|
+
const url = `${this.apiBase}/models/${request.model}:embedContent`;
|
|
226
|
+
const body = {
|
|
227
|
+
content: { parts: [{ text: request.text ?? request.prompt }] },
|
|
228
|
+
};
|
|
229
|
+
// Support output_dimensionality via metadata
|
|
230
|
+
const dims = request.metadata?.outputDimensionality;
|
|
231
|
+
if (dims !== undefined)
|
|
232
|
+
body.output_dimensionality = dims;
|
|
233
|
+
const data = await this.postJson(url, body, apiKey, signal);
|
|
234
|
+
const values = data.embedding?.values ?? data.embeddings?.[0]?.values ?? [];
|
|
235
|
+
return {
|
|
236
|
+
mediaUrls: [],
|
|
237
|
+
model: request.model,
|
|
238
|
+
durationMs: Date.now() - start,
|
|
239
|
+
metadata: { embedding: values, dimensions: values.length },
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
// 鈹€鈹€ Shared helpers 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
243
|
+
async postJson(url, body, apiKey, signal) {
|
|
244
|
+
const res = await fetch(url, {
|
|
245
|
+
method: "POST",
|
|
246
|
+
headers: {
|
|
247
|
+
"Content-Type": "application/json",
|
|
248
|
+
"x-goog-api-key": apiKey,
|
|
249
|
+
},
|
|
250
|
+
body: JSON.stringify(body),
|
|
251
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
252
|
+
});
|
|
253
|
+
if (!res.ok) {
|
|
254
|
+
const text = await res.text().catch(() => "");
|
|
255
|
+
throw new Error(`Gemini API error ${res.status}: ${text}`);
|
|
256
|
+
}
|
|
257
|
+
return res.json();
|
|
258
|
+
}
|
|
259
|
+
async pollOperation(operationName, apiKey, signal, onProgress, taskId) {
|
|
260
|
+
const deadline = Date.now() + VIDEO_MAX_POLL_MS;
|
|
261
|
+
while (Date.now() < deadline) {
|
|
262
|
+
signal?.throwIfAborted();
|
|
263
|
+
const url = `${this.apiBase}/${operationName}`;
|
|
264
|
+
const res = await fetch(url, {
|
|
265
|
+
method: "GET",
|
|
266
|
+
headers: { "x-goog-api-key": apiKey },
|
|
267
|
+
signal: signal ?? AbortSignal.timeout(this.timeoutMs),
|
|
268
|
+
});
|
|
269
|
+
if (!res.ok) {
|
|
270
|
+
const text = await res.text().catch(() => "");
|
|
271
|
+
throw new Error(`Gemini operation poll error ${res.status}: ${text}`);
|
|
272
|
+
}
|
|
273
|
+
const data = await res.json();
|
|
274
|
+
if (data.done === true) {
|
|
275
|
+
onProgress?.(100, "completed", taskId);
|
|
276
|
+
return data;
|
|
277
|
+
}
|
|
278
|
+
// Check for error
|
|
279
|
+
if (data.error) {
|
|
280
|
+
const errObj = data.error;
|
|
281
|
+
throw new Error(`Gemini video generation failed: ${errObj.message ?? JSON.stringify(errObj)}`);
|
|
282
|
+
}
|
|
283
|
+
// Estimate progress from elapsed time
|
|
284
|
+
const elapsed = Date.now() - (deadline - VIDEO_MAX_POLL_MS);
|
|
285
|
+
const estimatedPercent = Math.min(95, Math.round((elapsed / VIDEO_MAX_POLL_MS) * 100));
|
|
286
|
+
onProgress?.(estimatedPercent, "running", taskId);
|
|
287
|
+
await new Promise(r => setTimeout(r, VIDEO_POLL_INTERVAL_MS));
|
|
288
|
+
}
|
|
289
|
+
throw new Error("Gemini video generation timed out after polling");
|
|
290
|
+
}
|
|
291
|
+
/**
|
|
292
|
+
* Resolve an image URL to inline data for the Veo API.
|
|
293
|
+
* Supports file:// paths and https:// URLs.
|
|
294
|
+
*/
|
|
295
|
+
async resolveImageData(imageUrl) {
|
|
296
|
+
if (imageUrl.startsWith("data:")) {
|
|
297
|
+
// data:image/png;base64,...
|
|
298
|
+
const match = imageUrl.match(/^data:([^;]+);base64,(.+)$/);
|
|
299
|
+
if (match)
|
|
300
|
+
return { mimeType: match[1], data: match[2] };
|
|
301
|
+
}
|
|
302
|
+
if (imageUrl.startsWith("file://")) {
|
|
303
|
+
const { readFileSync } = await import("node:fs");
|
|
304
|
+
const filePath = imageUrl.replace(/^file:\/\//, "");
|
|
305
|
+
const data = readFileSync(filePath).toString("base64");
|
|
306
|
+
const ext = filePath.split(".").pop()?.toLowerCase() ?? "png";
|
|
307
|
+
const mimeType = ext === "jpg" || ext === "jpeg" ? "image/jpeg"
|
|
308
|
+
: ext === "webp" ? "image/webp" : "image/png";
|
|
309
|
+
return { mimeType, data };
|
|
310
|
+
}
|
|
311
|
+
// Fetch from URL
|
|
312
|
+
const res = await fetch(imageUrl, { signal: AbortSignal.timeout(60_000) });
|
|
313
|
+
if (!res.ok)
|
|
314
|
+
throw new Error(`Failed to fetch image ${imageUrl}: ${res.status}`);
|
|
315
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
316
|
+
const mimeType = res.headers.get("content-type") ?? "image/png";
|
|
317
|
+
return { mimeType, data: buf.toString("base64") };
|
|
318
|
+
}
|
|
319
|
+
/** Extract base64 image data from generateContent response 鈫?persist to cache files. */
|
|
320
|
+
extractInlineImages(data) {
|
|
321
|
+
const mediaUrls = [];
|
|
322
|
+
const cacheDir = join(getUserCacheDir(), "gemini-images");
|
|
323
|
+
mkdirSync(cacheDir, { recursive: true });
|
|
324
|
+
for (const candidate of data.candidates ?? []) {
|
|
325
|
+
for (const part of candidate.content?.parts ?? []) {
|
|
326
|
+
if (part.inlineData?.data) {
|
|
327
|
+
const ext = part.inlineData.mimeType === "image/png" ? "png"
|
|
328
|
+
: part.inlineData.mimeType === "image/webp" ? "webp" : "jpg";
|
|
329
|
+
const filename = `gemini-${randomUUID()}.${ext}`;
|
|
330
|
+
const filePath = join(cacheDir, filename);
|
|
331
|
+
writeFileSync(filePath, Buffer.from(part.inlineData.data, "base64"));
|
|
332
|
+
mediaUrls.push(`file://${filePath}`);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
return mediaUrls;
|
|
337
|
+
}
|
|
338
|
+
/** Extract base64 audio data from generateContent response 鈫?persist to cache files. */
|
|
339
|
+
extractInlineAudio(data, subDir) {
|
|
340
|
+
const mediaUrls = [];
|
|
341
|
+
const cacheDir = join(getUserCacheDir(), subDir);
|
|
342
|
+
mkdirSync(cacheDir, { recursive: true });
|
|
343
|
+
for (const candidate of data.candidates ?? []) {
|
|
344
|
+
for (const part of candidate.content?.parts ?? []) {
|
|
345
|
+
if (part.inlineData?.data && part.inlineData.mimeType?.startsWith("audio/")) {
|
|
346
|
+
const ext = part.inlineData.mimeType.includes("wav") ? "wav"
|
|
347
|
+
: part.inlineData.mimeType.includes("mpeg") || part.inlineData.mimeType.includes("mp3") ? "mp3"
|
|
348
|
+
: "pcm";
|
|
349
|
+
const rawBuf = Buffer.from(part.inlineData.data, "base64");
|
|
350
|
+
// For raw PCM (TTS output), wrap in a WAV header
|
|
351
|
+
const audioBuffer = ext === "pcm" ? wrapPcmAsWav(rawBuf, 24000, 1, 16) : rawBuf;
|
|
352
|
+
const filename = `gemini-${randomUUID()}.${ext === "pcm" ? "wav" : ext}`;
|
|
353
|
+
const filePath = join(cacheDir, filename);
|
|
354
|
+
writeFileSync(filePath, audioBuffer);
|
|
355
|
+
mediaUrls.push(`file://${filePath}`);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
return mediaUrls;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
// 鈹€鈹€ WAV helper 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
363
|
+
/** Wrap raw PCM s16le data in a minimal WAV header. */
|
|
364
|
+
function wrapPcmAsWav(pcm, sampleRate, channels, bitsPerSample) {
|
|
365
|
+
const byteRate = sampleRate * channels * (bitsPerSample / 8);
|
|
366
|
+
const blockAlign = channels * (bitsPerSample / 8);
|
|
367
|
+
const dataSize = pcm.length;
|
|
368
|
+
const header = Buffer.alloc(44);
|
|
369
|
+
header.write("RIFF", 0);
|
|
370
|
+
header.writeUInt32LE(36 + dataSize, 4);
|
|
371
|
+
header.write("WAVE", 8);
|
|
372
|
+
header.write("fmt ", 12);
|
|
373
|
+
header.writeUInt32LE(16, 16); // SubChunk1Size (PCM)
|
|
374
|
+
header.writeUInt16LE(1, 20); // AudioFormat (PCM)
|
|
375
|
+
header.writeUInt16LE(channels, 22);
|
|
376
|
+
header.writeUInt32LE(sampleRate, 24);
|
|
377
|
+
header.writeUInt32LE(byteRate, 28);
|
|
378
|
+
header.writeUInt16LE(blockAlign, 30);
|
|
379
|
+
header.writeUInt16LE(bitsPerSample, 32);
|
|
380
|
+
header.write("data", 36);
|
|
381
|
+
header.writeUInt32LE(dataSize, 40);
|
|
382
|
+
return Buffer.concat([header, pcm]);
|
|
383
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Media URL Resolution 鈥?resolves local/private URLs for cloud LLM APIs.
|
|
3
|
+
*
|
|
4
|
+
* Cloud LLM APIs (OpenAI, Anthropic, DeepSeek, etc.) cannot fetch content
|
|
5
|
+
* from localhost or private networks.
|
|
6
|
+
*
|
|
7
|
+
* **Primary approach (upload-based):**
|
|
8
|
+
* Local URLs are fetched 鈫?uploaded to provider's File API or Aliyun OSS 鈫?public URL returned.
|
|
9
|
+
* This is the ONLY approach for images, video, and documents.
|
|
10
|
+
*
|
|
11
|
+
* **Audio exception (base64):**
|
|
12
|
+
* OpenAI's input_audio.data API field mandates base64 encoding.
|
|
13
|
+
* resolveMediaUrl() is retained ONLY for this case.
|
|
14
|
+
*
|
|
15
|
+
* URL-first design: the gateway stores media as HTTP URLs; this layer handles
|
|
16
|
+
* the last-mile transformation before sending to provider APIs.
|
|
17
|
+
*/
|
|
18
|
+
import type { FileUploadAdapter } from "../file-upload-service.js";
|
|
19
|
+
/** Check if a URL points to a local/private address that cloud APIs cannot reach. */
|
|
20
|
+
export declare function isLocalUrl(url: string): boolean;
|
|
21
|
+
export interface MediaResolveContext {
|
|
22
|
+
/** Provider-specific upload adapter. */
|
|
23
|
+
uploadAdapter: FileUploadAdapter;
|
|
24
|
+
/** API key for the upload. */
|
|
25
|
+
apiKey: string;
|
|
26
|
+
/** Abort signal. */
|
|
27
|
+
signal?: AbortSignal;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Resolve a local URL by uploading to the provider's File API.
|
|
31
|
+
* This is the **preferred** method for all media types.
|
|
32
|
+
*
|
|
33
|
+
* Public URLs pass through unchanged.
|
|
34
|
+
* Local URLs are uploaded 鈫?public URL or file_id returned.
|
|
35
|
+
*/
|
|
36
|
+
export declare function resolveMediaUrlViaUpload(url: string, ctx: MediaResolveContext): Promise<string>;
|
|
37
|
+
/**
|
|
38
|
+
* Batch-resolve URLs via upload. Best-effort: failures return original URL.
|
|
39
|
+
*/
|
|
40
|
+
export declare function resolveMediaUrlsViaUpload(urls: string[], ctx: MediaResolveContext): Promise<string[]>;
|
|
41
|
+
/**
|
|
42
|
+
* Resolve local audio URL to base64 data URL.
|
|
43
|
+
*
|
|
44
|
+
* ONLY for audio 鈥?OpenAI's input_audio.data API field mandates base64 encoding.
|
|
45
|
+
* There is no upload alternative for audio in the OpenAI API.
|
|
46
|
+
*
|
|
47
|
+
* For images/video/documents: ALWAYS use resolveMediaUrlViaUpload() instead.
|
|
48
|
+
* base64 is forbidden for non-audio media per project architecture rules.
|
|
49
|
+
*/
|
|
50
|
+
export declare function resolveMediaUrl(url: string, fallbackMime?: string): Promise<string>;
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Media URL Resolution 鈥?resolves local/private URLs for cloud LLM APIs.
|
|
3
|
+
*
|
|
4
|
+
* Cloud LLM APIs (OpenAI, Anthropic, DeepSeek, etc.) cannot fetch content
|
|
5
|
+
* from localhost or private networks.
|
|
6
|
+
*
|
|
7
|
+
* **Primary approach (upload-based):**
|
|
8
|
+
* Local URLs are fetched 鈫?uploaded to provider's File API or Aliyun OSS 鈫?public URL returned.
|
|
9
|
+
* This is the ONLY approach for images, video, and documents.
|
|
10
|
+
*
|
|
11
|
+
* **Audio exception (base64):**
|
|
12
|
+
* OpenAI's input_audio.data API field mandates base64 encoding.
|
|
13
|
+
* resolveMediaUrl() is retained ONLY for this case.
|
|
14
|
+
*
|
|
15
|
+
* URL-first design: the gateway stores media as HTTP URLs; this layer handles
|
|
16
|
+
* the last-mile transformation before sending to provider APIs.
|
|
17
|
+
*/
|
|
18
|
+
import { resolveLocalMedia } from "../file-upload-service.js";
|
|
19
|
+
const LOCAL_URL_RE = /^https?:\/\/(127\.0\.0\.1|localhost|0\.0\.0\.0|\[::1\])(:\d+)?/i;
|
|
20
|
+
/** Check if a URL points to a local/private address that cloud APIs cannot reach. */
|
|
21
|
+
export function isLocalUrl(url) {
|
|
22
|
+
return LOCAL_URL_RE.test(url);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Resolve a local URL by uploading to the provider's File API.
|
|
26
|
+
* This is the **preferred** method for all media types.
|
|
27
|
+
*
|
|
28
|
+
* Public URLs pass through unchanged.
|
|
29
|
+
* Local URLs are uploaded 鈫?public URL or file_id returned.
|
|
30
|
+
*/
|
|
31
|
+
export async function resolveMediaUrlViaUpload(url, ctx) {
|
|
32
|
+
return resolveLocalMedia(url, ctx.uploadAdapter, ctx.apiKey, ctx.signal);
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Batch-resolve URLs via upload. Best-effort: failures return original URL.
|
|
36
|
+
*/
|
|
37
|
+
export async function resolveMediaUrlsViaUpload(urls, ctx) {
|
|
38
|
+
return Promise.all(urls.map(async (url) => {
|
|
39
|
+
try {
|
|
40
|
+
return await resolveMediaUrlViaUpload(url, ctx);
|
|
41
|
+
}
|
|
42
|
+
catch {
|
|
43
|
+
return url;
|
|
44
|
+
}
|
|
45
|
+
}));
|
|
46
|
+
}
|
|
47
|
+
// 鈹€鈹€ Legacy base64 Resolution (fallback only) 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
48
|
+
/** Maximum file size for base64 fallback (20MB). Beyond this, upload is required. */
|
|
49
|
+
const BASE64_MAX_BYTES = 20 * 1024 * 1024;
|
|
50
|
+
/**
|
|
51
|
+
* Resolve local audio URL to base64 data URL.
|
|
52
|
+
*
|
|
53
|
+
* ONLY for audio 鈥?OpenAI's input_audio.data API field mandates base64 encoding.
|
|
54
|
+
* There is no upload alternative for audio in the OpenAI API.
|
|
55
|
+
*
|
|
56
|
+
* For images/video/documents: ALWAYS use resolveMediaUrlViaUpload() instead.
|
|
57
|
+
* base64 is forbidden for non-audio media per project architecture rules.
|
|
58
|
+
*/
|
|
59
|
+
export async function resolveMediaUrl(url, fallbackMime) {
|
|
60
|
+
if (url.startsWith("data:"))
|
|
61
|
+
return url;
|
|
62
|
+
if (!isLocalUrl(url))
|
|
63
|
+
return url;
|
|
64
|
+
const res = await fetch(url);
|
|
65
|
+
if (!res.ok) {
|
|
66
|
+
throw new Error(`Failed to fetch local media ${url}: ${res.status}`);
|
|
67
|
+
}
|
|
68
|
+
const buffer = Buffer.from(await res.arrayBuffer());
|
|
69
|
+
if (buffer.byteLength > BASE64_MAX_BYTES) {
|
|
70
|
+
throw new Error(`Local media too large for base64 (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB). ` +
|
|
71
|
+
`Use file upload API instead.`);
|
|
72
|
+
}
|
|
73
|
+
const contentType = res.headers.get("content-type") || fallbackMime || guessMimeFromExt(url);
|
|
74
|
+
return `data:${contentType};base64,${buffer.toString("base64")}`;
|
|
75
|
+
}
|
|
76
|
+
// 鈹€鈹€ Helpers 鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€鈹€
|
|
77
|
+
function guessMimeFromExt(url) {
|
|
78
|
+
const clean = url.split("?")[0].split("#")[0];
|
|
79
|
+
const ext = clean.slice(clean.lastIndexOf(".")).toLowerCase();
|
|
80
|
+
const map = {
|
|
81
|
+
".png": "image/png", ".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
|
82
|
+
".gif": "image/gif", ".webp": "image/webp", ".svg": "image/svg+xml",
|
|
83
|
+
".bmp": "image/bmp", ".avif": "image/avif", ".heic": "image/heic",
|
|
84
|
+
".mp4": "video/mp4", ".webm": "video/webm", ".mov": "video/quicktime",
|
|
85
|
+
".mp3": "audio/mpeg", ".wav": "audio/wav", ".ogg": "audio/ogg",
|
|
86
|
+
".aac": "audio/aac", ".flac": "audio/flac", ".m4a": "audio/mp4",
|
|
87
|
+
".opus": "audio/opus",
|
|
88
|
+
".pdf": "application/pdf",
|
|
89
|
+
};
|
|
90
|
+
return map[ext] || "application/octet-stream";
|
|
91
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MiniMax Media Transport 鈥?Music + Video + TTS Generation.
|
|
3
|
+
*
|
|
4
|
+
* Music: POST /v1/music_generation (sync or async poll)
|
|
5
|
+
* Video: POST /v1/video_generation (4 modes: text, image, first-last-frame, subject-ref)
|
|
6
|
+
* Video Query: GET /v1/query/video_generation?task_id=XXX
|
|
7
|
+
* File Retrieve: GET /v1/files/retrieve?file_id=XXX (get download_url)
|
|
8
|
+
*
|
|
9
|
+
* Auth: Authorization: Bearer $MINIMAX_API_KEY
|
|
10
|
+
* Docs: minimax-ProviderMax.md 搂13-18 (video), 搂21 (music), 搂24-28 (files)
|
|
11
|
+
*/
|
|
12
|
+
import type { AsyncMediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
|
|
13
|
+
export interface MiniMaxMediaConfig {
|
|
14
|
+
/** Base URL, e.g. "https://api.minimaxi.com" */
|
|
15
|
+
baseUrl: string;
|
|
16
|
+
timeoutMs?: number;
|
|
17
|
+
}
|
|
18
|
+
export declare class MiniMaxMediaTransport implements AsyncMediaTransport {
|
|
19
|
+
readonly supportedTypes: readonly MediaType[];
|
|
20
|
+
private baseUrl;
|
|
21
|
+
private timeoutMs;
|
|
22
|
+
constructor(config: MiniMaxMediaConfig);
|
|
23
|
+
generate(request: MediaRequest, apiKey: string, signal?: AbortSignal): Promise<MediaResult>;
|
|
24
|
+
private generateTTS;
|
|
25
|
+
private generateMusic;
|
|
26
|
+
private pollTask;
|
|
27
|
+
/**
|
|
28
|
+
* Generate lyrics from a text prompt via MiniMax Lyrics Generation API.
|
|
29
|
+
* POST /v1/lyrics_generation 鈥?returns structured lyrics with tags.
|
|
30
|
+
*/
|
|
31
|
+
generateLyrics(prompt: string, apiKey: string, signal?: AbortSignal): Promise<string>;
|
|
32
|
+
private generateVideo;
|
|
33
|
+
private pollVideoTask;
|
|
34
|
+
private getFileDownloadUrl;
|
|
35
|
+
/**
|
|
36
|
+
* Query a single video task by ID.
|
|
37
|
+
* GET /v1/query/video_generation?task_id=XXX
|
|
38
|
+
*/
|
|
39
|
+
getTaskStatus(taskId: string, apiKey: string, signal?: AbortSignal): Promise<{
|
|
40
|
+
status: string;
|
|
41
|
+
task: Record<string, unknown>;
|
|
42
|
+
}>;
|
|
43
|
+
/**
|
|
44
|
+
* List tasks 鈥?MiniMax does not have a bulk list endpoint.
|
|
45
|
+
* Each task must be queried individually with getTaskStatus().
|
|
46
|
+
*/
|
|
47
|
+
listVideoTasks(_apiKey: string, _options?: {
|
|
48
|
+
after?: string;
|
|
49
|
+
limit?: number;
|
|
50
|
+
status?: string;
|
|
51
|
+
}, _signal?: AbortSignal): Promise<Record<string, unknown>>;
|
|
52
|
+
/**
|
|
53
|
+
* Cancel/delete is not natively supported by MiniMax video API.
|
|
54
|
+
*/
|
|
55
|
+
deleteVideoTask(_taskId: string, _apiKey: string, _signal?: AbortSignal): Promise<void>;
|
|
56
|
+
}
|