qlogicagent 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/dist/agent.js +8 -6
  2. package/dist/cli.js +258 -214
  3. package/dist/index.js +258 -214
  4. package/dist/orchestration.js +12 -9
  5. package/dist/types/agent/tool-loop.d.ts +22 -0
  6. package/dist/types/agent/types.d.ts +32 -0
  7. package/dist/types/cli/stdio-server.d.ts +96 -1
  8. package/dist/types/cli/tool-bootstrap.d.ts +8 -1
  9. package/dist/types/llm/gemini-schema-utils.d.ts +17 -0
  10. package/dist/types/llm/index.d.ts +11 -2
  11. package/dist/types/llm/media-transport.d.ts +28 -3
  12. package/dist/types/llm/model-detection.d.ts +22 -0
  13. package/dist/types/llm/provider-def.d.ts +17 -4
  14. package/dist/types/llm/transport.d.ts +60 -2
  15. package/dist/types/llm/transports/gemini-cache-api.d.ts +86 -0
  16. package/dist/types/llm/transports/gemini-file-api.d.ts +90 -0
  17. package/dist/types/llm/transports/gemini-generatecontent.d.ts +52 -0
  18. package/dist/types/llm/transports/gemini-lyria-realtime.d.ts +117 -0
  19. package/dist/types/llm/transports/gemini-media.d.ts +40 -8
  20. package/dist/types/llm/transports/minimax-media.d.ts +34 -5
  21. package/dist/types/llm/transports/openai-responses.d.ts +60 -0
  22. package/dist/types/llm/transports/qwen-media.d.ts +32 -7
  23. package/dist/types/llm/transports/realtime-transport.d.ts +1 -1
  24. package/dist/types/llm/transports/volcengine-media.d.ts +10 -2
  25. package/dist/types/llm/transports/zhipu-media.d.ts +24 -2
  26. package/dist/types/orchestration/agent-instance.d.ts +58 -0
  27. package/dist/types/orchestration/dag-scheduler.d.ts +72 -0
  28. package/dist/types/orchestration/product-budget.d.ts +56 -0
  29. package/dist/types/orchestration/product-checkpoint.d.ts +46 -0
  30. package/dist/types/orchestration/product-persistence.d.ts +40 -0
  31. package/dist/types/orchestration/product-worktree.d.ts +13 -0
  32. package/dist/types/orchestration/solo-evaluator.d.ts +59 -0
  33. package/dist/types/orchestration/subagent/fork-subagent.d.ts +2 -0
  34. package/dist/types/orchestration/subagent/task-types.d.ts +4 -0
  35. package/dist/types/orchestration/tool-cascade.d.ts +2 -2
  36. package/dist/types/protocol/methods.d.ts +92 -0
  37. package/dist/types/protocol/notifications.d.ts +162 -0
  38. package/dist/types/runtime/infra/acp-detector.d.ts +36 -0
  39. package/dist/types/runtime/infra/acp-detector.test.d.ts +1 -0
  40. package/dist/types/runtime/infra/acp-protocol-adapter.d.ts +73 -0
  41. package/dist/types/runtime/infra/acp-protocol-adapter.test.d.ts +1 -0
  42. package/dist/types/runtime/infra/acp-types.d.ts +397 -0
  43. package/dist/types/runtime/infra/acp-usage-tracker.d.ts +46 -0
  44. package/dist/types/runtime/infra/acp-usage-tracker.test.d.ts +1 -0
  45. package/dist/types/runtime/infra/agent-config-store.d.ts +30 -0
  46. package/dist/types/runtime/infra/agent-config-store.test.d.ts +1 -0
  47. package/dist/types/runtime/infra/agent-paths.d.ts +8 -0
  48. package/dist/types/runtime/infra/agent-process.d.ts +280 -0
  49. package/dist/types/runtime/infra/agent-process.test.d.ts +1 -0
  50. package/dist/types/runtime/infra/index.d.ts +10 -0
  51. package/dist/types/runtime/infra/mcp-bridge.d.ts +166 -0
  52. package/dist/types/runtime/infra/mcp-bridge.test.d.ts +1 -0
  53. package/dist/types/runtime/infra/model-id-translator.d.ts +22 -0
  54. package/dist/types/runtime/infra/model-id-translator.test.d.ts +1 -0
  55. package/dist/types/runtime/infra/skill-injector.d.ts +51 -0
  56. package/dist/types/runtime/infra/skill-injector.test.d.ts +1 -0
  57. package/dist/types/runtime/infra/worktree-backend.d.ts +1 -0
  58. package/dist/types/runtime/prompt/environment-context.d.ts +6 -0
  59. package/dist/types/runtime/session/session-persistence.d.ts +9 -8
  60. package/dist/types/runtime/session/session-state.d.ts +3 -31
  61. package/dist/types/skills/index.d.ts +2 -10
  62. package/dist/types/skills/tools/skill-tool.d.ts +101 -0
  63. package/dist/types/skills/tools/team-tool.d.ts +23 -1
  64. package/package.json +1 -1
  65. package/dist/types/runtime/session/session-memory.d.ts +0 -90
  66. package/dist/types/skills/memory/memory-extractor.d.ts +0 -64
@@ -9,12 +9,12 @@
9
9
  * Layer 2: model-catalog.ts remote (models.dev)
10
10
  * Layer 3: user config (from agent.turn.config)
11
11
  */
12
- export type TransportType = "openai-chat" | "anthropic-messages" | "volcengine-responses";
12
+ export type TransportType = "openai-chat" | "openai-responses" | "anthropic-messages" | "volcengine-responses" | "gemini-generatecontent";
13
13
  export type AuthType = "bearer" | "x-api-key" | "none";
14
- export type MediaCapability = "image" | "video" | "music" | "tts" | "3d" | "stt" | "embedding" | "video_understanding" | "image_understanding" | "voice_clone" | "rerank" | "document_parsing";
14
+ export type MediaCapability = "image" | "video" | "music" | "music_realtime" | "tts" | "3d" | "stt" | "embedding" | "video_understanding" | "image_understanding" | "voice_clone" | "rerank" | "document_parsing" | "realtime_audio";
15
15
  export type VideoOperation = "text2video" | "img2video" | "video2video" | "edit" | "merge" | "upscale";
16
16
  export type ImageOperation = "text2image" | "img2img" | "inpainting" | "outpainting";
17
- export type MusicOperation = "text2music" | "cover";
17
+ export type MusicOperation = "text2music" | "cover" | "realtime";
18
18
  export type TtsOperation = "text2speech" | "voice_clone";
19
19
  export type ThreeDOperation = "text2_3d" | "img2_3d";
20
20
  export interface VideoCapabilities {
@@ -87,7 +87,14 @@ export interface DocumentParsingCapabilities {
87
87
  maxPageCount?: number;
88
88
  maxFileSizeMB?: number;
89
89
  }
90
- export type MediaCapabilities = VideoCapabilities | ImageCapabilities | MusicCapabilities | TtsCapabilities | ThreeDCapabilities | SttCapabilities | EmbeddingCapabilities | VideoUnderstandingCapabilities | ImageUnderstandingCapabilities | VoiceCloneCapabilities | RerankCapabilities | DocumentParsingCapabilities;
90
+ export interface RealtimeAudioCapabilities {
91
+ type: "realtime_audio";
92
+ voices?: string[];
93
+ modalities?: Array<"text" | "audio">;
94
+ vad?: boolean;
95
+ toolCalling?: boolean;
96
+ }
97
+ export type MediaCapabilities = VideoCapabilities | ImageCapabilities | MusicCapabilities | TtsCapabilities | ThreeDCapabilities | SttCapabilities | EmbeddingCapabilities | VideoUnderstandingCapabilities | ImageUnderstandingCapabilities | VoiceCloneCapabilities | RerankCapabilities | DocumentParsingCapabilities | RealtimeAudioCapabilities;
91
98
  /**
92
99
  * Provider-specific quirks — drives conditional logic in transports.
93
100
  * CC parity: provider detection via quirks flags instead of hardcoded if/else.
@@ -106,6 +113,12 @@ export interface ProviderQuirks {
106
113
  builtinWebSearch?: boolean;
107
114
  /** Provider has built-in code interpreter */
108
115
  builtinCodeInterpreter?: boolean;
116
+ /** Provider supports native URL context fetching (Gemini urlContext tool) */
117
+ builtinUrlContext?: boolean;
118
+ /** Provider supports Google Maps Grounding (Gemini googleMaps tool) */
119
+ builtinMapsGrounding?: boolean;
120
+ /** Provider supports native file search (Gemini fileSearch tool) */
121
+ builtinFileSearch?: boolean;
109
122
  /** Supports thinking.type="enabled"/"disabled" body param (Kimi K2, GLM).
110
123
  * Disambiguation: GLM also sets supportsToolStream; Kimi does not. */
111
124
  supportsThinkingParam?: boolean;
@@ -57,7 +57,7 @@ export interface LLMRequest {
57
57
  topP?: number;
58
58
  maxTokens?: number;
59
59
  reasoning?: {
60
- effort: "minimal" | "low" | "medium" | "high";
60
+ effort: "minimal" | "low" | "medium" | "high" | "xhigh";
61
61
  /** Request encrypted original reasoning content (Volcengine §17.7). */
62
62
  includeEncryptedReasoning?: boolean;
63
63
  };
@@ -123,6 +123,64 @@ export interface LLMRequest {
123
123
  * Server-side trimming of historical thinking chains and tool call traces.
124
124
  */
125
125
  contextManagement?: ContextManagementConfig;
126
+ /**
127
+ * Gemini explicit cache reference (gemini-ProviderMax §8).
128
+ * Passes a pre-created cache name (e.g. "cachedContents/abc123") to
129
+ * generateContent so the server uses cached tokens instead of re-processing.
130
+ * Create caches via GeminiCacheAPI.createCache() first.
131
+ */
132
+ cachedContent?: string;
133
+ /**
134
+ * Predicted output for speculative decoding (openai-ProviderMax §11).
135
+ * When editing code, pass the existing content so the model can diff efficiently.
136
+ * Reduces latency by 3-5x when prediction matches. Falls back when it doesn't.
137
+ * Works with OpenAI GPT-5.x models via Responses API and Chat Completions.
138
+ */
139
+ prediction?: {
140
+ type: "content";
141
+ content: string;
142
+ };
143
+ /**
144
+ * Prompt cache bucketing key (openai-ProviderMax §11).
145
+ * Replaces the deprecated `user` field. Helps OpenAI group similar requests
146
+ * for higher cache hit rates.
147
+ */
148
+ promptCacheKey?: string;
149
+ /**
150
+ * Prompt cache retention policy (openai-ProviderMax §11).
151
+ * "in_memory" = default 5-10 min, "24h" = extended up to 24 hours.
152
+ */
153
+ promptCacheRetention?: "in_memory" | "24h";
154
+ /**
155
+ * Service tier for request scheduling (openai-ProviderMax §14).
156
+ * "auto" = project default, "flex" = 50% cheaper / higher latency,
157
+ * "priority" = guaranteed low latency.
158
+ */
159
+ serviceTier?: "auto" | "default" | "flex" | "priority";
160
+ /**
161
+ * OpenAI Responses API built-in tools (openai-ProviderMax §7).
162
+ * Platform-executed tools like web_search, file_search, code_interpreter, etc.
163
+ */
164
+ openaiBuiltinTools?: Array<{
165
+ type: "web_search_preview" | "file_search" | "code_interpreter" | "computer_use_preview";
166
+ [key: string]: unknown;
167
+ }>;
168
+ /**
169
+ * OpenAI Responses API conversation ID (openai-ProviderMax §2.1).
170
+ * Alternative to previous_response_id — persistent server-side conversation.
171
+ * Cannot be used together with previousResponseId.
172
+ */
173
+ conversationId?: string;
174
+ /**
175
+ * Disable parallel tool calling for this request.
176
+ * When false, the model must call tools sequentially.
177
+ */
178
+ parallelToolCalls?: boolean;
179
+ /**
180
+ * Text output verbosity hint (openai-ProviderMax §5).
181
+ * Controls how detailed the model's textual output should be.
182
+ */
183
+ textVerbosity?: "low" | "medium" | "high";
126
184
  }
127
185
  /**
128
186
  * FIM completion request — DeepSeek Beta Completions API.
@@ -182,7 +240,7 @@ export type LLMChunk = {
182
240
  event: string;
183
241
  data?: Record<string, unknown>;
184
242
  } | {
185
- /** Web search citation annotations from Volcengine web_search results. */
243
+ /** Web search citation annotations from provider builtin tools (Volcengine web_search, Gemini grounding). */
186
244
  type: "annotations";
187
245
  annotations: Array<{
188
246
  type: string;
@@ -0,0 +1,86 @@
1
+ /**
2
+ * GeminiCacheAPI — Explicit Context Caching for Gemini generateContent.
3
+ *
4
+ * Manages named cached content resources that can be referenced in
5
+ * generateContent requests via the `cachedContent` field.
6
+ *
7
+ * REST endpoints:
8
+ * POST /v1beta/cachedContents — create cache
9
+ * GET /v1beta/cachedContents — list caches
10
+ * GET /v1beta/cachedContents/{name} — get cache metadata
11
+ * PATCH /v1beta/cachedContents/{name} — update TTL
12
+ * DELETE /v1beta/cachedContents/{name} — delete cache
13
+ *
14
+ * Minimum cacheable content: 1024 tokens (Flash) / 4096 tokens (Pro).
15
+ * TTL default: 1 hour. Storage: ~$1.00/hour/MTok (Flash series).
16
+ *
17
+ * Docs: https://ai.google.dev/gemini-api/docs/caching
18
+ */
19
+ export interface GeminiCacheCreateOptions {
20
+ /** Model to use, e.g. "models/gemini-3-flash-preview" */
21
+ model: string;
22
+ /** Contents to cache (same format as generateContent contents) */
23
+ contents: Array<Record<string, unknown>>;
24
+ /** Optional system instruction to include in cache */
25
+ systemInstruction?: Record<string, unknown>;
26
+ /** Time-to-live, e.g. "300s" for 5 minutes. Default: "3600s" (1 hour) */
27
+ ttl?: string;
28
+ /** Human-readable name for identifying the cache */
29
+ displayName?: string;
30
+ }
31
+ export interface GeminiCachedContent {
32
+ /** Resource name, e.g. "cachedContents/abc123" */
33
+ name: string;
34
+ /** Model this cache is bound to */
35
+ model: string;
36
+ /** Display name (if set) */
37
+ displayName?: string;
38
+ /** Token usage metadata */
39
+ usageMetadata?: {
40
+ totalTokenCount?: number;
41
+ };
42
+ /** Creation time (ISO 8601) */
43
+ createTime?: string;
44
+ /** Last update time (ISO 8601) */
45
+ updateTime?: string;
46
+ /** Expiration time (ISO 8601) */
47
+ expireTime?: string;
48
+ }
49
+ export declare class GeminiCacheAPI {
50
+ private baseUrl;
51
+ private timeoutMs;
52
+ constructor(config: {
53
+ baseUrl: string;
54
+ timeoutMs?: number;
55
+ });
56
+ /**
57
+ * Create a new cached content resource.
58
+ * The cache name returned can be passed as `cachedContent` in generateContent.
59
+ */
60
+ createCache(options: GeminiCacheCreateOptions, apiKey: string, signal?: AbortSignal): Promise<GeminiCachedContent>;
61
+ /**
62
+ * Get metadata for a cached content resource.
63
+ */
64
+ getCache(name: string, apiKey: string, signal?: AbortSignal): Promise<GeminiCachedContent>;
65
+ /**
66
+ * List all cached content resources.
67
+ */
68
+ listCaches(apiKey: string, options?: {
69
+ pageSize?: number;
70
+ pageToken?: string;
71
+ }, signal?: AbortSignal): Promise<{
72
+ cachedContents: GeminiCachedContent[];
73
+ nextPageToken?: string;
74
+ }>;
75
+ /**
76
+ * Update a cache's TTL or expiration time.
77
+ */
78
+ updateCache(name: string, update: {
79
+ ttl?: string;
80
+ expireTime?: string;
81
+ }, apiKey: string, signal?: AbortSignal): Promise<GeminiCachedContent>;
82
+ /**
83
+ * Delete a cached content resource.
84
+ */
85
+ deleteCache(name: string, apiKey: string, signal?: AbortSignal): Promise<void>;
86
+ }
@@ -0,0 +1,90 @@
1
+ /**
2
+ * GeminiFileAPI — Gemini File API for uploading and managing files.
3
+ *
4
+ * Files uploaded via this API can be referenced in generateContent requests
5
+ * using `file_data: { file_uri, mime_type }` parts.
6
+ *
7
+ * Upload uses the resumable upload protocol (2-step):
8
+ * 1. POST /upload/v1beta/files → get upload URL (response header)
9
+ * 2. PUT {upload_url} with file bytes → get file info
10
+ *
11
+ * Other operations:
12
+ * GET /v1beta/files — list files
13
+ * GET /v1beta/files/{name} — get file metadata
14
+ * DELETE /v1beta/files/{name} — delete file
15
+ *
16
+ * Files expire after 48 hours. Max 2GB per file, 20GB per project.
17
+ *
18
+ * Docs: https://ai.google.dev/gemini-api/docs/files
19
+ */
20
+ export interface GeminiFileInfo {
21
+ /** Resource name, e.g. "files/abc123" */
22
+ name: string;
23
+ /** Display name (set during upload) */
24
+ displayName?: string;
25
+ /** MIME type */
26
+ mimeType: string;
27
+ /** File size in bytes */
28
+ sizeBytes?: string;
29
+ /** File URI for use in generateContent, e.g. "https://generativelanguage.googleapis.com/v1beta/files/abc123" */
30
+ uri: string;
31
+ /** Processing state: PROCESSING | ACTIVE | FAILED */
32
+ state: string;
33
+ /** Creation time (ISO 8601) */
34
+ createTime?: string;
35
+ /** Last update time (ISO 8601) */
36
+ updateTime?: string;
37
+ /** Expiration time (ISO 8601) */
38
+ expirationTime?: string;
39
+ /** Error info if state is FAILED */
40
+ error?: {
41
+ code: number;
42
+ message: string;
43
+ };
44
+ }
45
+ export declare class GeminiFileAPI {
46
+ private baseUrl;
47
+ private timeoutMs;
48
+ constructor(config: {
49
+ baseUrl: string;
50
+ timeoutMs?: number;
51
+ });
52
+ /**
53
+ * Upload a file using the resumable upload protocol.
54
+ *
55
+ * Step 1: Initiate upload → get upload URL from response header
56
+ * Step 2: Upload bytes to that URL → get file metadata
57
+ *
58
+ * @returns GeminiFileInfo with .uri for use in generateContent
59
+ */
60
+ uploadFile(file: Blob | Buffer, apiKey: string, options?: {
61
+ mimeType?: string;
62
+ displayName?: string;
63
+ }, signal?: AbortSignal): Promise<GeminiFileInfo>;
64
+ /**
65
+ * Wait for a file to finish processing (state → ACTIVE).
66
+ * Some file types (video, audio) require server-side processing.
67
+ */
68
+ waitForProcessing(name: string, apiKey: string, options?: {
69
+ pollIntervalMs?: number;
70
+ maxWaitMs?: number;
71
+ }): Promise<GeminiFileInfo>;
72
+ /**
73
+ * Get metadata for an uploaded file.
74
+ */
75
+ getFile(name: string, apiKey: string, signal?: AbortSignal): Promise<GeminiFileInfo>;
76
+ /**
77
+ * List uploaded files.
78
+ */
79
+ listFiles(apiKey: string, options?: {
80
+ pageSize?: number;
81
+ pageToken?: string;
82
+ }, signal?: AbortSignal): Promise<{
83
+ files: GeminiFileInfo[];
84
+ nextPageToken?: string;
85
+ }>;
86
+ /**
87
+ * Delete an uploaded file.
88
+ */
89
+ deleteFile(name: string, apiKey: string, signal?: AbortSignal): Promise<void>;
90
+ }
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Gemini generateContent Transport — Native Gemini API streaming implementation.
3
+ *
4
+ * Targets Gemini 3 series exclusively (3.1 Pro, 3 Flash, 3.1 Flash-Lite).
5
+ * Uses the native Gemini REST API instead of the OpenAI compatibility layer,
6
+ * unlocking Gemini-exclusive features unavailable via the compat endpoint:
7
+ * - thinkingConfig (thinkingLevel — G3 native control)
8
+ * - Google Search / Maps Grounding
9
+ * - Code Execution
10
+ * - Safety Settings fine-grained control
11
+ * - Thought Signatures for multi-turn reasoning continuity
12
+ * - URL Context / File Search
13
+ * - systemInstruction top-level field
14
+ *
15
+ * Streaming endpoint: POST .../models/{model}:streamGenerateContent?alt=sse
16
+ * Non-streaming: POST .../models/{model}:generateContent
17
+ * Auth: x-goog-api-key header
18
+ *
19
+ * Protocol reference: https://ai.google.dev/gemini-api/docs
20
+ * Aligned with gemini-ProviderMax.md native protocol strategy.
21
+ */
22
+ import type { LLMChunk, LLMRequest, LLMTransport } from "../transport.js";
23
+ import type { ProviderQuirks } from "../provider-def.js";
24
+ export interface GeminiGenerateContentTransportConfig {
25
+ /** Base URL, e.g. "https://generativelanguage.googleapis.com/v1beta" */
26
+ baseUrl: string;
27
+ /** Per-request timeout in ms (default 180_000) */
28
+ timeoutMs?: number;
29
+ /** Provider-specific quirks */
30
+ quirks?: ProviderQuirks;
31
+ }
32
+ export declare class GeminiGenerateContentTransport implements LLMTransport {
33
+ private baseUrl;
34
+ private timeoutMs;
35
+ private quirks;
36
+ constructor(config: GeminiGenerateContentTransportConfig);
37
+ stream(request: LLMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<LLMChunk>;
38
+ private buildRequestBody;
39
+ private buildTools;
40
+ private buildToolConfig;
41
+ private buildGenerationConfig;
42
+ private fetchAndStream;
43
+ /**
44
+ * Parse SSE stream with 90s idle watchdog (CC parity).
45
+ */
46
+ private parseSSEStreamWithWatchdog;
47
+ /**
48
+ * Process a single Gemini response chunk, yielding LLMChunk events.
49
+ */
50
+ private processResponse;
51
+ private nonStreamingFallback;
52
+ }
@@ -0,0 +1,117 @@
1
+ /**
2
+ * GeminiLyriaRealtimeSession — WebSocket-based real-time streaming music generation.
3
+ *
4
+ * Uses Lyria RealTime (`lyria-realtime-exp`) via the Gemini Live API WebSocket.
5
+ * Provides both a full interactive session API and a one-shot convenience method.
6
+ *
7
+ * Protocol:
8
+ * - WebSocket URL: wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent
9
+ * - Auth: API key as query parameter
10
+ * - Client → Server: setup, musicInput (weightedPrompts, musicGenerationConfig, playbackControl)
11
+ * - Server → Client: serverContent.audioChunks (base64 PCM s16le, 48kHz, stereo)
12
+ *
13
+ * Docs: https://ai.google.dev/gemini-api/docs/realtime-music-generation
14
+ */
15
+ export interface WeightedPrompt {
16
+ text: string;
17
+ weight: number;
18
+ }
19
+ export type MusicScale = "C_MAJOR_A_MINOR" | "D_FLAT_MAJOR_B_FLAT_MINOR" | "D_MAJOR_B_MINOR" | "E_FLAT_MAJOR_C_MINOR" | "E_MAJOR_D_FLAT_MINOR" | "F_MAJOR_D_MINOR" | "G_FLAT_MAJOR_E_FLAT_MINOR" | "G_MAJOR_E_MINOR" | "A_FLAT_MAJOR_F_MINOR" | "A_MAJOR_G_FLAT_MINOR" | "B_FLAT_MAJOR_G_MINOR" | "B_MAJOR_A_FLAT_MINOR" | "SCALE_UNSPECIFIED";
20
+ export type MusicGenerationMode = "QUALITY" | "DIVERSITY" | "VOCALIZATION";
21
+ export interface MusicGenerationConfig {
22
+ bpm?: number;
23
+ density?: number;
24
+ brightness?: number;
25
+ guidance?: number;
26
+ scale?: MusicScale;
27
+ temperature?: number;
28
+ topK?: number;
29
+ seed?: number;
30
+ muteBass?: boolean;
31
+ muteDrums?: boolean;
32
+ onlyBassAndDrums?: boolean;
33
+ musicGenerationMode?: MusicGenerationMode;
34
+ audioFormat?: string;
35
+ sampleRateHz?: number;
36
+ }
37
+ export interface LyriaRealtimeConfig {
38
+ /** Base URL (REST), e.g. "https://generativelanguage.googleapis.com/v1beta" */
39
+ baseUrl: string;
40
+ }
41
+ export interface LyriaRealtimeSessionOptions {
42
+ model?: string;
43
+ prompts: WeightedPrompt[];
44
+ config?: MusicGenerationConfig;
45
+ }
46
+ /** Audio chunk received from the server. */
47
+ export interface AudioChunk {
48
+ /** Raw PCM s16le data (48kHz, stereo) */
49
+ data: Buffer;
50
+ }
51
+ /**
52
+ * Interactive Lyria RealTime session over WebSocket.
53
+ *
54
+ * Usage:
55
+ * const session = new GeminiLyriaRealtimeSession({ baseUrl: "..." });
56
+ * await session.connect(apiKey, { prompts: [{ text: "jazz", weight: 1 }] });
57
+ * session.onAudioChunk = (chunk) => { ... };
58
+ * await session.play();
59
+ * // ... later
60
+ * await session.stop();
61
+ * session.close();
62
+ */
63
+ export declare class GeminiLyriaRealtimeSession {
64
+ private ws;
65
+ private wsUrl;
66
+ private model;
67
+ /** Called for each audio chunk received from the server. */
68
+ onAudioChunk: ((chunk: AudioChunk) => void) | null;
69
+ /** Called when the server reports a filtered prompt. */
70
+ onFilteredPrompt: ((reason: string) => void) | null;
71
+ /** Called on WebSocket error. */
72
+ onError: ((error: Error) => void) | null;
73
+ /** Called when the WebSocket connection closes. */
74
+ onClose: (() => void) | null;
75
+ constructor(config: LyriaRealtimeConfig);
76
+ /**
77
+ * Connect to the Lyria RealTime WebSocket and send setup + initial config.
78
+ */
79
+ connect(apiKey: string, options: LyriaRealtimeSessionOptions): Promise<void>;
80
+ /** Set or update weighted prompts (smooth transition). */
81
+ setWeightedPrompts(prompts: WeightedPrompt[]): Promise<void>;
82
+ /** Set or update music generation config. */
83
+ setMusicGenerationConfig(config: MusicGenerationConfig): Promise<void>;
84
+ /** Start streaming music. */
85
+ play(): Promise<void>;
86
+ /** Pause music streaming (can resume with play). */
87
+ pause(): Promise<void>;
88
+ /** Stop music streaming (terminates the current piece). */
89
+ stop(): Promise<void>;
90
+ /** Reset model context (for BPM/scale changes). */
91
+ resetContext(): Promise<void>;
92
+ /** Close the WebSocket connection. */
93
+ close(): void;
94
+ /** Whether the session is connected. */
95
+ get connected(): boolean;
96
+ private send;
97
+ private handleMessage;
98
+ }
99
+ /**
100
+ * Generate a fixed-duration music clip using Lyria RealTime.
101
+ *
102
+ * Connects, plays for the specified duration (default 30s), collects all
103
+ * audio chunks, assembles into a WAV file, and returns the file path.
104
+ *
105
+ * Output: 48kHz, stereo, 16-bit PCM wrapped in WAV.
106
+ */
107
+ export declare function generateRealtimeMusic(apiKey: string, config: LyriaRealtimeConfig, options: {
108
+ prompts: WeightedPrompt[];
109
+ durationSeconds?: number;
110
+ musicConfig?: MusicGenerationConfig;
111
+ model?: string;
112
+ signal?: AbortSignal;
113
+ onProgress?: (percent: number, status: string) => void;
114
+ }): Promise<{
115
+ filePath: string;
116
+ durationMs: number;
117
+ }>;
@@ -1,21 +1,53 @@
1
1
  /**
2
- * Gemini Media Transport — Image generation via Gemini generateContent.
2
+ * Gemini Media Transport — unified media generation for all Gemini media APIs.
3
3
  *
4
- * Uses responseModalities: ["TEXT", "IMAGE"] with the Gemini REST API.
5
- * POST /v1beta/models/{model}:generateContent
6
- * Auth: key= query param or x-goog-api-key header
7
- * Docs: https://ai.google.dev/gemini-api/docs/image-generation
4
+ * Supported media types and endpoints:
5
+ * image — POST /models/{model}:generateContent (responseModalities: ["TEXT","IMAGE"])
6
+ * video — POST /models/{model}:predictLongRunning poll operations download URI
7
+ * music — POST /models/{model}:generateContent (Lyria 3 — inlineData audio)
8
+ * music_realtime — WebSocket session (Lyria RealTime — streaming PCM → WAV)
9
+ * tts — POST /models/{model}:generateContent (speechConfig — inlineData PCM)
10
+ * embedding — POST /models/{model}:embedContent (float vector)
11
+ *
12
+ * Auth: x-goog-api-key header for all endpoints.
8
13
  */
9
- import type { MediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
14
+ import type { AsyncMediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
10
15
  export interface GeminiMediaConfig {
11
- /** Base URL, e.g. "https://generativelanguage.googleapis.com/v1beta/openai" */
16
+ /** Base URL, e.g. "https://generativelanguage.googleapis.com/v1beta" */
12
17
  baseUrl: string;
13
18
  timeoutMs?: number;
14
19
  }
15
- export declare class GeminiMediaTransport implements MediaTransport {
20
+ export declare class GeminiMediaTransport implements AsyncMediaTransport {
16
21
  readonly supportedTypes: readonly MediaType[];
17
22
  private apiBase;
18
23
  private timeoutMs;
19
24
  constructor(config: GeminiMediaConfig);
20
25
  generate(request: MediaRequest, apiKey: string, signal?: AbortSignal): Promise<MediaResult>;
26
+ deleteVideoTask(_taskId: string, _apiKey: string, _signal?: AbortSignal): Promise<void>;
27
+ listVideoTasks(_apiKey: string, _options?: {
28
+ after?: string;
29
+ limit?: number;
30
+ status?: string;
31
+ }, _signal?: AbortSignal): Promise<Record<string, unknown>>;
32
+ getTaskStatus(taskId: string, apiKey: string, signal?: AbortSignal): Promise<{
33
+ status: string;
34
+ task: Record<string, unknown>;
35
+ }>;
36
+ private generateImage;
37
+ private generateVideo;
38
+ private generateMusic;
39
+ private generateMusicRealtime;
40
+ private generateTTS;
41
+ private generateEmbedding;
42
+ private postJson;
43
+ private pollOperation;
44
+ /**
45
+ * Resolve an image URL to inline data for the Veo API.
46
+ * Supports file:// paths and https:// URLs.
47
+ */
48
+ private resolveImageData;
49
+ /** Extract base64 image data from generateContent response → persist to cache files. */
50
+ private extractInlineImages;
51
+ /** Extract base64 audio data from generateContent response → persist to cache files. */
52
+ private extractInlineAudio;
21
53
  }
@@ -1,26 +1,55 @@
1
1
  /**
2
- * MiniMax Media Transport — Music Generation API (music-2.6, music-cover).
2
+ * MiniMax Media Transport — Music + Video Generation.
3
+ *
4
+ * Music: POST /v1/music_generation (sync or async poll)
5
+ * Video: POST /v1/video_generation (4 modes: text, image, first-last-frame, subject-ref)
6
+ * Video Query: GET /v1/query/video_generation?task_id=XXX
7
+ * File Retrieve: GET /v1/files/retrieve?file_id=XXX (get download_url)
3
8
  *
4
- * POST /v1/music_generation (async job: submit → poll → result)
5
9
  * Auth: Authorization: Bearer $MINIMAX_API_KEY
6
- * Docs: https://platform.minimaxi.com/document/Music
10
+ * Docs: minimax-ProviderMax.md §13-18 (video), §21 (music), §24-28 (files)
7
11
  */
8
- import type { MediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
12
+ import type { AsyncMediaTransport, MediaRequest, MediaResult, MediaType } from "../media-transport.js";
9
13
  export interface MiniMaxMediaConfig {
10
14
  /** Base URL, e.g. "https://api.minimaxi.com" */
11
15
  baseUrl: string;
12
16
  timeoutMs?: number;
13
17
  }
14
- export declare class MiniMaxMediaTransport implements MediaTransport {
18
+ export declare class MiniMaxMediaTransport implements AsyncMediaTransport {
15
19
  readonly supportedTypes: readonly MediaType[];
16
20
  private baseUrl;
17
21
  private timeoutMs;
18
22
  constructor(config: MiniMaxMediaConfig);
19
23
  generate(request: MediaRequest, apiKey: string, signal?: AbortSignal): Promise<MediaResult>;
24
+ private generateMusic;
20
25
  private pollTask;
21
26
  /**
22
27
  * Generate lyrics from a text prompt via MiniMax Lyrics Generation API.
23
28
  * POST /v1/lyrics_generation — returns structured lyrics with tags.
24
29
  */
25
30
  generateLyrics(prompt: string, apiKey: string, signal?: AbortSignal): Promise<string>;
31
+ private generateVideo;
32
+ private pollVideoTask;
33
+ private getFileDownloadUrl;
34
+ /**
35
+ * Query a single video task by ID.
36
+ * GET /v1/query/video_generation?task_id=XXX
37
+ */
38
+ getTaskStatus(taskId: string, apiKey: string, signal?: AbortSignal): Promise<{
39
+ status: string;
40
+ task: Record<string, unknown>;
41
+ }>;
42
+ /**
43
+ * List tasks — MiniMax does not have a bulk list endpoint.
44
+ * Each task must be queried individually with getTaskStatus().
45
+ */
46
+ listVideoTasks(_apiKey: string, _options?: {
47
+ after?: string;
48
+ limit?: number;
49
+ status?: string;
50
+ }, _signal?: AbortSignal): Promise<Record<string, unknown>>;
51
+ /**
52
+ * Cancel/delete is not natively supported by MiniMax video API.
53
+ */
54
+ deleteVideoTask(_taskId: string, _apiKey: string, _signal?: AbortSignal): Promise<void>;
26
55
  }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * OpenAI Responses API Transport — SSE streaming implementation.
3
+ *
4
+ * Implements the OpenAI Responses API (`POST /v1/responses`),
5
+ * the officially recommended path for GPT-5.x text generation.
6
+ *
7
+ * Key differences from OpenAI Chat Completions:
8
+ * - Endpoint: POST {baseUrl}/v1/responses
9
+ * - Request body uses `input` (not `messages`), `instructions`, `reasoning`
10
+ * - SSE events: response.output_text.delta, response.function_call_arguments.delta,
11
+ * response.completed, etc.
12
+ * - Tool defs: { type: "function", name, parameters } (not nested under `function:`)
13
+ * - Tool results: { type: "function_call_output", call_id, output }
14
+ * - Context persistence: previous_response_id for server-side session continuation
15
+ * - Structured output: `text: { format: { type: "json_schema", ... } }`
16
+ * - Reasoning: `reasoning: { effort, summary }` for GPT-5.x models
17
+ *
18
+ * Wire format reference:
19
+ * https://developers.openai.com/api/docs/api-reference/responses/create
20
+ * https://developers.openai.com/api/docs/api-reference/responses/streaming-events
21
+ *
22
+ * Design: Closely mirrors volcengine-responses.ts patterns while adapting to
23
+ * OpenAI-specific wire format. Shared LLMChunk output makes upper layers
24
+ * transport-agnostic.
25
+ */
26
+ import type { LLMChunk, LLMRequest, LLMTransport } from "../transport.js";
27
+ import type { ProviderQuirks } from "../provider-def.js";
28
+ export interface OpenAIResponsesTransportConfig {
29
+ baseUrl: string;
30
+ extraHeaders?: Record<string, string>;
31
+ timeoutMs?: number;
32
+ quirks?: ProviderQuirks;
33
+ }
34
+ export declare class OpenAIResponsesTransport implements LLMTransport {
35
+ private baseUrl;
36
+ private extraHeaders;
37
+ private timeoutMs;
38
+ private quirks;
39
+ constructor(config: OpenAIResponsesTransportConfig);
40
+ stream(request: LLMRequest, apiKey: string, signal?: AbortSignal): AsyncGenerator<LLMChunk>;
41
+ private buildRequestBody;
42
+ private fetchAndStream;
43
+ private handleNonStreamingResponse;
44
+ /**
45
+ * Parse OpenAI Responses API SSE stream.
46
+ *
47
+ * Event format: "event: <type>\ndata: <json>\n\n"
48
+ * Key events:
49
+ * - response.output_text.delta → text content delta
50
+ * - response.reasoning_summary_text.delta → reasoning summary text
51
+ * - response.function_call_arguments.delta → tool call arguments streaming
52
+ * - response.output_item.added → new output item started
53
+ * - response.output_item.done → output item completed
54
+ * - response.content_part.done → content part completed (annotations)
55
+ * - response.completed → full response complete with usage
56
+ * - response.failed → error
57
+ */
58
+ private parseSSEStream;
59
+ private processEvent;
60
+ }