@agentfield/sdk 0.1.85-rc.1 → 0.1.85-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1606,27 +1606,42 @@ declare class MediaProviderError extends Error {
1606
1606
  cause?: unknown;
1607
1607
  });
1608
1608
  }
1609
+ /** Frame guidance for image-to-video models (e.g. Veo). */
1610
+ interface VideoFrameImage {
1611
+ /** Image content type — usually "image_url". */
1612
+ type?: string;
1613
+ /** Image URL or `data:` URL. */
1614
+ imageUrl: {
1615
+ url: string;
1616
+ };
1617
+ /** Which frame this image controls. */
1618
+ frameType?: 'first_frame' | 'last_frame';
1619
+ }
1620
+ /** Reference image for style / subject guidance (Veo "reference-to-video"). */
1621
+ interface VideoInputReference {
1622
+ type?: string;
1623
+ imageUrl: {
1624
+ url: string;
1625
+ };
1626
+ }
1609
1627
  interface VideoRequest {
1610
1628
  prompt: string;
1611
1629
  model?: string;
1630
+ /** Duration in seconds (model-dependent — typically 4, 6, or 8). */
1612
1631
  duration?: number;
1613
1632
  resolution?: '480p' | '720p' | '1080p' | '1K' | '2K' | '4K';
1614
1633
  aspectRatio?: '16:9' | '9:16' | '1:1' | '4:3' | '3:4' | '21:9' | '9:21';
1634
+ /** Toggle synchronized audio track (when model supports it). */
1615
1635
  generateAudio?: boolean;
1616
1636
  seed?: number;
1617
- frameImages?: Array<{
1618
- type: string;
1619
- imageUrl: {
1620
- url: string;
1621
- };
1622
- frameType?: string;
1623
- }>;
1624
- inputReferences?: Array<{
1625
- type: string;
1626
- imageUrl: {
1627
- url: string;
1628
- };
1629
- }>;
1637
+ /** Single input image for image-to-video (legacy convenience field). */
1638
+ imageUrl?: string;
1639
+ /** Per-frame guidance — first_frame / last_frame. Takes precedence over `imageUrl`. */
1640
+ frameImages?: VideoFrameImage[];
1641
+ /** Reference images for style/subject guidance. */
1642
+ inputReferences?: VideoInputReference[];
1643
+ /** Model-specific passthrough parameters (e.g. Veo's `personGeneration`). */
1644
+ extra?: Record<string, unknown>;
1630
1645
  pollInterval?: number;
1631
1646
  timeout?: number;
1632
1647
  }
@@ -1635,21 +1650,37 @@ interface ImageRequest {
1635
1650
  model?: string;
1636
1651
  size?: string;
1637
1652
  quality?: string;
1653
+ /** Reference / source image(s) for image+text→image models (e.g. grok-imagine). */
1654
+ imageUrls?: string[];
1638
1655
  imageConfig?: {
1639
1656
  aspectRatio?: string;
1640
1657
  imageSize?: string;
1658
+ /** Image-to-image blend strength (model-dependent, 0–1). */
1659
+ strength?: number;
1660
+ /** Style hint — Recraft V3 etc. */
1661
+ style?: string;
1662
+ /** RGB color palette — array of [r,g,b]. */
1663
+ rgbColors?: number[][];
1664
+ /** Background color hint as [r,g,b]. */
1665
+ backgroundRgbColor?: number[];
1641
1666
  superResolutionReferences?: string[];
1642
1667
  fontInputs?: Array<{
1643
1668
  fontUrl: string;
1644
1669
  text: string;
1645
1670
  }>;
1646
1671
  };
1672
+ /** Model-specific passthrough parameters. */
1673
+ extra?: Record<string, unknown>;
1647
1674
  }
1648
1675
  interface AudioRequest {
1649
1676
  text: string;
1650
1677
  model?: string;
1651
1678
  voice?: string;
1652
1679
  format?: string;
1680
+ /** Playback speed multiplier (OpenAI TTS only — other models ignore). */
1681
+ speed?: number;
1682
+ /** Model-specific passthrough parameters. */
1683
+ extra?: Record<string, unknown>;
1653
1684
  }
1654
1685
  interface MediaResponse {
1655
1686
  text: string;
@@ -1713,11 +1744,32 @@ declare class OpenRouterMediaProvider implements MediaProvider {
1713
1744
  readonly supportedModalities: string[];
1714
1745
  private readonly baseUrl;
1715
1746
  constructor(options?: OpenRouterMediaProviderOptions);
1747
+ /**
1748
+ * Seed the metadata cache for a model. Useful when running against test
1749
+ * servers that don't expose `GET /models/{id}/endpoints`, or when callers
1750
+ * already know the routing they want.
1751
+ *
1752
+ * Output modalities follow OpenRouter's convention — `["speech"]` for
1753
+ * TTS-only (Kokoro etc.), `["text","audio"]` for chat-audio (gpt-audio
1754
+ * family), `["video"]`, `["image"]`, etc.
1755
+ */
1756
+ seedModelMeta(model: string, outputModalities: string[], inputModalities?: string[]): void;
1757
+ /**
1758
+ * Fetch + cache OpenRouter model metadata so we can route requests to the
1759
+ * right endpoint. On any error returns an empty meta object so callers can
1760
+ * fall back to defaults.
1761
+ */
1762
+ private fetchModelMeta;
1716
1763
  /** Prevent API key from leaking via JSON.stringify (CR-03). */
1717
1764
  toJSON(): Record<string, unknown>;
1718
1765
  generateVideo(request: VideoRequest): Promise<MediaResponse>;
1719
1766
  generateImage(request: ImageRequest): Promise<MediaResponse>;
1720
1767
  generateAudio(request: AudioRequest): Promise<MediaResponse>;
1768
+ /**
1769
+ * Call OpenRouter's OpenAI-compatible TTS endpoint (`POST /audio/speech`).
1770
+ * Returns raw bytes for the requested format; wraps PCM → WAV when needed.
1771
+ */
1772
+ private generateAudioViaSpeechEndpoint;
1721
1773
  private post;
1722
1774
  private get;
1723
1775
  }
@@ -1827,4 +1879,4 @@ declare class ApprovalClient {
1827
1879
  waitForApproval(executionId: string, opts?: WaitForApprovalOptions): Promise<ApprovalStatusResponse>;
1828
1880
  }
1829
1881
 
1830
- export { ACTIVE_STATUSES, AIClient, type AIConfig, type AIEmbeddingOptions, type AIRequestOptions, type AIStream, type AIToolRequestOptions, Agent, type AgentCapability, type AgentConfig, type AgentHandler, AgentRouter, type AgentRouterOptions, type AgentState, ApprovalClient, type ApprovalRequestResponse, type ApprovalStatusResponse, Audio, type AudioOutput, type AudioRequest, type AuditTrailExport, type AuditTrailFilters, type Awaitable, CANONICAL_STATUSES, type CompactCapability, type CompactDiscoveryResponse, DIDAuthenticator, type DIDIdentity, type DIDIdentityPackage, type DIDRegistrationRequest, type DIDRegistrationResponse, type DeploymentType, DidClient, DidInterface, DidManager, type DiscoveryFormat, type DiscoveryOptions, type DiscoveryPagination, type DiscoveryResponse, type DiscoveryResult, ExecutionContext, type ExecutionCredential, type ExecutionLogAttributes, type ExecutionLogBatchPayload, type ExecutionLogContext, type ExecutionLogEmitOptions, type ExecutionLogEntry, type ExecutionLogLevel, type ExecutionLogTransport, type ExecutionLogTransportPayload, type ExecutionLogWireEntry, ExecutionLogger, type ExecutionLoggerOptions, type ExecutionMetadata, ExecutionStatus, type ExecutionStatusValue, File, type FileOutput, type GenerateCredentialOptions, type GenerateCredentialParams, HEADER_CALLER_DID, HEADER_DID_NONCE, HEADER_DID_SIGNATURE, HEADER_DID_TIMESTAMP, type HarnessConfig, type HarnessOptions, type HarnessProvider, type HarnessResult, HarnessRunner, type HealthStatus, Image, type ImageOutput, type ImageRequest, type MediaProvider, MediaProviderError, type MediaResponse, MediaRouter, type MemoryChangeEvent, MemoryClient, MemoryClientBase, type MemoryConfig, MemoryEventClient, type MemoryEventHandler, type MemoryEventHistoryOptions, MemoryInterface, type MemoryRequestMetadata, type MemoryRequestOptions, type MemoryScope, type MemoryWatchHandler, type Metrics, type MultimodalContent, MultimodalResponse, OpenRouterMediaProvider, type OpenRouterMediaProviderOptions, RateLimitError, type RateLimiterOptions, type RawExecutionContext, type RawResult, type ReasonerCapability, ReasonerContext, type ReasonerDefinition, type ReasonerHandler, type ReasonerOptions, type RequestApprovalPayload, SUPPORTED_PROVIDERS, type ServerlessAdapter, type ServerlessEvent, type ServerlessResponse, type SkillCapability, SkillContext, type SkillDefinition, type SkillHandler, type SkillOptions, StatelessRateLimiter, TERMINAL_STATUSES, Text, type ToolCallConfig, type ToolCallRecord, type ToolCallTrace, type ToolsOption, type VectorSearchOptions, type VectorSearchResult, type VideoRequest, type WaitForApprovalOptions, type WorkflowCredential, type WorkflowMetadata, type WorkflowProgressOptions, WorkflowReporter, type ZodSchema, audioFromBase64, audioFromBuffer, audioFromFile, audioFromUrl, buildProvider, buildToolConfig, capabilitiesToTools, capabilityToMetadataTool, capabilityToTool, createExecutionLogger, createHarnessResult, createMetrics, createMultimodalResponse, createRawResult, executeToolCallLoop, fileFromBase64, fileFromBuffer, fileFromPath, fileFromUrl, getCurrentContext, getCurrentSkillContext, imageFromBase64, imageFromBuffer, imageFromFile, imageFromUrl, isActive, isExecutionLogBatchPayload, isTerminal, normalizeExecutionLogEntry, normalizeStatus, serializeExecutionLogEntry, text };
1882
+ export { ACTIVE_STATUSES, AIClient, type AIConfig, type AIEmbeddingOptions, type AIRequestOptions, type AIStream, type AIToolRequestOptions, Agent, type AgentCapability, type AgentConfig, type AgentHandler, AgentRouter, type AgentRouterOptions, type AgentState, ApprovalClient, type ApprovalRequestResponse, type ApprovalStatusResponse, Audio, type AudioOutput, type AudioRequest, type AuditTrailExport, type AuditTrailFilters, type Awaitable, CANONICAL_STATUSES, type CompactCapability, type CompactDiscoveryResponse, DIDAuthenticator, type DIDIdentity, type DIDIdentityPackage, type DIDRegistrationRequest, type DIDRegistrationResponse, type DeploymentType, DidClient, DidInterface, DidManager, type DiscoveryFormat, type DiscoveryOptions, type DiscoveryPagination, type DiscoveryResponse, type DiscoveryResult, ExecutionContext, type ExecutionCredential, type ExecutionLogAttributes, type ExecutionLogBatchPayload, type ExecutionLogContext, type ExecutionLogEmitOptions, type ExecutionLogEntry, type ExecutionLogLevel, type ExecutionLogTransport, type ExecutionLogTransportPayload, type ExecutionLogWireEntry, ExecutionLogger, type ExecutionLoggerOptions, type ExecutionMetadata, ExecutionStatus, type ExecutionStatusValue, File, type FileOutput, type GenerateCredentialOptions, type GenerateCredentialParams, HEADER_CALLER_DID, HEADER_DID_NONCE, HEADER_DID_SIGNATURE, HEADER_DID_TIMESTAMP, type HarnessConfig, type HarnessOptions, type HarnessProvider, type HarnessResult, HarnessRunner, type HealthStatus, Image, type ImageOutput, type ImageRequest, type MediaProvider, MediaProviderError, type MediaResponse, MediaRouter, type MemoryChangeEvent, MemoryClient, MemoryClientBase, type MemoryConfig, MemoryEventClient, type MemoryEventHandler, type MemoryEventHistoryOptions, MemoryInterface, type MemoryRequestMetadata, type MemoryRequestOptions, type MemoryScope, type MemoryWatchHandler, type Metrics, type MultimodalContent, MultimodalResponse, OpenRouterMediaProvider, type OpenRouterMediaProviderOptions, RateLimitError, type RateLimiterOptions, type RawExecutionContext, type RawResult, type ReasonerCapability, ReasonerContext, type ReasonerDefinition, type ReasonerHandler, type ReasonerOptions, type RequestApprovalPayload, SUPPORTED_PROVIDERS, type ServerlessAdapter, type ServerlessEvent, type ServerlessResponse, type SkillCapability, SkillContext, type SkillDefinition, type SkillHandler, type SkillOptions, StatelessRateLimiter, TERMINAL_STATUSES, Text, type ToolCallConfig, type ToolCallRecord, type ToolCallTrace, type ToolsOption, type VectorSearchOptions, type VectorSearchResult, type VideoFrameImage, type VideoInputReference, type VideoRequest, type WaitForApprovalOptions, type WorkflowCredential, type WorkflowMetadata, type WorkflowProgressOptions, WorkflowReporter, type ZodSchema, audioFromBase64, audioFromBuffer, audioFromFile, audioFromUrl, buildProvider, buildToolConfig, capabilitiesToTools, capabilityToMetadataTool, capabilityToTool, createExecutionLogger, createHarnessResult, createMetrics, createMultimodalResponse, createRawResult, executeToolCallLoop, fileFromBase64, fileFromBuffer, fileFromPath, fileFromUrl, getCurrentContext, getCurrentSkillContext, imageFromBase64, imageFromBuffer, imageFromFile, imageFromUrl, isActive, isExecutionLogBatchPayload, isTerminal, normalizeExecutionLogEntry, normalizeStatus, serializeExecutionLogEntry, text };
package/dist/index.js CHANGED
@@ -572,15 +572,15 @@ var init_opencode = __esm({
572
572
  this.bin = binPath;
573
573
  }
574
574
  async execute(prompt, options) {
575
- const cmd = [this.bin];
575
+ const cmd = [this.bin, "run"];
576
576
  if (options.cwd && typeof options.cwd === "string") {
577
- cmd.push("-c", options.cwd);
577
+ cmd.push("--dir", options.cwd);
578
578
  } else if (options.project_dir && typeof options.project_dir === "string") {
579
- cmd.push("-c", options.project_dir);
579
+ cmd.push("--dir", options.project_dir);
580
580
  }
581
581
  const env = { ...options.env };
582
582
  if (options.model) {
583
- env["MODEL"] = String(options.model);
583
+ cmd.push("-m", String(options.model));
584
584
  }
585
585
  let effectivePrompt = prompt;
586
586
  if (options.system_prompt && typeof options.system_prompt === "string" && options.system_prompt.trim()) {
@@ -592,7 +592,7 @@ ${options.system_prompt.trim()}
592
592
  USER REQUEST:
593
593
  ${prompt}`;
594
594
  }
595
- cmd.push("-p", effectivePrompt);
595
+ cmd.push(effectivePrompt);
596
596
  const startApi = Date.now();
597
597
  try {
598
598
  const { stdout, stderr, exitCode } = await runCli(cmd, { env });
@@ -5578,12 +5578,55 @@ var API_TIMEOUT = 3e4;
5578
5578
  var DOWNLOAD_TIMEOUT = 12e4;
5579
5579
  var MAX_CONSECUTIVE_PARSE_ERRORS = 50;
5580
5580
  var apiKeyStore = /* @__PURE__ */ new WeakMap();
5581
+ var modelMetaStore = /* @__PURE__ */ new WeakMap();
5581
5582
  function emptyMediaResponse(raw) {
5582
5583
  return { text: "", images: [], audio: null, files: [], videos: [], rawResponse: raw };
5583
5584
  }
5584
5585
  function stripPrefix(model) {
5585
5586
  return model.startsWith("openrouter/") ? model.slice("openrouter/".length) : model;
5586
5587
  }
5588
+ function wrapPcm16AsWav(pcm, sampleRate = 24e3) {
5589
+ const channels = 1;
5590
+ const bitsPerSample = 16;
5591
+ const byteRate = sampleRate * channels * bitsPerSample / 8;
5592
+ const blockAlign = channels * bitsPerSample / 8;
5593
+ const dataSize = pcm.byteLength;
5594
+ const buffer = new ArrayBuffer(44 + dataSize);
5595
+ const view = new DataView(buffer);
5596
+ view.setUint8(0, 82);
5597
+ view.setUint8(1, 73);
5598
+ view.setUint8(2, 70);
5599
+ view.setUint8(3, 70);
5600
+ view.setUint32(4, 36 + dataSize, true);
5601
+ view.setUint8(8, 87);
5602
+ view.setUint8(9, 65);
5603
+ view.setUint8(10, 86);
5604
+ view.setUint8(11, 69);
5605
+ view.setUint8(12, 102);
5606
+ view.setUint8(13, 109);
5607
+ view.setUint8(14, 116);
5608
+ view.setUint8(15, 32);
5609
+ view.setUint32(16, 16, true);
5610
+ view.setUint16(20, 1, true);
5611
+ view.setUint16(22, channels, true);
5612
+ view.setUint32(24, sampleRate, true);
5613
+ view.setUint32(28, byteRate, true);
5614
+ view.setUint16(32, blockAlign, true);
5615
+ view.setUint16(34, bitsPerSample, true);
5616
+ view.setUint8(36, 100);
5617
+ view.setUint8(37, 97);
5618
+ view.setUint8(38, 116);
5619
+ view.setUint8(39, 97);
5620
+ view.setUint32(40, dataSize, true);
5621
+ new Uint8Array(buffer, 44).set(pcm);
5622
+ return new Uint8Array(buffer);
5623
+ }
5624
+ function bytesToBase64(bytes) {
5625
+ return Buffer.from(bytes).toString("base64");
5626
+ }
5627
+ function base64ToBytes(b64) {
5628
+ return new Uint8Array(Buffer.from(b64, "base64"));
5629
+ }
5587
5630
  function assertSafeUrl(urlStr) {
5588
5631
  let parsed;
5589
5632
  try {
@@ -5623,6 +5666,56 @@ var OpenRouterMediaProvider = class {
5623
5666
  });
5624
5667
  }
5625
5668
  apiKeyStore.set(this, key);
5669
+ modelMetaStore.set(this, /* @__PURE__ */ new Map());
5670
+ }
5671
+ /**
5672
+ * Seed the metadata cache for a model. Useful when running against test
5673
+ * servers that don't expose `GET /models/{id}/endpoints`, or when callers
5674
+ * already know the routing they want.
5675
+ *
5676
+ * Output modalities follow OpenRouter's convention — `["speech"]` for
5677
+ * TTS-only (Kokoro etc.), `["text","audio"]` for chat-audio (gpt-audio
5678
+ * family), `["video"]`, `["image"]`, etc.
5679
+ */
5680
+ seedModelMeta(model, outputModalities, inputModalities = []) {
5681
+ const stripped = stripPrefix(model);
5682
+ const cache = modelMetaStore.get(this);
5683
+ cache.set(stripped, {
5684
+ outputModalities: [...outputModalities],
5685
+ inputModalities: [...inputModalities]
5686
+ });
5687
+ }
5688
+ /**
5689
+ * Fetch + cache OpenRouter model metadata so we can route requests to the
5690
+ * right endpoint. On any error returns an empty meta object so callers can
5691
+ * fall back to defaults.
5692
+ */
5693
+ async fetchModelMeta(model) {
5694
+ const stripped = stripPrefix(model);
5695
+ const cache = modelMetaStore.get(this);
5696
+ const cached = cache.get(stripped);
5697
+ if (cached) return cached;
5698
+ const url = `${this.baseUrl}/models/${stripped}/endpoints`;
5699
+ try {
5700
+ const res = await this.get(url);
5701
+ if (!res.ok) {
5702
+ const meta2 = { outputModalities: [], inputModalities: [] };
5703
+ cache.set(stripped, meta2);
5704
+ return meta2;
5705
+ }
5706
+ const data = await res.json();
5707
+ const arch = data?.data?.architecture ?? {};
5708
+ const meta = {
5709
+ outputModalities: arch.output_modalities ?? [],
5710
+ inputModalities: arch.input_modalities ?? []
5711
+ };
5712
+ cache.set(stripped, meta);
5713
+ return meta;
5714
+ } catch {
5715
+ const meta = { outputModalities: [], inputModalities: [] };
5716
+ cache.set(stripped, meta);
5717
+ return meta;
5718
+ }
5626
5719
  }
5627
5720
  /** Prevent API key from leaking via JSON.stringify (CR-03). */
5628
5721
  toJSON() {
@@ -5646,8 +5739,21 @@ var OpenRouterMediaProvider = class {
5646
5739
  if (request.aspectRatio) body.aspect_ratio = request.aspectRatio;
5647
5740
  if (request.generateAudio != null) body.generate_audio = request.generateAudio;
5648
5741
  if (request.seed != null) body.seed = request.seed;
5649
- if (request.frameImages) body.frame_images = request.frameImages;
5650
- if (request.inputReferences) body.input_references = request.inputReferences;
5742
+ if (request.imageUrl) body.image_url = request.imageUrl;
5743
+ if (request.frameImages) {
5744
+ body.frame_images = request.frameImages.map((fi) => ({
5745
+ type: fi.type ?? "image_url",
5746
+ image_url: fi.imageUrl,
5747
+ ...fi.frameType ? { frame_type: fi.frameType } : {}
5748
+ }));
5749
+ }
5750
+ if (request.inputReferences) {
5751
+ body.input_references = request.inputReferences.map((ref) => ({
5752
+ type: ref.type ?? "image_url",
5753
+ image_url: ref.imageUrl
5754
+ }));
5755
+ }
5756
+ if (request.extra) Object.assign(body, request.extra);
5651
5757
  const submitEndpoint = `${this.baseUrl}/videos`;
5652
5758
  const submitRes = await this.post(submitEndpoint, body);
5653
5759
  if (!submitRes.ok) {
@@ -5696,13 +5802,24 @@ var OpenRouterMediaProvider = class {
5696
5802
  { provider: "openrouter", model }
5697
5803
  );
5698
5804
  }
5805
+ const unsignedUrls = jobData.unsigned_urls;
5699
5806
  const unsignedUrl = jobData.unsigned_url;
5700
5807
  const signedUrl = jobData.url;
5701
- const videoUrl = unsignedUrl ?? signedUrl;
5808
+ const videoUrl = unsignedUrls?.[0] ?? unsignedUrl ?? signedUrl;
5702
5809
  let videoData;
5703
5810
  if (videoUrl) {
5704
5811
  assertSafeUrl(videoUrl);
5812
+ const downloadHeaders = {};
5813
+ try {
5814
+ const host = new URL(videoUrl).hostname.toLowerCase();
5815
+ if (host === "openrouter.ai" || host.endsWith(".openrouter.ai")) {
5816
+ const key = apiKeyStore.get(this);
5817
+ if (key) downloadHeaders.Authorization = `Bearer ${key}`;
5818
+ }
5819
+ } catch {
5820
+ }
5705
5821
  const dlRes = await fetch(videoUrl, {
5822
+ headers: downloadHeaders,
5706
5823
  signal: AbortSignal.timeout(DOWNLOAD_TIMEOUT),
5707
5824
  redirect: "error"
5708
5825
  });
@@ -5727,15 +5844,43 @@ var OpenRouterMediaProvider = class {
5727
5844
  // ── Image ──────────────────────────────────────────────────────────
5728
5845
  async generateImage(request) {
5729
5846
  const model = stripPrefix(request.model ?? "openai/gpt-image-1");
5730
- const messages = [{ role: "user", content: request.prompt }];
5847
+ let userContent = request.prompt;
5848
+ if (request.imageUrls && request.imageUrls.length > 0) {
5849
+ userContent = [
5850
+ { type: "text", text: request.prompt },
5851
+ ...request.imageUrls.map((url) => ({
5852
+ type: "image_url",
5853
+ image_url: { url }
5854
+ }))
5855
+ ];
5856
+ }
5857
+ const messages = [{ role: "user", content: userContent }];
5731
5858
  const body = {
5732
5859
  model,
5733
5860
  messages,
5734
- modalities: ["image", "text"]
5861
+ modalities: ["image"]
5735
5862
  };
5736
5863
  if (request.size) body.size = request.size;
5737
5864
  if (request.quality) body.quality = request.quality;
5738
- if (request.imageConfig) body.image_config = request.imageConfig;
5865
+ if (request.imageConfig) {
5866
+ const ic = request.imageConfig;
5867
+ const out = {};
5868
+ if (ic.aspectRatio) out.aspect_ratio = ic.aspectRatio;
5869
+ if (ic.imageSize) out.image_size = ic.imageSize;
5870
+ if (ic.strength != null) out.strength = ic.strength;
5871
+ if (ic.style) out.style = ic.style;
5872
+ if (ic.rgbColors) out.rgb_colors = ic.rgbColors;
5873
+ if (ic.backgroundRgbColor) out.background_rgb_color = ic.backgroundRgbColor;
5874
+ if (ic.superResolutionReferences) out.super_resolution_references = ic.superResolutionReferences;
5875
+ if (ic.fontInputs) {
5876
+ out.font_inputs = ic.fontInputs.map((fi) => ({
5877
+ font_url: fi.fontUrl,
5878
+ text: fi.text
5879
+ }));
5880
+ }
5881
+ body.image_config = out;
5882
+ }
5883
+ if (request.extra) Object.assign(body, request.extra);
5739
5884
  const endpoint = `${this.baseUrl}/chat/completions`;
5740
5885
  const res = await this.post(endpoint, body);
5741
5886
  if (!res.ok) {
@@ -5746,6 +5891,15 @@ var OpenRouterMediaProvider = class {
5746
5891
  }
5747
5892
  const data = await res.json();
5748
5893
  const resp = emptyMediaResponse(data);
5894
+ const pushImageFromUrl = (url) => {
5895
+ if (!url) return;
5896
+ if (url.startsWith("data:")) {
5897
+ const b64 = url.split(",", 2)[1];
5898
+ resp.images.push({ url, b64Json: b64 });
5899
+ } else {
5900
+ resp.images.push({ url });
5901
+ }
5902
+ };
5749
5903
  const choices = data.choices;
5750
5904
  if (choices) {
5751
5905
  for (const choice of choices) {
@@ -5761,16 +5915,17 @@ var OpenRouterMediaProvider = class {
5761
5915
  resp.text += p.text;
5762
5916
  } else if (p.type === "image_url") {
5763
5917
  const imgUrl = p.image_url;
5764
- const url = imgUrl?.url;
5765
- if (url?.startsWith("data:")) {
5766
- const b64 = url.split(",", 2)[1];
5767
- resp.images.push({ url, b64Json: b64 });
5768
- } else if (url) {
5769
- resp.images.push({ url });
5770
- }
5918
+ pushImageFromUrl(imgUrl?.url);
5771
5919
  }
5772
5920
  }
5773
5921
  }
5922
+ const images = msg.images;
5923
+ if (Array.isArray(images)) {
5924
+ for (const img of images) {
5925
+ const imgUrl = img.image_url;
5926
+ pushImageFromUrl(imgUrl?.url);
5927
+ }
5928
+ }
5774
5929
  }
5775
5930
  }
5776
5931
  return resp;
@@ -5778,6 +5933,20 @@ var OpenRouterMediaProvider = class {
5778
5933
  // ── Audio ──────────────────────────────────────────────────────────
5779
5934
  async generateAudio(request) {
5780
5935
  const model = stripPrefix(request.model ?? "openai/gpt-4o-mini-tts");
5936
+ const requestedFormat = request.format ?? "wav";
5937
+ const meta = await this.fetchModelMeta(model);
5938
+ const outMods = meta.outputModalities;
5939
+ const useSpeechEndpoint = outMods.includes("speech") || outMods.length === 0 || !outMods.includes("audio");
5940
+ if (useSpeechEndpoint) {
5941
+ return this.generateAudioViaSpeechEndpoint(
5942
+ model,
5943
+ request.text,
5944
+ request.voice ?? "alloy",
5945
+ requestedFormat,
5946
+ request
5947
+ );
5948
+ }
5949
+ const wireFormat = requestedFormat === "wav" ? "pcm16" : requestedFormat;
5781
5950
  const messages = [{ role: "user", content: request.text }];
5782
5951
  const body = {
5783
5952
  model,
@@ -5786,7 +5955,7 @@ var OpenRouterMediaProvider = class {
5786
5955
  stream: true,
5787
5956
  audio: {
5788
5957
  voice: request.voice ?? "alloy",
5789
- format: request.format ?? "wav"
5958
+ format: wireFormat
5790
5959
  }
5791
5960
  };
5792
5961
  const endpoint = `${this.baseUrl}/chat/completions`;
@@ -5877,13 +6046,65 @@ var OpenRouterMediaProvider = class {
5877
6046
  const resp = emptyMediaResponse(null);
5878
6047
  resp.text = textContent;
5879
6048
  if (audioChunks.length > 0) {
6049
+ let b64 = audioChunks.join("");
6050
+ try {
6051
+ const parts = audioChunks.map(base64ToBytes);
6052
+ const total = parts.reduce((n, p) => n + p.byteLength, 0);
6053
+ const merged = new Uint8Array(total);
6054
+ let off = 0;
6055
+ for (const p of parts) {
6056
+ merged.set(p, off);
6057
+ off += p.byteLength;
6058
+ }
6059
+ b64 = bytesToBase64(merged);
6060
+ if (requestedFormat === "wav") {
6061
+ b64 = bytesToBase64(wrapPcm16AsWav(merged));
6062
+ }
6063
+ } catch {
6064
+ }
5880
6065
  resp.audio = {
5881
- data: audioChunks.join(""),
5882
- format: request.format ?? "wav"
6066
+ data: b64,
6067
+ format: requestedFormat
5883
6068
  };
5884
6069
  }
5885
6070
  return resp;
5886
6071
  }
6072
+ /**
6073
+ * Call OpenRouter's OpenAI-compatible TTS endpoint (`POST /audio/speech`).
6074
+ * Returns raw bytes for the requested format; wraps PCM → WAV when needed.
6075
+ */
6076
+ async generateAudioViaSpeechEndpoint(model, text2, voice, requestedFormat, request) {
6077
+ const wireFormat = requestedFormat === "wav" || requestedFormat === "pcm" || requestedFormat === "pcm16" ? "pcm" : requestedFormat;
6078
+ const endpoint = `${this.baseUrl}/audio/speech`;
6079
+ const body = {
6080
+ model,
6081
+ input: text2,
6082
+ voice,
6083
+ response_format: wireFormat
6084
+ };
6085
+ if (request?.speed != null) body.speed = request.speed;
6086
+ if (request?.extra) Object.assign(body, request.extra);
6087
+ const res = await this.post(endpoint, body);
6088
+ if (!res.ok) {
6089
+ throw new MediaProviderError(
6090
+ `Audio generation failed [model=${model}] [endpoint=${endpoint}]: ${res.status} ${await res.text()}`,
6091
+ { provider: "openrouter", model, endpoint }
6092
+ );
6093
+ }
6094
+ const buf = new Uint8Array(await res.arrayBuffer());
6095
+ const finalBytes = requestedFormat === "wav" ? wrapPcm16AsWav(buf) : buf;
6096
+ const resp = emptyMediaResponse({
6097
+ endpoint: "audio/speech",
6098
+ model,
6099
+ mime_type: res.headers.get("content-type") ?? ""
6100
+ });
6101
+ resp.text = text2;
6102
+ resp.audio = {
6103
+ data: bytesToBase64(finalBytes),
6104
+ format: requestedFormat
6105
+ };
6106
+ return resp;
6107
+ }
5887
6108
  // ── Helpers ────────────────────────────────────────────────────────
5888
6109
  post(url, body) {
5889
6110
  const key = apiKeyStore.get(this);