@bitkyc08/opencodex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/LICENSE +21 -0
  2. package/README.ko.md +164 -0
  3. package/README.md +165 -0
  4. package/README.zh-CN.md +162 -0
  5. package/gui/README.md +73 -0
  6. package/gui/dist/assets/index-C1wlp1SM.css +1 -0
  7. package/gui/dist/assets/index-C9y3iMF1.js +9 -0
  8. package/gui/dist/favicon.png +0 -0
  9. package/gui/dist/icons.svg +24 -0
  10. package/gui/dist/index.html +15 -0
  11. package/gui/dist/logo.png +0 -0
  12. package/package.json +56 -0
  13. package/scripts/postinstall.mjs +57 -0
  14. package/src/adapters/anthropic.ts +306 -0
  15. package/src/adapters/azure.ts +31 -0
  16. package/src/adapters/base.ts +20 -0
  17. package/src/adapters/google.ts +195 -0
  18. package/src/adapters/image.ts +23 -0
  19. package/src/adapters/openai-chat.ts +265 -0
  20. package/src/adapters/openai-responses.ts +43 -0
  21. package/src/bridge.ts +296 -0
  22. package/src/cli.ts +183 -0
  23. package/src/codex-catalog.ts +318 -0
  24. package/src/codex-inject.ts +186 -0
  25. package/src/config.ts +108 -0
  26. package/src/index.ts +20 -0
  27. package/src/init.ts +163 -0
  28. package/src/model-cache.ts +42 -0
  29. package/src/oauth/anthropic.ts +151 -0
  30. package/src/oauth/callback-server.ts +249 -0
  31. package/src/oauth/index.ts +235 -0
  32. package/src/oauth/key-providers.ts +126 -0
  33. package/src/oauth/kimi.ts +160 -0
  34. package/src/oauth/local-token-detect.ts +71 -0
  35. package/src/oauth/login-cli.ts +90 -0
  36. package/src/oauth/pkce.ts +15 -0
  37. package/src/oauth/store.ts +39 -0
  38. package/src/oauth/types.ts +22 -0
  39. package/src/oauth/xai.ts +234 -0
  40. package/src/responses/parser.ts +402 -0
  41. package/src/responses/schema.ts +145 -0
  42. package/src/router.ts +86 -0
  43. package/src/server.ts +522 -0
  44. package/src/service.ts +130 -0
  45. package/src/star-prompt.ts +50 -0
  46. package/src/types.ts +228 -0
  47. package/src/update.ts +64 -0
  48. package/src/vision/describe.ts +98 -0
  49. package/src/vision/index.ts +141 -0
  50. package/src/web-search/executor.ts +75 -0
  51. package/src/web-search/format-result.ts +45 -0
  52. package/src/web-search/index.ts +62 -0
  53. package/src/web-search/loop.ts +188 -0
  54. package/src/web-search/parse.ts +128 -0
  55. package/src/web-search/synthetic-tool.ts +42 -0
@@ -0,0 +1,50 @@
1
+ import { existsSync, mkdirSync, writeFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import { spawnSync } from "node:child_process";
4
+ import { createInterface } from "node:readline/promises";
5
+ import { getConfigDir } from "./config";
6
+
7
+ const REPO = "lidge-jun/opencodex";
8
+ /** Shared with scripts/postinstall.mjs so the prompt fires exactly once across install + first start. */
9
+ const MARKER = ".star-prompted";
10
+
11
+ function ghAvailable(): boolean {
12
+ const r = spawnSync("gh", ["--version"], { stdio: "ignore", timeout: 3000, windowsHide: true });
13
+ return !r.error && r.status === 0;
14
+ }
15
+
16
+ function starRepo(): { ok: boolean; error?: string } {
17
+ const r = spawnSync("gh", ["api", "-X", "PUT", `/user/starred/${REPO}`],
18
+ { encoding: "utf8", stdio: ["ignore", "pipe", "pipe"], timeout: 10000, windowsHide: true });
19
+ if (r.error) return { ok: false, error: r.error.message };
20
+ if (r.status !== 0) return { ok: false, error: (r.stderr || r.stdout || "").trim() || `gh exited ${r.status}` };
21
+ return { ok: true };
22
+ }
23
+
24
+ /**
25
+ * First interactive `ocx start`: a one-time `[Y/n]` "star on GitHub?" prompt. On yes, stars the repo
26
+ * via the user's `gh` auth (same approach as the npm postinstall). No-op under the background service,
27
+ * for non-TTY/piped runs, when already prompted, or when `gh` is unavailable. Never throws.
28
+ */
29
+ export async function maybeShowStarPrompt(): Promise<void> {
30
+ try {
31
+ if (process.env.OCX_SERVICE || !process.stdin.isTTY || !process.stdout.isTTY) return;
32
+ const dir = getConfigDir();
33
+ const marker = join(dir, MARKER);
34
+ if (existsSync(marker)) return;
35
+ if (!ghAvailable()) return; // can't star without gh — stay silent and re-check on a later start
36
+ try { mkdirSync(dir, { recursive: true }); writeFileSync(marker, new Date().toISOString()); } catch { /* best-effort */ }
37
+
38
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
39
+ let yes = false;
40
+ try {
41
+ const ans = (await rl.question("\n \x1b[38;5;141m⭐ Enjoying opencodex? Star it on GitHub?\x1b[0m [Y/n] ")).trim().toLowerCase();
42
+ yes = ans === "" || ans === "y" || ans === "yes";
43
+ } finally {
44
+ rl.close();
45
+ }
46
+ if (!yes) return;
47
+ const r = starRepo();
48
+ console.log(r.ok ? " Thanks for the star! ⭐\n" : ` Couldn't star automatically (${r.error}) — ${REPO}\n`);
49
+ } catch { /* never let the star prompt disrupt startup */ }
50
+ }
package/src/types.ts ADDED
@@ -0,0 +1,228 @@
1
+ export interface OcxParsedRequest {
2
+ modelId: string;
3
+ context: OcxContext;
4
+ stream: boolean;
5
+ options: OcxRequestOptions;
6
+ _rawBody?: unknown;
7
+ /**
8
+ * The hosted `{type:"web_search", ...}` tool config, stashed when Codex enables web search. Routed
9
+ * (non-OpenAI) providers can't run it server-side, so the proxy re-exposes it as a function tool and
10
+ * executes searches via the gpt-5.4-mini sidecar (see src/web-search). Absent when not requested.
11
+ */
12
+ _webSearch?: Record<string, unknown>;
13
+ /**
14
+ * True when Codex requested structured output (`text.format` = json_schema/json_object). The
15
+ * web-search tool_result is then rendered as compact JSON instead of markdown prose, so its
16
+ * answer/"Sources:" text can't bleed into and corrupt the model's schema-constrained output.
17
+ */
18
+ _structuredOutput?: boolean;
19
+ }
20
+
21
+ export interface OcxContext {
22
+ systemPrompt?: string[];
23
+ messages: OcxMessage[];
24
+ tools?: OcxTool[];
25
+ }
26
+
27
+ export type OcxMessage =
28
+ | OcxUserMessage
29
+ | OcxAssistantMessage
30
+ | OcxDeveloperMessage
31
+ | OcxToolResultMessage;
32
+
33
+ export interface OcxUserMessage {
34
+ role: "user";
35
+ content: string | OcxContentPart[];
36
+ timestamp: number;
37
+ }
38
+
39
+ export interface OcxAssistantMessage {
40
+ role: "assistant";
41
+ content: OcxAssistantContentPart[];
42
+ model?: string;
43
+ timestamp: number;
44
+ }
45
+
46
+ export interface OcxDeveloperMessage {
47
+ role: "developer";
48
+ content: string | OcxContentPart[];
49
+ timestamp: number;
50
+ }
51
+
52
+ export interface OcxToolResultMessage {
53
+ role: "toolResult";
54
+ toolCallId: string;
55
+ toolName: string;
56
+ /** Text, or content parts when a tool (e.g. Codex view_image) returns an image in its output. */
57
+ content: string | OcxContentPart[];
58
+ isError: boolean;
59
+ timestamp: number;
60
+ }
61
+
62
+ export interface OcxTextContent {
63
+ type: "text";
64
+ text: string;
65
+ }
66
+
67
+ export interface OcxImageContent {
68
+ type: "image";
69
+ /** A `data:` URL (base64) or a remote https URL — passed through from Codex verbatim, NEVER inlined as text. */
70
+ imageUrl: string;
71
+ /** Fidelity hint from Codex: "low" | "high" | "auto". */
72
+ detail?: string;
73
+ }
74
+
75
+ /** A user/developer message content part: text or an image (vision). */
76
+ export type OcxContentPart = OcxTextContent | OcxImageContent;
77
+
78
+ export interface OcxThinkingContent {
79
+ type: "thinking";
80
+ thinking: string;
81
+ signature?: string;
82
+ itemId?: string;
83
+ }
84
+
85
+ export interface OcxToolCall {
86
+ type: "toolCall";
87
+ id: string;
88
+ name: string;
89
+ arguments: Record<string, unknown>;
90
+ customWireName?: string;
91
+ thoughtSignature?: string;
92
+ /** MCP namespace (e.g. "mcp__context7") when this call targets a namespaced tool. */
93
+ namespace?: string;
94
+ }
95
+
96
+ export type OcxAssistantContentPart = OcxTextContent | OcxThinkingContent | OcxToolCall;
97
+
98
+ export interface OcxTool {
99
+ name: string;
100
+ description: string;
101
+ parameters: Record<string, unknown>;
102
+ strict?: boolean;
103
+ /** MCP namespace (e.g. "mcp__context7") for tools flattened out of a Responses "namespace" tool. */
104
+ namespace?: string;
105
+ /** Freeform/custom tool (e.g. apply_patch): the model's call must be relayed as a custom_tool_call. */
106
+ freeform?: boolean;
107
+ /** Client-executed tool discovery (tool_search): the model's call must be relayed as a tool_search_call. */
108
+ toolSearch?: boolean;
109
+ /** Synthetic web_search tool: the model's call is executed by the gpt-5.4-mini sidecar, not relayed to Codex. */
110
+ webSearch?: boolean;
111
+ }
112
+
113
+ /**
114
+ * Wire name a chat model sees for a tool. Namespaced (MCP) tools are flattened to
115
+ * "<namespace>__<name>" so they survive the chat-completions function-tool format;
116
+ * the proxy maps this back to {namespace, name} on the return trip (Codex routes MCP
117
+ * calls by an explicit `namespace` field, not by parsing the name).
118
+ */
119
+ export function namespacedToolName(namespace: string | undefined, name: string): string {
120
+ return namespace ? `${namespace}__${name}` : name;
121
+ }
122
+
123
+ /**
124
+ * Whether `modelId` is in a per-provider classification list (e.g. `noVisionModels`). Matches the full
125
+ * id, OR — for Ollama-style ids — the family before the ":size" tag, so a `gpt-oss` entry covers
126
+ * `gpt-oss:120b`/`gpt-oss:20b`. Colon-less ids (e.g. `grok-build-0.1`) still match exactly only.
127
+ */
128
+ export function modelInList(list: string[] | undefined, modelId: string): boolean {
129
+ if (!list || list.length === 0) return false;
130
+ if (list.includes(modelId)) return true;
131
+ const colon = modelId.indexOf(":");
132
+ return colon > 0 && list.includes(modelId.slice(0, colon));
133
+ }
134
+
135
+ export interface OcxRequestOptions {
136
+ maxOutputTokens?: number;
137
+ temperature?: number;
138
+ topP?: number;
139
+ stopSequences?: string[];
140
+ toolChoice?: "auto" | "none" | "required" | { name: string };
141
+ reasoning?: string;
142
+ hideThinkingSummary?: boolean;
143
+ serviceTier?: string;
144
+ presencePenalty?: number;
145
+ frequencyPenalty?: number;
146
+ promptCacheKey?: string;
147
+ }
148
+
149
+ export type AdapterEvent =
150
+ | { type: "text_delta"; text: string }
151
+ | { type: "thinking_delta"; thinking: string }
152
+ | { type: "tool_call_start"; id: string; name: string }
153
+ | { type: "tool_call_delta"; arguments: string }
154
+ | { type: "tool_call_end" }
155
+ | { type: "done"; usage?: OcxUsage }
156
+ | { type: "error"; message: string };
157
+
158
+ export interface OcxUsage {
159
+ inputTokens: number;
160
+ outputTokens: number;
161
+ }
162
+
163
+ export interface OcxConfig {
164
+ port: number;
165
+ providers: Record<string, OcxProviderConfig>;
166
+ defaultProvider: string;
167
+ /**
168
+ * Up to 5 routed model ids ("<provider>/<model>") to feature FIRST in the injected Codex catalog.
169
+ * Codex's spawn_agent only advertises the first 5 routed models, so this picks which 5 appear.
170
+ */
171
+ subagentModels?: string[];
172
+ /** Routed model ids ("<provider>/<model>") hidden from Codex (excluded from the catalog + /v1/models). */
173
+ disabledModels?: string[];
174
+ /** Freshness window (ms) for the per-provider live `/models` cache. Defaults to 5 min. */
175
+ modelCacheTtlMs?: number;
176
+ /** Web-search sidecar: route web_search for non-OpenAI models through a gpt-mini via ChatGPT passthrough. */
177
+ webSearchSidecar?: OcxWebSearchSidecarConfig;
178
+ /** Vision sidecar: describe images via a gpt vision model so text-only models can "see" them. */
179
+ visionSidecar?: OcxVisionSidecarConfig;
180
+ }
181
+
182
+ export interface OcxVisionSidecarConfig {
183
+ /** Master switch. Default: enabled when a forward (ChatGPT) provider exists and the caller is logged in. */
184
+ enabled?: boolean;
185
+ /** Vision model that describes images (must be a native ChatGPT model with image input). */
186
+ model?: string;
187
+ /** Sidecar fetch timeout (ms). */
188
+ timeoutMs?: number;
189
+ }
190
+
191
+ export interface OcxWebSearchSidecarConfig {
192
+ /** Master switch. Default: enabled when a forward (ChatGPT) provider exists and the caller is logged in. */
193
+ enabled?: boolean;
194
+ /** Sidecar model that runs the real server-side web_search (must be a native ChatGPT model). */
195
+ model?: string;
196
+ /** Reasoning effort for the sidecar — "minimal" (non-thinking) keeps it fast/cheap. */
197
+ reasoning?: string;
198
+ /** Max searches executed per main-model turn (loop guard). */
199
+ maxSearchesPerTurn?: number;
200
+ /** Sidecar fetch timeout (ms). */
201
+ timeoutMs?: number;
202
+ }
203
+
204
+ export interface OcxProviderConfig {
205
+ adapter: string;
206
+ baseUrl: string;
207
+ apiKey?: string;
208
+ defaultModel?: string;
209
+ models?: string[];
210
+ headers?: Record<string, string>;
211
+ /**
212
+ * "key" (default): authenticate upstream with `apiKey`.
213
+ * "forward": relay the caller's incoming auth headers verbatim (OAuth passthrough; gpt only).
214
+ * "oauth": resolve a stored OAuth access token (auto-refreshed) and use it as the Bearer key.
215
+ * Only the openai-responses adapter implements "forward"; openai-chat uses its own key/token.
216
+ */
217
+ authMode?: "key" | "forward" | "oauth";
218
+ /**
219
+ * Model ids that do NOT support a reasoning/thinking parameter. The openai-chat adapter drops
220
+ * reasoning_effort for these even when Codex selects a reasoning level (e.g. xAI grok-build-0.1).
221
+ */
222
+ noReasoningModels?: string[];
223
+ /**
224
+ * Model ids that do NOT accept image inputs. The proxy gives them "eyes" via the vision sidecar:
225
+ * attached images are described by a gpt vision model and replaced with text before the call.
226
+ */
227
+ noVisionModels?: string[];
228
+ }
package/src/update.ts ADDED
@@ -0,0 +1,64 @@
1
+ import { spawnSync } from "node:child_process";
2
+ import { readFileSync } from "node:fs";
3
+ import { fileURLToPath } from "node:url";
4
+ import { dirname, join } from "node:path";
5
+
6
+ const PKG = "@bitkyc08/opencodex";
7
+ const HERE = dirname(fileURLToPath(import.meta.url)); // .../opencodex/src
8
+
9
+ type Installer = "bun" | "npm" | "source";
10
+
11
+ /** Infer how opencodex is installed from the running module's path. */
12
+ function detectInstall(): Installer {
13
+ if (!HERE.includes("node_modules")) return "source"; // a git checkout, not a global install
14
+ return HERE.includes(".bun") ? "bun" : "npm";
15
+ }
16
+
17
+ function currentVersion(): string {
18
+ try {
19
+ return (JSON.parse(readFileSync(join(HERE, "..", "package.json"), "utf8")).version as string) ?? "?";
20
+ } catch {
21
+ return "?";
22
+ }
23
+ }
24
+
25
+ /** Latest published version from the registry (best-effort; null if npm isn't available). */
26
+ function latestVersion(): string | null {
27
+ const r = spawnSync("npm", ["view", PKG, "version"], { encoding: "utf8", timeout: 12000, windowsHide: true });
28
+ return r.status === 0 ? r.stdout.trim() : null;
29
+ }
30
+
31
+ /**
32
+ * `ocx update` — self-update opencodex to the latest published version, using the same package
33
+ * manager it was installed with (bun or npm global). A source checkout is told to `git pull` instead.
34
+ */
35
+ export function runUpdate(): void {
36
+ const installer = detectInstall();
37
+ const current = currentVersion();
38
+ console.log(`opencodex v${current} (installed via ${installer})`);
39
+
40
+ if (installer === "source") {
41
+ console.log("Running from a source checkout — update with: git pull && bun install");
42
+ return;
43
+ }
44
+
45
+ const latest = latestVersion();
46
+ if (latest && latest === current) {
47
+ console.log(`Already on the latest version (v${latest}).`);
48
+ return;
49
+ }
50
+
51
+ const bin = installer === "bun" ? "bun" : "npm";
52
+ const cmdArgs = installer === "bun"
53
+ ? ["add", "-g", `${PKG}@latest`]
54
+ : ["install", "-g", `${PKG}@latest`];
55
+ console.log(`Updating${latest ? ` to v${latest}` : ""}…\n$ ${bin} ${cmdArgs.join(" ")}`);
56
+
57
+ const r = spawnSync(bin, cmdArgs, { stdio: "inherit", timeout: 180000, windowsHide: true });
58
+ if (r.status === 0) {
59
+ console.log(`\n✅ Updated${latest ? ` to v${latest}` : ""}. Restart the proxy: ocx stop && ocx start`);
60
+ } else {
61
+ console.error(`\n⚠️ Update failed (${bin} exit ${r.status ?? "?"}). Try manually: ${bin} ${cmdArgs.join(" ")}`);
62
+ process.exit(1);
63
+ }
64
+ }
@@ -0,0 +1,98 @@
1
+ import type { OcxProviderConfig } from "../types";
2
+ import { FORWARD_HEADERS } from "../adapters/openai-responses";
3
+ import { parseSidecarSSE } from "../web-search/parse";
4
+
5
+ export interface VisionSettings {
6
+ model: string;
7
+ timeoutMs: number;
8
+ }
9
+
10
+ /** A description, or an `error` string when it couldn't run (caller injects a graceful marker). */
11
+ export type DescribeOutcome = { text: string; error?: string };
12
+
13
+ const ALLOWED_IMAGE_MIME = new Set(["image/png", "image/jpeg", "image/jpg", "image/webp", "image/gif"]);
14
+ /** ~20 MB — generous enough for screenshots; rejects pathological payloads before forwarding. */
15
+ const MAX_IMAGE_BYTES = 20 * 1024 * 1024;
16
+
17
+ /**
18
+ * Validate an image URL before forwarding. Data URLs are checked for an allowed media type and a sane
19
+ * decoded size (a malformed/huge/unsupported one would otherwise 400 at the backend or waste tokens).
20
+ * Remote https URLs are passed through — the ChatGPT backend fetches them, not this proxy (so there's
21
+ * no SSRF surface here). Returns an error string when the URL must be rejected, else null.
22
+ */
23
+ function validateImageUrl(url: string): string | null {
24
+ if (url.startsWith("data:")) {
25
+ const m = /^data:([^;,]+?)(;base64)?,(.*)$/s.exec(url);
26
+ if (!m) return "malformed data URL";
27
+ const mime = m[1].toLowerCase();
28
+ if (!ALLOWED_IMAGE_MIME.has(mime)) return `unsupported image type "${mime}"`;
29
+ if (m[2]) {
30
+ const bytes = Math.floor((m[3].length * 3) / 4);
31
+ if (bytes > MAX_IMAGE_BYTES) return `image too large (~${Math.round(bytes / 1024 / 1024)}MB)`;
32
+ }
33
+ return null;
34
+ }
35
+ if (url.startsWith("https://")) return null;
36
+ return "unsupported image URL scheme (expected data: or https:)";
37
+ }
38
+
39
+ /**
40
+ * Describe ONE image via a gpt vision model through the ChatGPT forward backend — the path that has
41
+ * native image input. Reuses the caller's forwarded OAuth headers. The user's own request text is
42
+ * passed as context so the description is focused. Never throws — returns `{error}` on failure.
43
+ */
44
+ export async function describeImage(
45
+ imageUrl: string,
46
+ detail: string | undefined,
47
+ contextText: string,
48
+ forwardProvider: OcxProviderConfig,
49
+ incomingHeaders: Headers,
50
+ settings: VisionSettings,
51
+ ): Promise<DescribeOutcome> {
52
+ const invalid = validateImageUrl(imageUrl);
53
+ if (invalid) return { text: "", error: invalid };
54
+
55
+ const headers: Record<string, string> = { "Content-Type": "application/json" };
56
+ if (forwardProvider.headers) Object.assign(headers, forwardProvider.headers);
57
+ for (const h of FORWARD_HEADERS) {
58
+ const v = incomingHeaders.get(h);
59
+ if (v) headers[h] = v;
60
+ }
61
+ const content: unknown[] = [];
62
+ if (contextText) content.push({ type: "input_text", text: `The user's request about this image: ${contextText}` });
63
+ content.push({ type: "input_image", image_url: imageUrl, detail: detail ?? "high" });
64
+
65
+ const body = {
66
+ model: settings.model,
67
+ instructions:
68
+ "You are a vision describer for a text-only model that cannot see the image. Describe the image " +
69
+ "thoroughly and factually so that model can fully reason about it: transcribe any visible text " +
70
+ "verbatim, and note UI/layout, colors, branding/logos, charts, and notable details. Focus on " +
71
+ "what's relevant to the user's request. Output only the description.",
72
+ input: [{ type: "message", role: "user", content }],
73
+ reasoning: { effort: "low" },
74
+ // The ChatGPT (codex) backend rejects `max_output_tokens` ("Unsupported parameter"); the
75
+ // description is clamped downstream (DESC_MAX_CHARS) instead.
76
+ store: false,
77
+ stream: true,
78
+ };
79
+ try {
80
+ const res = await fetch(`${forwardProvider.baseUrl}/responses`, {
81
+ method: "POST",
82
+ headers,
83
+ body: JSON.stringify(body),
84
+ signal: AbortSignal.timeout(settings.timeoutMs),
85
+ });
86
+ if (!res.ok) {
87
+ const t = await res.text().catch(() => "");
88
+ return { text: "", error: `vision sidecar HTTP ${res.status}: ${t.slice(0, 200)}` };
89
+ }
90
+ const parsed = await parseSidecarSSE(res);
91
+ // The backend can return HTTP 200 then stream a `response.failed`/`error` event with no text;
92
+ // surface that as a describe error instead of an empty (silently-blank) description.
93
+ if (!parsed.text.trim() && parsed.error) return { text: "", error: parsed.error };
94
+ return { text: parsed.text };
95
+ } catch (e) {
96
+ return { text: "", error: e instanceof Error ? e.message : String(e) };
97
+ }
98
+ }
@@ -0,0 +1,141 @@
1
+ import type { OcxConfig, OcxContentPart, OcxMessage, OcxParsedRequest, OcxProviderConfig, OcxTextContent } from "../types";
2
+ import { modelInList } from "../types";
3
+ import { describeImage, type VisionSettings } from "./describe";
4
+
5
+ export { describeImage } from "./describe";
6
+
7
+ const DEFAULT_VISION_MODEL = "gpt-5.4-mini";
8
+ const DEFAULT_TIMEOUT_MS = 45_000;
9
+ /** Max images described in parallel — keeps first-token latency bounded without flooding the backend. */
10
+ const VISION_CONCURRENCY = 3;
11
+ /** Per-image description hard cap (chars) so multi-image turns can't blow the main model's context. */
12
+ const DESC_MAX_CHARS = 2000;
13
+ /** User-text context passed to the describer, capped. */
14
+ const CONTEXT_MAX_CHARS = 800;
15
+
16
+ /** Run `worker` over `items` with bounded concurrency, preserving input order in the result array. */
17
+ async function runBounded<T, R>(items: T[], limit: number, worker: (item: T) => Promise<R>): Promise<R[]> {
18
+ const results = new Array<R>(items.length);
19
+ let next = 0;
20
+ const runner = async (): Promise<void> => {
21
+ while (next < items.length) {
22
+ const i = next++;
23
+ results[i] = await worker(items[i]);
24
+ }
25
+ };
26
+ await Promise.all(Array.from({ length: Math.min(limit, items.length) }, runner));
27
+ return results;
28
+ }
29
+
30
+ function clamp(s: string, max: number): string {
31
+ return s.length <= max ? s : `${s.slice(0, max)}\n…[description truncated]`;
32
+ }
33
+
34
+ /** First configured forward (ChatGPT passthrough) provider — the path with native image input. */
35
+ function findForwardProvider(config: OcxConfig): OcxProviderConfig | undefined {
36
+ for (const prov of Object.values(config.providers)) {
37
+ if (prov.authMode === "forward") return prov;
38
+ }
39
+ return undefined;
40
+ }
41
+
42
+ /** A user/developer/toolResult message can carry images (toolResult: e.g. Codex view_image output). */
43
+ function carriesImages(role: string): boolean {
44
+ return role === "user" || role === "developer" || role === "toolResult";
45
+ }
46
+
47
+ function messagesHaveImage(parsed: OcxParsedRequest): boolean {
48
+ return parsed.context.messages.some(m =>
49
+ carriesImages(m.role) && Array.isArray(m.content) && (m.content as OcxContentPart[]).some(p => p.type === "image"));
50
+ }
51
+
52
+ export interface VisionPlan {
53
+ forwardProvider: OcxProviderConfig;
54
+ settings: VisionSettings;
55
+ }
56
+
57
+ /**
58
+ * Decide whether the vision sidecar should pre-describe images for this request, returning the plan
59
+ * if so. Active when: the routed model is in `provider.noVisionModels`, the request actually carries
60
+ * an image, a forward provider exists, the sidecar isn't disabled, and the caller forwarded ChatGPT
61
+ * auth. Returns undefined otherwise (the request takes the normal path — images sent natively).
62
+ */
63
+ export function planVisionSidecar(
64
+ config: OcxConfig,
65
+ provider: OcxProviderConfig,
66
+ modelId: string,
67
+ parsed: OcxParsedRequest,
68
+ incomingHeaders: Headers,
69
+ ): VisionPlan | undefined {
70
+ if (!modelInList(provider.noVisionModels, modelId)) return undefined;
71
+ if (!messagesHaveImage(parsed)) return undefined;
72
+ const cfg = config.visionSidecar ?? {};
73
+ if (cfg.enabled === false) return undefined;
74
+ if (!incomingHeaders.get("authorization")) return undefined;
75
+ const forwardProvider = findForwardProvider(config);
76
+ if (!forwardProvider) return undefined;
77
+ return {
78
+ forwardProvider,
79
+ settings: { model: cfg.model ?? DEFAULT_VISION_MODEL, timeoutMs: cfg.timeoutMs ?? DEFAULT_TIMEOUT_MS },
80
+ };
81
+ }
82
+
83
+ interface ImageJob {
84
+ imageUrl: string;
85
+ detail?: string;
86
+ contextText: string;
87
+ }
88
+
89
+ /** Render one describe outcome as the replacement text part (clamped to the per-image budget). */
90
+ function renderDescription(out: { text: string; error?: string }): OcxTextContent {
91
+ return {
92
+ type: "text",
93
+ text: out.error
94
+ ? `[An image was attached but could not be processed: ${out.error}]`
95
+ : `[Image content — described by a vision model because you cannot see images directly:\n${clamp(out.text.trim(), DESC_MAX_CHARS)}]`,
96
+ };
97
+ }
98
+
99
+ /**
100
+ * Replace every image part in the request with a gpt-described text part, so a text-only model can
101
+ * reason about it. Mutates `parsed.context.messages` in place; uses the message's own text as the
102
+ * description context. All images are described with bounded concurrency (not serially) so a
103
+ * multi-image turn doesn't pay the sum of per-image latencies. Failures degrade to a short marker.
104
+ */
105
+ export async function describeImagesInPlace(
106
+ parsed: OcxParsedRequest,
107
+ forwardProvider: OcxProviderConfig,
108
+ incomingHeaders: Headers,
109
+ settings: VisionSettings,
110
+ ): Promise<void> {
111
+ // 1. Gather every image part across messages, each with its own message's text as context.
112
+ const jobs: ImageJob[] = [];
113
+ const targets: { msg: OcxMessage; parts: OcxContentPart[] }[] = [];
114
+ for (const msg of parsed.context.messages) {
115
+ if (!carriesImages(msg.role) || !Array.isArray(msg.content)) continue;
116
+ const parts = msg.content as OcxContentPart[];
117
+ if (!parts.some(p => p.type === "image")) continue;
118
+ const contextText = parts
119
+ .filter((p): p is OcxTextContent => p.type === "text")
120
+ .map(p => p.text)
121
+ .join(" ")
122
+ .slice(0, CONTEXT_MAX_CHARS);
123
+ for (const p of parts) {
124
+ if (p.type === "image") jobs.push({ imageUrl: p.imageUrl, detail: p.detail, contextText });
125
+ }
126
+ targets.push({ msg, parts });
127
+ }
128
+ if (jobs.length === 0) return;
129
+
130
+ // 2. Describe all images with bounded concurrency (order preserved).
131
+ const outcomes = await runBounded(jobs, VISION_CONCURRENCY, j =>
132
+ describeImage(j.imageUrl, j.detail, j.contextText, forwardProvider, incomingHeaders, settings));
133
+
134
+ // 3. Rebuild each message, replacing image parts with their descriptions in order.
135
+ let oi = 0;
136
+ for (const { msg, parts } of targets) {
137
+ const newParts: OcxContentPart[] = [];
138
+ for (const p of parts) newParts.push(p.type === "image" ? renderDescription(outcomes[oi++]) : p);
139
+ msg.content = newParts;
140
+ }
141
+ }
@@ -0,0 +1,75 @@
1
+ import type { OcxProviderConfig } from "../types";
2
+ import { FORWARD_HEADERS } from "../adapters/openai-responses";
3
+ import { parseSidecarSSE, type WebSearchResult } from "./parse";
4
+
5
+ export interface SidecarSettings {
6
+ model: string;
7
+ reasoning: string;
8
+ timeoutMs: number;
9
+ /**
10
+ * True when the routed (downstream) model is text-only. The search model CAN see images, so it's
11
+ * told to verbalize any relevant image results and include their URLs — otherwise a non-vision model
12
+ * would receive bare image links it cannot interpret (the image-web-search gap).
13
+ */
14
+ describeImages?: boolean;
15
+ }
16
+
17
+ const BASE_INSTRUCTION =
18
+ "You are a web-search assistant. Use the web_search tool to find current information for the " +
19
+ "user's query, then reply with a concise, factual answer and cite the sources you used.";
20
+ const IMAGE_INSTRUCTION =
21
+ " The model that will read your answer is TEXT-ONLY and cannot see images: if the results include " +
22
+ "relevant images, describe what they show in words and include their source URLs in your answer.";
23
+
24
+ /** A search result, or an `error` string when the search couldn't run (surfaced as a tool result). */
25
+ export type SidecarOutcome = WebSearchResult & { error?: string };
26
+
27
+ /**
28
+ * Execute ONE web search via the gpt-mini sidecar through the ChatGPT forward backend — the only path
29
+ * with a real server-side web_search. Reuses the caller's forwarded OAuth headers (the forward adapter
30
+ * has no key of its own), replays the hosted web_search tool config verbatim, and runs the mini at
31
+ * minimal reasoning. Never throws — returns `{error}` so the caller injects a graceful tool result.
32
+ */
33
+ export async function runWebSearch(
34
+ query: string,
35
+ hostedTool: Record<string, unknown>,
36
+ forwardProvider: OcxProviderConfig,
37
+ incomingHeaders: Headers,
38
+ settings: SidecarSettings,
39
+ ): Promise<SidecarOutcome> {
40
+ const headers: Record<string, string> = { "Content-Type": "application/json" };
41
+ if (forwardProvider.headers) Object.assign(headers, forwardProvider.headers);
42
+ for (const h of FORWARD_HEADERS) {
43
+ const v = incomingHeaders.get(h);
44
+ if (v) headers[h] = v;
45
+ }
46
+ const body = {
47
+ model: settings.model,
48
+ instructions: settings.describeImages ? BASE_INSTRUCTION + IMAGE_INSTRUCTION : BASE_INSTRUCTION,
49
+ input: [{ type: "message", role: "user", content: [{ type: "input_text", text: query }] }],
50
+ tools: [hostedTool],
51
+ tool_choice: "auto",
52
+ reasoning: { effort: settings.reasoning },
53
+ // NOTE: the ChatGPT (codex) backend rejects `max_output_tokens` ("Unsupported parameter") and
54
+ // requires `store: false` — keep this body minimal. Answer length is capped downstream
55
+ // (format-result clamps the injected tool_result), so no upstream cap is needed.
56
+ store: false,
57
+ stream: true,
58
+ };
59
+ const url = `${forwardProvider.baseUrl}/responses`;
60
+ try {
61
+ const res = await fetch(url, {
62
+ method: "POST",
63
+ headers,
64
+ body: JSON.stringify(body),
65
+ signal: AbortSignal.timeout(settings.timeoutMs),
66
+ });
67
+ if (!res.ok) {
68
+ const t = await res.text().catch(() => "");
69
+ return { text: "", sources: [], error: `sidecar HTTP ${res.status}: ${t.slice(0, 200)}` };
70
+ }
71
+ return await parseSidecarSSE(res);
72
+ } catch (e) {
73
+ return { text: "", sources: [], error: e instanceof Error ? e.message : String(e) };
74
+ }
75
+ }