@loreai/gateway 0.14.0 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,222 +0,0 @@
1
- /**
2
- * Internal representation types for the Lore gateway.
3
- *
4
- * The gateway accepts both Anthropic (`/v1/messages`) and OpenAI
5
- * (`/v1/chat/completions`) protocol requests, normalizes them into these
6
- * types for Lore pipeline processing, then translates back to the original
7
- * protocol for the upstream response.
8
- *
9
- * Design: types are intentionally minimal — only fields that Lore's context
10
- * management (gradient, LTM, distillation) actually reads/writes. Protocol-
11
- * specific fields the gateway doesn't process live in `metadata`.
12
- */
13
-
14
- // ---------------------------------------------------------------------------
15
- // Content blocks — discriminated union on `type`
16
- // ---------------------------------------------------------------------------
17
-
18
- export type GatewayTextBlock = {
19
- type: "text";
20
- text: string;
21
- };
22
-
23
- export type GatewayThinkingBlock = {
24
- type: "thinking";
25
- thinking: string;
26
- /** Anthropic extended thinking signature, opaque bytes. */
27
- signature?: string;
28
- };
29
-
30
- export type GatewayToolUseBlock = {
31
- type: "tool_use";
32
- /** Provider-assigned tool call ID (e.g. `toolu_…` for Anthropic). */
33
- id: string;
34
- name: string;
35
- input: unknown;
36
- };
37
-
38
- export type GatewayToolResultBlock = {
39
- type: "tool_result";
40
- /** ID of the tool_use block this result corresponds to. */
41
- toolUseId: string;
42
- content: string;
43
- isError?: boolean;
44
- };
45
-
46
- export type GatewayContentBlock =
47
- | GatewayTextBlock
48
- | GatewayThinkingBlock
49
- | GatewayToolUseBlock
50
- | GatewayToolResultBlock;
51
-
52
- // ---------------------------------------------------------------------------
53
- // Messages
54
- // ---------------------------------------------------------------------------
55
-
56
- /** Normalized message — system messages are extracted to `GatewayRequest.system`. */
57
- export type GatewayMessage = {
58
- role: "user" | "assistant";
59
- content: GatewayContentBlock[];
60
- };
61
-
62
- // ---------------------------------------------------------------------------
63
- // Tools
64
- // ---------------------------------------------------------------------------
65
-
66
- /** Normalized tool definition. Both protocols use JSON Schema for input. */
67
- export type GatewayTool = {
68
- name: string;
69
- description: string;
70
- inputSchema: Record<string, unknown>;
71
- };
72
-
73
- // ---------------------------------------------------------------------------
74
- // Request — the normalized form after ingress translation
75
- // ---------------------------------------------------------------------------
76
-
77
- export type GatewayProtocol = "anthropic" | "openai";
78
-
79
- /** Normalized request after ingress translation from either protocol. */
80
- export type GatewayRequest = {
81
- /** Which protocol the request arrived as — determines egress translation. */
82
- protocol: GatewayProtocol;
83
- /** Model identifier (e.g. `claude-sonnet-4-20250514`, `gpt-4o`). */
84
- model: string;
85
- /**
86
- * Extracted system prompt.
87
- * - Anthropic: top-level `system` field.
88
- * - OpenAI: first message with `role: "system"`, removed from messages.
89
- */
90
- system: string;
91
- messages: GatewayMessage[];
92
- tools: GatewayTool[];
93
- stream: boolean;
94
- maxTokens: number;
95
- /**
96
- * Protocol-specific parameters the gateway doesn't process but must
97
- * forward to the upstream provider (e.g. `temperature`, `top_p`,
98
- * `stop_sequences`, `tool_choice`).
99
- */
100
- metadata: Record<string, unknown>;
101
- /** Original request headers — passed through for auth, tracing, etc. */
102
- rawHeaders: Record<string, string>;
103
- /**
104
- * Additional OpenAI-compatible parameters preserved for upstream forwarding.
105
- * Populated by `parseOpenAIRequest`.
106
- */
107
- extras?: {
108
- temperature?: number;
109
- top_p?: number;
110
- frequency_penalty?: number;
111
- presence_penalty?: number;
112
- user?: string;
113
- logprobs?: boolean;
114
- top_logprobs?: number;
115
- };
116
- };
117
-
118
- // ---------------------------------------------------------------------------
119
- // Response — accumulated from upstream streaming/non-streaming response
120
- // ---------------------------------------------------------------------------
121
-
122
- export type GatewayUsage = {
123
- inputTokens: number;
124
- outputTokens: number;
125
- /** Anthropic prompt caching — present when cache hits occur. */
126
- cacheReadInputTokens?: number;
127
- /** Anthropic prompt caching — tokens written to cache on this request. */
128
- cacheCreationInputTokens?: number;
129
- };
130
-
131
- /** Accumulated response from the upstream provider. */
132
- export type GatewayResponse = {
133
- id: string;
134
- model: string;
135
- content: GatewayContentBlock[];
136
- /** Provider stop reason (e.g. `end_turn`, `stop`, `tool_use`, `length`). */
137
- stopReason: string;
138
- usage: GatewayUsage;
139
- };
140
-
141
- // ---------------------------------------------------------------------------
142
- // Recall store (cross-request, gateway recall interception)
143
- // ---------------------------------------------------------------------------
144
-
145
- /** Stored recall result for marker-based round-trip expansion. */
146
- export type StoredRecall = {
147
- /** The tool_use ID to reconstruct in the upstream request. */
148
- toolUseId: string;
149
- /** Original recall input (query + scope). */
150
- input: { query: string; scope?: string };
151
- /** Position (content block index) in the original assistant message. */
152
- position: number;
153
- /** Executed recall result (formatted markdown). */
154
- result: string;
155
- };
156
-
157
- /** Map from marker key (`${scope}:${query}`) → stored recall data. */
158
- export type RecallStore = Map<string, StoredRecall>;
159
-
160
- // ---------------------------------------------------------------------------
161
- // Session state — per-session tracking for Lore pipeline integration
162
- // ---------------------------------------------------------------------------
163
-
164
- /** Per-turn cache analysis emitted as structured log data. */
165
- export type CacheTurnAnalysis = {
166
- /** Turn number within this session. */
167
- turn: number;
168
-
169
- // --- Ground truth from API response ---
170
- /** Tokens served from prompt cache (hit). */
171
- cacheRead: number;
172
- /** Tokens written to prompt cache (miss / new). */
173
- cacheCreation: number;
174
- /** Uncached input tokens. */
175
- inputTokens: number;
176
- /** cacheRead / total input — 0..1. */
177
- cacheHitRate: number;
178
-
179
- // --- Request body prefix comparison ---
180
- /** Bytes matching from start of serialized request body vs previous turn. */
181
- prefixMatchBytes: number;
182
- /** prefixMatchBytes / min(prev, current) body length — 0..1. */
183
- prefixMatchPercent: number;
184
- /** Semantic location of the first divergence (e.g. "messages[3].content[1]"). */
185
- divergencePoint: string;
186
- /** Human-readable reason (e.g. "system prompt changed", "new message appended"). */
187
- divergenceReason: string;
188
- };
189
-
190
- /** Per-session cache analytics state. */
191
- export type CacheAnalytics = {
192
- /** Deflate-compressed serialized request body from the last turn. */
193
- lastRequestBody: Uint8Array | null;
194
- /** Uncompressed byte length of lastRequestBody (for prefix match %). */
195
- lastRequestBodyLength: number;
196
- /** cache_read_input_tokens from last API response. */
197
- lastCacheRead: number;
198
- /** cache_creation_input_tokens from last API response. */
199
- lastCacheCreation: number;
200
- /** Total turns observed. */
201
- turnCount: number;
202
- /** Confirmed busts (API returned cacheRead=0 with cacheCreation>0). */
203
- bustCount: number;
204
- };
205
-
206
- /** Per-session state tracked by the gateway for Lore pipeline decisions. */
207
- export type SessionState = {
208
- sessionID: string;
209
- projectPath: string;
210
- /** SHA-256 fingerprint of the first user message — used for session correlation. */
211
- fingerprint: string;
212
- /** Unix timestamp (ms) of the last request in this session. */
213
- lastRequestTime: number;
214
- /** Total user+assistant messages seen in this session. */
215
- messageCount: number;
216
- /** Turns since last curation run — triggers background curation. */
217
- turnsSinceCuration: number;
218
- /** Stored recall results for marker-based round-trip expansion. */
219
- recallStore: RecallStore;
220
- /** Cache analytics — request body prefix comparison + API cache fields. */
221
- cacheAnalytics: CacheAnalytics;
222
- };
@@ -1,408 +0,0 @@
1
- /**
2
- * Gateway worker model discovery and resolution.
3
- *
4
- * Discovers available models from the upstream Anthropic `/v1/models` API,
5
- * fetches per-model pricing from models.dev (open-source model database),
6
- * and integrates with core's worker model validation/resolution pipeline.
7
- *
8
- * This replaces the OpenCode adapter's `getProviderModels()` +
9
- * `maybeValidateWorkerModel()` — the gateway is the universal path and
10
- * doesn't depend on the OpenCode SDK's model listing (which can report
11
- * deprecated models as "active").
12
- */
13
-
14
- import {
15
- workerModel,
16
- temporal,
17
- distillation as distillationMod,
18
- config as loreConfig,
19
- log,
20
- } from "@loreai/core";
21
- import type { LLMClient } from "@loreai/core";
22
- import type { AuthCredential } from "./auth";
23
- import { authHeaders } from "./auth";
24
-
25
- // ---------------------------------------------------------------------------
26
- // Cost lookup — models.dev with hardcoded fallback
27
- // ---------------------------------------------------------------------------
28
-
29
- /**
30
- * models.dev JSON API endpoint — returns all providers/models with pricing.
31
- *
32
- * Single request replaces N individual TOML fetches. Response shape:
33
- * { anthropic: { models: { "claude-sonnet-4-20250514": { cost: { input: 3 }, ... }, ... } } }
34
- * Cost values are per-million-token USD.
35
- */
36
- const MODELS_DEV_API = "https://models.dev/api.json";
37
-
38
- /** Cached models.dev cost data: modelID → per-million-token input cost. */
39
- let cachedCostMap: Map<string, number> | null = null;
40
- let cachedCostMapAt = 0;
41
- const COST_CACHE_TTL_MS = 60 * 60 * 1000; // 1 hour
42
-
43
- /**
44
- * Hardcoded fallback costs (per-input-token, USD) used when models.dev
45
- * API is unreachable. Prefix-matched against model IDs.
46
- *
47
- * These only serve as a safety net — runtime pricing from models.dev is
48
- * preferred and fetched on every discovery cycle (cached 1h).
49
- */
50
- const FALLBACK_COSTS: Array<{ prefix: string; inputCostPerToken: number }> = [
51
- { prefix: "claude-opus-4", inputCostPerToken: 15 / 1_000_000 },
52
- { prefix: "claude-sonnet-4", inputCostPerToken: 3 / 1_000_000 },
53
- { prefix: "claude-haiku-4", inputCostPerToken: 1 / 1_000_000 },
54
- { prefix: "claude-haiku-3-5", inputCostPerToken: 0.8 / 1_000_000 },
55
- { prefix: "claude-sonnet-3-5", inputCostPerToken: 3 / 1_000_000 },
56
- { prefix: "claude-3-haiku", inputCostPerToken: 0.25 / 1_000_000 },
57
- { prefix: "claude-3-sonnet", inputCostPerToken: 3 / 1_000_000 },
58
- { prefix: "claude-3-opus", inputCostPerToken: 15 / 1_000_000 },
59
- ];
60
-
61
- function fallbackCost(modelID: string): number {
62
- for (const { prefix, inputCostPerToken } of FALLBACK_COSTS) {
63
- if (modelID.startsWith(prefix)) return inputCostPerToken;
64
- }
65
- // Unknown model — assume expensive so it doesn't get picked as a worker
66
- return 100 / 1_000_000;
67
- }
68
-
69
- /** Shape of a model entry in the models.dev JSON API. */
70
- type ModelsDevEntry = {
71
- id: string;
72
- cost?: { input?: number };
73
- };
74
-
75
- /** Shape of the models.dev JSON API response (subset we care about). */
76
- type ModelsDevResponse = {
77
- [provider: string]: {
78
- models?: { [modelId: string]: ModelsDevEntry };
79
- };
80
- };
81
-
82
- /**
83
- * Fetch the models.dev cost map for Anthropic models.
84
- *
85
- * Single HTTP request to the JSON API, cached for 1 hour.
86
- * Returns a map of modelID → per-million-token input cost.
87
- */
88
- export async function fetchCostMap(): Promise<Map<string, number>> {
89
- // Return cache if fresh
90
- if (cachedCostMap && Date.now() - cachedCostMapAt < COST_CACHE_TTL_MS) {
91
- return cachedCostMap;
92
- }
93
-
94
- try {
95
- const controller = new AbortController();
96
- const timeout = setTimeout(() => controller.abort(), 10_000);
97
-
98
- const response = await fetch(MODELS_DEV_API, { signal: controller.signal });
99
- clearTimeout(timeout);
100
-
101
- if (!response.ok) {
102
- log.warn(`models.dev API failed: ${response.status} ${response.statusText}`);
103
- return cachedCostMap ?? new Map();
104
- }
105
-
106
- const data = (await response.json()) as ModelsDevResponse;
107
- const anthropic = data.anthropic?.models;
108
- if (!anthropic) {
109
- log.warn("models.dev API: no anthropic provider found");
110
- return cachedCostMap ?? new Map();
111
- }
112
-
113
- const costMap = new Map<string, number>();
114
- for (const [modelId, entry] of Object.entries(anthropic)) {
115
- if (entry.cost?.input != null) {
116
- costMap.set(modelId, entry.cost.input);
117
- }
118
- }
119
-
120
- cachedCostMap = costMap;
121
- cachedCostMapAt = Date.now();
122
-
123
- log.info(`models.dev: loaded costs for ${costMap.size} anthropic models`);
124
- return costMap;
125
- } catch (e) {
126
- log.warn("models.dev API error:", e);
127
- return cachedCostMap ?? new Map();
128
- }
129
- }
130
-
131
- /** Clear the cached cost map (for testing). */
132
- export function clearCostCache(): void {
133
- cachedCostMap = null;
134
- cachedCostMapAt = 0;
135
- }
136
-
137
- /**
138
- * Fetch per-model input cost from models.dev JSON API.
139
- *
140
- * Single HTTP request fetches all Anthropic model costs. Returns a map of
141
- * modelID → per-token cost. Models not found in models.dev get fallback costs.
142
- */
143
- export async function fetchModelCosts(
144
- modelIDs: string[],
145
- ): Promise<Map<string, number>> {
146
- const costMap = await fetchCostMap();
147
- const costs = new Map<string, number>();
148
-
149
- for (const id of modelIDs) {
150
- const costPerMillion = costMap.get(id);
151
- if (costPerMillion != null) {
152
- costs.set(id, costPerMillion / 1_000_000);
153
- } else {
154
- costs.set(id, fallbackCost(id));
155
- }
156
- }
157
-
158
- return costs;
159
- }
160
-
161
- // ---------------------------------------------------------------------------
162
- // Anthropic /v1/models API types (subset we care about)
163
- // ---------------------------------------------------------------------------
164
-
165
- type AnthropicModelEntry = {
166
- id: string;
167
- display_name: string;
168
- created_at: string;
169
- capabilities?: {
170
- thinking?: { supported: boolean };
171
- };
172
- };
173
-
174
- type AnthropicModelsResponse = {
175
- data: AnthropicModelEntry[];
176
- has_more: boolean;
177
- last_id?: string;
178
- };
179
-
180
- // ---------------------------------------------------------------------------
181
- // Model discovery — fetch from upstream /v1/models
182
- // ---------------------------------------------------------------------------
183
-
184
- /** Cached model list with TTL. */
185
- let cachedModels: workerModel.ModelInfo[] | null = null;
186
- let cachedModelsAt = 0;
187
- const MODEL_CACHE_TTL_MS = 60 * 60 * 1000; // 1 hour
188
-
189
- /**
190
- * Fetch available Anthropic models from the upstream API.
191
- *
192
- * Results are cached for 1 hour — model listings change rarely and we
193
- * don't want to hit the API on every idle cycle.
194
- *
195
- * Unlike the OpenCode SDK's `provider.list()`, the Anthropic `/v1/models`
196
- * API only returns models that actually exist — deprecated models are
197
- * removed, so we never get stale entries like `claude-3-haiku-20240307`.
198
- */
199
- export async function discoverModels(
200
- upstreamUrl: string,
201
- cred: AuthCredential,
202
- ): Promise<workerModel.ModelInfo[]> {
203
- // Return cache if fresh
204
- if (cachedModels && Date.now() - cachedModelsAt < MODEL_CACHE_TTL_MS) {
205
- return cachedModels;
206
- }
207
-
208
- try {
209
- const entries: AnthropicModelEntry[] = [];
210
- let afterId: string | undefined;
211
-
212
- // Paginate through all models
213
- do {
214
- const url = new URL(`${upstreamUrl}/v1/models`);
215
- url.searchParams.set("limit", "1000");
216
- if (afterId) url.searchParams.set("after_id", afterId);
217
-
218
- const response = await fetch(url.toString(), {
219
- headers: {
220
- "content-type": "application/json",
221
- "anthropic-version": "2023-06-01",
222
- ...authHeaders(cred),
223
- },
224
- });
225
-
226
- if (!response.ok) {
227
- const text = await response.text().catch(() => "(no body)");
228
- log.warn(
229
- `model discovery failed: ${response.status} ${response.statusText} — ${text}`,
230
- );
231
- return cachedModels ?? [];
232
- }
233
-
234
- const data = (await response.json()) as AnthropicModelsResponse;
235
-
236
- for (const entry of data.data) {
237
- entries.push(entry);
238
- }
239
-
240
- afterId = data.has_more ? data.last_id : undefined;
241
- } while (afterId);
242
-
243
- // Fetch costs from models.dev in parallel (with fallback to hardcoded)
244
- const modelIDs = entries.map((e) => e.id);
245
- const costs = await fetchModelCosts(modelIDs);
246
-
247
- const models: workerModel.ModelInfo[] = entries.map((entry) => ({
248
- id: entry.id,
249
- providerID: "anthropic",
250
- cost: { input: costs.get(entry.id) ?? fallbackCost(entry.id) },
251
- status: "active", // Only active models are returned by the API
252
- capabilities: {
253
- input: { text: true }, // All Anthropic models accept text
254
- reasoning: entry.capabilities?.thinking?.supported ?? false,
255
- },
256
- }));
257
-
258
- cachedModels = models;
259
- cachedModelsAt = Date.now();
260
-
261
- log.info(
262
- `model discovery: found ${models.length} models (${models.map((m) => m.id).join(", ")})`,
263
- );
264
-
265
- return models;
266
- } catch (e) {
267
- log.warn("model discovery error:", e);
268
- return cachedModels ?? [];
269
- }
270
- }
271
-
272
- /** Clear the cached model list (for testing). */
273
- export function clearModelCache(): void {
274
- cachedModels = null;
275
- cachedModelsAt = 0;
276
- }
277
-
278
- // ---------------------------------------------------------------------------
279
- // Worker model validation — gateway version of maybeValidateWorkerModel
280
- // ---------------------------------------------------------------------------
281
-
282
- /** Guard against concurrent validation runs. */
283
- let validating = false;
284
-
285
- /**
286
- * Run worker model validation if needed.
287
- *
288
- * Called on session idle — discovers available models, selects candidates,
289
- * checks if the stored validation is stale, and runs the two-phase
290
- * comparison (structural check + LLM judge) if needed.
291
- *
292
- * @param sessionModel The model ID being used for conversation (frontier)
293
- * @param upstreamUrl Anthropic API base URL
294
- * @param cred Auth credential for API calls
295
- * @param llm LLM client for validation prompts
296
- * @param projectPath Project directory path
297
- * @param sessionID Session ID for loading reference distillation data
298
- */
299
- export async function maybeValidateWorkerModel(
300
- sessionModel: string,
301
- upstreamUrl: string,
302
- cred: AuthCredential,
303
- llm: LLMClient,
304
- projectPath: string,
305
- sessionID: string,
306
- ): Promise<void> {
307
- if (validating) return;
308
-
309
- const cfg = loreConfig();
310
- if (cfg.workerModel) return; // explicit override — skip auto-selection
311
-
312
- const models = await discoverModels(upstreamUrl, cred);
313
- if (models.length === 0) return;
314
-
315
- // Build the session model info for candidate selection.
316
- // Use cost from discovered models if available, otherwise fallback.
317
- const discoveredModel = models.find((m) => m.id === sessionModel);
318
- const sessionModelInfo: Parameters<typeof workerModel.selectWorkerCandidates>[0] = {
319
- id: sessionModel,
320
- providerID: "anthropic",
321
- cost: { input: discoveredModel?.cost.input ?? fallbackCost(sessionModel) },
322
- };
323
-
324
- const candidates = workerModel.selectWorkerCandidates(sessionModelInfo, models);
325
- if (candidates.length === 0) return;
326
- // If session model is already the cheapest, no comparison needed
327
- if (candidates.length === 1 && candidates[0].id === sessionModel) return;
328
-
329
- const fingerprint = workerModel.computeModelFingerprint(
330
- "anthropic",
331
- sessionModel,
332
- models.filter((m) => m.providerID === "anthropic").map((m) => m.id),
333
- );
334
-
335
- const stored = workerModel.getValidatedWorkerModel("anthropic");
336
- if (!workerModel.isValidationStale(stored, fingerprint)) return;
337
-
338
- // Need reference distillation data
339
- const distillations = distillationMod.loadForSession(projectPath, sessionID, true);
340
- const gen0 = distillations.filter((d) => d.generation === 0);
341
- if (gen0.length === 0) return;
342
-
343
- const reference = gen0[gen0.length - 1]; // most recent gen-0
344
- const sourceIds = reference.source_ids;
345
- if (sourceIds.length === 0) return;
346
-
347
- // Load source temporal messages
348
- const allMessages = temporal.bySession(projectPath, sessionID);
349
- const sourceSet = new Set(sourceIds);
350
- const sourceMessages = allMessages.filter((m) => sourceSet.has(m.id));
351
- if (sourceMessages.length === 0) return;
352
-
353
- const messagesText = sourceMessages.map((m) => m.content).join("\n");
354
- const date = new Date(sourceMessages[0].created_at).toLocaleDateString(
355
- "en-US",
356
- { year: "numeric", month: "long", day: "numeric" },
357
- );
358
-
359
- validating = true;
360
- try {
361
- const result = await workerModel.runValidation({
362
- llm,
363
- providerID: "anthropic",
364
- sessionModelID: sessionModel,
365
- candidates,
366
- referenceObservations: reference.observations,
367
- sourceMessagesText: messagesText,
368
- date,
369
- });
370
- if (result) {
371
- log.info(
372
- `worker model validated: ${result.modelID} (judge=${result.judgeScore}) — saving 50%+ on worker calls`,
373
- );
374
- }
375
- } catch (e) {
376
- log.error("worker model validation error:", e);
377
- } finally {
378
- validating = false;
379
- }
380
- }
381
-
382
- // ---------------------------------------------------------------------------
383
- // Resolution — wrapper around core's resolveWorkerModel
384
- // ---------------------------------------------------------------------------
385
-
386
- /**
387
- * Resolve the effective worker model for background calls.
388
- *
389
- * Checks (in order):
390
- * 1. Explicit config override (`workerModel` in lore config)
391
- * 2. Validated auto-selection from kv_meta (with 24h TTL)
392
- * 3. Config model fallback (frontier model)
393
- */
394
- export function getWorkerModel(): { providerID: string; modelID: string } | undefined {
395
- const cfg = loreConfig();
396
- return workerModel.resolveWorkerModel(
397
- "anthropic",
398
- cfg.workerModel,
399
- cfg.model,
400
- );
401
- }
402
-
403
- /** Reset module state (for testing). */
404
- export function resetWorkerModelState(): void {
405
- clearModelCache();
406
- clearCostCache();
407
- validating = false;
408
- }