prism-mcp-server 4.2.0 → 4.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,240 @@
1
+ /**
2
+ * Image Captioner (v4.5)
3
+ * ─────────────────────────────────────────────────────────────────────────────
4
+ * PURPOSE:
5
+ * Fire-and-forget background pipeline that auto-captions images saved via
6
+ * `session_save_image`. Connects the VLM adapter → handoff visual_memory →
7
+ * session ledger → embedding backfill so images become semantically
8
+ * searchable without any changes to the tool surface area.
9
+ *
10
+ * PIPELINE (all async, never blocks the MCP response):
11
+ * 1. Read vault file → base64
12
+ * 2. Size check (Anthropic: 5MB hard cap; all others: 20MB soft cap)
13
+ * 3. getLLMProvider().generateImageDescription(base64, mimeType, context)
14
+ * 4. storage.updateImageCaption(project, imageId, caption) — patches handoff
15
+ * 5. storage.saveLedger(...) — makes caption searchable
16
+ * 6. backfillEmbeddingsAsync(project) — vector-indexes caption
17
+ *
18
+ * DESIGN DECISIONS:
19
+ * - `generateImageDescription` is optional on LLMProvider. If the active
20
+ * provider doesn't support VLM (e.g. a text-only Ollama model), captioning
21
+ * is skipped gracefully with a single log line.
22
+ * - Errors are caught and logged; they never propagate. `session_save_image`
23
+ * already returned successfully when this runs.
24
+ * - The Anthropic 5MB limit is checked before calling the API. Gemini and
25
+ * OpenAI accept up to ~20MB.
26
+ * - The ledger entry embeds image ID and path in the `summary` string because
27
+ * the ledger schema has no generic metadata column.
28
+ */
29
+ import * as fs from "fs";
30
+ import * as nodePath from "path";
31
+ import { getLLMProvider } from "./llm/factory.js";
32
+ import { getStorage } from "../storage/index.js";
33
+ import { debugLog } from "./logger.js";
34
+ import { PRISM_USER_ID } from "../config.js";
35
+ import { getTracer } from "./telemetry.js";
36
+ import { SpanStatusCode, context as otelContext, trace } from "@opentelemetry/api";
37
+ // ─── Size Caps ────────────────────────────────────────────────────────────────
38
+ /** Anthropic Messages API rejects base64 image payloads > 5MB */
39
+ const ANTHROPIC_MAX_BYTES = 5 * 1024 * 1024;
40
+ /** Gemini / OpenAI accept larger images; 20MB is a conservative safe cap */
41
+ const DEFAULT_MAX_BYTES = 20 * 1024 * 1024;
42
+ // ─── MIME Type Detection ──────────────────────────────────────────────────────
43
+ const EXT_TO_MIME = {
44
+ ".png": "image/png",
45
+ ".jpg": "image/jpeg",
46
+ ".jpeg": "image/jpeg",
47
+ ".webp": "image/webp",
48
+ ".gif": "image/gif",
49
+ ".svg": "image/svg+xml",
50
+ };
51
+ function getMimeType(filePath) {
52
+ const ext = nodePath.extname(filePath).toLowerCase();
53
+ return EXT_TO_MIME[ext] ?? "image/png";
54
+ }
55
+ // ─── Public API ───────────────────────────────────────────────────────────────
56
+ /**
57
+ * Fire-and-forget wrapper. Call this from `session_save_image` after the
58
+ * file is in the vault and the handoff has been saved. Errors are swallowed.
59
+ *
60
+ * @param project Project identifier
61
+ * @param imageId Short UUID assigned by session_save_image (e.g. "a3f1b2c9")
62
+ * @param vaultPath Absolute path to the copied file in ~/.prism-mcp/media/
63
+ * @param userContext User-provided description (passed as hint to the VLM)
64
+ */
65
+ export function fireCaptionAsync(project, imageId, vaultPath, userContext) {
66
+ // ── v4.6.0: OTel worker span ──────────────────────────────────────────────
67
+ // We start the span here (not inside captionImageAsync) so it runs within
68
+ // the active OTel context that was propagated from the mcp.call_tool root
69
+ // span in server.ts. AsyncLocalStorage carries the context across the
70
+ // async boundary, making this a child of session_save_image in Jaeger.
71
+ //
72
+ // The parent (mcp.call_tool) typically ends at ~50ms when the MCP response
73
+ // is sent. This worker span continues until captioning completes (2–5s).
74
+ // In Jaeger, you will see the parent end first, then its child outlive it —
75
+ // this is the correct, expected representation of fire-and-forget async work.
76
+ const span = getTracer().startSpan("worker.vlm_caption", {
77
+ attributes: {
78
+ "worker.image_id": imageId,
79
+ "worker.project": project,
80
+ },
81
+ });
82
+ // context.with() propagates the OTel span into the async chain so any further
83
+ // nested spans (e.g. llm.generate_image_description inside TracingLLMProvider)
84
+ // are correctly parented as grandchildren of mcp.call_tool.
85
+ otelContext.with(trace.setSpan(otelContext.active(), span), () => {
86
+ captionImageAsync(project, imageId, vaultPath, userContext)
87
+ .then(() => {
88
+ span.setStatus({ code: SpanStatusCode.OK });
89
+ })
90
+ .catch(err => {
91
+ span.recordException(err instanceof Error ? err : new Error(String(err)));
92
+ span.setStatus({
93
+ code: SpanStatusCode.ERROR,
94
+ message: err instanceof Error ? err.message : String(err),
95
+ });
96
+ console.error(`[ImageCaptioner] Failed for [${imageId}]: ${err}`);
97
+ })
98
+ .finally(() => {
99
+ // Always end the span — even on VLM failure — to flush the BatchSpanProcessor.
100
+ span.end();
101
+ });
102
+ });
103
+ }
104
+ // ─── Core Pipeline ────────────────────────────────────────────────────────────
105
+ async function captionImageAsync(project, imageId, vaultPath, userContext) {
106
+ // ── Step 1: Resolve provider ─────────────────────────────────────────────
107
+ const llm = getLLMProvider();
108
+ if (!llm.generateImageDescription) {
109
+ debugLog(`[ImageCaptioner] Active LLM provider does not support VLM — ` +
110
+ `captioning skipped for [${imageId}]. ` +
111
+ `Switch to Gemini, OpenAI (gpt-4o-mini+), or Anthropic to enable.`);
112
+ return;
113
+ }
114
+ // ── Step 2: Read file + size check ───────────────────────────────────────
115
+ if (!fs.existsSync(vaultPath)) {
116
+ debugLog(`[ImageCaptioner] Vault file not found: ${vaultPath}`);
117
+ return;
118
+ }
119
+ const fileBuffer = fs.readFileSync(vaultPath);
120
+ const fileSizeBytes = fileBuffer.length;
121
+ const mimeType = getMimeType(vaultPath);
122
+ // Detect active text provider to apply the correct size cap.
123
+ // We import getSettingSync lazily to avoid circular dependency issues.
124
+ const { getSettingSync } = await import("../storage/configStorage.js");
125
+ const textProvider = getSettingSync("text_provider", "gemini");
126
+ const maxBytes = textProvider === "anthropic" ? ANTHROPIC_MAX_BYTES : DEFAULT_MAX_BYTES;
127
+ if (fileSizeBytes > maxBytes) {
128
+ const limitMB = (maxBytes / 1024 / 1024).toFixed(0);
129
+ const actualMB = (fileSizeBytes / 1024 / 1024).toFixed(1);
130
+ console.warn(`[ImageCaptioner] Image [${imageId}] is ${actualMB}MB, exceeding ` +
131
+ `the ${textProvider} VLM limit (${limitMB}MB). Captioning skipped. ` +
132
+ (textProvider === "anthropic"
133
+ ? "Switch Embedding Provider to Gemini/OpenAI to caption larger images."
134
+ : "Consider resizing the image."));
135
+ return;
136
+ }
137
+ const imageBase64 = fileBuffer.toString("base64");
138
+ // ── Step 3: Generate caption via VLM ────────────────────────────────────
139
+ debugLog(`[ImageCaptioner] Captioning [${imageId}] via ${textProvider}…`);
140
+ const caption = await llm.generateImageDescription(imageBase64, mimeType, userContext);
141
+ if (!caption || caption.trim().length === 0) {
142
+ debugLog(`[ImageCaptioner] Empty caption returned for [${imageId}] — skipping storage.`);
143
+ return;
144
+ }
145
+ debugLog(`[ImageCaptioner] Caption generated for [${imageId}]: "${caption.slice(0, 80)}…"`);
146
+ // ── Step 4: Patch handoff visual_memory entry ─────────────────────────
147
+ await updateHandoffCaption(project, imageId, caption, vaultPath);
148
+ // ── Step 5: Persist as ledger entry (makes caption semantically searchable)
149
+ // NOTE: The ledger schema has no generic metadata column, so we embed the
150
+ // image context directly in the summary string for LLM-readable references.
151
+ const storage = await getStorage();
152
+ await storage.saveLedger({
153
+ project,
154
+ conversation_id: "vlm-captioner",
155
+ user_id: PRISM_USER_ID,
156
+ event_type: "learning",
157
+ summary: `[Visual Memory: ${imageId}]\n` +
158
+ `Path: ${vaultPath}\n` +
159
+ `User description: ${userContext}\n` +
160
+ `VLM Caption: ${caption}`,
161
+ keywords: [`image:${imageId}`, "visual_memory", "image_caption"],
162
+ });
163
+ // ── Step 6: Backfill embeddings (makes caption findable via vector search)
164
+ // ── Step 6: Embed the caption inline ────────────────────────────────
165
+ // We embed the caption directly here rather than calling backfillEmbeddingsHandler
166
+ // to avoid a circular import (imageCaptioner ↔ sessionMemoryHandlers).
167
+ // We already have getLLMProvider() in scope, so the embed cost is near-zero.
168
+ try {
169
+ const embedText = `[Visual Memory: ${imageId}] Description: ${userContext}. Caption: ${caption}`;
170
+ const embedding = await llm.generateEmbedding(embedText);
171
+ // Find the ledger entry we just saved and patch its embedding
172
+ const allEntries = await storage.getLedgerEntries({
173
+ project,
174
+ conversation_id: "vlm-captioner",
175
+ });
176
+ // Sort descending and take the most recent (the one we just inserted)
177
+ const latest = allEntries.sort((a, b) => new Date(b.created_at ?? 0).getTime() - new Date(a.created_at ?? 0).getTime())[0];
178
+ if (latest?.id) {
179
+ await storage.patchLedger(latest.id, { embedding: JSON.stringify(embedding) });
180
+ debugLog(`[ImageCaptioner] Caption embedded for ledger entry [${latest.id}].`);
181
+ }
182
+ }
183
+ catch (embedErr) {
184
+ // Non-fatal: caption still persists in the ledger as plain text and
185
+ // will be picked up by the next project-wide backfill sweep.
186
+ debugLog(`[ImageCaptioner] Embedding failed (will surface in next backfill): ${embedErr}`);
187
+ }
188
+ debugLog(`[ImageCaptioner] Pipeline complete for [${imageId}].`);
189
+ }
190
+ // ─── Handoff Patch ────────────────────────────────────────────────────────────
191
+ /**
192
+ * Adds `caption` to the matching visual_memory entry inside the handoff JSON.
193
+ * Uses a read-modify-write because visual_memory is embedded in the handoff
194
+ * metadata JSON blob — there's no dedicated column to patch atomically.
195
+ *
196
+ * On version conflict (OCC), retries once with a fresh read. If both fail,
197
+ * logs and returns (the ledger entry still exists as a search fallback).
198
+ */
199
+ async function updateHandoffCaption(project, imageId, caption, vaultPath) {
200
+ const storage = await getStorage();
201
+ for (let attempt = 1; attempt <= 2; attempt++) {
202
+ const context = await storage.loadContext(project, "quick", PRISM_USER_ID);
203
+ if (!context) {
204
+ debugLog(`[ImageCaptioner] No handoff context for "${project}" — skipping caption patch.`);
205
+ return;
206
+ }
207
+ const ctx = context;
208
+ const meta = ctx.metadata || {};
209
+ const vm = meta.visual_memory || [];
210
+ const entry = vm.find((e) => e.id === imageId);
211
+ if (!entry) {
212
+ debugLog(`[ImageCaptioner] Image [${imageId}] not found in visual_memory — skipping patch.`);
213
+ return;
214
+ }
215
+ // Mutate the entry in-memory, then save back
216
+ entry.caption = caption;
217
+ entry.caption_path = vaultPath;
218
+ entry.caption_at = new Date().toISOString();
219
+ const handoffUpdate = {
220
+ project,
221
+ user_id: PRISM_USER_ID,
222
+ metadata: meta,
223
+ last_summary: ctx.last_summary ?? null,
224
+ pending_todo: ctx.pending_todo ?? null,
225
+ active_decisions: ctx.active_decisions ?? null,
226
+ keywords: ctx.keywords ?? null,
227
+ key_context: ctx.key_context ?? null,
228
+ active_branch: ctx.active_branch ?? null,
229
+ };
230
+ const result = await storage.saveHandoff(handoffUpdate, ctx.version);
231
+ if (result.status !== "conflict") {
232
+ debugLog(`[ImageCaptioner] Handoff patched with caption for [${imageId}] (attempt ${attempt}).`);
233
+ return;
234
+ }
235
+ // OCC conflict — retry once with fresh version
236
+ debugLog(`[ImageCaptioner] OCC conflict patching [${imageId}], attempt ${attempt}. Retrying…`);
237
+ }
238
+ console.warn(`[ImageCaptioner] Could not patch handoff for [${imageId}] after 2 attempts. ` +
239
+ `Caption is still saved in the ledger and will surface via semantic search.`);
240
+ }
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Anthropic Adapter (v4.5)
3
+ * ─────────────────────────────────────────────────────────────────────────────
4
+ * PURPOSE:
5
+ * Implements LLMProvider using Anthropic's official @anthropic-ai/sdk.
6
+ * Covers Claude 3.5 Sonnet, Claude 3 Haiku, Claude 3 Opus, etc.
7
+ *
8
+ * EMBEDDING LIMITATION:
9
+ * Anthropic does NOT offer a native text embedding API.
10
+ * Their official recommendation is Voyage AI (voyage-3, voyage-3-lite).
11
+ * `generateEmbedding()` throws an explicit, actionable error rather than
12
+ * silently returning garbage — the factory's auto-resolution logic means
13
+ * this should never be called in practice (see factory.ts).
14
+ *
15
+ * VLM CAPABILITY:
16
+ * Claude 3.5 Sonnet, Opus, and Haiku all support vision natively via the
17
+ * Messages API `image` content block. This adapter implements
18
+ * `generateImageDescription` (5MB base64 payload limit enforced in
19
+ * imageCaptioner.ts before the API call reaches this adapter).
20
+ *
21
+ * FACTORY RESOLUTION:
22
+ * When `text_provider = "anthropic"` and `embedding_provider = "auto"`,
23
+ * the factory automatically routes embeddings to the Gemini adapter.
24
+ * Users who want explicit control set `embedding_provider = "gemini"` or
25
+ * `embedding_provider = "openai"` in the Mind Palace dashboard.
26
+ *
27
+ * CONFIG KEYS (Prism dashboard "AI Providers" tab):
28
+ * anthropic_api_key — Required. Claude API key (sk-ant-...)
29
+ * anthropic_model — Chat model (default: claude-3-5-sonnet-20241022)
30
+ *
31
+ * MODEL SUGGESTIONS:
32
+ * claude-3-5-sonnet-20241022 — Best quality for compaction & summarization
33
+ * claude-3-haiku-20240307 — Fastest & cheapest; good for briefings
34
+ * claude-3-opus-20240229 — Most capable; use for complex fact merging
35
+ */
36
+ import Anthropic from "@anthropic-ai/sdk";
37
+ import { getSettingSync } from "../../../storage/configStorage.js";
38
+ import { debugLog } from "../../logger.js";
39
+ // ─── Constants ────────────────────────────────────────────────────────────────
40
+ // Default to Claude 3.5 Sonnet — best quality/cost ratio for the tasks
41
+ // Prism performs (compaction, briefing, fact merging, security scan).
42
+ const DEFAULT_MODEL = "claude-3-5-sonnet-20241022";
43
+ // Max output tokens for all Prism text-generation tasks.
44
+ // 4096 is sufficient for compaction summaries; raise if needed.
45
+ const MAX_TOKENS = 4096;
46
+ export class AnthropicAdapter {
47
+ client;
48
+ constructor() {
49
+ const apiKey = getSettingSync("anthropic_api_key", process.env.ANTHROPIC_API_KEY ?? "");
50
+ if (!apiKey) {
51
+ throw new Error("AnthropicAdapter requires an API key. " +
52
+ "Set ANTHROPIC_API_KEY or configure it in the Mind Palace dashboard.");
53
+ }
54
+ this.client = new Anthropic({ apiKey });
55
+ debugLog("[AnthropicAdapter] Initialized");
56
+ }
57
+ // ─── Text Generation ─────────────────────────────────────────────────────
58
+ async generateText(prompt, systemInstruction) {
59
+ const model = getSettingSync("anthropic_model", DEFAULT_MODEL);
60
+ debugLog(`[AnthropicAdapter] generateText — model=${model}`);
61
+ // Anthropic's Messages API uses system as a top-level field (not a message role).
62
+ // This maps cleanly to LLMProvider's systemInstruction parameter.
63
+ const response = await this.client.messages.create({
64
+ model,
65
+ max_tokens: MAX_TOKENS,
66
+ ...(systemInstruction ? { system: systemInstruction } : {}),
67
+ messages: [{ role: "user", content: prompt }],
68
+ });
69
+ // Extract text from the first ContentBlock.
70
+ // Anthropic returns an array of content blocks; we only use text blocks.
71
+ const block = response.content[0];
72
+ if (!block || block.type !== "text") {
73
+ throw new Error(`[AnthropicAdapter] Unexpected response content type: ${block?.type ?? "empty"}`);
74
+ }
75
+ return block.text;
76
+ }
77
+ // ─── Embedding Generation (Not Supported) ────────────────────────────────
78
+ async generateEmbedding(_text) {
79
+ // This method should never be reached in normal operation:
80
+ // - factory.ts auto-resolves embedding_provider away from "anthropic"
81
+ // - The dashboard UI warns users if they select anthropic + auto
82
+ //
83
+ // If a user somehow bypasses the factory (e.g. by constructing this class
84
+ // directly in a test), they get a clear, actionable error rather than a
85
+ // silent zero-vector or crash.
86
+ throw new Error("AnthropicAdapter does not support text embeddings. " +
87
+ "Anthropic has no native embedding API. " +
88
+ "In the Mind Palace dashboard, set 'Embedding Provider' to Gemini or OpenAI/Ollama. " +
89
+ "When using Ollama locally, 'nomic-embed-text' is a free, high-quality option.");
90
+ }
91
+ // ─── Image Description (VLM) ─────────────────────────────────────────────
92
+ /**
93
+ * Describe an image using the Anthropic Messages API vision capability.
94
+ * Claude 3.5 Sonnet (and Haiku/Opus) accept `image` content blocks with a
95
+ * base64 `source`. imageCaptioner.ts enforces the 5MB payload limit before
96
+ * this method is ever called.
97
+ */
98
+ async generateImageDescription(imageBase64, mimeType, context) {
99
+ const model = getSettingSync("anthropic_model", DEFAULT_MODEL);
100
+ const prompt = context
101
+ ? `Describe this image in rich detail for a developer knowledge base. User context: "${context}"`
102
+ : "Describe this image in rich detail for a developer knowledge base. Include: UI elements, visible text, architectural components, and key observations.";
103
+ debugLog(`[AnthropicAdapter] generateImageDescription — model=${model}`);
104
+ const response = await this.client.messages.create({
105
+ model,
106
+ max_tokens: 1024,
107
+ messages: [{
108
+ role: "user",
109
+ content: [
110
+ {
111
+ type: "image",
112
+ source: {
113
+ type: "base64",
114
+ // Anthropic requires the media_type to be a specific union type;
115
+ // cast to `any` since mimeType is validated to be a supported
116
+ // image format by imageCaptioner.ts before reaching here.
117
+ media_type: mimeType,
118
+ data: imageBase64,
119
+ },
120
+ },
121
+ { type: "text", text: prompt },
122
+ ],
123
+ }],
124
+ });
125
+ const block = response.content[0];
126
+ return block?.type === "text" ? block.text : "";
127
+ }
128
+ }
@@ -0,0 +1,152 @@
1
+ /**
2
+ * Gemini Adapter (v4.5)
3
+ * ─────────────────────────────────────────────────────────────────────────────
4
+ * PURPOSE:
5
+ * Implements LLMProvider using Google's @google/generative-ai SDK.
6
+ * This is Prism's DEFAULT adapter and the result of consolidating LLM logic
7
+ * that was previously scattered across 6 different files into a single,
8
+ * well-guarded implementation.
9
+ *
10
+ * BEFORE v4.4 (scattered):
11
+ * - src/utils/embeddingApi.ts → generateEmbedding logic
12
+ * - src/utils/googleAi.ts → analyzePaperWithGemini text generation
13
+ * - compactionHandler.ts → direct new GoogleGenerativeAI() instantiation
14
+ * - factMerger.ts → direct new GoogleGenerativeAI() instantiation
15
+ * - briefing.ts → direct new GoogleGenerativeAI() instantiation
16
+ * - healthCheck.ts → direct new GoogleGenerativeAI() instantiation
17
+ *
18
+ * AFTER v4.4 (consolidated here):
19
+ * All embedding guards, model constants, and SDK calls live in one place.
20
+ * All consumers call getLLMProvider() instead of touching the SDK directly.
21
+ *
22
+ * MODELS:
23
+ * Text: gemini-2.0-flash — fast, matches all prior hardcoded usages
24
+ * Embedding: gemini-embedding-001 — replaced text-embedding-004 (deprecated 2026-01)
25
+ * Uses Matryoshka Representation Learning (MRL) at 768 dims.
26
+ * Requires v1beta API endpoint (NOT v1).
27
+ *
28
+ * SDK NOTE:
29
+ * Still using @google/generative-ai@^0.24.1 (NOT the newer @google/genai).
30
+ * This is intentional — upgrading the SDK at the same time as introducing
31
+ * the abstraction layer would conflate two sources of behavioral change.
32
+ * SDK upgrade is a separate, future task.
33
+ */
34
+ import { GoogleGenerativeAI, TaskType, } from "@google/generative-ai";
35
+ import { GOOGLE_API_KEY } from "../../../config.js";
36
+ import { debugLog } from "../../logger.js";
37
+ // ─── Model Constants ──────────────────────────────────────────────────────────
38
+ // Defined as constants (not hardcoded strings) so external reviewers can see
39
+ // all model choices at a glance, and future changes only need one edit.
40
+ const TEXT_MODEL = "gemini-2.0-flash"; // chat/instruction-following model
41
+ const EMBEDDING_MODEL = "gemini-embedding-001"; // vector embedding model (MRL-enabled)
42
+ const EMBEDDING_DIMS = 768; // fixed output dims — must match DB schema
43
+ // ─── Embedding Truncation Constants ──────────────────────────────────────────
44
+ // gemini-embedding-001 supports up to ~2048 tokens.
45
+ // We use a character-based limit (not token-based) because:
46
+ // 1. JS string.length returns UTF-16 code units, not tokens
47
+ // 2. Token counting would require an extra API call or tokenizer dependency
48
+ // 3. 8000 chars ≈ 1500-2000 tokens for typical prose — safely under the limit
49
+ // The word-boundary snap prevents splitting mid-word or mid-surrogate-pair.
50
+ const MAX_EMBEDDING_CHARS = 8000;
51
+ export class GeminiAdapter {
52
+ // The underlying Google SDK client — initialized once per adapter instance.
53
+ // The factory ensures only one adapter instance exists per process.
54
+ ai;
55
+ constructor() {
56
+ // Fail fast at construction time rather than at the first API call.
57
+ // The factory catches this error and falls back gracefully.
58
+ if (!GOOGLE_API_KEY) {
59
+ throw new Error("GeminiAdapter requires GOOGLE_API_KEY. " +
60
+ "Set this environment variable to enable LLM features.");
61
+ }
62
+ this.ai = new GoogleGenerativeAI(GOOGLE_API_KEY);
63
+ }
64
+ // ─── Text Generation ─────────────────────────────────────────────────────
65
+ async generateText(prompt, systemInstruction) {
66
+ // getGenerativeModel() is lightweight — it just binds model name + options.
67
+ // The HTTP call happens inside generateContent() below.
68
+ const model = this.ai.getGenerativeModel({
69
+ model: TEXT_MODEL,
70
+ // Only spread systemInstruction if provided — avoids sending an empty field
71
+ // which could confuse some model versions.
72
+ ...(systemInstruction ? { systemInstruction } : {}),
73
+ });
74
+ const result = await model.generateContent(prompt);
75
+ // result.response.text() extracts the first candidate's text content.
76
+ // This matches the prior behavior in all 6 migrated call sites.
77
+ return result.response.text();
78
+ }
79
+ // ─── Embedding Generation ────────────────────────────────────────────────
80
+ async generateEmbedding(text) {
81
+ // Guard: empty string would produce a useless/degenerate embedding.
82
+ // Better to fail loudly here than store a zero-vector in the DB.
83
+ if (!text || !text.trim()) {
84
+ throw new Error("Cannot generate embedding for empty text.");
85
+ }
86
+ // ── Truncation Guard ───────────────────────────────────────────────────
87
+ // gemini-embedding-001 has a ~2048 token context window.
88
+ // Long session summaries (esp. code-heavy ones) can easily exceed this.
89
+ // We truncate proactively rather than let the API return a 400 error.
90
+ let inputText = text;
91
+ if (inputText.length > MAX_EMBEDDING_CHARS) {
92
+ debugLog(`[GeminiAdapter] Embedding input truncated from ${inputText.length}` +
93
+ ` to ~${MAX_EMBEDDING_CHARS} chars (word-safe)`);
94
+ // Step 1: hard cut at the character limit
95
+ inputText = inputText.substring(0, MAX_EMBEDDING_CHARS);
96
+ // Step 2: snap back to the last word boundary to avoid:
97
+ // a) splitting a word mid-character (readability)
98
+ // b) splitting a UTF-16 surrogate pair (correctness)
99
+ const lastSpace = inputText.lastIndexOf(" ");
100
+ if (lastSpace > 0) {
101
+ inputText = inputText.substring(0, lastSpace);
102
+ }
103
+ }
104
+ // ── API Version Pin ────────────────────────────────────────────────────
105
+ // gemini-embedding-001 is ONLY available on the v1beta endpoint.
106
+ // Using the default v1 endpoint returns a 404/model-not-found error.
107
+ // This was a known breaking change when migrating from text-embedding-004.
108
+ const model = this.ai.getGenerativeModel({ model: EMBEDDING_MODEL }, { apiVersion: "v1beta" });
109
+ debugLog(`[GeminiAdapter] Generating ${EMBEDDING_DIMS}-dim embedding` +
110
+ ` for ${inputText.length} chars`);
111
+ // ── Request Construction ───────────────────────────────────────────────
112
+ // outputDimensionality is a valid API field (Matryoshka truncation) but
113
+ // lags in the TypeScript type definitions as of @google/generative-ai@0.24.1.
114
+ // We use a type assertion on the full object rather than a spread-cast hack
115
+ // to keep the code readable while satisfying tsc.
116
+ const request = {
117
+ content: {
118
+ role: "user", // "user" role is required by the embedding API
119
+ parts: [{ text: inputText }],
120
+ },
121
+ taskType: TaskType.SEMANTIC_SIMILARITY, // optimizes for cosine similarity search
122
+ outputDimensionality: EMBEDDING_DIMS, // MRL truncation to 768 dims
123
+ };
124
+ const result = await model.embedContent(request);
125
+ const values = result.embedding.values;
126
+ // ── Dimension Enforcement ──────────────────────────────────────────────
127
+ // Hard check: throwing here is better than silently writing a wrong-size
128
+ // vector to the DB, which would corrupt the pgvector/sqlite-vec index.
129
+ if (!Array.isArray(values) || values.length !== EMBEDDING_DIMS) {
130
+ throw new Error(`Embedding dimension mismatch: expected ${EMBEDDING_DIMS},` +
131
+ ` got ${values?.length ?? "unknown"}`);
132
+ }
133
+ return values;
134
+ }
135
+ // ─── Image Description (VLM) ─────────────────────────────────────────────
136
+ /**
137
+ * Describe an image using Gemini's native multimodal capability.
138
+ * gemini-2.0-flash handles images alongside text — the same model used for
139
+ * text generation, so no extra SDK initialization is needed.
140
+ */
141
+ async generateImageDescription(imageBase64, mimeType, context) {
142
+ const model = this.ai.getGenerativeModel({ model: TEXT_MODEL });
143
+ const prompt = context
144
+ ? `Describe this image in rich detail for a developer knowledge base. User context: "${context}"`
145
+ : "Describe this image in rich detail for a developer knowledge base. Include: UI elements, visible text, architectural components, and key observations.";
146
+ const result = await model.generateContent([
147
+ { inlineData: { data: imageBase64, mimeType } },
148
+ prompt,
149
+ ]);
150
+ return result.response.text();
151
+ }
152
+ }