@toolbaux/guardian 0.1.23 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,32 +13,62 @@
13
13
  */
14
14
  import fs from "node:fs/promises";
15
15
  import path from "node:path";
16
+ import { randomUUID } from "node:crypto";
16
17
  import { DEFAULT_SPECS_DIR } from "../config.js";
17
18
  const DEFAULT_CONFIG = {
18
19
  docs: {
19
20
  mode: "full",
20
21
  },
21
22
  };
23
+ /**
24
+ * Hook script written to .claude/hooks/mcp-first.sh
25
+ *
26
+ * Blocks Read/Glob/Grep until a guardian MCP tool has been called in the
27
+ * current session. Session state lives in /tmp/guardian-used-<SESSION_ID>
28
+ * and is set by the PostToolUse hook (guardian-used.sh) below.
29
+ */
22
30
  const CLAUDE_CODE_HOOK_SCRIPT = `#!/bin/bash
23
- # Guardian MCP-first hook — ensures AI tools use Guardian MCP before reading source files.
24
- # Installed by: guardian init
31
+ # Guardian MCP-first hook — blocks Read/Glob/Grep until guardian tools are used.
32
+ # Installed by: guardian init (v3 — session-scoped flag, no time drift)
25
33
 
26
34
  INPUT=$(cat)
27
- TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty')
35
+ SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "default"')
36
+ FLAG="/tmp/guardian-used-\${SESSION_ID}"
28
37
 
29
- cat >&2 <<BLOCK
30
- BLOCKED: Use Guardian MCP tools before reading source files.
38
+ # If guardian was called in this session, allow all file operations.
39
+ if [ -f "$FLAG" ]; then
40
+ exit 0
41
+ fi
42
+
43
+ cat >&2 <<'BLOCK'
44
+ BLOCKED: Use Guardian MCP tools before exploring source files.
31
45
 
32
- Use these MCP tools first:
33
- - guardian_orientget codebase overview
34
- - guardian_search find features by keyword
35
- - guardian_context deep dive into a specific area
46
+ Call one of these first:
47
+ guardian_search("your query")find files/symbols/endpoints by keyword
48
+ guardian_grep("pattern") semantic grep (replaces Grep tool)
49
+ guardian_glob("src/auth/**") semantic file discovery (replaces Glob tool)
50
+ guardian_orient() — get codebase overview
36
51
 
37
- Then you can read individual files as needed.
52
+ File reads are unblocked automatically for the rest of this session.
38
53
  BLOCK
39
54
 
40
55
  exit 2
41
56
  `;
57
+ /**
58
+ * Hook script written to .claude/hooks/guardian-used.sh
59
+ *
60
+ * Called by PostToolUse after any guardian_* tool. Sets the session flag
61
+ * that mcp-first.sh checks, unblocking subsequent Read/Glob/Grep calls.
62
+ */
63
+ const GUARDIAN_USED_SCRIPT = `#!/bin/bash
64
+ # Guardian PostToolUse hook — marks guardian as used for this session.
65
+ # Installed by: guardian init
66
+
67
+ INPUT=$(cat)
68
+ SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "default"')
69
+ touch "/tmp/guardian-used-\${SESSION_ID}"
70
+ exit 0
71
+ `;
42
72
  const HOOK_SCRIPT = `#!/bin/sh
43
73
  # guardian pre-commit hook — keeps architecture context fresh
44
74
  # Installed by: guardian init
@@ -80,6 +110,22 @@ export async function runInit(options) {
80
110
  else {
81
111
  console.log(" · guardian.config.json already exists");
82
112
  }
113
+ // 2b. Ensure project_id is present in guardian.config.json
114
+ try {
115
+ const raw = await fs.readFile(configPath, "utf8");
116
+ const cfg = JSON.parse(raw);
117
+ if (!cfg.project_id) {
118
+ cfg.project_id = randomUUID();
119
+ await fs.writeFile(configPath, JSON.stringify(cfg, null, 2) + "\n", "utf8");
120
+ console.log(" ✓ Added project_id to guardian.config.json");
121
+ }
122
+ else {
123
+ console.log(" · guardian.config.json already has project_id");
124
+ }
125
+ }
126
+ catch {
127
+ // Non-fatal — config may not be valid JSON yet
128
+ }
83
129
  // 3. Install pre-commit hook
84
130
  if (!options.skipHook) {
85
131
  const gitDir = path.join(root, ".git");
@@ -158,8 +204,6 @@ export async function runInit(options) {
158
204
  const { runExtract } = await import("./extract.js");
159
205
  await runExtract({
160
206
  projectRoot: root,
161
- backendRoot: options.backendRoot,
162
- frontendRoot: options.frontendRoot,
163
207
  output: specsDir,
164
208
  includeFileGraph: true,
165
209
  backend: options.backend,
@@ -167,8 +211,6 @@ export async function runInit(options) {
167
211
  const { runGenerate } = await import("./generate.js");
168
212
  await runGenerate({
169
213
  projectRoot: root,
170
- backendRoot: options.backendRoot,
171
- frontendRoot: options.frontendRoot,
172
214
  output: specsDir,
173
215
  aiContext: true,
174
216
  });
@@ -213,18 +255,13 @@ async function setupClaudeCodeHooks(root, specsDir) {
213
255
  try {
214
256
  mcpConfig = JSON.parse(await fs.readFile(mcpJsonPath, "utf8"));
215
257
  }
216
- catch {
217
- // Corrupted — overwrite
218
- }
258
+ catch { /* corrupted — overwrite */ }
219
259
  }
220
260
  if (!mcpConfig.mcpServers)
221
261
  mcpConfig.mcpServers = {};
222
262
  const servers = mcpConfig.mcpServers;
223
263
  if (!servers.guardian) {
224
- servers.guardian = {
225
- command: "guardian",
226
- args: ["mcp-serve", "--specs", specsDir],
227
- };
264
+ servers.guardian = { command: "guardian", args: ["mcp-serve", "--specs", specsDir] };
228
265
  await fs.writeFile(mcpJsonPath, JSON.stringify(mcpConfig, null, 2) + "\n", "utf8");
229
266
  console.log(" ✓ Created .mcp.json (MCP server config)");
230
267
  }
@@ -238,58 +275,49 @@ async function setupClaudeCodeHooks(root, specsDir) {
238
275
  const claudeDir = path.join(root, ".claude");
239
276
  const hooksDir = path.join(claudeDir, "hooks");
240
277
  const settingsPath = path.join(claudeDir, "settings.json");
241
- const hookScriptPath = path.join(hooksDir, "mcp-first.sh");
278
+ const mcpFirstPath = path.join(hooksDir, "mcp-first.sh");
279
+ const guardianUsedPath = path.join(hooksDir, "guardian-used.sh");
242
280
  await fs.mkdir(hooksDir, { recursive: true });
243
- // Write the hook script
244
- if (!(await fileExists(hookScriptPath))) {
245
- await fs.writeFile(hookScriptPath, CLAUDE_CODE_HOOK_SCRIPT, "utf8");
246
- await fs.chmod(hookScriptPath, 0o755);
247
- console.log(" ✓ Created Claude Code MCP-first hook (.claude/hooks/mcp-first.sh)");
248
- }
249
- else {
250
- console.log(" · Claude Code hook already exists");
251
- }
281
+ // Always overwrite hook scripts so they stay in sync with this version of guardian.
282
+ await fs.writeFile(mcpFirstPath, CLAUDE_CODE_HOOK_SCRIPT, "utf8");
283
+ await fs.chmod(mcpFirstPath, 0o755);
284
+ console.log(" ✓ Wrote .claude/hooks/mcp-first.sh (PreToolUse — blocks until guardian called)");
285
+ await fs.writeFile(guardianUsedPath, GUARDIAN_USED_SCRIPT, "utf8");
286
+ await fs.chmod(guardianUsedPath, 0o755);
287
+ console.log(" ✓ Wrote .claude/hooks/guardian-used.sh (PostToolUse — sets session flag)");
252
288
  // Write or merge .claude/settings.json
253
289
  let settings = {};
254
290
  if (await fileExists(settingsPath)) {
255
291
  try {
256
292
  settings = JSON.parse(await fs.readFile(settingsPath, "utf8"));
257
293
  }
258
- catch {
259
- // Corrupted file — overwrite
260
- }
294
+ catch { /* corrupted — overwrite */ }
261
295
  }
262
- // Add MCP server config
296
+ // MCP server registration
263
297
  if (!settings.mcpServers)
264
298
  settings.mcpServers = {};
265
- const mcpServers = settings.mcpServers;
266
- if (!mcpServers.guardian) {
267
- mcpServers.guardian = {
268
- command: "guardian",
269
- args: ["mcp-serve", "--specs", specsDir],
270
- };
271
- }
272
- // Add PreToolUse hook
273
- const hookEntry = {
274
- matcher: "Read|Glob|Grep",
275
- hooks: [
299
+ settings.mcpServers.guardian = {
300
+ command: "guardian",
301
+ args: ["mcp-serve", "--specs", specsDir],
302
+ };
303
+ // Hooks always overwrite to keep in sync with installed scripts.
304
+ settings.hooks = {
305
+ // PreToolUse: block Read/Glob/Grep until a guardian tool has been called.
306
+ // The script itself handles the session-flag check — no "if" filter needed here.
307
+ PreToolUse: [
308
+ {
309
+ matcher: "Read|Glob|Grep",
310
+ hooks: [{ type: "command", command: ".claude/hooks/mcp-first.sh" }],
311
+ },
312
+ ],
313
+ // PostToolUse: set the session flag after any guardian MCP tool call.
314
+ PostToolUse: [
276
315
  {
277
- type: "command",
278
- if: "Read(//*/src/*)|Glob(*src*)|Grep(*src*)",
279
- command: '"$CLAUDE_PROJECT_DIR"/.claude/hooks/mcp-first.sh',
316
+ matcher: "mcp__guardian__guardian_search|mcp__guardian__guardian_orient|mcp__guardian__guardian_context|mcp__guardian__guardian_impact|mcp__guardian__guardian_grep|mcp__guardian__guardian_glob",
317
+ hooks: [{ type: "command", command: ".claude/hooks/guardian-used.sh" }],
280
318
  },
281
319
  ],
282
320
  };
283
- if (!settings.hooks)
284
- settings.hooks = {};
285
- const hooks = settings.hooks;
286
- if (!hooks.PreToolUse) {
287
- hooks.PreToolUse = [hookEntry];
288
- console.log(" ✓ Configured Claude Code PreToolUse hook in .claude/settings.json");
289
- }
290
- else {
291
- console.log(" · Claude Code PreToolUse hook already configured");
292
- }
293
321
  await fs.writeFile(settingsPath, JSON.stringify(settings, null, 2) + "\n", "utf8");
294
- console.log(" ✓ Updated .claude/settings.json (MCP server + hooks)");
322
+ console.log(" ✓ Updated .claude/settings.json (MCP server + PreToolUse + PostToolUse hooks)");
295
323
  }
@@ -13,6 +13,7 @@ import { writeCodebaseIntelligenceViaStore } from "../extract/codebase-intel.js"
13
13
  import { getOutputLayout } from "../output-layout.js";
14
14
  import { SqliteSpecsStore } from "../db/sqlite-specs-store.js";
15
15
  import { populateFTSIndex } from "../db/fts-builder.js";
16
+ import { embedFunctions } from "../db/embeddings.js";
16
17
  export async function runIntel(options) {
17
18
  const specsDir = path.resolve(options.specs);
18
19
  const layout = getOutputLayout(specsDir);
@@ -49,6 +50,28 @@ export async function runIntel(options) {
49
50
  catch { /* not generated yet — skip */ }
50
51
  populateFTSIndex(store, intel, arch, funcIntel);
51
52
  console.log(`Built FTS5 search index (${Object.keys(intel.api_registry ?? {}).length} endpoints indexed)`);
53
+ // Populate module_metrics from structural-intelligence.json (if present).
54
+ try {
55
+ const siRaw = await (await import("node:fs/promises")).readFile((await import("node:path")).join(machineDir, "structural-intelligence.json"), "utf8");
56
+ const siReports = JSON.parse(siRaw);
57
+ if (Array.isArray(siReports) && siReports.length > 0) {
58
+ store.rebuildModuleMetrics(siReports);
59
+ console.log(`Indexed ${siReports.length} module metrics`);
60
+ }
61
+ }
62
+ catch { /* structural-intelligence.json not generated yet — skip */ }
63
+ // Embed functions for semantic (vector) search.
64
+ // Uses local on-device model by default (no API key needed).
65
+ // If OPENAI_API_KEY is set, uses OpenAI text-embedding-3-small (better quality).
66
+ if (funcIntel?.functions?.length) {
67
+ console.log(`[guardian embed] embedding ${funcIntel.functions.length} functions…`);
68
+ try {
69
+ await embedFunctions(store, funcIntel.functions, process.env.OPENAI_API_KEY);
70
+ }
71
+ catch (err) {
72
+ console.warn(`[guardian embed] skipped: ${err.message}`);
73
+ }
74
+ }
52
75
  }
53
76
  console.log(`Wrote guardian.db → ${layout.rootDir}`);
54
77
  }
@@ -111,6 +111,83 @@ async function search(args) {
111
111
  async function model(args) {
112
112
  return runCli(["search", "--model", args.name, "--input", specsInputDir]);
113
113
  }
114
+ /**
115
+ * guardian_grep — semantic grep via guardian search.
116
+ *
117
+ * Replaces raw Grep tool calls. Runs guardian BM25+vector search and returns
118
+ * matching symbols (file:line:name) and files, formatted like grep output.
119
+ * Claude gets richer context (call-graph, authority) with zero token overhead.
120
+ */
121
+ async function grep(args) {
122
+ const raw = await runCli([
123
+ "search", "--query", args.query, "--format", "json", "--backend", "auto", "--input", specsInputDir,
124
+ ]);
125
+ try {
126
+ const data = JSON.parse(raw);
127
+ const lines = [`guardian_grep("${args.query}")`];
128
+ if (data.symbols?.length) {
129
+ lines.push("\nSymbols (file:line: name):");
130
+ for (const s of data.symbols.slice(0, 25)) {
131
+ lines.push(` ${s.file}:${s.line}: ${s.name}`);
132
+ }
133
+ }
134
+ if (data.files?.length) {
135
+ lines.push("\nFiles:");
136
+ for (const f of data.files.slice(0, 15)) {
137
+ lines.push(` ${f.file_path}`);
138
+ }
139
+ }
140
+ if (lines.length === 1)
141
+ lines.push(" (no matches — try a different query)");
142
+ return lines.join("\n");
143
+ }
144
+ catch {
145
+ return raw; // passthrough if search returns plain text
146
+ }
147
+ }
148
+ /**
149
+ * guardian_glob — semantic file discovery via guardian search.
150
+ *
151
+ * Replaces raw Glob tool calls. Extracts meaningful keywords from the glob
152
+ * pattern and searches the guardian index for matching files. Falls back to
153
+ * guiding the user toward a more descriptive query for pure extension patterns.
154
+ */
155
+ async function glob(args) {
156
+ // Extract keywords: "src/auth/**/*.ts" → "auth", "src/middleware/error*" → "middleware error"
157
+ const keywords = args.pattern
158
+ .replace(/\*\*?/g, " ")
159
+ .replace(/\.\w+$/, "") // strip trailing extension
160
+ .replace(/[[\]{}]/g, " ")
161
+ .split(/[/\s]+/)
162
+ .filter(s => s.length > 2 && !/^(src|lib|dist|app|index)$/.test(s))
163
+ .join(" ")
164
+ .trim();
165
+ if (!keywords) {
166
+ return [
167
+ `guardian_glob("${args.pattern}"): pattern has no meaningful keywords.`,
168
+ `Use guardian_search with a descriptive query instead, e.g.:`,
169
+ ` guardian_search("TypeScript source files") — or describe what you're looking for.`,
170
+ ].join("\n");
171
+ }
172
+ const raw = await runCli([
173
+ "search", "--query", keywords, "--format", "json", "--backend", "auto", "--input", specsInputDir,
174
+ ]);
175
+ try {
176
+ const data = JSON.parse(raw);
177
+ const files = data.files ?? [];
178
+ const lines = [
179
+ `guardian_glob("${args.pattern}") — searched: "${keywords}"`,
180
+ `\nMatching files:`,
181
+ ...files.slice(0, 20).map(f => ` ${f.file_path}`),
182
+ ];
183
+ if (files.length === 0)
184
+ lines.push(" (no matches)");
185
+ return lines.join("\n");
186
+ }
187
+ catch {
188
+ return raw;
189
+ }
190
+ }
114
191
  // ── MCP protocol ──
115
192
  const TOOLS = [
116
193
  {
@@ -167,6 +244,39 @@ const TOOLS = [
167
244
  description: "MCP usage stats for this session. Call at end to evaluate guardian's usefulness.",
168
245
  inputSchema: { type: "object", properties: {} },
169
246
  },
247
+ {
248
+ name: "guardian_grep",
249
+ description: [
250
+ "Semantic grep — find symbols and files matching a keyword or pattern.",
251
+ "Use INSTEAD of the Grep tool. Returns matching function/class names with file:line locations.",
252
+ "Backed by BM25 + call-graph authority so relevant source definitions surface first.",
253
+ "Example: guardian_grep('validate token') → auth.py:42: validate_token, middleware.py:18: check_jwt",
254
+ ].join(" "),
255
+ inputSchema: {
256
+ type: "object",
257
+ properties: {
258
+ query: { type: "string", description: "Keyword or phrase to search for (natural language OK)" },
259
+ path: { type: "string", description: "Optional: restrict to files under this path prefix" },
260
+ },
261
+ required: ["query"],
262
+ },
263
+ },
264
+ {
265
+ name: "guardian_glob",
266
+ description: [
267
+ "Semantic file discovery — find files matching a path pattern.",
268
+ "Use INSTEAD of the Glob tool. Extracts keywords from the pattern and searches the guardian index.",
269
+ "Example: guardian_glob('src/auth/**/*.ts') → searches for 'auth typescript' files.",
270
+ "For pure extension globs with no path context, use guardian_search with a descriptive query.",
271
+ ].join(" "),
272
+ inputSchema: {
273
+ type: "object",
274
+ properties: {
275
+ pattern: { type: "string", description: "Glob pattern (e.g. 'src/auth/**/*.ts', '**/middleware*')" },
276
+ },
277
+ required: ["pattern"],
278
+ },
279
+ },
170
280
  ];
171
281
  const TOOL_HANDLERS = {
172
282
  guardian_orient: orient,
@@ -175,6 +285,8 @@ const TOOL_HANDLERS = {
175
285
  guardian_search: search,
176
286
  guardian_model: model,
177
287
  guardian_metrics: async () => JSON.stringify(metrics.summary()),
288
+ guardian_grep: grep,
289
+ guardian_glob: glob,
178
290
  };
179
291
  function respond(id, result) {
180
292
  const msg = JSON.stringify({ jsonrpc: "2.0", id, result });
@@ -17,6 +17,7 @@ export async function runSearch(options) {
17
17
  if (sqliteResult !== null) {
18
18
  const base = JSON.parse(await querySearch(inputDir, options.query));
19
19
  base.files = sqliteResult.files;
20
+ base.symbols = sqliteResult.symbols;
20
21
  base.search_signal = sqliteResult.signal;
21
22
  console.log(JSON.stringify(base));
22
23
  return;
@@ -138,12 +139,36 @@ async function runSearchSqlite(specsInput, query, limit, backend = "sqlite") {
138
139
  console.log(`No FTS results for "${query}"`);
139
140
  return true;
140
141
  }
142
+ let queryVec;
143
+ try {
144
+ const { embedQuery } = await import("../db/embeddings.js");
145
+ const vec = await embedQuery(cleaned || query, process.env.OPENAI_API_KEY);
146
+ if (vec)
147
+ queryVec = vec;
148
+ }
149
+ catch { /* graceful degradation */ }
150
+ const symbols = store.searchSymbols(cleaned || query, Math.ceil(limit / 2), queryVec);
141
151
  const lines = [`## FTS5 search: "${query}"\n`];
152
+ // Build a map of file → matching symbols for quick lookup
153
+ const symbolsByFile = new Map();
154
+ for (const s of symbols) {
155
+ if (!symbolsByFile.has(s.file_path))
156
+ symbolsByFile.set(s.file_path, []);
157
+ symbolsByFile.get(s.file_path).push({ name: s.name, line: s.line });
158
+ }
142
159
  for (const r of results) {
143
160
  const rank = Math.abs(r.rank).toFixed(3);
144
161
  lines.push(`### \`${r.file_path}\` (score: ${rank})`);
145
- if (r.symbol_name)
146
- lines.push(` symbols: ${r.symbol_name}`);
162
+ // Matching symbols from this file (snippet equivalent)
163
+ const fileSyms = symbolsByFile.get(r.file_path) ?? [];
164
+ const inlineSyms = r.matching_symbols.filter(s => !fileSyms.some(f => f.name === s));
165
+ if (fileSyms.length) {
166
+ for (const s of fileSyms)
167
+ lines.push(` → \`${s.name}\` :${s.line}`);
168
+ }
169
+ if (inlineSyms.length) {
170
+ lines.push(` symbols: ${inlineSyms.join(", ")}`);
171
+ }
147
172
  if (r.imports.length)
148
173
  lines.push(` imports: ${r.imports.join(", ")}`);
149
174
  if (r.used_by.length)
@@ -177,7 +202,22 @@ async function getSqliteFileList(specsInput, query, limit, backend = "auto") {
177
202
  if (results.length === 0)
178
203
  return null;
179
204
  const signal = store.querySignal(query);
180
- return { files: results.map((r) => r.file_path), signal };
205
+ // Hybrid symbol search: BM25 + call-graph authority + optional vector similarity.
206
+ // embedQuery uses local model (no API key) or OpenAI if OPENAI_API_KEY is set.
207
+ let queryVec;
208
+ try {
209
+ const { embedQuery } = await import("../db/embeddings.js");
210
+ const vec = await embedQuery(cleaned || query, process.env.OPENAI_API_KEY);
211
+ if (vec)
212
+ queryVec = vec;
213
+ }
214
+ catch { /* graceful degradation — vector unavailable */ }
215
+ const symbols = store.searchSymbols(cleaned || query, Math.ceil(limit / 2), queryVec);
216
+ return {
217
+ files: results.map((r) => r.file_path),
218
+ symbols: symbols.map((s) => ({ file: s.file_path, name: s.name, line: s.line })),
219
+ signal,
220
+ };
181
221
  }
182
222
  finally {
183
223
  await store.close();
package/dist/config.js CHANGED
@@ -273,6 +273,7 @@ function normalizeConfig(input, configDir) {
273
273
  }
274
274
  function mergeConfig(base, override) {
275
275
  return {
276
+ project_id: override.project_id ?? base.project_id,
276
277
  project: {
277
278
  root: override.project?.root ?? base.project?.root ?? "",
278
279
  backendRoot: override.project?.backendRoot ?? base.project?.backendRoot ?? "",
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Embedding generation for function-level semantic search.
3
+ *
4
+ * Strategy (local-first, no API key required):
5
+ * Default — @xenova/transformers running Xenova/all-MiniLM-L6-v2 on-device.
6
+ * Model downloads once (~23 MB) and is cached in ~/.cache/xenova.
7
+ * dim=384, pure JS/ONNX, no external service needed.
8
+ *
9
+ * Upgrade — OpenAI text-embedding-3-small when OPENAI_API_KEY is set.
10
+ * dim=256, higher quality, costs ~$0.002 per 1M tokens.
11
+ *
12
+ * Text per function (concise — name carries most semantic signal):
13
+ * "{name} {filename}: {top calls} {short literals}"
14
+ */
15
+ const LOCAL_MODEL = "Xenova/all-MiniLM-L6-v2";
16
+ const LOCAL_DIM = 384;
17
+ const OPENAI_MODEL = "text-embedding-3-small";
18
+ const OPENAI_DIM = 256;
19
+ const BATCH = 64; // safe for both local and OpenAI
20
+ function fnToText(fn) {
21
+ const filename = fn.file.split("/").pop() ?? fn.file;
22
+ const callStr = (fn.calls ?? []).slice(0, 10).join(" ");
23
+ const litStr = (fn.stringLiterals ?? []).slice(0, 5).join(" ").slice(0, 100);
24
+ return `${fn.name} ${filename}: ${callStr} ${litStr}`.trim().slice(0, 300);
25
+ }
26
+ // ── Local embedder (no API key) ───────────────────────────────────────────────
27
+ async function embedBatchLocal(texts, pipe) {
28
+ const out = [];
29
+ for (const text of texts) {
30
+ const result = await pipe(text, { pooling: "mean", normalize: true });
31
+ out.push(new Float32Array(result.data));
32
+ }
33
+ return out;
34
+ }
35
+ // ── OpenAI embedder (OPENAI_API_KEY required) ─────────────────────────────────
36
+ async function embedBatchOpenAI(texts, apiKey) {
37
+ const { default: OpenAI } = await import("openai");
38
+ const client = new OpenAI({ apiKey });
39
+ const response = await client.embeddings.create({
40
+ model: OPENAI_MODEL,
41
+ input: texts,
42
+ dimensions: OPENAI_DIM,
43
+ encoding_format: "float",
44
+ });
45
+ return response.data.map(d => new Float32Array(d.embedding));
46
+ }
47
+ // ── Public API ────────────────────────────────────────────────────────────────
48
+ /**
49
+ * Embed all functions and store them in guardian.db function_embeddings table.
50
+ * Uses local model by default; OpenAI when OPENAI_API_KEY is set (better quality).
51
+ */
52
+ export async function embedFunctions(store, fns, apiKey) {
53
+ if (fns.length === 0)
54
+ return;
55
+ const useOpenAI = !!apiKey;
56
+ let pipe;
57
+ if (!useOpenAI) {
58
+ // Lazy-load local model (downloads once, then cached)
59
+ const { pipeline } = await import("@xenova/transformers");
60
+ console.log(`[guardian embed] loading local model ${LOCAL_MODEL}…`);
61
+ pipe = await pipeline("feature-extraction", LOCAL_MODEL);
62
+ }
63
+ const rows = [];
64
+ for (let i = 0; i < fns.length; i += BATCH) {
65
+ const batch = fns.slice(i, i + BATCH);
66
+ const texts = batch.map(fnToText);
67
+ let vecs;
68
+ try {
69
+ vecs = useOpenAI
70
+ ? await embedBatchOpenAI(texts, apiKey)
71
+ : await embedBatchLocal(texts, pipe);
72
+ }
73
+ catch (err) {
74
+ console.warn(`[guardian embed] batch ${i}–${i + batch.length - 1} failed: ${err.message}`);
75
+ continue;
76
+ }
77
+ for (let j = 0; j < batch.length; j++) {
78
+ if (!vecs[j])
79
+ continue;
80
+ rows.push({
81
+ file_path: batch[j].file,
82
+ name: batch[j].name,
83
+ line: batch[j].lines[0],
84
+ vec: vecs[j],
85
+ });
86
+ }
87
+ if (i > 0 && i % 500 === 0) {
88
+ console.log(`[guardian embed] ${i}/${fns.length} functions embedded`);
89
+ }
90
+ }
91
+ store.rebuildEmbeddings(rows);
92
+ const source = useOpenAI ? `OpenAI ${OPENAI_MODEL} dim=${OPENAI_DIM}` : `local ${LOCAL_MODEL} dim=${LOCAL_DIM}`;
93
+ console.log(`[guardian embed] stored ${rows.length} embeddings (${source})`);
94
+ }
95
+ /**
96
+ * Embed a single query string for hybrid search.
97
+ * Returns null on failure — graceful degradation to BM25 + call-graph authority.
98
+ */
99
+ export async function embedQuery(query, apiKey) {
100
+ try {
101
+ if (apiKey) {
102
+ const [vec] = await embedBatchOpenAI([query.slice(0, 300)], apiKey);
103
+ return vec ?? null;
104
+ }
105
+ const { pipeline } = await import("@xenova/transformers");
106
+ const pipe = await pipeline("feature-extraction", LOCAL_MODEL);
107
+ const [vec] = await embedBatchLocal([query.slice(0, 300)], pipe);
108
+ return vec ?? null;
109
+ }
110
+ catch {
111
+ return null;
112
+ }
113
+ }