@toolbaux/guardian 0.1.23 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -111,6 +111,83 @@ async function search(args) {
111
111
  async function model(args) {
112
112
  return runCli(["search", "--model", args.name, "--input", specsInputDir]);
113
113
  }
114
+ /**
115
+ * guardian_grep — semantic grep via guardian search.
116
+ *
117
+ * Replaces raw Grep tool calls. Runs guardian BM25+vector search and returns
118
+ * matching symbols (file:line:name) and files, formatted like grep output.
119
+ * Claude gets richer context (call-graph, authority) with zero token overhead.
120
+ */
121
+ async function grep(args) {
122
+ const raw = await runCli([
123
+ "search", "--query", args.query, "--format", "json", "--backend", "auto", "--input", specsInputDir,
124
+ ]);
125
+ try {
126
+ const data = JSON.parse(raw);
127
+ const lines = [`guardian_grep("${args.query}")`];
128
+ if (data.symbols?.length) {
129
+ lines.push("\nSymbols (file:line: name):");
130
+ for (const s of data.symbols.slice(0, 25)) {
131
+ lines.push(` ${s.file}:${s.line}: ${s.name}`);
132
+ }
133
+ }
134
+ if (data.files?.length) {
135
+ lines.push("\nFiles:");
136
+ for (const f of data.files.slice(0, 15)) {
137
+ lines.push(` ${f.file_path}`);
138
+ }
139
+ }
140
+ if (lines.length === 1)
141
+ lines.push(" (no matches — try a different query)");
142
+ return lines.join("\n");
143
+ }
144
+ catch {
145
+ return raw; // passthrough if search returns plain text
146
+ }
147
+ }
148
+ /**
149
+ * guardian_glob — semantic file discovery via guardian search.
150
+ *
151
+ * Replaces raw Glob tool calls. Extracts meaningful keywords from the glob
152
+ * pattern and searches the guardian index for matching files. Falls back to
153
+ * guiding the user toward a more descriptive query for pure extension patterns.
154
+ */
155
+ async function glob(args) {
156
+ // Extract keywords: "src/auth/**/*.ts" → "auth", "src/middleware/error*" → "middleware error"
157
+ const keywords = args.pattern
158
+ .replace(/\*\*?/g, " ")
159
+ .replace(/\.\w+$/, "") // strip trailing extension
160
+ .replace(/[[\]{}]/g, " ")
161
+ .split(/[/\s]+/)
162
+ .filter(s => s.length > 2 && !/^(src|lib|dist|app|index)$/.test(s))
163
+ .join(" ")
164
+ .trim();
165
+ if (!keywords) {
166
+ return [
167
+ `guardian_glob("${args.pattern}"): pattern has no meaningful keywords.`,
168
+ `Use guardian_search with a descriptive query instead, e.g.:`,
169
+ ` guardian_search("TypeScript source files") — or describe what you're looking for.`,
170
+ ].join("\n");
171
+ }
172
+ const raw = await runCli([
173
+ "search", "--query", keywords, "--format", "json", "--backend", "auto", "--input", specsInputDir,
174
+ ]);
175
+ try {
176
+ const data = JSON.parse(raw);
177
+ const files = data.files ?? [];
178
+ const lines = [
179
+ `guardian_glob("${args.pattern}") — searched: "${keywords}"`,
180
+ `\nMatching files:`,
181
+ ...files.slice(0, 20).map(f => ` ${f.file_path}`),
182
+ ];
183
+ if (files.length === 0)
184
+ lines.push(" (no matches)");
185
+ return lines.join("\n");
186
+ }
187
+ catch {
188
+ return raw;
189
+ }
190
+ }
114
191
  // ── MCP protocol ──
115
192
  const TOOLS = [
116
193
  {
@@ -167,6 +244,39 @@ const TOOLS = [
167
244
  description: "MCP usage stats for this session. Call at end to evaluate guardian's usefulness.",
168
245
  inputSchema: { type: "object", properties: {} },
169
246
  },
247
+ {
248
+ name: "guardian_grep",
249
+ description: [
250
+ "Semantic grep — find symbols and files matching a keyword or pattern.",
251
+ "Use INSTEAD of the Grep tool. Returns matching function/class names with file:line locations.",
252
+ "Backed by BM25 + call-graph authority so relevant source definitions surface first.",
253
+ "Example: guardian_grep('validate token') → auth.py:42: validate_token, middleware.py:18: check_jwt",
254
+ ].join(" "),
255
+ inputSchema: {
256
+ type: "object",
257
+ properties: {
258
+ query: { type: "string", description: "Keyword or phrase to search for (natural language OK)" },
259
+ path: { type: "string", description: "Optional: restrict to files under this path prefix" },
260
+ },
261
+ required: ["query"],
262
+ },
263
+ },
264
+ {
265
+ name: "guardian_glob",
266
+ description: [
267
+ "Semantic file discovery — find files matching a path pattern.",
268
+ "Use INSTEAD of the Glob tool. Extracts keywords from the pattern and searches the guardian index.",
269
+ "Example: guardian_glob('src/auth/**/*.ts') → searches for 'auth typescript' files.",
270
+ "For pure extension globs with no path context, use guardian_search with a descriptive query.",
271
+ ].join(" "),
272
+ inputSchema: {
273
+ type: "object",
274
+ properties: {
275
+ pattern: { type: "string", description: "Glob pattern (e.g. 'src/auth/**/*.ts', '**/middleware*')" },
276
+ },
277
+ required: ["pattern"],
278
+ },
279
+ },
170
280
  ];
171
281
  const TOOL_HANDLERS = {
172
282
  guardian_orient: orient,
@@ -175,6 +285,8 @@ const TOOL_HANDLERS = {
175
285
  guardian_search: search,
176
286
  guardian_model: model,
177
287
  guardian_metrics: async () => JSON.stringify(metrics.summary()),
288
+ guardian_grep: grep,
289
+ guardian_glob: glob,
178
290
  };
179
291
  function respond(id, result) {
180
292
  const msg = JSON.stringify({ jsonrpc: "2.0", id, result });
@@ -17,6 +17,7 @@ export async function runSearch(options) {
17
17
  if (sqliteResult !== null) {
18
18
  const base = JSON.parse(await querySearch(inputDir, options.query));
19
19
  base.files = sqliteResult.files;
20
+ base.symbols = sqliteResult.symbols;
20
21
  base.search_signal = sqliteResult.signal;
21
22
  console.log(JSON.stringify(base));
22
23
  return;
@@ -138,12 +139,36 @@ async function runSearchSqlite(specsInput, query, limit, backend = "sqlite") {
138
139
  console.log(`No FTS results for "${query}"`);
139
140
  return true;
140
141
  }
142
+ let queryVec;
143
+ try {
144
+ const { embedQuery } = await import("../db/embeddings.js");
145
+ const vec = await embedQuery(cleaned || query, process.env.OPENAI_API_KEY);
146
+ if (vec)
147
+ queryVec = vec;
148
+ }
149
+ catch { /* graceful degradation */ }
150
+ const symbols = store.searchSymbols(cleaned || query, Math.ceil(limit / 2), queryVec);
141
151
  const lines = [`## FTS5 search: "${query}"\n`];
152
+ // Build a map of file → matching symbols for quick lookup
153
+ const symbolsByFile = new Map();
154
+ for (const s of symbols) {
155
+ if (!symbolsByFile.has(s.file_path))
156
+ symbolsByFile.set(s.file_path, []);
157
+ symbolsByFile.get(s.file_path).push({ name: s.name, line: s.line });
158
+ }
142
159
  for (const r of results) {
143
160
  const rank = Math.abs(r.rank).toFixed(3);
144
161
  lines.push(`### \`${r.file_path}\` (score: ${rank})`);
145
- if (r.symbol_name)
146
- lines.push(` symbols: ${r.symbol_name}`);
162
+ // Matching symbols from this file (snippet equivalent)
163
+ const fileSyms = symbolsByFile.get(r.file_path) ?? [];
164
+ const inlineSyms = r.matching_symbols.filter(s => !fileSyms.some(f => f.name === s));
165
+ if (fileSyms.length) {
166
+ for (const s of fileSyms)
167
+ lines.push(` → \`${s.name}\` :${s.line}`);
168
+ }
169
+ if (inlineSyms.length) {
170
+ lines.push(` symbols: ${inlineSyms.join(", ")}`);
171
+ }
147
172
  if (r.imports.length)
148
173
  lines.push(` imports: ${r.imports.join(", ")}`);
149
174
  if (r.used_by.length)
@@ -177,7 +202,22 @@ async function getSqliteFileList(specsInput, query, limit, backend = "auto") {
177
202
  if (results.length === 0)
178
203
  return null;
179
204
  const signal = store.querySignal(query);
180
- return { files: results.map((r) => r.file_path), signal };
205
+ // Hybrid symbol search: BM25 + call-graph authority + optional vector similarity.
206
+ // embedQuery uses local model (no API key) or OpenAI if OPENAI_API_KEY is set.
207
+ let queryVec;
208
+ try {
209
+ const { embedQuery } = await import("../db/embeddings.js");
210
+ const vec = await embedQuery(cleaned || query, process.env.OPENAI_API_KEY);
211
+ if (vec)
212
+ queryVec = vec;
213
+ }
214
+ catch { /* graceful degradation — vector unavailable */ }
215
+ const symbols = store.searchSymbols(cleaned || query, Math.ceil(limit / 2), queryVec);
216
+ return {
217
+ files: results.map((r) => r.file_path),
218
+ symbols: symbols.map((s) => ({ file: s.file_path, name: s.name, line: s.line })),
219
+ signal,
220
+ };
181
221
  }
182
222
  finally {
183
223
  await store.close();
package/dist/config.js CHANGED
@@ -273,6 +273,7 @@ function normalizeConfig(input, configDir) {
273
273
  }
274
274
  function mergeConfig(base, override) {
275
275
  return {
276
+ project_id: override.project_id ?? base.project_id,
276
277
  project: {
277
278
  root: override.project?.root ?? base.project?.root ?? "",
278
279
  backendRoot: override.project?.backendRoot ?? base.project?.backendRoot ?? "",
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Embedding generation for function-level semantic search.
3
+ *
4
+ * Strategy (local-first, no API key required):
5
+ * Default — @xenova/transformers running Xenova/all-MiniLM-L6-v2 on-device.
6
+ * Model downloads once (~23 MB) and is cached in ~/.cache/xenova.
7
+ * dim=384, pure JS/ONNX, no external service needed.
8
+ *
9
+ * Upgrade — OpenAI text-embedding-3-small when OPENAI_API_KEY is set.
10
+ * dim=256, higher quality, costs ~$0.002 per 1M tokens.
11
+ *
12
+ * Text per function (concise — name carries most semantic signal):
13
+ * "{name} {filename}: {top calls} {short literals}"
14
+ */
15
+ const LOCAL_MODEL = "Xenova/all-MiniLM-L6-v2";
16
+ const LOCAL_DIM = 384;
17
+ const OPENAI_MODEL = "text-embedding-3-small";
18
+ const OPENAI_DIM = 256;
19
+ const BATCH = 64; // safe for both local and OpenAI
20
+ function fnToText(fn) {
21
+ const filename = fn.file.split("/").pop() ?? fn.file;
22
+ const callStr = (fn.calls ?? []).slice(0, 10).join(" ");
23
+ const litStr = (fn.stringLiterals ?? []).slice(0, 5).join(" ").slice(0, 100);
24
+ return `${fn.name} ${filename}: ${callStr} ${litStr}`.trim().slice(0, 300);
25
+ }
26
+ // ── Local embedder (no API key) ───────────────────────────────────────────────
27
+ async function embedBatchLocal(texts, pipe) {
28
+ const out = [];
29
+ for (const text of texts) {
30
+ const result = await pipe(text, { pooling: "mean", normalize: true });
31
+ out.push(new Float32Array(result.data));
32
+ }
33
+ return out;
34
+ }
35
+ // ── OpenAI embedder (OPENAI_API_KEY required) ─────────────────────────────────
36
+ async function embedBatchOpenAI(texts, apiKey) {
37
+ const { default: OpenAI } = await import("openai");
38
+ const client = new OpenAI({ apiKey });
39
+ const response = await client.embeddings.create({
40
+ model: OPENAI_MODEL,
41
+ input: texts,
42
+ dimensions: OPENAI_DIM,
43
+ encoding_format: "float",
44
+ });
45
+ return response.data.map(d => new Float32Array(d.embedding));
46
+ }
47
+ // ── Public API ────────────────────────────────────────────────────────────────
48
+ /**
49
+ * Embed all functions and store them in guardian.db function_embeddings table.
50
+ * Uses local model by default; OpenAI when OPENAI_API_KEY is set (better quality).
51
+ */
52
+ export async function embedFunctions(store, fns, apiKey) {
53
+ if (fns.length === 0)
54
+ return;
55
+ const useOpenAI = !!apiKey;
56
+ let pipe;
57
+ if (!useOpenAI) {
58
+ // Lazy-load local model (downloads once, then cached)
59
+ const { pipeline } = await import("@xenova/transformers");
60
+ console.log(`[guardian embed] loading local model ${LOCAL_MODEL}…`);
61
+ pipe = await pipeline("feature-extraction", LOCAL_MODEL);
62
+ }
63
+ const rows = [];
64
+ for (let i = 0; i < fns.length; i += BATCH) {
65
+ const batch = fns.slice(i, i + BATCH);
66
+ const texts = batch.map(fnToText);
67
+ let vecs;
68
+ try {
69
+ vecs = useOpenAI
70
+ ? await embedBatchOpenAI(texts, apiKey)
71
+ : await embedBatchLocal(texts, pipe);
72
+ }
73
+ catch (err) {
74
+ console.warn(`[guardian embed] batch ${i}–${i + batch.length - 1} failed: ${err.message}`);
75
+ continue;
76
+ }
77
+ for (let j = 0; j < batch.length; j++) {
78
+ if (!vecs[j])
79
+ continue;
80
+ rows.push({
81
+ file_path: batch[j].file,
82
+ name: batch[j].name,
83
+ line: batch[j].lines[0],
84
+ vec: vecs[j],
85
+ });
86
+ }
87
+ if (i > 0 && i % 500 === 0) {
88
+ console.log(`[guardian embed] ${i}/${fns.length} functions embedded`);
89
+ }
90
+ }
91
+ store.rebuildEmbeddings(rows);
92
+ const source = useOpenAI ? `OpenAI ${OPENAI_MODEL} dim=${OPENAI_DIM}` : `local ${LOCAL_MODEL} dim=${LOCAL_DIM}`;
93
+ console.log(`[guardian embed] stored ${rows.length} embeddings (${source})`);
94
+ }
95
+ /**
96
+ * Embed a single query string for hybrid search.
97
+ * Returns null on failure — graceful degradation to BM25 + call-graph authority.
98
+ */
99
+ export async function embedQuery(query, apiKey) {
100
+ try {
101
+ if (apiKey) {
102
+ const [vec] = await embedBatchOpenAI([query.slice(0, 300)], apiKey);
103
+ return vec ?? null;
104
+ }
105
+ const { pipeline } = await import("@xenova/transformers");
106
+ const pipe = await pipeline("feature-extraction", LOCAL_MODEL);
107
+ const [vec] = await embedBatchLocal([query.slice(0, 300)], pipe);
108
+ return vec ?? null;
109
+ }
110
+ catch {
111
+ return null;
112
+ }
113
+ }
@@ -297,9 +297,94 @@ export function populateFTSIndex(store, intel, arch, funcIntel) {
297
297
  if (funcIntel)
298
298
  mergeFunctionIntelRows(rowMap, funcIntel);
299
299
  store.rebuildSearchIndex(Array.from(rowMap.values()));
300
+ // Per-function index — enables symbol-level search results with line numbers.
301
+ if (funcIntel?.functions?.length) {
302
+ store.rebuildFunctionIndex(funcIntel.functions);
303
+ }
300
304
  // Build dependency graph
301
305
  if (arch) {
302
306
  const edges = buildDepEdges(arch);
303
307
  store.rebuildDeps(edges);
304
308
  }
309
+ // ── Normalised fact tables ─────────────────────────────────────────────────
310
+ // Merge arch endpoints + intel api_registry into endpoints_raw.
311
+ // arch.endpoints is the richer source (has method + file); intel.api_registry adds
312
+ // request/response schemas and service_calls that arch may not have.
313
+ const endpointMap = new Map();
314
+ for (const ep of arch?.endpoints ?? []) {
315
+ const key = `${(ep.method ?? "").toUpperCase()}::${ep.path ?? ""}`;
316
+ if (!ep.path)
317
+ continue;
318
+ endpointMap.set(key, {
319
+ method: ep.method ?? "",
320
+ path: ep.path,
321
+ handler: ep.handler ?? "",
322
+ file_path: ep.file ?? ep.file_path ?? "",
323
+ module: ep.module ?? "",
324
+ service_calls: ep.service_calls ?? [],
325
+ request_schema: "",
326
+ response_schema: "",
327
+ });
328
+ }
329
+ for (const [route, entry] of Object.entries(intel?.api_registry ?? {})) {
330
+ // route is like "GET /users" or "/users"
331
+ const parts = route.trim().split(/\s+/);
332
+ const method = parts.length >= 2 ? parts[0].toUpperCase() : "";
333
+ const p = parts.length >= 2 ? parts[1] : parts[0];
334
+ const key = `${method}::${p}`;
335
+ const existing = endpointMap.get(key);
336
+ if (existing) {
337
+ if (entry.request_schema)
338
+ existing.request_schema = entry.request_schema;
339
+ if (entry.response_schema)
340
+ existing.response_schema = entry.response_schema;
341
+ if (entry.service_calls?.length)
342
+ existing.service_calls = entry.service_calls;
343
+ }
344
+ else {
345
+ endpointMap.set(key, {
346
+ method,
347
+ path: p,
348
+ handler: entry.handler ?? "",
349
+ file_path: entry.file ?? "",
350
+ module: entry.module ?? "",
351
+ service_calls: entry.service_calls ?? [],
352
+ request_schema: entry.request_schema ?? "",
353
+ response_schema: entry.response_schema ?? "",
354
+ });
355
+ }
356
+ }
357
+ store.rebuildEndpointsRaw(Array.from(endpointMap.values()));
358
+ // Merge arch data_models + intel model_registry into models_raw.
359
+ const modelMap = new Map();
360
+ for (const m of arch?.data_models ?? []) {
361
+ if (!m.name)
362
+ continue;
363
+ modelMap.set(m.name, {
364
+ name: m.name,
365
+ file_path: m.file ?? m.file_path ?? "",
366
+ module: m.module ?? "",
367
+ fields: m.fields ?? [],
368
+ relationships: m.relationships ?? [],
369
+ });
370
+ }
371
+ for (const [name, entry] of Object.entries(intel?.model_registry ?? {})) {
372
+ const existing = modelMap.get(name);
373
+ if (existing) {
374
+ if (entry.fields?.length)
375
+ existing.fields = entry.fields;
376
+ if (entry.relationships?.length)
377
+ existing.relationships = entry.relationships;
378
+ }
379
+ else {
380
+ modelMap.set(name, {
381
+ name,
382
+ file_path: entry.file ?? "",
383
+ module: entry.module ?? "",
384
+ fields: entry.fields ?? [],
385
+ relationships: entry.relationships ?? [],
386
+ });
387
+ }
388
+ }
389
+ store.rebuildModelsRaw(Array.from(modelMap.values()));
305
390
  }