@toolbaux/guardian 0.1.23 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -20
- package/dist/cli.js +2 -6
- package/dist/commands/context.js +87 -29
- package/dist/commands/extract.js +4 -1
- package/dist/commands/generate.js +83 -10
- package/dist/commands/init.js +88 -60
- package/dist/commands/intel.js +23 -0
- package/dist/commands/mcp-serve.js +112 -0
- package/dist/commands/search.js +43 -3
- package/dist/config.js +1 -0
- package/dist/db/embeddings.js +113 -0
- package/dist/db/fts-builder.js +108 -0
- package/dist/db/sqlite-specs-store.js +496 -3
- package/package.json +2 -1
package/dist/commands/init.js
CHANGED
|
@@ -13,32 +13,62 @@
|
|
|
13
13
|
*/
|
|
14
14
|
import fs from "node:fs/promises";
|
|
15
15
|
import path from "node:path";
|
|
16
|
+
import { randomUUID } from "node:crypto";
|
|
16
17
|
import { DEFAULT_SPECS_DIR } from "../config.js";
|
|
17
18
|
const DEFAULT_CONFIG = {
|
|
18
19
|
docs: {
|
|
19
20
|
mode: "full",
|
|
20
21
|
},
|
|
21
22
|
};
|
|
23
|
+
/**
|
|
24
|
+
* Hook script written to .claude/hooks/mcp-first.sh
|
|
25
|
+
*
|
|
26
|
+
* Blocks Read/Glob/Grep until a guardian MCP tool has been called in the
|
|
27
|
+
* current session. Session state lives in /tmp/guardian-used-<SESSION_ID>
|
|
28
|
+
* and is set by the PostToolUse hook (guardian-used.sh) below.
|
|
29
|
+
*/
|
|
22
30
|
const CLAUDE_CODE_HOOK_SCRIPT = `#!/bin/bash
|
|
23
|
-
# Guardian MCP-first hook —
|
|
24
|
-
# Installed by: guardian init
|
|
31
|
+
# Guardian MCP-first hook — blocks Read/Glob/Grep until guardian tools are used.
|
|
32
|
+
# Installed by: guardian init (v3 — session-scoped flag, no time drift)
|
|
25
33
|
|
|
26
34
|
INPUT=$(cat)
|
|
27
|
-
|
|
35
|
+
SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "default"')
|
|
36
|
+
FLAG="/tmp/guardian-used-\${SESSION_ID}"
|
|
28
37
|
|
|
29
|
-
|
|
30
|
-
|
|
38
|
+
# If guardian was called in this session, allow all file operations.
|
|
39
|
+
if [ -f "$FLAG" ]; then
|
|
40
|
+
exit 0
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
cat >&2 <<'BLOCK'
|
|
44
|
+
BLOCKED: Use Guardian MCP tools before exploring source files.
|
|
31
45
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
46
|
+
Call one of these first:
|
|
47
|
+
guardian_search("your query") — find files/symbols/endpoints by keyword
|
|
48
|
+
guardian_grep("pattern") — semantic grep (replaces Grep tool)
|
|
49
|
+
guardian_glob("src/auth/**") — semantic file discovery (replaces Glob tool)
|
|
50
|
+
guardian_orient() — get codebase overview
|
|
36
51
|
|
|
37
|
-
|
|
52
|
+
File reads are unblocked automatically for the rest of this session.
|
|
38
53
|
BLOCK
|
|
39
54
|
|
|
40
55
|
exit 2
|
|
41
56
|
`;
|
|
57
|
+
/**
|
|
58
|
+
* Hook script written to .claude/hooks/guardian-used.sh
|
|
59
|
+
*
|
|
60
|
+
* Called by PostToolUse after any guardian_* tool. Sets the session flag
|
|
61
|
+
* that mcp-first.sh checks, unblocking subsequent Read/Glob/Grep calls.
|
|
62
|
+
*/
|
|
63
|
+
const GUARDIAN_USED_SCRIPT = `#!/bin/bash
|
|
64
|
+
# Guardian PostToolUse hook — marks guardian as used for this session.
|
|
65
|
+
# Installed by: guardian init
|
|
66
|
+
|
|
67
|
+
INPUT=$(cat)
|
|
68
|
+
SESSION_ID=$(echo "$INPUT" | jq -r '.session_id // "default"')
|
|
69
|
+
touch "/tmp/guardian-used-\${SESSION_ID}"
|
|
70
|
+
exit 0
|
|
71
|
+
`;
|
|
42
72
|
const HOOK_SCRIPT = `#!/bin/sh
|
|
43
73
|
# guardian pre-commit hook — keeps architecture context fresh
|
|
44
74
|
# Installed by: guardian init
|
|
@@ -80,6 +110,22 @@ export async function runInit(options) {
|
|
|
80
110
|
else {
|
|
81
111
|
console.log(" · guardian.config.json already exists");
|
|
82
112
|
}
|
|
113
|
+
// 2b. Ensure project_id is present in guardian.config.json
|
|
114
|
+
try {
|
|
115
|
+
const raw = await fs.readFile(configPath, "utf8");
|
|
116
|
+
const cfg = JSON.parse(raw);
|
|
117
|
+
if (!cfg.project_id) {
|
|
118
|
+
cfg.project_id = randomUUID();
|
|
119
|
+
await fs.writeFile(configPath, JSON.stringify(cfg, null, 2) + "\n", "utf8");
|
|
120
|
+
console.log(" ✓ Added project_id to guardian.config.json");
|
|
121
|
+
}
|
|
122
|
+
else {
|
|
123
|
+
console.log(" · guardian.config.json already has project_id");
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
catch {
|
|
127
|
+
// Non-fatal — config may not be valid JSON yet
|
|
128
|
+
}
|
|
83
129
|
// 3. Install pre-commit hook
|
|
84
130
|
if (!options.skipHook) {
|
|
85
131
|
const gitDir = path.join(root, ".git");
|
|
@@ -158,8 +204,6 @@ export async function runInit(options) {
|
|
|
158
204
|
const { runExtract } = await import("./extract.js");
|
|
159
205
|
await runExtract({
|
|
160
206
|
projectRoot: root,
|
|
161
|
-
backendRoot: options.backendRoot,
|
|
162
|
-
frontendRoot: options.frontendRoot,
|
|
163
207
|
output: specsDir,
|
|
164
208
|
includeFileGraph: true,
|
|
165
209
|
backend: options.backend,
|
|
@@ -167,8 +211,6 @@ export async function runInit(options) {
|
|
|
167
211
|
const { runGenerate } = await import("./generate.js");
|
|
168
212
|
await runGenerate({
|
|
169
213
|
projectRoot: root,
|
|
170
|
-
backendRoot: options.backendRoot,
|
|
171
|
-
frontendRoot: options.frontendRoot,
|
|
172
214
|
output: specsDir,
|
|
173
215
|
aiContext: true,
|
|
174
216
|
});
|
|
@@ -213,18 +255,13 @@ async function setupClaudeCodeHooks(root, specsDir) {
|
|
|
213
255
|
try {
|
|
214
256
|
mcpConfig = JSON.parse(await fs.readFile(mcpJsonPath, "utf8"));
|
|
215
257
|
}
|
|
216
|
-
catch {
|
|
217
|
-
// Corrupted — overwrite
|
|
218
|
-
}
|
|
258
|
+
catch { /* corrupted — overwrite */ }
|
|
219
259
|
}
|
|
220
260
|
if (!mcpConfig.mcpServers)
|
|
221
261
|
mcpConfig.mcpServers = {};
|
|
222
262
|
const servers = mcpConfig.mcpServers;
|
|
223
263
|
if (!servers.guardian) {
|
|
224
|
-
servers.guardian = {
|
|
225
|
-
command: "guardian",
|
|
226
|
-
args: ["mcp-serve", "--specs", specsDir],
|
|
227
|
-
};
|
|
264
|
+
servers.guardian = { command: "guardian", args: ["mcp-serve", "--specs", specsDir] };
|
|
228
265
|
await fs.writeFile(mcpJsonPath, JSON.stringify(mcpConfig, null, 2) + "\n", "utf8");
|
|
229
266
|
console.log(" ✓ Created .mcp.json (MCP server config)");
|
|
230
267
|
}
|
|
@@ -238,58 +275,49 @@ async function setupClaudeCodeHooks(root, specsDir) {
|
|
|
238
275
|
const claudeDir = path.join(root, ".claude");
|
|
239
276
|
const hooksDir = path.join(claudeDir, "hooks");
|
|
240
277
|
const settingsPath = path.join(claudeDir, "settings.json");
|
|
241
|
-
const
|
|
278
|
+
const mcpFirstPath = path.join(hooksDir, "mcp-first.sh");
|
|
279
|
+
const guardianUsedPath = path.join(hooksDir, "guardian-used.sh");
|
|
242
280
|
await fs.mkdir(hooksDir, { recursive: true });
|
|
243
|
-
//
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
console.log(" · Claude Code hook already exists");
|
|
251
|
-
}
|
|
281
|
+
// Always overwrite hook scripts so they stay in sync with this version of guardian.
|
|
282
|
+
await fs.writeFile(mcpFirstPath, CLAUDE_CODE_HOOK_SCRIPT, "utf8");
|
|
283
|
+
await fs.chmod(mcpFirstPath, 0o755);
|
|
284
|
+
console.log(" ✓ Wrote .claude/hooks/mcp-first.sh (PreToolUse — blocks until guardian called)");
|
|
285
|
+
await fs.writeFile(guardianUsedPath, GUARDIAN_USED_SCRIPT, "utf8");
|
|
286
|
+
await fs.chmod(guardianUsedPath, 0o755);
|
|
287
|
+
console.log(" ✓ Wrote .claude/hooks/guardian-used.sh (PostToolUse — sets session flag)");
|
|
252
288
|
// Write or merge .claude/settings.json
|
|
253
289
|
let settings = {};
|
|
254
290
|
if (await fileExists(settingsPath)) {
|
|
255
291
|
try {
|
|
256
292
|
settings = JSON.parse(await fs.readFile(settingsPath, "utf8"));
|
|
257
293
|
}
|
|
258
|
-
catch {
|
|
259
|
-
// Corrupted file — overwrite
|
|
260
|
-
}
|
|
294
|
+
catch { /* corrupted — overwrite */ }
|
|
261
295
|
}
|
|
262
|
-
//
|
|
296
|
+
// MCP server registration
|
|
263
297
|
if (!settings.mcpServers)
|
|
264
298
|
settings.mcpServers = {};
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
299
|
+
settings.mcpServers.guardian = {
|
|
300
|
+
command: "guardian",
|
|
301
|
+
args: ["mcp-serve", "--specs", specsDir],
|
|
302
|
+
};
|
|
303
|
+
// Hooks — always overwrite to keep in sync with installed scripts.
|
|
304
|
+
settings.hooks = {
|
|
305
|
+
// PreToolUse: block Read/Glob/Grep until a guardian tool has been called.
|
|
306
|
+
// The script itself handles the session-flag check — no "if" filter needed here.
|
|
307
|
+
PreToolUse: [
|
|
308
|
+
{
|
|
309
|
+
matcher: "Read|Glob|Grep",
|
|
310
|
+
hooks: [{ type: "command", command: ".claude/hooks/mcp-first.sh" }],
|
|
311
|
+
},
|
|
312
|
+
],
|
|
313
|
+
// PostToolUse: set the session flag after any guardian MCP tool call.
|
|
314
|
+
PostToolUse: [
|
|
276
315
|
{
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
command: '"$CLAUDE_PROJECT_DIR"/.claude/hooks/mcp-first.sh',
|
|
316
|
+
matcher: "mcp__guardian__guardian_search|mcp__guardian__guardian_orient|mcp__guardian__guardian_context|mcp__guardian__guardian_impact|mcp__guardian__guardian_grep|mcp__guardian__guardian_glob",
|
|
317
|
+
hooks: [{ type: "command", command: ".claude/hooks/guardian-used.sh" }],
|
|
280
318
|
},
|
|
281
319
|
],
|
|
282
320
|
};
|
|
283
|
-
if (!settings.hooks)
|
|
284
|
-
settings.hooks = {};
|
|
285
|
-
const hooks = settings.hooks;
|
|
286
|
-
if (!hooks.PreToolUse) {
|
|
287
|
-
hooks.PreToolUse = [hookEntry];
|
|
288
|
-
console.log(" ✓ Configured Claude Code PreToolUse hook in .claude/settings.json");
|
|
289
|
-
}
|
|
290
|
-
else {
|
|
291
|
-
console.log(" · Claude Code PreToolUse hook already configured");
|
|
292
|
-
}
|
|
293
321
|
await fs.writeFile(settingsPath, JSON.stringify(settings, null, 2) + "\n", "utf8");
|
|
294
|
-
console.log(" ✓ Updated .claude/settings.json (MCP server + hooks)");
|
|
322
|
+
console.log(" ✓ Updated .claude/settings.json (MCP server + PreToolUse + PostToolUse hooks)");
|
|
295
323
|
}
|
package/dist/commands/intel.js
CHANGED
|
@@ -13,6 +13,7 @@ import { writeCodebaseIntelligenceViaStore } from "../extract/codebase-intel.js"
|
|
|
13
13
|
import { getOutputLayout } from "../output-layout.js";
|
|
14
14
|
import { SqliteSpecsStore } from "../db/sqlite-specs-store.js";
|
|
15
15
|
import { populateFTSIndex } from "../db/fts-builder.js";
|
|
16
|
+
import { embedFunctions } from "../db/embeddings.js";
|
|
16
17
|
export async function runIntel(options) {
|
|
17
18
|
const specsDir = path.resolve(options.specs);
|
|
18
19
|
const layout = getOutputLayout(specsDir);
|
|
@@ -49,6 +50,28 @@ export async function runIntel(options) {
|
|
|
49
50
|
catch { /* not generated yet — skip */ }
|
|
50
51
|
populateFTSIndex(store, intel, arch, funcIntel);
|
|
51
52
|
console.log(`Built FTS5 search index (${Object.keys(intel.api_registry ?? {}).length} endpoints indexed)`);
|
|
53
|
+
// Populate module_metrics from structural-intelligence.json (if present).
|
|
54
|
+
try {
|
|
55
|
+
const siRaw = await (await import("node:fs/promises")).readFile((await import("node:path")).join(machineDir, "structural-intelligence.json"), "utf8");
|
|
56
|
+
const siReports = JSON.parse(siRaw);
|
|
57
|
+
if (Array.isArray(siReports) && siReports.length > 0) {
|
|
58
|
+
store.rebuildModuleMetrics(siReports);
|
|
59
|
+
console.log(`Indexed ${siReports.length} module metrics`);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
catch { /* structural-intelligence.json not generated yet — skip */ }
|
|
63
|
+
// Embed functions for semantic (vector) search.
|
|
64
|
+
// Uses local on-device model by default (no API key needed).
|
|
65
|
+
// If OPENAI_API_KEY is set, uses OpenAI text-embedding-3-small (better quality).
|
|
66
|
+
if (funcIntel?.functions?.length) {
|
|
67
|
+
console.log(`[guardian embed] embedding ${funcIntel.functions.length} functions…`);
|
|
68
|
+
try {
|
|
69
|
+
await embedFunctions(store, funcIntel.functions, process.env.OPENAI_API_KEY);
|
|
70
|
+
}
|
|
71
|
+
catch (err) {
|
|
72
|
+
console.warn(`[guardian embed] skipped: ${err.message}`);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
52
75
|
}
|
|
53
76
|
console.log(`Wrote guardian.db → ${layout.rootDir}`);
|
|
54
77
|
}
|
|
@@ -111,6 +111,83 @@ async function search(args) {
|
|
|
111
111
|
async function model(args) {
|
|
112
112
|
return runCli(["search", "--model", args.name, "--input", specsInputDir]);
|
|
113
113
|
}
|
|
114
|
+
/**
|
|
115
|
+
* guardian_grep — semantic grep via guardian search.
|
|
116
|
+
*
|
|
117
|
+
* Replaces raw Grep tool calls. Runs guardian BM25+vector search and returns
|
|
118
|
+
* matching symbols (file:line:name) and files, formatted like grep output.
|
|
119
|
+
* Claude gets richer context (call-graph, authority) with zero token overhead.
|
|
120
|
+
*/
|
|
121
|
+
async function grep(args) {
|
|
122
|
+
const raw = await runCli([
|
|
123
|
+
"search", "--query", args.query, "--format", "json", "--backend", "auto", "--input", specsInputDir,
|
|
124
|
+
]);
|
|
125
|
+
try {
|
|
126
|
+
const data = JSON.parse(raw);
|
|
127
|
+
const lines = [`guardian_grep("${args.query}")`];
|
|
128
|
+
if (data.symbols?.length) {
|
|
129
|
+
lines.push("\nSymbols (file:line: name):");
|
|
130
|
+
for (const s of data.symbols.slice(0, 25)) {
|
|
131
|
+
lines.push(` ${s.file}:${s.line}: ${s.name}`);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if (data.files?.length) {
|
|
135
|
+
lines.push("\nFiles:");
|
|
136
|
+
for (const f of data.files.slice(0, 15)) {
|
|
137
|
+
lines.push(` ${f.file_path}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
if (lines.length === 1)
|
|
141
|
+
lines.push(" (no matches — try a different query)");
|
|
142
|
+
return lines.join("\n");
|
|
143
|
+
}
|
|
144
|
+
catch {
|
|
145
|
+
return raw; // passthrough if search returns plain text
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* guardian_glob — semantic file discovery via guardian search.
|
|
150
|
+
*
|
|
151
|
+
* Replaces raw Glob tool calls. Extracts meaningful keywords from the glob
|
|
152
|
+
* pattern and searches the guardian index for matching files. Falls back to
|
|
153
|
+
* guiding the user toward a more descriptive query for pure extension patterns.
|
|
154
|
+
*/
|
|
155
|
+
async function glob(args) {
|
|
156
|
+
// Extract keywords: "src/auth/**/*.ts" → "auth", "src/middleware/error*" → "middleware error"
|
|
157
|
+
const keywords = args.pattern
|
|
158
|
+
.replace(/\*\*?/g, " ")
|
|
159
|
+
.replace(/\.\w+$/, "") // strip trailing extension
|
|
160
|
+
.replace(/[[\]{}]/g, " ")
|
|
161
|
+
.split(/[/\s]+/)
|
|
162
|
+
.filter(s => s.length > 2 && !/^(src|lib|dist|app|index)$/.test(s))
|
|
163
|
+
.join(" ")
|
|
164
|
+
.trim();
|
|
165
|
+
if (!keywords) {
|
|
166
|
+
return [
|
|
167
|
+
`guardian_glob("${args.pattern}"): pattern has no meaningful keywords.`,
|
|
168
|
+
`Use guardian_search with a descriptive query instead, e.g.:`,
|
|
169
|
+
` guardian_search("TypeScript source files") — or describe what you're looking for.`,
|
|
170
|
+
].join("\n");
|
|
171
|
+
}
|
|
172
|
+
const raw = await runCli([
|
|
173
|
+
"search", "--query", keywords, "--format", "json", "--backend", "auto", "--input", specsInputDir,
|
|
174
|
+
]);
|
|
175
|
+
try {
|
|
176
|
+
const data = JSON.parse(raw);
|
|
177
|
+
const files = data.files ?? [];
|
|
178
|
+
const lines = [
|
|
179
|
+
`guardian_glob("${args.pattern}") — searched: "${keywords}"`,
|
|
180
|
+
`\nMatching files:`,
|
|
181
|
+
...files.slice(0, 20).map(f => ` ${f.file_path}`),
|
|
182
|
+
];
|
|
183
|
+
if (files.length === 0)
|
|
184
|
+
lines.push(" (no matches)");
|
|
185
|
+
return lines.join("\n");
|
|
186
|
+
}
|
|
187
|
+
catch {
|
|
188
|
+
return raw;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
114
191
|
// ── MCP protocol ──
|
|
115
192
|
const TOOLS = [
|
|
116
193
|
{
|
|
@@ -167,6 +244,39 @@ const TOOLS = [
|
|
|
167
244
|
description: "MCP usage stats for this session. Call at end to evaluate guardian's usefulness.",
|
|
168
245
|
inputSchema: { type: "object", properties: {} },
|
|
169
246
|
},
|
|
247
|
+
{
|
|
248
|
+
name: "guardian_grep",
|
|
249
|
+
description: [
|
|
250
|
+
"Semantic grep — find symbols and files matching a keyword or pattern.",
|
|
251
|
+
"Use INSTEAD of the Grep tool. Returns matching function/class names with file:line locations.",
|
|
252
|
+
"Backed by BM25 + call-graph authority so relevant source definitions surface first.",
|
|
253
|
+
"Example: guardian_grep('validate token') → auth.py:42: validate_token, middleware.py:18: check_jwt",
|
|
254
|
+
].join(" "),
|
|
255
|
+
inputSchema: {
|
|
256
|
+
type: "object",
|
|
257
|
+
properties: {
|
|
258
|
+
query: { type: "string", description: "Keyword or phrase to search for (natural language OK)" },
|
|
259
|
+
path: { type: "string", description: "Optional: restrict to files under this path prefix" },
|
|
260
|
+
},
|
|
261
|
+
required: ["query"],
|
|
262
|
+
},
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
name: "guardian_glob",
|
|
266
|
+
description: [
|
|
267
|
+
"Semantic file discovery — find files matching a path pattern.",
|
|
268
|
+
"Use INSTEAD of the Glob tool. Extracts keywords from the pattern and searches the guardian index.",
|
|
269
|
+
"Example: guardian_glob('src/auth/**/*.ts') → searches for 'auth typescript' files.",
|
|
270
|
+
"For pure extension globs with no path context, use guardian_search with a descriptive query.",
|
|
271
|
+
].join(" "),
|
|
272
|
+
inputSchema: {
|
|
273
|
+
type: "object",
|
|
274
|
+
properties: {
|
|
275
|
+
pattern: { type: "string", description: "Glob pattern (e.g. 'src/auth/**/*.ts', '**/middleware*')" },
|
|
276
|
+
},
|
|
277
|
+
required: ["pattern"],
|
|
278
|
+
},
|
|
279
|
+
},
|
|
170
280
|
];
|
|
171
281
|
const TOOL_HANDLERS = {
|
|
172
282
|
guardian_orient: orient,
|
|
@@ -175,6 +285,8 @@ const TOOL_HANDLERS = {
|
|
|
175
285
|
guardian_search: search,
|
|
176
286
|
guardian_model: model,
|
|
177
287
|
guardian_metrics: async () => JSON.stringify(metrics.summary()),
|
|
288
|
+
guardian_grep: grep,
|
|
289
|
+
guardian_glob: glob,
|
|
178
290
|
};
|
|
179
291
|
function respond(id, result) {
|
|
180
292
|
const msg = JSON.stringify({ jsonrpc: "2.0", id, result });
|
package/dist/commands/search.js
CHANGED
|
@@ -17,6 +17,7 @@ export async function runSearch(options) {
|
|
|
17
17
|
if (sqliteResult !== null) {
|
|
18
18
|
const base = JSON.parse(await querySearch(inputDir, options.query));
|
|
19
19
|
base.files = sqliteResult.files;
|
|
20
|
+
base.symbols = sqliteResult.symbols;
|
|
20
21
|
base.search_signal = sqliteResult.signal;
|
|
21
22
|
console.log(JSON.stringify(base));
|
|
22
23
|
return;
|
|
@@ -138,12 +139,36 @@ async function runSearchSqlite(specsInput, query, limit, backend = "sqlite") {
|
|
|
138
139
|
console.log(`No FTS results for "${query}"`);
|
|
139
140
|
return true;
|
|
140
141
|
}
|
|
142
|
+
let queryVec;
|
|
143
|
+
try {
|
|
144
|
+
const { embedQuery } = await import("../db/embeddings.js");
|
|
145
|
+
const vec = await embedQuery(cleaned || query, process.env.OPENAI_API_KEY);
|
|
146
|
+
if (vec)
|
|
147
|
+
queryVec = vec;
|
|
148
|
+
}
|
|
149
|
+
catch { /* graceful degradation */ }
|
|
150
|
+
const symbols = store.searchSymbols(cleaned || query, Math.ceil(limit / 2), queryVec);
|
|
141
151
|
const lines = [`## FTS5 search: "${query}"\n`];
|
|
152
|
+
// Build a map of file → matching symbols for quick lookup
|
|
153
|
+
const symbolsByFile = new Map();
|
|
154
|
+
for (const s of symbols) {
|
|
155
|
+
if (!symbolsByFile.has(s.file_path))
|
|
156
|
+
symbolsByFile.set(s.file_path, []);
|
|
157
|
+
symbolsByFile.get(s.file_path).push({ name: s.name, line: s.line });
|
|
158
|
+
}
|
|
142
159
|
for (const r of results) {
|
|
143
160
|
const rank = Math.abs(r.rank).toFixed(3);
|
|
144
161
|
lines.push(`### \`${r.file_path}\` (score: ${rank})`);
|
|
145
|
-
|
|
146
|
-
|
|
162
|
+
// Matching symbols from this file (snippet equivalent)
|
|
163
|
+
const fileSyms = symbolsByFile.get(r.file_path) ?? [];
|
|
164
|
+
const inlineSyms = r.matching_symbols.filter(s => !fileSyms.some(f => f.name === s));
|
|
165
|
+
if (fileSyms.length) {
|
|
166
|
+
for (const s of fileSyms)
|
|
167
|
+
lines.push(` → \`${s.name}\` :${s.line}`);
|
|
168
|
+
}
|
|
169
|
+
if (inlineSyms.length) {
|
|
170
|
+
lines.push(` symbols: ${inlineSyms.join(", ")}`);
|
|
171
|
+
}
|
|
147
172
|
if (r.imports.length)
|
|
148
173
|
lines.push(` imports: ${r.imports.join(", ")}`);
|
|
149
174
|
if (r.used_by.length)
|
|
@@ -177,7 +202,22 @@ async function getSqliteFileList(specsInput, query, limit, backend = "auto") {
|
|
|
177
202
|
if (results.length === 0)
|
|
178
203
|
return null;
|
|
179
204
|
const signal = store.querySignal(query);
|
|
180
|
-
|
|
205
|
+
// Hybrid symbol search: BM25 + call-graph authority + optional vector similarity.
|
|
206
|
+
// embedQuery uses local model (no API key) or OpenAI if OPENAI_API_KEY is set.
|
|
207
|
+
let queryVec;
|
|
208
|
+
try {
|
|
209
|
+
const { embedQuery } = await import("../db/embeddings.js");
|
|
210
|
+
const vec = await embedQuery(cleaned || query, process.env.OPENAI_API_KEY);
|
|
211
|
+
if (vec)
|
|
212
|
+
queryVec = vec;
|
|
213
|
+
}
|
|
214
|
+
catch { /* graceful degradation — vector unavailable */ }
|
|
215
|
+
const symbols = store.searchSymbols(cleaned || query, Math.ceil(limit / 2), queryVec);
|
|
216
|
+
return {
|
|
217
|
+
files: results.map((r) => r.file_path),
|
|
218
|
+
symbols: symbols.map((s) => ({ file: s.file_path, name: s.name, line: s.line })),
|
|
219
|
+
signal,
|
|
220
|
+
};
|
|
181
221
|
}
|
|
182
222
|
finally {
|
|
183
223
|
await store.close();
|
package/dist/config.js
CHANGED
|
@@ -273,6 +273,7 @@ function normalizeConfig(input, configDir) {
|
|
|
273
273
|
}
|
|
274
274
|
function mergeConfig(base, override) {
|
|
275
275
|
return {
|
|
276
|
+
project_id: override.project_id ?? base.project_id,
|
|
276
277
|
project: {
|
|
277
278
|
root: override.project?.root ?? base.project?.root ?? "",
|
|
278
279
|
backendRoot: override.project?.backendRoot ?? base.project?.backendRoot ?? "",
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding generation for function-level semantic search.
|
|
3
|
+
*
|
|
4
|
+
* Strategy (local-first, no API key required):
|
|
5
|
+
* Default — @xenova/transformers running Xenova/all-MiniLM-L6-v2 on-device.
|
|
6
|
+
* Model downloads once (~23 MB) and is cached in ~/.cache/xenova.
|
|
7
|
+
* dim=384, pure JS/ONNX, no external service needed.
|
|
8
|
+
*
|
|
9
|
+
* Upgrade — OpenAI text-embedding-3-small when OPENAI_API_KEY is set.
|
|
10
|
+
* dim=256, higher quality, costs ~$0.002 per 1M tokens.
|
|
11
|
+
*
|
|
12
|
+
* Text per function (concise — name carries most semantic signal):
|
|
13
|
+
* "{name} {filename}: {top calls} {short literals}"
|
|
14
|
+
*/
|
|
15
|
+
const LOCAL_MODEL = "Xenova/all-MiniLM-L6-v2";
|
|
16
|
+
const LOCAL_DIM = 384;
|
|
17
|
+
const OPENAI_MODEL = "text-embedding-3-small";
|
|
18
|
+
const OPENAI_DIM = 256;
|
|
19
|
+
const BATCH = 64; // safe for both local and OpenAI
|
|
20
|
+
function fnToText(fn) {
|
|
21
|
+
const filename = fn.file.split("/").pop() ?? fn.file;
|
|
22
|
+
const callStr = (fn.calls ?? []).slice(0, 10).join(" ");
|
|
23
|
+
const litStr = (fn.stringLiterals ?? []).slice(0, 5).join(" ").slice(0, 100);
|
|
24
|
+
return `${fn.name} ${filename}: ${callStr} ${litStr}`.trim().slice(0, 300);
|
|
25
|
+
}
|
|
26
|
+
// ── Local embedder (no API key) ───────────────────────────────────────────────
|
|
27
|
+
async function embedBatchLocal(texts, pipe) {
|
|
28
|
+
const out = [];
|
|
29
|
+
for (const text of texts) {
|
|
30
|
+
const result = await pipe(text, { pooling: "mean", normalize: true });
|
|
31
|
+
out.push(new Float32Array(result.data));
|
|
32
|
+
}
|
|
33
|
+
return out;
|
|
34
|
+
}
|
|
35
|
+
// ── OpenAI embedder (OPENAI_API_KEY required) ─────────────────────────────────
|
|
36
|
+
async function embedBatchOpenAI(texts, apiKey) {
|
|
37
|
+
const { default: OpenAI } = await import("openai");
|
|
38
|
+
const client = new OpenAI({ apiKey });
|
|
39
|
+
const response = await client.embeddings.create({
|
|
40
|
+
model: OPENAI_MODEL,
|
|
41
|
+
input: texts,
|
|
42
|
+
dimensions: OPENAI_DIM,
|
|
43
|
+
encoding_format: "float",
|
|
44
|
+
});
|
|
45
|
+
return response.data.map(d => new Float32Array(d.embedding));
|
|
46
|
+
}
|
|
47
|
+
// ── Public API ────────────────────────────────────────────────────────────────
|
|
48
|
+
/**
|
|
49
|
+
* Embed all functions and store them in guardian.db function_embeddings table.
|
|
50
|
+
* Uses local model by default; OpenAI when OPENAI_API_KEY is set (better quality).
|
|
51
|
+
*/
|
|
52
|
+
export async function embedFunctions(store, fns, apiKey) {
|
|
53
|
+
if (fns.length === 0)
|
|
54
|
+
return;
|
|
55
|
+
const useOpenAI = !!apiKey;
|
|
56
|
+
let pipe;
|
|
57
|
+
if (!useOpenAI) {
|
|
58
|
+
// Lazy-load local model (downloads once, then cached)
|
|
59
|
+
const { pipeline } = await import("@xenova/transformers");
|
|
60
|
+
console.log(`[guardian embed] loading local model ${LOCAL_MODEL}…`);
|
|
61
|
+
pipe = await pipeline("feature-extraction", LOCAL_MODEL);
|
|
62
|
+
}
|
|
63
|
+
const rows = [];
|
|
64
|
+
for (let i = 0; i < fns.length; i += BATCH) {
|
|
65
|
+
const batch = fns.slice(i, i + BATCH);
|
|
66
|
+
const texts = batch.map(fnToText);
|
|
67
|
+
let vecs;
|
|
68
|
+
try {
|
|
69
|
+
vecs = useOpenAI
|
|
70
|
+
? await embedBatchOpenAI(texts, apiKey)
|
|
71
|
+
: await embedBatchLocal(texts, pipe);
|
|
72
|
+
}
|
|
73
|
+
catch (err) {
|
|
74
|
+
console.warn(`[guardian embed] batch ${i}–${i + batch.length - 1} failed: ${err.message}`);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
for (let j = 0; j < batch.length; j++) {
|
|
78
|
+
if (!vecs[j])
|
|
79
|
+
continue;
|
|
80
|
+
rows.push({
|
|
81
|
+
file_path: batch[j].file,
|
|
82
|
+
name: batch[j].name,
|
|
83
|
+
line: batch[j].lines[0],
|
|
84
|
+
vec: vecs[j],
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
if (i > 0 && i % 500 === 0) {
|
|
88
|
+
console.log(`[guardian embed] ${i}/${fns.length} functions embedded`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
store.rebuildEmbeddings(rows);
|
|
92
|
+
const source = useOpenAI ? `OpenAI ${OPENAI_MODEL} dim=${OPENAI_DIM}` : `local ${LOCAL_MODEL} dim=${LOCAL_DIM}`;
|
|
93
|
+
console.log(`[guardian embed] stored ${rows.length} embeddings (${source})`);
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Embed a single query string for hybrid search.
|
|
97
|
+
* Returns null on failure — graceful degradation to BM25 + call-graph authority.
|
|
98
|
+
*/
|
|
99
|
+
export async function embedQuery(query, apiKey) {
|
|
100
|
+
try {
|
|
101
|
+
if (apiKey) {
|
|
102
|
+
const [vec] = await embedBatchOpenAI([query.slice(0, 300)], apiKey);
|
|
103
|
+
return vec ?? null;
|
|
104
|
+
}
|
|
105
|
+
const { pipeline } = await import("@xenova/transformers");
|
|
106
|
+
const pipe = await pipeline("feature-extraction", LOCAL_MODEL);
|
|
107
|
+
const [vec] = await embedBatchLocal([query.slice(0, 300)], pipe);
|
|
108
|
+
return vec ?? null;
|
|
109
|
+
}
|
|
110
|
+
catch {
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
}
|