grepmax 0.17.16 → 0.17.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/mcp.js +24 -1
- package/dist/commands/search.js +17 -1
- package/dist/eval-seed.js +284 -0
- package/dist/lib/daemon/daemon.js +5 -1
- package/dist/lib/daemon/ipc-handler.js +3 -0
- package/dist/lib/search/searcher.js +86 -18
- package/dist/lib/search/seed-weight.js +125 -0
- package/dist/lib/workers/orchestrator.js +9 -1
- package/package.json +1 -1
- package/plugins/grepmax/.claude-plugin/plugin.json +1 -1
package/dist/commands/mcp.js
CHANGED
|
@@ -158,6 +158,14 @@ const TOOLS = [
|
|
|
158
158
|
type: "string",
|
|
159
159
|
description: "Project names to exclude (comma-separated)",
|
|
160
160
|
},
|
|
161
|
+
seed_files: {
|
|
162
|
+
type: "string",
|
|
163
|
+
description: "Bias results toward your working context: comma-separated paths you have open (e.g. 'src/lib/llm/server.ts'). On-topic chunks in these files get lifted; off-topic ones are not.",
|
|
164
|
+
},
|
|
165
|
+
seed_symbols: {
|
|
166
|
+
type: "string",
|
|
167
|
+
description: "Bias results toward identifiers you're working with: comma-separated symbol names. Chunks defining a seeded symbol are preferred over mere callers.",
|
|
168
|
+
},
|
|
161
169
|
},
|
|
162
170
|
required: ["query"],
|
|
163
171
|
},
|
|
@@ -784,7 +792,22 @@ exports.mcp = new commander_1.Command("mcp")
|
|
|
784
792
|
}
|
|
785
793
|
}
|
|
786
794
|
}
|
|
787
|
-
|
|
795
|
+
// Aider-style seeding: the agent passes its open files / discussed
|
|
796
|
+
// symbols; the searcher biases candidate generation toward them.
|
|
797
|
+
const parseSeedList = (v) => {
|
|
798
|
+
const items = (Array.isArray(v)
|
|
799
|
+
? v.map((x) => String(x))
|
|
800
|
+
: typeof v === "string"
|
|
801
|
+
? v.split(",")
|
|
802
|
+
: [])
|
|
803
|
+
.map((s) => s.trim())
|
|
804
|
+
.filter((s) => s.length > 0);
|
|
805
|
+
return items.length > 0 ? items : undefined;
|
|
806
|
+
};
|
|
807
|
+
const seedFiles = parseSeedList(args.seed_files);
|
|
808
|
+
const seedSymbols = parseSeedList(args.seed_symbols);
|
|
809
|
+
const seeds = seedFiles || seedSymbols ? { files: seedFiles, symbols: seedSymbols } : undefined;
|
|
810
|
+
const result = yield searcher.search(query, limit, { rerank: process.env.GMAX_RERANK === "1", seeds }, Object.keys(filters).length > 0 ? filters : undefined, pathPrefix);
|
|
788
811
|
if (!result.data || result.data.length === 0) {
|
|
789
812
|
return ok("No matches found. Try broadening your query, using fewer keywords, or check `gmax status` to verify the project is indexed.");
|
|
790
813
|
}
|
package/dist/commands/search.js
CHANGED
|
@@ -377,6 +377,8 @@ exports.search = new commander_1.Command("search")
|
|
|
377
377
|
.option("--name <regex>", "Filter results by symbol name regex")
|
|
378
378
|
.option("-C, --context <n>", "Include N lines before/after each result")
|
|
379
379
|
.option("--agent", "Ultra-compact output for AI agents (one line per result)", false)
|
|
380
|
+
.option("--seed-file <path>", "Bias results toward your working context (repeatable; comma-separated also accepted)", (value, prev) => (prev ? [...prev, value] : [value]))
|
|
381
|
+
.option("--seed-symbol <name>", "Bias results toward an identifier you're working with (repeatable; comma-separated also accepted)", (value, prev) => (prev ? [...prev, value] : [value]))
|
|
380
382
|
.argument("<pattern>", 'Natural language query (e.g. "where do we handle auth?")')
|
|
381
383
|
.argument("[path]", "Restrict search to this path prefix")
|
|
382
384
|
.addHelpText("after", `
|
|
@@ -569,6 +571,19 @@ Examples:
|
|
|
569
571
|
searchFilters.inPrefixes = scope.inPrefixes;
|
|
570
572
|
if (scope.excludePrefixes.length > 0)
|
|
571
573
|
searchFilters.excludePrefixes = scope.excludePrefixes;
|
|
574
|
+
// Aider-style seeding: --seed-file / --seed-symbol (repeatable, also
|
|
575
|
+
// comma-separated) bias candidate generation toward the caller's working
|
|
576
|
+
// context. Absent → undefined → inert.
|
|
577
|
+
const splitSeeds = (vals) => {
|
|
578
|
+
const items = (vals !== null && vals !== void 0 ? vals : [])
|
|
579
|
+
.flatMap((v) => v.split(","))
|
|
580
|
+
.map((s) => s.trim())
|
|
581
|
+
.filter((s) => s.length > 0);
|
|
582
|
+
return items.length > 0 ? items : undefined;
|
|
583
|
+
};
|
|
584
|
+
const seedFiles = splitSeeds(options.seedFile);
|
|
585
|
+
const seedSymbols = splitSeeds(options.seedSymbol);
|
|
586
|
+
const seeds = seedFiles || seedSymbols ? { files: seedFiles, symbols: seedSymbols } : undefined;
|
|
572
587
|
// Daemon-mediated search: ships query+args over IPC, daemon runs the
|
|
573
588
|
// hybrid+rerank against its already-warm VectorDB and worker pool.
|
|
574
589
|
// Drops cold-start cost (~17s wall, 6GB RAM in the CLI) to <1s. Falls
|
|
@@ -592,6 +607,7 @@ Examples:
|
|
|
592
607
|
pathPrefix: pathFilter,
|
|
593
608
|
rerank: process.env.GMAX_RERANK === "1",
|
|
594
609
|
explain: options.explain,
|
|
610
|
+
seeds,
|
|
595
611
|
includeSkeletons: options.skeleton,
|
|
596
612
|
includeGraph: options.symbol,
|
|
597
613
|
}, { timeoutMs: 60000 });
|
|
@@ -695,7 +711,7 @@ Examples:
|
|
|
695
711
|
}
|
|
696
712
|
}
|
|
697
713
|
const searcher = new searcher_1.Searcher(vectorDb);
|
|
698
|
-
searchResult = yield searcher.search(pattern, parseInt(options.m, 10), { rerank: process.env.GMAX_RERANK === "1", explain: options.explain }, Object.keys(searchFilters).length > 0
|
|
714
|
+
searchResult = yield searcher.search(pattern, parseInt(options.m, 10), { rerank: process.env.GMAX_RERANK === "1", explain: options.explain, seeds }, Object.keys(searchFilters).length > 0
|
|
699
715
|
? searchFilters
|
|
700
716
|
: undefined, pathFilter);
|
|
701
717
|
} // end if (!searchResult) — in-process fallback
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Seed-eval harness — measures Aider-style chat/file seeding (Phase 4).
|
|
4
|
+
*
|
|
5
|
+
* WHY A SEPARATE HARNESS. `bench:oss` (src/eval-oss.ts) uses bare-symbol P1
|
|
6
|
+
* lookups with NO seed context, so it can only serve as a *no-seed regression
|
|
7
|
+
* guard* for seeding (seeded path absent → results must be unchanged). It
|
|
8
|
+
* cannot demonstrate that seeding *helps*: attaching a seed equal to the answer
|
|
9
|
+
* file would be circular, and the fixtures carry no realistic "open files"
|
|
10
|
+
* annotation.
|
|
11
|
+
*
|
|
12
|
+
* THE HONEST DESIGN. Every case here uses an *ambiguous* natural-language query
|
|
13
|
+
* that legitimately matches several subsystems, plus a realistic seed (a file
|
|
14
|
+
* an agent would have open, or a symbol they're discussing). The metric is the
|
|
15
|
+
* rank of the *contextually-correct* answer file, measured twice: baseline (no
|
|
16
|
+
* seed) vs seeded. Three case kinds:
|
|
17
|
+
*
|
|
18
|
+
* - route: same query, seed points at subsystem A → answer should be A's
|
|
19
|
+
* file (which a no-seed search ranks below a different subsystem).
|
|
20
|
+
* Non-circular because the SAME query under a DIFFERENT seed must
|
|
21
|
+
* route to a DIFFERENT, independently-valid answer — something no
|
|
22
|
+
* static ranking can do.
|
|
23
|
+
* - recover: the contextually-correct file is OUT of the no-seed top-K
|
|
24
|
+
* entirely; seeding must pull it back via candidate-generation
|
|
25
|
+
* weight (a rerank-only seed could never recover an out-of-pool
|
|
26
|
+
* item — this case is the load-bearing proof of "weight in
|
|
27
|
+
* candidate generation, not rerank").
|
|
28
|
+
* - guard: the seed is IRRELEVANT to the query; the no-seed rank-1 file must
|
|
29
|
+
* stay rank 1. Catches seeding doing harm.
|
|
30
|
+
*
|
|
31
|
+
* Baselines below were measured live against the gmax index on 2026-06-02
|
|
32
|
+
* (granite-small, gpu) and are quoted per case. They are documentation, not
|
|
33
|
+
* assertions — the harness recomputes them every run.
|
|
34
|
+
*
|
|
35
|
+
* Usage:
|
|
36
|
+
* npx tsx src/eval-seed.ts # table output
|
|
37
|
+
* npx tsx src/eval-seed.ts --json # machine-readable
|
|
38
|
+
*/
|
|
39
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
40
|
+
if (k2 === undefined) k2 = k;
|
|
41
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
42
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
43
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
44
|
+
}
|
|
45
|
+
Object.defineProperty(o, k2, desc);
|
|
46
|
+
}) : (function(o, m, k, k2) {
|
|
47
|
+
if (k2 === undefined) k2 = k;
|
|
48
|
+
o[k2] = m[k];
|
|
49
|
+
}));
|
|
50
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
51
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
52
|
+
}) : function(o, v) {
|
|
53
|
+
o["default"] = v;
|
|
54
|
+
});
|
|
55
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
56
|
+
var ownKeys = function(o) {
|
|
57
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
58
|
+
var ar = [];
|
|
59
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
60
|
+
return ar;
|
|
61
|
+
};
|
|
62
|
+
return ownKeys(o);
|
|
63
|
+
};
|
|
64
|
+
return function (mod) {
|
|
65
|
+
if (mod && mod.__esModule) return mod;
|
|
66
|
+
var result = {};
|
|
67
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
68
|
+
__setModuleDefault(result, mod);
|
|
69
|
+
return result;
|
|
70
|
+
};
|
|
71
|
+
})();
|
|
72
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
73
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
74
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
75
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
76
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
77
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
78
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
79
|
+
});
|
|
80
|
+
};
|
|
81
|
+
var _a, _b, _c;
|
|
82
|
+
var _d;
|
|
83
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
84
|
+
(_a = (_d = process.env).GMAX_WORKER_COUNT) !== null && _a !== void 0 ? _a : (_d.GMAX_WORKER_COUNT = "1");
|
|
85
|
+
const path = __importStar(require("node:path"));
|
|
86
|
+
const searcher_1 = require("./lib/search/searcher");
|
|
87
|
+
const vector_db_1 = require("./lib/store/vector-db");
|
|
88
|
+
const exit_1 = require("./lib/utils/exit");
|
|
89
|
+
const config_1 = require("./config");
|
|
90
|
+
// Route/recover cases target the gmax repo itself — the corpus whose graph the
|
|
91
|
+
// author can verify by hand. The "idle timeout" concept lives in three
|
|
92
|
+
// subsystems (worker reap / LLM server / daemon); "health check" in three more.
|
|
93
|
+
// That natural polysemy is what makes the routing test honest.
|
|
94
|
+
//
|
|
95
|
+
// The no-harm guards instead target the immutable express fixture: querying
|
|
96
|
+
// gmax for "rank fusion" is contaminated by this harness's own source (which is
|
|
97
|
+
// full of "fusion" prose and gets live-indexed), so a stable external corpus is
|
|
98
|
+
// the honest place to assert "an irrelevant seed must not displace the winner".
|
|
99
|
+
const REPO_ROOTS = {
|
|
100
|
+
gmax: path.join((_b = process.env.HOME) !== null && _b !== void 0 ? _b : "", "Development/beyond/tools/gmax"),
|
|
101
|
+
express: path.join((_c = process.env.HOME) !== null && _c !== void 0 ? _c : "", "Development/sandbox/bench-fixtures/express"),
|
|
102
|
+
};
|
|
103
|
+
const GMAX_CASES = [
|
|
104
|
+
// ── Triple A: "idle timeout shutdown" routes to worker / LLM / daemon ──────
|
|
105
|
+
{
|
|
106
|
+
id: "idle-pool",
|
|
107
|
+
query: "idle timeout shutdown",
|
|
108
|
+
seedFiles: ["src/lib/workers/pool.ts"],
|
|
109
|
+
expectedFile: "src/lib/workers/pool.ts",
|
|
110
|
+
kind: "guard", // already rank 1 without seeds — seeding must not demote it
|
|
111
|
+
baselineRankNote: 1,
|
|
112
|
+
note: "worker-reap is the no-seed winner; seeding its own file keeps it #1",
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
id: "idle-llm",
|
|
116
|
+
query: "idle timeout shutdown",
|
|
117
|
+
seedFiles: ["src/lib/llm/server.ts"],
|
|
118
|
+
expectedFile: "src/lib/llm/server.ts",
|
|
119
|
+
kind: "route",
|
|
120
|
+
baselineRankNote: 5,
|
|
121
|
+
note: "LLM idle watchdog at #5 behind worker-reap chunks; seed should lift it to #1",
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
id: "idle-daemon",
|
|
125
|
+
query: "idle timeout shutdown",
|
|
126
|
+
seedFiles: ["src/lib/daemon/daemon.ts"],
|
|
127
|
+
expectedFile: "src/lib/daemon/daemon.ts",
|
|
128
|
+
kind: "recover",
|
|
129
|
+
baselineRankNote: 0,
|
|
130
|
+
note: "daemon idle checker is OUT of the no-seed top-25; candidate-gen weight must recover it",
|
|
131
|
+
},
|
|
132
|
+
// ── Triple B: "health check probe" routes to doctor / mlx / llm ────────────
|
|
133
|
+
{
|
|
134
|
+
id: "health-doctor",
|
|
135
|
+
query: "health check probe",
|
|
136
|
+
seedFiles: ["src/commands/doctor.ts"],
|
|
137
|
+
expectedFile: "src/commands/doctor.ts",
|
|
138
|
+
kind: "guard",
|
|
139
|
+
baselineRankNote: 1,
|
|
140
|
+
note: "doctor is the no-seed winner; seeding its own file keeps it #1",
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
id: "health-mlx",
|
|
144
|
+
query: "health check probe",
|
|
145
|
+
seedFiles: ["src/lib/workers/embeddings/mlx-client.ts"],
|
|
146
|
+
expectedFile: "src/lib/workers/embeddings/mlx-client.ts",
|
|
147
|
+
kind: "route",
|
|
148
|
+
baselineRankNote: 3,
|
|
149
|
+
note: "mlx checkHealth at #3; seed should lift the embed-server probe to #1",
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
id: "health-llm",
|
|
153
|
+
query: "health check probe",
|
|
154
|
+
seedFiles: ["src/lib/llm/server.ts"],
|
|
155
|
+
expectedFile: "src/lib/llm/server.ts",
|
|
156
|
+
kind: "route",
|
|
157
|
+
baselineRankNote: 5,
|
|
158
|
+
note: "llm-server healthy() at #5; seed should lift it to #1",
|
|
159
|
+
},
|
|
160
|
+
// ── Symbol seeding: discussed identifier instead of open file ──────────────
|
|
161
|
+
{
|
|
162
|
+
id: "idle-llm-sym",
|
|
163
|
+
query: "idle timeout shutdown",
|
|
164
|
+
seedSymbols: ["LlmServer"],
|
|
165
|
+
expectedFile: "src/lib/llm/server.ts",
|
|
166
|
+
kind: "route",
|
|
167
|
+
baselineRankNote: 5,
|
|
168
|
+
note: "symbol-seed analog of idle-llm: discussing LlmServer biases toward its file",
|
|
169
|
+
},
|
|
170
|
+
// ── Guards: irrelevant seed must not perturb a strong no-seed winner.
|
|
171
|
+
// On the immutable express fixture so the assertion can't be polluted by
|
|
172
|
+
// live-indexing this harness's own source. ────────────────────────────────
|
|
173
|
+
{
|
|
174
|
+
id: "guard-express-file",
|
|
175
|
+
repo: "express",
|
|
176
|
+
query: "create the application factory",
|
|
177
|
+
seedFiles: ["lib/view.js"],
|
|
178
|
+
expectedFile: "lib/express.js",
|
|
179
|
+
kind: "guard",
|
|
180
|
+
baselineRankNote: 1,
|
|
181
|
+
note: "view.js (rank ~150 for this query) is off-topic; express.js must stay #1",
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
id: "guard-express-sym",
|
|
185
|
+
repo: "express",
|
|
186
|
+
query: "create the application factory",
|
|
187
|
+
seedSymbols: ["View"],
|
|
188
|
+
expectedFile: "lib/express.js",
|
|
189
|
+
kind: "guard",
|
|
190
|
+
baselineRankNote: 1,
|
|
191
|
+
note: "View is defined in the off-topic view.js; express.js must stay #1",
|
|
192
|
+
},
|
|
193
|
+
];
|
|
194
|
+
/** Rank (1-indexed) of the first result whose path matches expectedFile; 0 = miss. */
|
|
195
|
+
function rankOf(response, expectedFile) {
|
|
196
|
+
const want = expectedFile.toLowerCase();
|
|
197
|
+
const idx = response.data.findIndex((chunk) => {
|
|
198
|
+
var _a;
|
|
199
|
+
const p = String(((_a = chunk.metadata) === null || _a === void 0 ? void 0 : _a.path) || "").toLowerCase();
|
|
200
|
+
return p.endsWith(`/${want}`) || p.endsWith(want);
|
|
201
|
+
});
|
|
202
|
+
return idx + 1;
|
|
203
|
+
}
|
|
204
|
+
function judge(kind, baseline, seeded) {
|
|
205
|
+
// 0 means "not found in top-K"; treat as worse than any found rank.
|
|
206
|
+
const b = baseline === 0 ? Infinity : baseline;
|
|
207
|
+
const s = seeded === 0 ? Infinity : seeded;
|
|
208
|
+
switch (kind) {
|
|
209
|
+
case "route":
|
|
210
|
+
// Seeding must improve (or already hold) the contextually-correct file's
|
|
211
|
+
// rank — and land it at the top.
|
|
212
|
+
return s <= b && s === 1;
|
|
213
|
+
case "recover":
|
|
214
|
+
// Out-of-pool baseline must be pulled into the results and to the top.
|
|
215
|
+
return baseline === 0 && s === 1;
|
|
216
|
+
case "guard":
|
|
217
|
+
// No harm: the file must not lose rank (and a rank-1 stays rank-1).
|
|
218
|
+
return s <= b;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
function run() {
|
|
222
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
223
|
+
var _a;
|
|
224
|
+
const jsonMode = process.argv.includes("--json") || process.env.GMAX_EVAL_JSON === "1";
|
|
225
|
+
const topK = 25;
|
|
226
|
+
const rerank = process.env.GMAX_EVAL_RERANK === "1";
|
|
227
|
+
const vectorDb = new vector_db_1.VectorDB(config_1.PATHS.lancedbDir);
|
|
228
|
+
const searcher = new searcher_1.Searcher(vectorDb);
|
|
229
|
+
const results = [];
|
|
230
|
+
for (const c of GMAX_CASES) {
|
|
231
|
+
const pathPrefix = `${REPO_ROOTS[(_a = c.repo) !== null && _a !== void 0 ? _a : "gmax"]}/`;
|
|
232
|
+
const baseRes = yield searcher.search(c.query, topK, { rerank }, undefined, pathPrefix);
|
|
233
|
+
const seededRes = yield searcher.search(c.query, topK, { rerank, seeds: { files: c.seedFiles, symbols: c.seedSymbols } }, undefined, pathPrefix);
|
|
234
|
+
const baselineRank = rankOf(baseRes, c.expectedFile);
|
|
235
|
+
const seededRank = rankOf(seededRes, c.expectedFile);
|
|
236
|
+
results.push({
|
|
237
|
+
id: c.id,
|
|
238
|
+
kind: c.kind,
|
|
239
|
+
query: c.query,
|
|
240
|
+
expectedFile: c.expectedFile,
|
|
241
|
+
baselineRank,
|
|
242
|
+
seededRank,
|
|
243
|
+
pass: judge(c.kind, baselineRank, seededRank),
|
|
244
|
+
note: c.note,
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
yield vectorDb.close();
|
|
248
|
+
const passes = results.filter((r) => r.pass).length;
|
|
249
|
+
const byKind = (k) => results.filter((r) => r.kind === k);
|
|
250
|
+
const summary = {
|
|
251
|
+
cases: results.length,
|
|
252
|
+
passes,
|
|
253
|
+
route: { total: byKind("route").length, pass: byKind("route").filter((r) => r.pass).length },
|
|
254
|
+
recover: { total: byKind("recover").length, pass: byKind("recover").filter((r) => r.pass).length },
|
|
255
|
+
guard: { total: byKind("guard").length, pass: byKind("guard").filter((r) => r.pass).length },
|
|
256
|
+
};
|
|
257
|
+
if (jsonMode) {
|
|
258
|
+
process.stdout.write(`${JSON.stringify({ rerank, summary, results }, null, 2)}\n`);
|
|
259
|
+
}
|
|
260
|
+
else {
|
|
261
|
+
console.log(`Seed eval (rerank=${rerank ? "on" : "off"})\n`);
|
|
262
|
+
const fmtRank = (r) => (r === 0 ? "—" : `#${r}`);
|
|
263
|
+
for (const r of results) {
|
|
264
|
+
const arrow = `${fmtRank(r.baselineRank)} → ${fmtRank(r.seededRank)}`;
|
|
265
|
+
const mark = r.pass ? "✓" : "✗";
|
|
266
|
+
const seed = `[${r.kind}]`;
|
|
267
|
+
console.log(` ${mark} ${r.id.padEnd(18)} ${seed.padEnd(10)} ${arrow.padEnd(12)} ${r.expectedFile}`);
|
|
268
|
+
if (r.note)
|
|
269
|
+
console.log(` ${r.note}`);
|
|
270
|
+
}
|
|
271
|
+
console.log(`\n → ${passes}/${results.length} pass ` +
|
|
272
|
+
`(route ${summary.route.pass}/${summary.route.total}, ` +
|
|
273
|
+
`recover ${summary.recover.pass}/${summary.recover.total}, ` +
|
|
274
|
+
`guard ${summary.guard.pass}/${summary.guard.total})`);
|
|
275
|
+
}
|
|
276
|
+
yield (0, exit_1.gracefulExit)(0);
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
if (require.main === module && process.env.GMAX_EVAL_AUTORUN !== "0") {
|
|
280
|
+
run().catch((e) => {
|
|
281
|
+
console.error(e);
|
|
282
|
+
process.exit(1);
|
|
283
|
+
});
|
|
284
|
+
}
|
|
@@ -867,7 +867,11 @@ class Daemon {
|
|
|
867
867
|
this.lastActivity = Date.now();
|
|
868
868
|
let result;
|
|
869
869
|
try {
|
|
870
|
-
result = yield searcher.search(payload.query, payload.limit, {
|
|
870
|
+
result = yield searcher.search(payload.query, payload.limit, {
|
|
871
|
+
rerank: payload.rerank === true,
|
|
872
|
+
explain: payload.explain === true,
|
|
873
|
+
seeds: payload.seeds,
|
|
874
|
+
}, payload.filters, payload.pathPrefix, undefined, signal);
|
|
871
875
|
}
|
|
872
876
|
catch (err) {
|
|
873
877
|
if ((err === null || err === void 0 ? void 0 : err.name) === "AbortError") {
|
|
@@ -171,6 +171,9 @@ function handleCommand(daemon, cmd, conn) {
|
|
|
171
171
|
pathPrefix: typeof cmd.pathPrefix === "string" ? cmd.pathPrefix : undefined,
|
|
172
172
|
rerank: cmd.rerank === true,
|
|
173
173
|
explain: cmd.explain === true,
|
|
174
|
+
seeds: cmd.seeds && typeof cmd.seeds === "object" && !Array.isArray(cmd.seeds)
|
|
175
|
+
? cmd.seeds
|
|
176
|
+
: undefined,
|
|
174
177
|
includeSkeletons: cmd.includeSkeletons === true,
|
|
175
178
|
skeletonLimit: skeletonLimitRaw,
|
|
176
179
|
includeGraph: cmd.includeGraph === true,
|
|
@@ -17,6 +17,7 @@ const filter_builder_1 = require("../utils/filter-builder");
|
|
|
17
17
|
const pool_1 = require("../workers/pool");
|
|
18
18
|
const intent_1 = require("./intent");
|
|
19
19
|
const pagerank_1 = require("./pagerank");
|
|
20
|
+
const seed_weight_1 = require("./seed-weight");
|
|
20
21
|
// Reads a defined_symbols / referenced_symbols column that may arrive as a plain
|
|
21
22
|
// array or a LanceDB Arrow proxy (.toArray()).
|
|
22
23
|
function readSymbolArray(val) {
|
|
@@ -367,7 +368,7 @@ class Searcher {
|
|
|
367
368
|
}
|
|
368
369
|
search(query, top_k, _search_options, _filters, pathPrefix, intent, signal) {
|
|
369
370
|
return __awaiter(this, void 0, void 0, function* () {
|
|
370
|
-
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m;
|
|
371
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o, _p;
|
|
371
372
|
const finalLimit = top_k !== null && top_k !== void 0 ? top_k : 10;
|
|
372
373
|
// ColBERT rerank is opt-in as of v0.17.1. On the 97-case eval it
|
|
373
374
|
// regresses MRR@10 by ~3% and doubles query latency; sweep across
|
|
@@ -375,6 +376,10 @@ class Searcher {
|
|
|
375
376
|
// fused scores ~30:1 so blend tuning can't recover the loss.
|
|
376
377
|
let doRerank = (_a = _search_options === null || _search_options === void 0 ? void 0 : _search_options.rerank) !== null && _a !== void 0 ? _a : false;
|
|
377
378
|
const explain = (_b = _search_options === null || _search_options === void 0 ? void 0 : _search_options.explain) !== null && _b !== void 0 ? _b : false;
|
|
379
|
+
// Aider-style seeding (Phase 4): bias candidate generation toward the
|
|
380
|
+
// agent's working context. Inert unless the caller supplied seed files or
|
|
381
|
+
// symbols, so the default search path is unchanged.
|
|
382
|
+
const seedCtx = (0, seed_weight_1.buildSeedContext)(_search_options === null || _search_options === void 0 ? void 0 : _search_options.seeds);
|
|
378
383
|
const searchIntent = intent || (0, intent_1.detectIntent)(query);
|
|
379
384
|
// Bare-identifier queries get symbol-definition promotion (see below).
|
|
380
385
|
const symbolQuery = asSymbolQuery(query);
|
|
@@ -402,7 +407,7 @@ class Searcher {
|
|
|
402
407
|
try {
|
|
403
408
|
table = yield this.db.ensureTable();
|
|
404
409
|
}
|
|
405
|
-
catch (
|
|
410
|
+
catch (_q) {
|
|
406
411
|
return { data: [] };
|
|
407
412
|
}
|
|
408
413
|
// Ensure FTS index exists (lazy init, retry periodically on failure)
|
|
@@ -428,7 +433,9 @@ class Searcher {
|
|
|
428
433
|
const pagerankEnabled = process.env.GMAX_PAGERANK === "1" && !!pathPrefix;
|
|
429
434
|
// Symbol-definition promotion needs defined_symbols on every candidate, not
|
|
430
435
|
// just the final display set — load it for bare-symbol queries too.
|
|
431
|
-
|
|
436
|
+
// Seed-symbol matching reads defined_symbols (referenced_symbols is always
|
|
437
|
+
// loaded), so pull it into the lightweight path when symbols were seeded.
|
|
438
|
+
const needDefinedSymbols = pagerankEnabled || symbolQuery !== null || seedCtx.symbols.size > 0;
|
|
432
439
|
const LIGHTWEIGHT_COLUMNS = [
|
|
433
440
|
"id", "path", "hash", "chunk_index", "start_line", "end_line",
|
|
434
441
|
"is_anchor", "chunk_type", "role", "complexity", "is_exported",
|
|
@@ -471,7 +478,7 @@ class Searcher {
|
|
|
471
478
|
this.ftsAvailable = true;
|
|
472
479
|
console.warn("[Searcher] Rebuilt FTS index with position support — retry search");
|
|
473
480
|
}
|
|
474
|
-
catch (
|
|
481
|
+
catch (_r) { }
|
|
475
482
|
}
|
|
476
483
|
else {
|
|
477
484
|
console.warn(`[Searcher] FTS search failed (will retry later): ${msg}`);
|
|
@@ -487,11 +494,23 @@ class Searcher {
|
|
|
487
494
|
const RRF_K = 60;
|
|
488
495
|
const candidateScores = new Map();
|
|
489
496
|
const docMap = new Map();
|
|
497
|
+
// Best (lowest) 1-indexed rank each candidate reached in any retriever —
|
|
498
|
+
// the relevance gate for seeding (see the seed block below). Only tracked
|
|
499
|
+
// when seeding is active; otherwise it stays empty and costs nothing.
|
|
500
|
+
const bestRank = new Map();
|
|
501
|
+
const noteRank = seedCtx.active
|
|
502
|
+
? (key, rank) => {
|
|
503
|
+
const prev = bestRank.get(key);
|
|
504
|
+
if (prev === undefined || rank + 1 < prev)
|
|
505
|
+
bestRank.set(key, rank + 1);
|
|
506
|
+
}
|
|
507
|
+
: () => { };
|
|
490
508
|
vectorResults.forEach((doc, rank) => {
|
|
491
509
|
const key = doc.id || `${doc.path}:${doc.chunk_index}`;
|
|
492
510
|
docMap.set(key, doc);
|
|
493
511
|
const score = 1.0 / (RRF_K + rank + 1);
|
|
494
512
|
candidateScores.set(key, (candidateScores.get(key) || 0) + score);
|
|
513
|
+
noteRank(key, rank);
|
|
495
514
|
});
|
|
496
515
|
ftsResults.forEach((doc, rank) => {
|
|
497
516
|
const key = doc.id || `${doc.path}:${doc.chunk_index}`;
|
|
@@ -499,6 +518,7 @@ class Searcher {
|
|
|
499
518
|
docMap.set(key, doc);
|
|
500
519
|
const score = 1.0 / (RRF_K + rank + 1);
|
|
501
520
|
candidateScores.set(key, (candidateScores.get(key) || 0) + score);
|
|
521
|
+
noteRank(key, rank);
|
|
502
522
|
});
|
|
503
523
|
const fused = Array.from(candidateScores.entries())
|
|
504
524
|
.sort((a, b) => b[1] - a[1])
|
|
@@ -507,6 +527,45 @@ class Searcher {
|
|
|
507
527
|
// Free raw search results — docMap holds the only needed references
|
|
508
528
|
vectorResults.length = 0;
|
|
509
529
|
ftsResults.length = 0;
|
|
530
|
+
// Aider-style seeding (Phase 4): bump the RRF score of candidates matching
|
|
531
|
+
// the agent's working context, gated by each candidate's own relevance so
|
|
532
|
+
// off-topic seed files are never injected (the safety invariant). Because
|
|
533
|
+
// the final ordering also reads candidateScores, this one bump propagates
|
|
534
|
+
// through the stage-1 cosine cut, the stage-2 window, the rerank set, AND
|
|
535
|
+
// the final score — and can recover a candidate fusion buried below the
|
|
536
|
+
// display cut, which a rerank-only seed could not. See ./seed-weight.ts.
|
|
537
|
+
if (seedCtx.active) {
|
|
538
|
+
// Bound the scan to the relevant head of the pool. The gate is each
|
|
539
|
+
// candidate's best retriever rank (bestRank), so off-topic seed chunks
|
|
540
|
+
// that only appear deep in the pool are never lifted.
|
|
541
|
+
const SEED_WINDOW = 200;
|
|
542
|
+
const seedParams = (0, seed_weight_1.seedParamsFromEnv)();
|
|
543
|
+
let boosted = false;
|
|
544
|
+
for (const doc of fused.slice(0, SEED_WINDOW)) {
|
|
545
|
+
const sym = seedCtx.symbols.size > 0
|
|
546
|
+
? (0, seed_weight_1.matchesSeedSymbol)(seedCtx, readSymbolArray(doc.defined_symbols), readSymbolArray(doc.referenced_symbols))
|
|
547
|
+
: { def: false, ref: false };
|
|
548
|
+
const match = {
|
|
549
|
+
file: (0, seed_weight_1.matchesSeedFile)(seedCtx, doc.path),
|
|
550
|
+
symbolDef: sym.def,
|
|
551
|
+
symbolRef: sym.ref && !sym.def,
|
|
552
|
+
};
|
|
553
|
+
const key = doc.id || `${doc.path}:${doc.chunk_index}`;
|
|
554
|
+
const bonus = (0, seed_weight_1.seedBoost)(match, (_d = bestRank.get(key)) !== null && _d !== void 0 ? _d : Infinity, seedParams);
|
|
555
|
+
if (bonus > 0) {
|
|
556
|
+
candidateScores.set(key, ((_e = candidateScores.get(key)) !== null && _e !== void 0 ? _e : 0) + bonus);
|
|
557
|
+
boosted = true;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
if (boosted) {
|
|
561
|
+
fused.sort((a, b) => {
|
|
562
|
+
var _a, _b;
|
|
563
|
+
const ka = a.id || `${a.path}:${a.chunk_index}`;
|
|
564
|
+
const kb = b.id || `${b.path}:${b.chunk_index}`;
|
|
565
|
+
return ((_a = candidateScores.get(kb)) !== null && _a !== void 0 ? _a : 0) - ((_b = candidateScores.get(ka)) !== null && _b !== void 0 ? _b : 0);
|
|
566
|
+
});
|
|
567
|
+
}
|
|
568
|
+
}
|
|
510
569
|
// Candidate-concentration gate (Bundle B, v0.17.2 OSS-fixture finding):
|
|
511
570
|
// ColBERT rerank is shape-sensitive. When the post-fusion pool clusters
|
|
512
571
|
// into one file (single-file-repo / concentrated shape, e.g. lodash) rerank
|
|
@@ -515,7 +574,7 @@ class Searcher {
|
|
|
515
574
|
// here and *add* rerank-on for it. This only ever flips doRerank false→true:
|
|
516
575
|
// an explicit GMAX_RERANK=1 (doRerank already true) is never overridden off.
|
|
517
576
|
if (!doRerank) {
|
|
518
|
-
const envConcThreshold = Number.parseFloat((
|
|
577
|
+
const envConcThreshold = Number.parseFloat((_f = process.env.GMAX_CONCENTRATION_THRESHOLD) !== null && _f !== void 0 ? _f : "");
|
|
519
578
|
// <= 0 (or NaN with the default) keeps the gate active at 0.7; a value > 1
|
|
520
579
|
// disables it (no possible share reaches it), giving a rerank-fully-off
|
|
521
580
|
// baseline for sweeps without touching the doRerank default. 0.7 is the
|
|
@@ -531,7 +590,7 @@ class Searcher {
|
|
|
531
590
|
if (window.length > 0 && CONCENTRATION_THRESHOLD <= 1) {
|
|
532
591
|
const buckets = new Map();
|
|
533
592
|
for (const doc of window) {
|
|
534
|
-
buckets.set(doc.path, ((
|
|
593
|
+
buckets.set(doc.path, ((_g = buckets.get(doc.path)) !== null && _g !== void 0 ? _g : 0) + 1);
|
|
535
594
|
}
|
|
536
595
|
let maxBucket = 0;
|
|
537
596
|
for (const count of buckets.values()) {
|
|
@@ -546,7 +605,7 @@ class Searcher {
|
|
|
546
605
|
}
|
|
547
606
|
// Item 8: Widen PRE_RERANK_K
|
|
548
607
|
// Retrieve a wide set for Stage 1 filtering
|
|
549
|
-
const envStage1 = Number.parseInt((
|
|
608
|
+
const envStage1 = Number.parseInt((_h = process.env.GMAX_STAGE1_K) !== null && _h !== void 0 ? _h : "", 10);
|
|
550
609
|
const STAGE1_K = Number.isFinite(envStage1) && envStage1 > 0 ? envStage1 : 200;
|
|
551
610
|
const topCandidates = fused.slice(0, STAGE1_K);
|
|
552
611
|
// Free docMap — topCandidates already holds record references
|
|
@@ -554,24 +613,33 @@ class Searcher {
|
|
|
554
613
|
// Item 9: Two-stage rerank
|
|
555
614
|
// Stage 1: Cheap pooled cosine filter
|
|
556
615
|
let stage2Candidates = topCandidates;
|
|
557
|
-
const envStage2K = Number.parseInt((
|
|
616
|
+
const envStage2K = Number.parseInt((_j = process.env.GMAX_STAGE2_K) !== null && _j !== void 0 ? _j : "", 10);
|
|
558
617
|
const STAGE2_K = Number.isFinite(envStage2K) && envStage2K > 0 ? envStage2K : 40;
|
|
559
|
-
const envRerankTop = Number.parseInt((
|
|
618
|
+
const envRerankTop = Number.parseInt((_k = process.env.GMAX_RERANK_TOP) !== null && _k !== void 0 ? _k : "", 10);
|
|
560
619
|
const RERANK_TOP = Number.isFinite(envRerankTop) && envRerankTop > 0 ? envRerankTop : 20;
|
|
561
|
-
const envBlend = Number.parseFloat((
|
|
620
|
+
const envBlend = Number.parseFloat((_l = process.env.GMAX_RERANK_BLEND) !== null && _l !== void 0 ? _l : "");
|
|
562
621
|
const FUSED_WEIGHT = Number.isFinite(envBlend) && envBlend >= 0 ? envBlend : 0.5;
|
|
563
622
|
if (queryPooled && topCandidates.length > STAGE2_K) {
|
|
564
623
|
const cosineScores = topCandidates.map((doc) => {
|
|
565
|
-
|
|
624
|
+
const docVec = doc.pooled_colbert_48d;
|
|
625
|
+
// Reject missing or short vectors. Also treat an all-zero vector as
|
|
626
|
+
// "no pooled signal" rather than a genuine cosine of 0 — chunks indexed
|
|
627
|
+
// before the pooled-IPC fix (orchestrator.ts) stored all-zero padding,
|
|
628
|
+
// and on a mixed index those must sort below chunks that carry real
|
|
629
|
+
// pooled vectors, not tie with orthogonal ones.
|
|
630
|
+
if (!docVec || docVec.length < queryPooled.length)
|
|
566
631
|
return -1;
|
|
567
632
|
// Manual cosine sim since we don't have helper here easily
|
|
568
633
|
// Assuming vectors are normalized (which they should be from orchestrator)
|
|
569
634
|
let dot = 0;
|
|
570
|
-
|
|
635
|
+
let nonZero = false;
|
|
571
636
|
for (let i = 0; i < queryPooled.length; i++) {
|
|
572
|
-
|
|
637
|
+
const c = docVec[i] || 0;
|
|
638
|
+
if (c !== 0)
|
|
639
|
+
nonZero = true;
|
|
640
|
+
dot += queryPooled[i] * c;
|
|
573
641
|
}
|
|
574
|
-
return dot;
|
|
642
|
+
return nonZero ? dot : -1;
|
|
575
643
|
});
|
|
576
644
|
// Sort by cosine score and keep top N
|
|
577
645
|
const withScore = topCandidates.map((doc, i) => ({
|
|
@@ -662,7 +730,7 @@ class Searcher {
|
|
|
662
730
|
// constructor/toJSON, which otherwise score higher on the literal and evict
|
|
663
731
|
// the parent in overlap dedup). Multiplicative keeps it scale-invariant
|
|
664
732
|
// across the rerank-on (ColBERT maxsim) and rerank-off (fusion) score ranges.
|
|
665
|
-
const envDefBoost = Number.parseFloat((
|
|
733
|
+
const envDefBoost = Number.parseFloat((_m = process.env.GMAX_DEF_BOOST) !== null && _m !== void 0 ? _m : "");
|
|
666
734
|
const DEF_MATCH_BOOST = Number.isFinite(envDefBoost) && envDefBoost >= 1 ? envDefBoost : 5;
|
|
667
735
|
const scored = rerankCandidates.map((doc, idx) => {
|
|
668
736
|
var _a, _b;
|
|
@@ -691,7 +759,7 @@ class Searcher {
|
|
|
691
759
|
try {
|
|
692
760
|
const { scores: prScores, max: prMax } = yield (0, pagerank_1.loadOrComputePageRank)(this.db, pathPrefix);
|
|
693
761
|
if (prMax > 0) {
|
|
694
|
-
const envWeight = Number.parseFloat((
|
|
762
|
+
const envWeight = Number.parseFloat((_o = process.env.GMAX_PR_WEIGHT) !== null && _o !== void 0 ? _o : "");
|
|
695
763
|
const PR_WEIGHT = Number.isFinite(envWeight) && envWeight >= 0 ? envWeight : 0.05;
|
|
696
764
|
for (const item of scored) {
|
|
697
765
|
const raw = item.record.defined_symbols;
|
|
@@ -706,7 +774,7 @@ class Searcher {
|
|
|
706
774
|
defs = arr.filter((v) => typeof v === "string");
|
|
707
775
|
}
|
|
708
776
|
}
|
|
709
|
-
catch (
|
|
777
|
+
catch (_s) { }
|
|
710
778
|
}
|
|
711
779
|
const norm = (0, pagerank_1.pageRankBoostForSymbols)(defs, prScores, prMax);
|
|
712
780
|
item.score += PR_WEIGHT * norm;
|
|
@@ -724,7 +792,7 @@ class Searcher {
|
|
|
724
792
|
// Item 10: Per-file diversification
|
|
725
793
|
const seenFiles = new Map();
|
|
726
794
|
const diversified = [];
|
|
727
|
-
const envMaxPerFile = Number.parseInt((
|
|
795
|
+
const envMaxPerFile = Number.parseInt((_p = process.env.GMAX_MAX_PER_FILE) !== null && _p !== void 0 ? _p : "", 10);
|
|
728
796
|
const MAX_PER_FILE = Number.isFinite(envMaxPerFile) && envMaxPerFile > 0 ? envMaxPerFile : 3;
|
|
729
797
|
for (const item of uniqueScored) {
|
|
730
798
|
const path = item.record.path || "";
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Aider-style chat/file seeding (Phase 4) — pure scoring helpers.
|
|
4
|
+
*
|
|
5
|
+
* Seeding biases search toward the agent's *working context*: files it has open
|
|
6
|
+
* ("chat files", weighted heavily in Aider's repo-map) and identifiers it is
|
|
7
|
+
* discussing. gmax applies the bias in **candidate generation** — it bumps the
|
|
8
|
+
* Reciprocal Rank Fusion score of seed-matching candidates — NOT as a post-hoc
|
|
9
|
+
* rerank tiebreaker. Bundle B (see docs/plans/2026-05-25-semantic-search-
|
|
10
|
+
* landscape.md) showed a tiebreaker over a saturated rerank pool is a no-op;
|
|
11
|
+
* lifting the fusion score instead lets a seeded candidate climb through the
|
|
12
|
+
* stage-1 cosine cut, the stage-2 window, and the final ordering in one move,
|
|
13
|
+
* and can even *recover* a candidate that fusion alone buried below the display
|
|
14
|
+
* cut (something a rerank-only seed could never do).
|
|
15
|
+
*
|
|
16
|
+
* THE SAFETY INVARIANT. Seeding must never inject *off-topic* context: an agent
|
|
17
|
+
* working in `pool.ts` who searches for "rank fusion scoring" should still get
|
|
18
|
+
* `searcher.ts`, because `pool.ts` has nothing relevant to say. So the bonus is
|
|
19
|
+
* **relevance-gated** — a seed match is only boosted when the candidate already
|
|
20
|
+
* ranked highly in at least one retriever (vector OR full-text). A genuinely
|
|
21
|
+
* on-topic seed chunk surfaces near the top of some retriever; an off-topic one
|
|
22
|
+
* sits deep in every retriever and is left exactly where the query put it. We
|
|
23
|
+
* gate on retriever rank (always available, even for an FTS-only hit) rather
|
|
24
|
+
* than pooled-ColBERT cosine, which is not reliably populated on every index.
|
|
25
|
+
*
|
|
26
|
+
* All functions here are pure so the gating/weighting math can be unit-tested
|
|
27
|
+
* (tests/seed-weight.test.ts) independently of the LanceDB-backed searcher.
|
|
28
|
+
*/
|
|
29
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
30
|
+
exports.DEFAULT_SEED_PARAMS = void 0;
|
|
31
|
+
exports.seedParamsFromEnv = seedParamsFromEnv;
|
|
32
|
+
exports.buildSeedContext = buildSeedContext;
|
|
33
|
+
exports.matchesSeedFile = matchesSeedFile;
|
|
34
|
+
exports.matchesSeedSymbol = matchesSeedSymbol;
|
|
35
|
+
exports.seedBoost = seedBoost;
|
|
36
|
+
exports.DEFAULT_SEED_PARAMS = {
|
|
37
|
+
// RRF scores live around 1/(60+rank) ≈ 0.008–0.016, so a ~0.02 bonus is
|
|
38
|
+
// strong enough to lift a genuinely-relevant seed match several ranks while
|
|
39
|
+
// staying in the same order of magnitude as the fusion signal it augments.
|
|
40
|
+
fileWeight: 0.02,
|
|
41
|
+
symbolDefWeight: 0.02,
|
|
42
|
+
symbolRefWeight: 0.006,
|
|
43
|
+
// A genuinely on-topic seed chunk reaches the top handful of some retriever
|
|
44
|
+
// (the route/recover fixtures land at ranks 1–7); an off-topic one sits mid-
|
|
45
|
+
// pool or deeper (an irrelevant express seed file is rank ~150 for an
|
|
46
|
+
// unrelated query). 8 is the eligibility ceiling separating the two without
|
|
47
|
+
// boosting mid-pool noise (see tests/seed-weight.test.ts and eval-seed.ts).
|
|
48
|
+
maxRank: 8,
|
|
49
|
+
};
|
|
50
|
+
/** Resolve params from env, falling back to DEFAULT_SEED_PARAMS per field. */
|
|
51
|
+
function seedParamsFromEnv(env = process.env) {
|
|
52
|
+
const num = (raw, fallback, min) => {
|
|
53
|
+
const v = Number.parseFloat(raw !== null && raw !== void 0 ? raw : "");
|
|
54
|
+
return Number.isFinite(v) && v >= min ? v : fallback;
|
|
55
|
+
};
|
|
56
|
+
return {
|
|
57
|
+
fileWeight: num(env.GMAX_SEED_FILE_W, exports.DEFAULT_SEED_PARAMS.fileWeight, 0),
|
|
58
|
+
symbolDefWeight: num(env.GMAX_SEED_SYMBOL_DEF_W, exports.DEFAULT_SEED_PARAMS.symbolDefWeight, 0),
|
|
59
|
+
symbolRefWeight: num(env.GMAX_SEED_SYMBOL_REF_W, exports.DEFAULT_SEED_PARAMS.symbolRefWeight, 0),
|
|
60
|
+
maxRank: num(env.GMAX_SEED_MAX_RANK, exports.DEFAULT_SEED_PARAMS.maxRank, 1),
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
/** Normalize a seed spec into a matchable context. */
|
|
64
|
+
function buildSeedContext(spec) {
|
|
65
|
+
var _a, _b;
|
|
66
|
+
const fileSuffixes = ((_a = spec === null || spec === void 0 ? void 0 : spec.files) !== null && _a !== void 0 ? _a : [])
|
|
67
|
+
.map((f) => f.trim().toLowerCase().replace(/^\.?\//, ""))
|
|
68
|
+
.filter((f) => f.length > 0);
|
|
69
|
+
const symbols = new Set(((_b = spec === null || spec === void 0 ? void 0 : spec.symbols) !== null && _b !== void 0 ? _b : []).map((s) => s.trim()).filter((s) => s.length > 0));
|
|
70
|
+
return {
|
|
71
|
+
fileSuffixes,
|
|
72
|
+
symbols,
|
|
73
|
+
active: fileSuffixes.length > 0 || symbols.size > 0,
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
/** Does a candidate match any seed file (by path suffix)? */
|
|
77
|
+
function matchesSeedFile(ctx, candidatePath) {
|
|
78
|
+
if (ctx.fileSuffixes.length === 0)
|
|
79
|
+
return false;
|
|
80
|
+
const p = candidatePath.toLowerCase();
|
|
81
|
+
return ctx.fileSuffixes.some((suffix) => p.endsWith(`/${suffix}`) || p === suffix || p.endsWith(suffix));
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Classify a candidate's relationship to the seed symbols: does it define one,
|
|
85
|
+
* or merely reference one? Definition wins when both are true.
|
|
86
|
+
*/
|
|
87
|
+
function matchesSeedSymbol(ctx, definedSymbols, referencedSymbols) {
|
|
88
|
+
if (ctx.symbols.size === 0)
|
|
89
|
+
return { def: false, ref: false };
|
|
90
|
+
let def = false;
|
|
91
|
+
for (const s of definedSymbols) {
|
|
92
|
+
if (ctx.symbols.has(s)) {
|
|
93
|
+
def = true;
|
|
94
|
+
break;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
let ref = false;
|
|
98
|
+
for (const s of referencedSymbols) {
|
|
99
|
+
if (ctx.symbols.has(s)) {
|
|
100
|
+
ref = true;
|
|
101
|
+
break;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return { def, ref };
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* The additive RRF-score bonus for a candidate. Returns 0 when the candidate
|
|
108
|
+
* matches no seed, or when it matches but its best retriever rank is deeper
|
|
109
|
+
* than the ceiling (the safety invariant). `bestRank` is the 1-indexed best
|
|
110
|
+
* position the candidate reached across retrievers; 0/Infinity means it was
|
|
111
|
+
* never retrieved near the top and is therefore ineligible. File and symbol
|
|
112
|
+
* bonuses are additive; a definition match supersedes a reference match.
|
|
113
|
+
*/
|
|
114
|
+
function seedBoost(match, bestRank, params) {
|
|
115
|
+
if (!match.file && !match.symbolDef && !match.symbolRef)
|
|
116
|
+
return 0;
|
|
117
|
+
if (!(bestRank >= 1) || bestRank > params.maxRank)
|
|
118
|
+
return 0;
|
|
119
|
+
let bonus = match.file ? params.fileWeight : 0;
|
|
120
|
+
if (match.symbolDef)
|
|
121
|
+
bonus += params.symbolDefWeight;
|
|
122
|
+
else if (match.symbolRef)
|
|
123
|
+
bonus += params.symbolRefWeight;
|
|
124
|
+
return bonus;
|
|
125
|
+
}
|
|
@@ -287,7 +287,15 @@ class WorkerOrchestrator {
|
|
|
287
287
|
colbert: new Int8Array(),
|
|
288
288
|
scale: 1,
|
|
289
289
|
};
|
|
290
|
-
return Object.assign(Object.assign({}, chunk), { vector: hybrid.dense, colbert: Buffer.from(hybrid.colbert), colbert_scale: hybrid.scale,
|
|
290
|
+
return Object.assign(Object.assign({}, chunk), { vector: hybrid.dense, colbert: Buffer.from(hybrid.colbert), colbert_scale: hybrid.scale,
|
|
291
|
+
// Convert the pooled Float32Array to a plain number[] so it survives
|
|
292
|
+
// the JSON IPC hop to the parent (process-child.ts → pool.ts). A typed
|
|
293
|
+
// array JSON-serializes to a length-less {"0":..} object, which then
|
|
294
|
+
// Array.from()s to [] on insert and pads to 48 zeros — silently making
|
|
295
|
+
// the stage-1 cosine prefilter a no-op (searcher.ts:732).
|
|
296
|
+
pooled_colbert_48d: hybrid.pooled_colbert_48d
|
|
297
|
+
? Array.from(hybrid.pooled_colbert_48d)
|
|
298
|
+
: undefined, doc_token_ids: hybrid.token_ids });
|
|
291
299
|
});
|
|
292
300
|
onProgress === null || onProgress === void 0 ? void 0 : onProgress();
|
|
293
301
|
(0, logger_1.debug)("orch", `processFile done: ${input.path} ${vectors.length} vectors ${(performance.now() - fileStart).toFixed(0)}ms`);
|
package/package.json
CHANGED