codetrap 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +151 -52
- package/docs/installation.md +113 -29
- package/package.json +4 -3
- package/plugins/codetrap-agent/.codex-plugin/plugin.json +1 -2
- package/plugins/codetrap-agent/hooks/post-flight-capture.example.md +19 -17
- package/plugins/codetrap-agent/hooks.json +2 -2
- package/{skills → plugins/codetrap-agent/skills}/codetrap-add/SKILL.md +10 -4
- package/plugins/codetrap-agent/skills/codetrap-capture/SKILL.md +14 -3
- package/plugins/codetrap-agent/skills/codetrap-capture-external/SKILL.md +52 -9
- package/plugins/codetrap-agent/skills/codetrap-check/SKILL.md +74 -6
- package/{skills → plugins/codetrap-agent/skills}/codetrap-search/SKILL.md +6 -5
- package/plugins/codetrap-agent/templates/AGENTS.codetrap.md +31 -5
- package/scripts/search-policy-sweep.ts +131 -0
- package/src/commands/workflow.ts +144 -68
- package/src/db/embedding-queries.ts +230 -48
- package/src/db/queries.ts +0 -25
- package/src/db/repository.ts +32 -21
- package/src/db/schema.ts +80 -0
- package/src/index.ts +28 -3
- package/src/lib/command-requests.ts +112 -1
- package/src/lib/config.ts +57 -7
- package/src/lib/constants.ts +1 -1
- package/src/lib/doctor.ts +42 -12
- package/src/lib/embedder.ts +118 -3
- package/src/lib/embedding-health.ts +3 -1
- package/src/lib/embedding-job.ts +3 -0
- package/src/lib/embedding-management.ts +65 -0
- package/src/lib/embedding-runtime.ts +177 -0
- package/src/lib/output-json.ts +0 -2
- package/src/lib/scope-context.ts +12 -6
- package/src/lib/scope-migration.ts +2 -1
- package/src/lib/scope.ts +0 -2
- package/src/lib/search-eval.ts +38 -18
- package/src/lib/search-policy-sweep.ts +563 -0
- package/src/lib/search-policy.ts +0 -4
- package/src/lib/search-service.ts +14 -15
- package/src/lib/session-candidate-document.ts +175 -0
- package/src/lib/session-candidate-scope.ts +6 -0
- package/src/lib/session-capture.ts +298 -32
- package/src/lib/session-codec.ts +1 -8
- package/src/lib/session-operations.ts +83 -60
- package/src/lib/session-review.ts +327 -0
- package/src/lib/session-store.ts +87 -73
- package/src/lib/store.ts +74 -10
- package/src/lib/string-list.ts +3 -0
- package/src/lib/text-lines.ts +7 -0
- package/src/lib/trap-search-document.ts +2 -1
- package/src/lib/value-types.ts +3 -0
- package/src/web/client-review.ts +171 -0
- package/src/web/client-script.ts +426 -51
- package/src/web/client-shell.ts +414 -0
- package/src/web/client-text.ts +112 -0
- package/src/web/project-registry.ts +3 -5
- package/src/web/server.ts +117 -103
- package/src/web/static.ts +364 -19
- package/skills/codetrap-capture-external/SKILL.md +0 -62
- package/skills/codetrap-check/SKILL.md +0 -69
- package/src/lib/embedding-index.ts +0 -53
package/src/lib/search-eval.ts
CHANGED
|
@@ -3,13 +3,19 @@ import { openDatabase } from "../db/connection";
|
|
|
3
3
|
import { TrapRepository } from "../db/repository";
|
|
4
4
|
import type { TrapInput, TrapSearchResult } from "../domain/trap";
|
|
5
5
|
import { SEARCH_MODES, type SearchMode } from "./constants";
|
|
6
|
+
import { isRecord } from "./value-types";
|
|
6
7
|
import {
|
|
7
|
-
createDefaultEmbeddingProvider,
|
|
8
|
-
embeddingConfig,
|
|
9
8
|
type EmbeddingConfig,
|
|
10
9
|
type EmbeddingProvider,
|
|
11
10
|
type EmbeddingTask,
|
|
12
11
|
} from "./embedder";
|
|
12
|
+
import {
|
|
13
|
+
EmbeddingRuntime,
|
|
14
|
+
defaultEmbeddingRuntime,
|
|
15
|
+
embeddingRuntimeFrom,
|
|
16
|
+
type EmbeddingRuntimeInput,
|
|
17
|
+
} from "./embedding-runtime";
|
|
18
|
+
import type { RankingConfig } from "./search-policy";
|
|
13
19
|
|
|
14
20
|
export type PhaseGate = "phase0" | "phase1" | "phase4" | "dogfood";
|
|
15
21
|
export const DOGFOOD_JUDGMENTS = ["useful_hit", "miss", "noisy_hit", "no_relevant_trap"] as const;
|
|
@@ -79,6 +85,10 @@ export type SearchEvalReport = {
|
|
|
79
85
|
next_actions: SearchEvalNextAction[];
|
|
80
86
|
};
|
|
81
87
|
|
|
88
|
+
export type SearchEvalDetailedReport = Omit<SearchEvalReport, "mode" | "fixture" | "next_actions"> & {
|
|
89
|
+
cases: EvalCaseReport[];
|
|
90
|
+
};
|
|
91
|
+
|
|
82
92
|
export type RecordDogfoodResult = {
|
|
83
93
|
success: true;
|
|
84
94
|
fixture: string;
|
|
@@ -115,8 +125,8 @@ export function recordDogfoodCase(fixturePath: string, jsonInput: string | undef
|
|
|
115
125
|
|
|
116
126
|
export async function reportDogfood(fixturePath: string, live: boolean): Promise<SearchEvalReport> {
|
|
117
127
|
const fixture = readEvalFixture(fixturePath);
|
|
118
|
-
const
|
|
119
|
-
const evaluated = await evaluateSearchFixture(fixture,
|
|
128
|
+
const runtime = live ? defaultEmbeddingRuntime() : new EmbeddingRuntime(new EvalEmbedder());
|
|
129
|
+
const evaluated = await evaluateSearchFixture(fixture, runtime);
|
|
120
130
|
const mode: SearchEvalReport["mode"] = live ? "live" : "deterministic";
|
|
121
131
|
const report: Omit<SearchEvalReport, "next_actions"> = {
|
|
122
132
|
mode,
|
|
@@ -131,19 +141,30 @@ export async function reportDogfood(fixturePath: string, live: boolean): Promise
|
|
|
131
141
|
|
|
132
142
|
export async function evaluateSearchFixture(
|
|
133
143
|
fixture: EvalFixture,
|
|
134
|
-
|
|
144
|
+
embeddings: EmbeddingRuntimeInput,
|
|
145
|
+
ranking?: RankingConfig
|
|
135
146
|
): Promise<Omit<SearchEvalReport, "mode" | "fixture" | "next_actions">> {
|
|
136
|
-
const
|
|
147
|
+
const { cases: _cases, ...report } = await evaluateSearchFixtureCases(fixture, embeddings, ranking);
|
|
148
|
+
return report;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
export async function evaluateSearchFixtureCases(
|
|
152
|
+
fixture: EvalFixture,
|
|
153
|
+
embeddings: EmbeddingRuntimeInput,
|
|
154
|
+
ranking?: RankingConfig
|
|
155
|
+
): Promise<SearchEvalDetailedReport> {
|
|
156
|
+
const runtime = embeddingRuntimeFrom(embeddings);
|
|
157
|
+
const repo = fixtureRepository(fixture, runtime, ranking);
|
|
137
158
|
|
|
138
159
|
let providerError: string | null = null;
|
|
139
|
-
if (
|
|
160
|
+
if (runtime.available()) {
|
|
140
161
|
try {
|
|
141
162
|
await repo.ensureEmbeddings();
|
|
142
163
|
} catch (error) {
|
|
143
164
|
providerError = errorMessage(error);
|
|
144
165
|
}
|
|
145
166
|
}
|
|
146
|
-
const searchRepo = providerError ? fixtureRepository(fixture, undefined) : repo;
|
|
167
|
+
const searchRepo = providerError ? fixtureRepository(fixture, undefined, ranking) : repo;
|
|
147
168
|
|
|
148
169
|
const cases: EvalCaseReport[] = [];
|
|
149
170
|
let hybridFallbackCount = 0;
|
|
@@ -154,7 +175,7 @@ export async function evaluateSearchFixture(
|
|
|
154
175
|
const results = await searchRepo.search(item.query, { mode: item.mode, limit: 5 });
|
|
155
176
|
const report = caseReport(item, fixture, results);
|
|
156
177
|
cases.push(report);
|
|
157
|
-
if (item.mode === "hybrid" && (!
|
|
178
|
+
if (item.mode === "hybrid" && (!runtime.available() || hasSemanticFallback(results))) {
|
|
158
179
|
hybridFallbackCount++;
|
|
159
180
|
}
|
|
160
181
|
} catch (error) {
|
|
@@ -169,8 +190,8 @@ export async function evaluateSearchFixture(
|
|
|
169
190
|
const noisyHits = cases.filter((item) => item.judgment === "noisy_hit");
|
|
170
191
|
const metrics = aggregateMetrics(cases);
|
|
171
192
|
return {
|
|
172
|
-
provider:
|
|
173
|
-
semantic_available:
|
|
193
|
+
provider: runtime.config(),
|
|
194
|
+
semantic_available: runtime.available() && providerError === null,
|
|
174
195
|
provider_error: providerError,
|
|
175
196
|
total_cases: cases.length,
|
|
176
197
|
metrics: {
|
|
@@ -190,6 +211,7 @@ export async function evaluateSearchFixture(
|
|
|
190
211
|
failures,
|
|
191
212
|
misses,
|
|
192
213
|
noisy_hits: noisyHits,
|
|
214
|
+
cases,
|
|
193
215
|
};
|
|
194
216
|
}
|
|
195
217
|
|
|
@@ -240,8 +262,9 @@ function buildSearchEvalNextActions(
|
|
|
240
262
|
): SearchEvalNextAction[] {
|
|
241
263
|
const actions: SearchEvalNextAction[] = [];
|
|
242
264
|
if (report.mode === "live" && !report.semantic_available) {
|
|
243
|
-
|
|
244
|
-
|
|
265
|
+
const action = defaultEmbeddingRuntime().setupAction();
|
|
266
|
+
if (action) actions.push({
|
|
267
|
+
command: action.command,
|
|
245
268
|
reason: "Enable live semantic checks, then rerun bun run eval:dogfood -- report --live.",
|
|
246
269
|
});
|
|
247
270
|
}
|
|
@@ -309,8 +332,8 @@ function formatNextActions(actions: SearchEvalNextAction[]): string[] {
|
|
|
309
332
|
return actions.map((action) => ` - ${action.command} # ${action.reason}`);
|
|
310
333
|
}
|
|
311
334
|
|
|
312
|
-
function fixtureRepository(fixture: EvalFixture,
|
|
313
|
-
const repo = new TrapRepository(openDatabase(":memory:"),
|
|
335
|
+
function fixtureRepository(fixture: EvalFixture, embeddings: EmbeddingRuntimeInput, ranking?: RankingConfig): TrapRepository {
|
|
336
|
+
const repo = new TrapRepository(openDatabase(":memory:"), embeddings, ranking);
|
|
314
337
|
for (const trap of fixture.traps) repo.add(trap);
|
|
315
338
|
return repo;
|
|
316
339
|
}
|
|
@@ -471,9 +494,6 @@ function providerLabel(provider: EmbeddingConfig | null): string {
|
|
|
471
494
|
return `${provider.provider}/${provider.model}`;
|
|
472
495
|
}
|
|
473
496
|
|
|
474
|
-
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
475
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
476
|
-
}
|
|
477
497
|
|
|
478
498
|
function round(value: number): number {
|
|
479
499
|
return Math.round(value * 10000) / 10000;
|
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import type { TrapSearchResult } from "../domain/trap";
|
|
3
|
+
import { SCOPES, SEARCH_MODES, type Scope, type SearchMode } from "./constants";
|
|
4
|
+
import { isRecord } from "./value-types";
|
|
5
|
+
import {
|
|
6
|
+
defaultEmbeddingRuntime,
|
|
7
|
+
embeddingRuntimeFrom,
|
|
8
|
+
type EmbeddingRuntimeInput,
|
|
9
|
+
} from "./embedding-runtime";
|
|
10
|
+
import {
|
|
11
|
+
DEFAULT_SEARCH_EVAL_FIXTURE,
|
|
12
|
+
EvalEmbedder,
|
|
13
|
+
evaluateSearchFixtureCases,
|
|
14
|
+
readEvalFixture,
|
|
15
|
+
type EvalCaseReport,
|
|
16
|
+
type SearchEvalMetrics,
|
|
17
|
+
} from "./search-eval";
|
|
18
|
+
import { ScopedRepositoryContext } from "./scope-context";
|
|
19
|
+
import {
|
|
20
|
+
DEFAULT_RANKING_CONFIG,
|
|
21
|
+
type RankingConfig,
|
|
22
|
+
} from "./search-policy";
|
|
23
|
+
|
|
24
|
+
export type RankingCandidate = {
|
|
25
|
+
name: string;
|
|
26
|
+
config: RankingConfig;
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
export type GoldTarget = {
|
|
30
|
+
scope?: Scope;
|
|
31
|
+
id?: number;
|
|
32
|
+
title?: string;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
export type LiveEvalCase = {
|
|
36
|
+
query: string;
|
|
37
|
+
mode?: SearchMode;
|
|
38
|
+
scope?: Scope;
|
|
39
|
+
limit?: number;
|
|
40
|
+
gold?: GoldTarget[];
|
|
41
|
+
minRecallAt3?: number;
|
|
42
|
+
minRecallAt5?: number;
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
export type ComparableSearchCase = {
|
|
46
|
+
key: string;
|
|
47
|
+
query: string;
|
|
48
|
+
mode: SearchMode;
|
|
49
|
+
scope?: Scope;
|
|
50
|
+
scored: boolean;
|
|
51
|
+
recallAt3: number;
|
|
52
|
+
recallAt5: number;
|
|
53
|
+
reciprocalRank: number;
|
|
54
|
+
passed: boolean;
|
|
55
|
+
topResults: { id: number; scope?: Scope; title: string; sources: string[]; diagnostics: string[] }[];
|
|
56
|
+
warnings: string[];
|
|
57
|
+
error?: string;
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
export type CaseDelta = {
|
|
61
|
+
query: string;
|
|
62
|
+
mode: SearchMode;
|
|
63
|
+
scope?: Scope;
|
|
64
|
+
before: number;
|
|
65
|
+
after: number;
|
|
66
|
+
beforeTop: string[];
|
|
67
|
+
afterTop: string[];
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
export type SweepCandidateReport = {
|
|
71
|
+
name: string;
|
|
72
|
+
config: RankingConfig;
|
|
73
|
+
total_cases: number;
|
|
74
|
+
scored_cases: number;
|
|
75
|
+
metrics: SearchEvalMetrics;
|
|
76
|
+
cases: ComparableSearchCase[];
|
|
77
|
+
failures: ComparableSearchCase[];
|
|
78
|
+
regressions: CaseDelta[];
|
|
79
|
+
improvements: CaseDelta[];
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
export type PolicySweepReport = {
|
|
83
|
+
mode: "fixture" | "live";
|
|
84
|
+
source: string;
|
|
85
|
+
cwd?: string;
|
|
86
|
+
candidate_count: number;
|
|
87
|
+
baseline: SweepCandidateReport;
|
|
88
|
+
candidates: SweepCandidateReport[];
|
|
89
|
+
best: SweepCandidateReport;
|
|
90
|
+
recommendation: string;
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
type FixtureSweepOptions = {
|
|
94
|
+
fixturePath?: string;
|
|
95
|
+
candidates?: RankingCandidate[];
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
type LiveSweepOptions = {
|
|
99
|
+
cwd: string;
|
|
100
|
+
cases: LiveEvalCase[];
|
|
101
|
+
candidates?: RankingCandidate[];
|
|
102
|
+
embeddings?: EmbeddingRuntimeInput;
|
|
103
|
+
defaultScope?: Scope;
|
|
104
|
+
home?: string;
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
export const DEFAULT_POLICY_SWEEP_CANDIDATES: RankingCandidate[] = [
|
|
108
|
+
candidate("default", {}),
|
|
109
|
+
candidate("title-tag-heavy", {
|
|
110
|
+
titleTokenBoost: 0.24,
|
|
111
|
+
tagTokenBoost: 0.28,
|
|
112
|
+
maxBoost: 0.55,
|
|
113
|
+
}),
|
|
114
|
+
candidate("identifier-heavy", {
|
|
115
|
+
identifierBoost: 0.3,
|
|
116
|
+
maxBoost: 0.55,
|
|
117
|
+
}),
|
|
118
|
+
candidate("scope-heavy", {
|
|
119
|
+
pathMatchBoost: 0.2,
|
|
120
|
+
moduleMatchBoost: 0.14,
|
|
121
|
+
ownerMatchBoost: 0.08,
|
|
122
|
+
maxBoost: 0.55,
|
|
123
|
+
}),
|
|
124
|
+
candidate("severity-light", {
|
|
125
|
+
severityBoost: {
|
|
126
|
+
warning: 0,
|
|
127
|
+
error: 0.02,
|
|
128
|
+
critical: 0.03,
|
|
129
|
+
},
|
|
130
|
+
}),
|
|
131
|
+
candidate("semantic-loose", {
|
|
132
|
+
semanticMinScore: 0.2,
|
|
133
|
+
}),
|
|
134
|
+
candidate("semantic-strict", {
|
|
135
|
+
semanticMinScore: 0.4,
|
|
136
|
+
}),
|
|
137
|
+
];
|
|
138
|
+
|
|
139
|
+
export async function runFixturePolicySweep(options: FixtureSweepOptions = {}): Promise<PolicySweepReport> {
|
|
140
|
+
const fixturePath = options.fixturePath ?? DEFAULT_SEARCH_EVAL_FIXTURE;
|
|
141
|
+
const fixture = readEvalFixture(fixturePath);
|
|
142
|
+
const candidates = options.candidates ?? DEFAULT_POLICY_SWEEP_CANDIDATES;
|
|
143
|
+
const reports: SweepCandidateReport[] = [];
|
|
144
|
+
let baselineCases: ComparableSearchCase[] | undefined;
|
|
145
|
+
|
|
146
|
+
for (const item of candidates) {
|
|
147
|
+
const detailed = await evaluateSearchFixtureCases(fixture, new EvalEmbedder(), item.config);
|
|
148
|
+
const cases = detailed.cases.map(fromEvalCaseReport);
|
|
149
|
+
if (!baselineCases) baselineCases = cases;
|
|
150
|
+
reports.push(candidateReport(item, cases, baselineCases));
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return buildSweepReport("fixture", fixturePath, reports);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
export async function runLivePolicySweep(options: LiveSweepOptions): Promise<PolicySweepReport> {
|
|
157
|
+
if (options.cases.length === 0) throw new Error("Live sweep requires at least one query case.");
|
|
158
|
+
|
|
159
|
+
const candidates = options.candidates ?? DEFAULT_POLICY_SWEEP_CANDIDATES;
|
|
160
|
+
const embeddings = options.embeddings ?? defaultEmbeddingRuntime();
|
|
161
|
+
const reports: SweepCandidateReport[] = [];
|
|
162
|
+
let baselineCases: ComparableSearchCase[] | undefined;
|
|
163
|
+
|
|
164
|
+
for (const item of candidates) {
|
|
165
|
+
const cases = await evaluateLiveCases({
|
|
166
|
+
...options,
|
|
167
|
+
embeddings,
|
|
168
|
+
defaultScope: options.defaultScope ?? "project",
|
|
169
|
+
ranking: item.config,
|
|
170
|
+
});
|
|
171
|
+
if (!baselineCases) baselineCases = cases;
|
|
172
|
+
reports.push(candidateReport(item, cases, baselineCases));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return buildSweepReport("live", "live project", reports, options.cwd);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
export function readLiveEvalCases(path: string): LiveEvalCase[] {
|
|
179
|
+
const parsed = JSON.parse(readFileSync(path, "utf-8")) as unknown;
|
|
180
|
+
const records = Array.isArray(parsed)
|
|
181
|
+
? parsed
|
|
182
|
+
: isRecord(parsed) && Array.isArray(parsed.queries)
|
|
183
|
+
? parsed.queries
|
|
184
|
+
: null;
|
|
185
|
+
if (!records) throw new Error("Live queries file must be an array or an object with a queries array.");
|
|
186
|
+
return records.map(normalizeLiveEvalCase);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
export function formatPolicySweepReport(report: PolicySweepReport): string {
|
|
190
|
+
const lines = [
|
|
191
|
+
`Search policy sweep (${report.mode})`,
|
|
192
|
+
`Source: ${report.source}`,
|
|
193
|
+
];
|
|
194
|
+
if (report.cwd) lines.push(`cwd: ${report.cwd}`);
|
|
195
|
+
lines.push(
|
|
196
|
+
`Candidates: ${report.candidate_count}`,
|
|
197
|
+
`Baseline: ${summaryLine(report.baseline)}`,
|
|
198
|
+
`Best: ${summaryLine(report.best)}`,
|
|
199
|
+
`Recommendation: ${report.recommendation}`,
|
|
200
|
+
"Results:"
|
|
201
|
+
);
|
|
202
|
+
|
|
203
|
+
for (const candidate of report.candidates) {
|
|
204
|
+
lines.push(` - ${summaryLine(candidate)}`);
|
|
205
|
+
if (candidate.regressions.length > 0) {
|
|
206
|
+
lines.push(` regressions: ${formatDeltas(candidate.regressions)}`);
|
|
207
|
+
}
|
|
208
|
+
if (candidate.improvements.length > 0) {
|
|
209
|
+
lines.push(` improvements: ${formatDeltas(candidate.improvements)}`);
|
|
210
|
+
}
|
|
211
|
+
if (candidate.failures.length > 0) {
|
|
212
|
+
lines.push(` failures: ${candidate.failures.slice(0, 3).map((item) => item.query).join("; ")}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return lines.join("\n");
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function candidate(name: string, patch: Partial<RankingConfig>): RankingCandidate {
|
|
220
|
+
return {
|
|
221
|
+
name,
|
|
222
|
+
config: {
|
|
223
|
+
...DEFAULT_RANKING_CONFIG,
|
|
224
|
+
...patch,
|
|
225
|
+
severityBoost: {
|
|
226
|
+
...DEFAULT_RANKING_CONFIG.severityBoost,
|
|
227
|
+
...(patch.severityBoost ?? {}),
|
|
228
|
+
},
|
|
229
|
+
},
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
async function evaluateLiveCases(options: LiveSweepOptions & { ranking: RankingConfig }): Promise<ComparableSearchCase[]> {
|
|
234
|
+
const defaultScope = options.defaultScope ?? "project";
|
|
235
|
+
const scopes = new ScopedRepositoryContext(
|
|
236
|
+
options.cwd,
|
|
237
|
+
embeddingRuntimeFrom(options.embeddings),
|
|
238
|
+
options.home,
|
|
239
|
+
options.ranking
|
|
240
|
+
);
|
|
241
|
+
|
|
242
|
+
const out: ComparableSearchCase[] = [];
|
|
243
|
+
for (const input of options.cases) {
|
|
244
|
+
const scope = input.scope ?? defaultScope;
|
|
245
|
+
const mode = input.mode ?? "hybrid";
|
|
246
|
+
try {
|
|
247
|
+
const results = await scopes.repositoryFor(scope).search(input.query, {
|
|
248
|
+
mode,
|
|
249
|
+
scope,
|
|
250
|
+
limit: input.limit ?? 5,
|
|
251
|
+
});
|
|
252
|
+
out.push(liveCaseReport(input, mode, scope, results));
|
|
253
|
+
} catch (error) {
|
|
254
|
+
out.push({
|
|
255
|
+
key: caseKey(input.query, mode, scope),
|
|
256
|
+
query: input.query,
|
|
257
|
+
mode,
|
|
258
|
+
scope,
|
|
259
|
+
scored: (input.gold ?? []).length > 0,
|
|
260
|
+
recallAt3: 0,
|
|
261
|
+
recallAt5: 0,
|
|
262
|
+
reciprocalRank: 0,
|
|
263
|
+
passed: false,
|
|
264
|
+
topResults: [],
|
|
265
|
+
warnings: [],
|
|
266
|
+
error: error instanceof Error ? error.message : String(error),
|
|
267
|
+
});
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
return out;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function liveCaseReport(
|
|
274
|
+
input: LiveEvalCase,
|
|
275
|
+
mode: SearchMode,
|
|
276
|
+
scope: Scope,
|
|
277
|
+
results: TrapSearchResult[]
|
|
278
|
+
): ComparableSearchCase {
|
|
279
|
+
const gold = input.gold ?? [];
|
|
280
|
+
const warnings = new Set<string>();
|
|
281
|
+
const recallAt3 = gold.length > 0 ? recall(gold, results.slice(0, 3), warnings, scope) : 1;
|
|
282
|
+
const recallAt5 = gold.length > 0 ? recall(gold, results.slice(0, 5), warnings, scope) : 1;
|
|
283
|
+
const firstRank = gold.length > 0 ? firstMatchRank(gold, results, warnings, scope) : -1;
|
|
284
|
+
const reciprocalRank = firstRank >= 0 ? 1 / (firstRank + 1) : 0;
|
|
285
|
+
const minRecallAt3 = input.minRecallAt3 ?? (gold.length > 0 ? 1 : 0);
|
|
286
|
+
const minRecallAt5 = input.minRecallAt5 ?? (gold.length > 0 ? 1 : 0);
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
key: caseKey(input.query, mode, scope),
|
|
290
|
+
query: input.query,
|
|
291
|
+
mode,
|
|
292
|
+
scope,
|
|
293
|
+
scored: gold.length > 0,
|
|
294
|
+
recallAt3,
|
|
295
|
+
recallAt5,
|
|
296
|
+
reciprocalRank,
|
|
297
|
+
passed: recallAt3 >= minRecallAt3 && recallAt5 >= minRecallAt5,
|
|
298
|
+
topResults: results.map((result) => ({
|
|
299
|
+
id: result.trap.id,
|
|
300
|
+
scope: result.trap.scope === "project" || result.trap.scope === "global" ? result.trap.scope : scope,
|
|
301
|
+
title: result.trap.title,
|
|
302
|
+
sources: result.sources ?? [],
|
|
303
|
+
diagnostics: (result.diagnostics ?? []).map((diagnostic) => diagnostic.code),
|
|
304
|
+
})),
|
|
305
|
+
warnings: [...warnings],
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
function recall(gold: GoldTarget[], results: TrapSearchResult[], warnings: Set<string>, defaultScope: Scope): number {
|
|
310
|
+
return gold.filter((target) => targetMatches(target, results, warnings, defaultScope)).length / gold.length;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function firstMatchRank(
|
|
314
|
+
gold: GoldTarget[],
|
|
315
|
+
results: TrapSearchResult[],
|
|
316
|
+
warnings: Set<string>,
|
|
317
|
+
defaultScope: Scope
|
|
318
|
+
): number {
|
|
319
|
+
return results.findIndex((result) =>
|
|
320
|
+
gold.some((target) => targetMatches(target, [result], warnings, defaultScope))
|
|
321
|
+
);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
function targetMatches(
|
|
325
|
+
target: GoldTarget,
|
|
326
|
+
results: TrapSearchResult[],
|
|
327
|
+
warnings: Set<string>,
|
|
328
|
+
defaultScope: Scope
|
|
329
|
+
): boolean {
|
|
330
|
+
const scope = target.scope ?? defaultScope;
|
|
331
|
+
const scoped = results.filter((result) => result.trap.scope === scope);
|
|
332
|
+
if (target.id !== undefined) {
|
|
333
|
+
const idMatch = scoped.find((result) => result.trap.id === target.id);
|
|
334
|
+
if (idMatch) {
|
|
335
|
+
if (target.title && idMatch.trap.title !== target.title) {
|
|
336
|
+
warnings.add(`gold_title_mismatch:${target.id}`);
|
|
337
|
+
}
|
|
338
|
+
return true;
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
if (!target.title) return false;
|
|
342
|
+
|
|
343
|
+
const titleMatches = scoped.filter((result) => result.trap.title === target.title);
|
|
344
|
+
if (titleMatches.length > 1) warnings.add(`gold_title_ambiguous:${target.title}`);
|
|
345
|
+
if (titleMatches.length > 0) {
|
|
346
|
+
if (target.id !== undefined && titleMatches.every((result) => result.trap.id !== target.id)) {
|
|
347
|
+
warnings.add(`gold_id_drift:${target.id}->${titleMatches.map((result) => result.trap.id).join(",")}`);
|
|
348
|
+
}
|
|
349
|
+
return true;
|
|
350
|
+
}
|
|
351
|
+
return false;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function fromEvalCaseReport(item: EvalCaseReport): ComparableSearchCase {
|
|
355
|
+
return {
|
|
356
|
+
key: caseKey(item.query, item.mode),
|
|
357
|
+
query: item.query,
|
|
358
|
+
mode: item.mode,
|
|
359
|
+
scored: item.goldTrapIds.length > 0,
|
|
360
|
+
recallAt3: item.recallAt3,
|
|
361
|
+
recallAt5: item.recallAt5,
|
|
362
|
+
reciprocalRank: item.reciprocalRank,
|
|
363
|
+
passed: item.passed,
|
|
364
|
+
topResults: item.topResults.map((result) => ({
|
|
365
|
+
id: result.id,
|
|
366
|
+
title: result.title,
|
|
367
|
+
sources: result.sources,
|
|
368
|
+
diagnostics: result.diagnostics,
|
|
369
|
+
})),
|
|
370
|
+
warnings: [],
|
|
371
|
+
error: item.error,
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function candidateReport(
|
|
376
|
+
candidate: RankingCandidate,
|
|
377
|
+
cases: ComparableSearchCase[],
|
|
378
|
+
baselineCases: ComparableSearchCase[]
|
|
379
|
+
): SweepCandidateReport {
|
|
380
|
+
return {
|
|
381
|
+
name: candidate.name,
|
|
382
|
+
config: candidate.config,
|
|
383
|
+
total_cases: cases.length,
|
|
384
|
+
scored_cases: cases.filter((item) => item.scored).length,
|
|
385
|
+
metrics: aggregateMetrics(cases),
|
|
386
|
+
cases,
|
|
387
|
+
failures: cases.filter((item) => !item.passed),
|
|
388
|
+
regressions: deltas(cases, baselineCases, "regression"),
|
|
389
|
+
improvements: deltas(cases, baselineCases, "improvement"),
|
|
390
|
+
};
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
function aggregateMetrics(cases: ComparableSearchCase[]): SearchEvalMetrics {
|
|
394
|
+
const scored = cases.filter((item) => item.scored);
|
|
395
|
+
const total = scored.length || 1;
|
|
396
|
+
return {
|
|
397
|
+
recall_at_3: round(scored.reduce((sum, item) => sum + item.recallAt3, 0) / total),
|
|
398
|
+
recall_at_5: round(scored.reduce((sum, item) => sum + item.recallAt5, 0) / total),
|
|
399
|
+
mrr: round(scored.reduce((sum, item) => sum + item.reciprocalRank, 0) / total),
|
|
400
|
+
hybrid_fallback_count: cases.filter((item) =>
|
|
401
|
+
item.topResults.some((result) =>
|
|
402
|
+
result.diagnostics.some((code) =>
|
|
403
|
+
["semantic_unavailable", "semantic_no_candidates", "semantic_failed"].includes(code)
|
|
404
|
+
)
|
|
405
|
+
)
|
|
406
|
+
).length,
|
|
407
|
+
semantic_error_count: cases.filter((item) => item.error).length,
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
function deltas(
|
|
412
|
+
cases: ComparableSearchCase[],
|
|
413
|
+
baselineCases: ComparableSearchCase[],
|
|
414
|
+
direction: "regression" | "improvement"
|
|
415
|
+
): CaseDelta[] {
|
|
416
|
+
const baselineByKey = new Map(baselineCases.map((item) => [item.key, item]));
|
|
417
|
+
const out: CaseDelta[] = [];
|
|
418
|
+
for (const item of cases) {
|
|
419
|
+
const baseline = baselineByKey.get(item.key);
|
|
420
|
+
if (!baseline || !item.scored) continue;
|
|
421
|
+
const diff = item.reciprocalRank - baseline.reciprocalRank;
|
|
422
|
+
const changed = direction === "regression" ? diff < -0.0001 : diff > 0.0001;
|
|
423
|
+
if (!changed) continue;
|
|
424
|
+
out.push({
|
|
425
|
+
query: item.query,
|
|
426
|
+
mode: item.mode,
|
|
427
|
+
scope: item.scope,
|
|
428
|
+
before: baseline.reciprocalRank,
|
|
429
|
+
after: item.reciprocalRank,
|
|
430
|
+
beforeTop: topTitles(baseline),
|
|
431
|
+
afterTop: topTitles(item),
|
|
432
|
+
});
|
|
433
|
+
}
|
|
434
|
+
return out;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
function buildSweepReport(
|
|
438
|
+
mode: PolicySweepReport["mode"],
|
|
439
|
+
source: string,
|
|
440
|
+
candidates: SweepCandidateReport[],
|
|
441
|
+
cwd?: string
|
|
442
|
+
): PolicySweepReport {
|
|
443
|
+
if (candidates.length === 0) throw new Error("At least one ranking candidate is required.");
|
|
444
|
+
const baseline = candidates[0]!;
|
|
445
|
+
const best = [...candidates].sort(compareCandidates)[0]!;
|
|
446
|
+
return {
|
|
447
|
+
mode,
|
|
448
|
+
source,
|
|
449
|
+
cwd,
|
|
450
|
+
candidate_count: candidates.length,
|
|
451
|
+
baseline,
|
|
452
|
+
candidates,
|
|
453
|
+
best,
|
|
454
|
+
recommendation: recommendation(baseline, best, candidates),
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
function compareCandidates(a: SweepCandidateReport, b: SweepCandidateReport): number {
|
|
459
|
+
return (
|
|
460
|
+
b.metrics.recall_at_3 - a.metrics.recall_at_3 ||
|
|
461
|
+
b.metrics.mrr - a.metrics.mrr ||
|
|
462
|
+
a.failures.length - b.failures.length ||
|
|
463
|
+
a.regressions.length - b.regressions.length
|
|
464
|
+
);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
function recommendation(baseline: SweepCandidateReport, best: SweepCandidateReport, candidates: SweepCandidateReport[]): string {
|
|
468
|
+
if (baseline.scored_cases === 0) return "No scored live cases yet; add gold targets before using this as an optimization signal.";
|
|
469
|
+
const allTie = candidates.every((item) =>
|
|
470
|
+
item.metrics.recall_at_3 === baseline.metrics.recall_at_3 &&
|
|
471
|
+
item.metrics.recall_at_5 === baseline.metrics.recall_at_5 &&
|
|
472
|
+
item.metrics.mrr === baseline.metrics.mrr
|
|
473
|
+
);
|
|
474
|
+
if (allTie) return "All candidates tie; add harder miss/noisy_hit eval cases before changing ranking config.";
|
|
475
|
+
if (best.name === baseline.name) return "The default config is still best on this fixture.";
|
|
476
|
+
if (best.regressions.length > 0) return `${best.name} improves aggregate metrics but has regressions; inspect before adopting.`;
|
|
477
|
+
return `${best.name} is the strongest candidate on these cases; inspect changed rankings before editing defaults.`;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
function normalizeLiveEvalCase(value: unknown): LiveEvalCase {
|
|
481
|
+
if (!isRecord(value)) throw new Error("Each live query case must be an object.");
|
|
482
|
+
const query = stringField(value, "query");
|
|
483
|
+
const mode = optionalEnum(value, "mode", SEARCH_MODES);
|
|
484
|
+
const scope = optionalEnum(value, "scope", SCOPES);
|
|
485
|
+
return {
|
|
486
|
+
query,
|
|
487
|
+
mode,
|
|
488
|
+
scope,
|
|
489
|
+
limit: optionalPositiveInt(value, "limit"),
|
|
490
|
+
gold: normalizeGoldTargets(value.gold),
|
|
491
|
+
minRecallAt3: optionalScore(value, "minRecallAt3"),
|
|
492
|
+
minRecallAt5: optionalScore(value, "minRecallAt5"),
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
function normalizeGoldTargets(value: unknown): GoldTarget[] | undefined {
|
|
497
|
+
if (value === undefined) return undefined;
|
|
498
|
+
if (!Array.isArray(value)) throw new Error("gold must be an array.");
|
|
499
|
+
return value.map((item) => {
|
|
500
|
+
if (!isRecord(item)) throw new Error("gold entries must be objects.");
|
|
501
|
+
const id = item.id === undefined ? undefined : Number(item.id);
|
|
502
|
+
if (id !== undefined && (!Number.isInteger(id) || id <= 0)) throw new Error("gold.id must be a positive integer.");
|
|
503
|
+
const scope = optionalEnum(item, "scope", SCOPES);
|
|
504
|
+
const title = typeof item.title === "string" && item.title.trim() ? item.title.trim() : undefined;
|
|
505
|
+
if (id === undefined && !title) throw new Error("gold entries require id or title.");
|
|
506
|
+
return { id, title, scope };
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
function caseKey(query: string, mode: SearchMode, scope?: Scope): string {
|
|
511
|
+
return `${scope ?? "fixture"}\0${mode}\0${query}`;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
function topTitles(item: ComparableSearchCase): string[] {
|
|
515
|
+
return item.topResults.slice(0, 3).map((result) => `#${result.id} ${result.title}`);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
function summaryLine(item: SweepCandidateReport): string {
|
|
519
|
+
return `${item.name} R@3=${item.metrics.recall_at_3} R@5=${item.metrics.recall_at_5} MRR=${item.metrics.mrr} failures=${item.failures.length} regressions=${item.regressions.length}`;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
function formatDeltas(items: CaseDelta[]): string {
|
|
523
|
+
return items
|
|
524
|
+
.slice(0, 3)
|
|
525
|
+
.map((item) => `${item.query} ${round(item.before)}->${round(item.after)}`)
|
|
526
|
+
.join("; ");
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
function optionalEnum<T extends readonly string[]>(value: Record<string, unknown>, key: string, choices: T): T[number] | undefined {
|
|
530
|
+
const field = value[key];
|
|
531
|
+
if (field === undefined) return undefined;
|
|
532
|
+
if (typeof field !== "string" || !(choices as readonly string[]).includes(field)) {
|
|
533
|
+
throw new Error(`${key} must be one of: ${choices.join(", ")}`);
|
|
534
|
+
}
|
|
535
|
+
return field as T[number];
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
function stringField(value: Record<string, unknown>, key: string): string {
|
|
539
|
+
const field = value[key];
|
|
540
|
+
if (typeof field !== "string" || field.trim() === "") throw new Error(`${key} is required.`);
|
|
541
|
+
return field.trim();
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
function optionalPositiveInt(value: Record<string, unknown>, key: string): number | undefined {
|
|
545
|
+
const field = value[key];
|
|
546
|
+
if (field === undefined) return undefined;
|
|
547
|
+
const number = Number(field);
|
|
548
|
+
if (!Number.isInteger(number) || number <= 0) throw new Error(`${key} must be a positive integer.`);
|
|
549
|
+
return number;
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
function optionalScore(value: Record<string, unknown>, key: string): number | undefined {
|
|
553
|
+
const field = value[key];
|
|
554
|
+
if (field === undefined) return undefined;
|
|
555
|
+
const number = Number(field);
|
|
556
|
+
if (!Number.isFinite(number) || number < 0 || number > 1) throw new Error(`${key} must be between 0 and 1.`);
|
|
557
|
+
return number;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
function round(value: number): number {
|
|
562
|
+
return Math.round(value * 10000) / 10000;
|
|
563
|
+
}
|