codetrap 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +151 -52
  2. package/docs/installation.md +113 -29
  3. package/package.json +4 -3
  4. package/plugins/codetrap-agent/.codex-plugin/plugin.json +1 -2
  5. package/plugins/codetrap-agent/hooks/post-flight-capture.example.md +19 -17
  6. package/plugins/codetrap-agent/hooks.json +2 -2
  7. package/{skills → plugins/codetrap-agent/skills}/codetrap-add/SKILL.md +10 -4
  8. package/plugins/codetrap-agent/skills/codetrap-capture/SKILL.md +14 -3
  9. package/plugins/codetrap-agent/skills/codetrap-capture-external/SKILL.md +52 -9
  10. package/plugins/codetrap-agent/skills/codetrap-check/SKILL.md +74 -6
  11. package/{skills → plugins/codetrap-agent/skills}/codetrap-search/SKILL.md +6 -5
  12. package/plugins/codetrap-agent/templates/AGENTS.codetrap.md +31 -5
  13. package/scripts/search-policy-sweep.ts +131 -0
  14. package/src/commands/workflow.ts +144 -68
  15. package/src/db/embedding-queries.ts +230 -48
  16. package/src/db/queries.ts +0 -25
  17. package/src/db/repository.ts +32 -21
  18. package/src/db/schema.ts +80 -0
  19. package/src/index.ts +28 -3
  20. package/src/lib/command-requests.ts +112 -1
  21. package/src/lib/config.ts +57 -7
  22. package/src/lib/constants.ts +1 -1
  23. package/src/lib/doctor.ts +42 -12
  24. package/src/lib/embedder.ts +118 -3
  25. package/src/lib/embedding-health.ts +3 -1
  26. package/src/lib/embedding-job.ts +3 -0
  27. package/src/lib/embedding-management.ts +65 -0
  28. package/src/lib/embedding-runtime.ts +177 -0
  29. package/src/lib/output-json.ts +0 -2
  30. package/src/lib/scope-context.ts +12 -6
  31. package/src/lib/scope-migration.ts +2 -1
  32. package/src/lib/scope.ts +0 -2
  33. package/src/lib/search-eval.ts +38 -18
  34. package/src/lib/search-policy-sweep.ts +563 -0
  35. package/src/lib/search-policy.ts +0 -4
  36. package/src/lib/search-service.ts +14 -15
  37. package/src/lib/session-candidate-document.ts +175 -0
  38. package/src/lib/session-candidate-scope.ts +6 -0
  39. package/src/lib/session-capture.ts +298 -32
  40. package/src/lib/session-codec.ts +1 -8
  41. package/src/lib/session-operations.ts +83 -60
  42. package/src/lib/session-review.ts +327 -0
  43. package/src/lib/session-store.ts +87 -73
  44. package/src/lib/store.ts +74 -10
  45. package/src/lib/string-list.ts +3 -0
  46. package/src/lib/text-lines.ts +7 -0
  47. package/src/lib/trap-search-document.ts +2 -1
  48. package/src/lib/value-types.ts +3 -0
  49. package/src/web/client-review.ts +171 -0
  50. package/src/web/client-script.ts +426 -51
  51. package/src/web/client-shell.ts +414 -0
  52. package/src/web/client-text.ts +112 -0
  53. package/src/web/project-registry.ts +3 -5
  54. package/src/web/server.ts +117 -103
  55. package/src/web/static.ts +364 -19
  56. package/skills/codetrap-capture-external/SKILL.md +0 -62
  57. package/skills/codetrap-check/SKILL.md +0 -69
  58. package/src/lib/embedding-index.ts +0 -53
@@ -3,13 +3,19 @@ import { openDatabase } from "../db/connection";
3
3
  import { TrapRepository } from "../db/repository";
4
4
  import type { TrapInput, TrapSearchResult } from "../domain/trap";
5
5
  import { SEARCH_MODES, type SearchMode } from "./constants";
6
+ import { isRecord } from "./value-types";
6
7
  import {
7
- createDefaultEmbeddingProvider,
8
- embeddingConfig,
9
8
  type EmbeddingConfig,
10
9
  type EmbeddingProvider,
11
10
  type EmbeddingTask,
12
11
  } from "./embedder";
12
+ import {
13
+ EmbeddingRuntime,
14
+ defaultEmbeddingRuntime,
15
+ embeddingRuntimeFrom,
16
+ type EmbeddingRuntimeInput,
17
+ } from "./embedding-runtime";
18
+ import type { RankingConfig } from "./search-policy";
13
19
 
14
20
  export type PhaseGate = "phase0" | "phase1" | "phase4" | "dogfood";
15
21
  export const DOGFOOD_JUDGMENTS = ["useful_hit", "miss", "noisy_hit", "no_relevant_trap"] as const;
@@ -79,6 +85,10 @@ export type SearchEvalReport = {
79
85
  next_actions: SearchEvalNextAction[];
80
86
  };
81
87
 
88
+ export type SearchEvalDetailedReport = Omit<SearchEvalReport, "mode" | "fixture" | "next_actions"> & {
89
+ cases: EvalCaseReport[];
90
+ };
91
+
82
92
  export type RecordDogfoodResult = {
83
93
  success: true;
84
94
  fixture: string;
@@ -115,8 +125,8 @@ export function recordDogfoodCase(fixturePath: string, jsonInput: string | undef
115
125
 
116
126
  export async function reportDogfood(fixturePath: string, live: boolean): Promise<SearchEvalReport> {
117
127
  const fixture = readEvalFixture(fixturePath);
118
- const provider = live ? createDefaultEmbeddingProvider() : new EvalEmbedder();
119
- const evaluated = await evaluateSearchFixture(fixture, provider);
128
+ const runtime = live ? defaultEmbeddingRuntime() : new EmbeddingRuntime(new EvalEmbedder());
129
+ const evaluated = await evaluateSearchFixture(fixture, runtime);
120
130
  const mode: SearchEvalReport["mode"] = live ? "live" : "deterministic";
121
131
  const report: Omit<SearchEvalReport, "next_actions"> = {
122
132
  mode,
@@ -131,19 +141,30 @@ export async function reportDogfood(fixturePath: string, live: boolean): Promise
131
141
 
132
142
  export async function evaluateSearchFixture(
133
143
  fixture: EvalFixture,
134
- provider: EmbeddingProvider | undefined
144
+ embeddings: EmbeddingRuntimeInput,
145
+ ranking?: RankingConfig
135
146
  ): Promise<Omit<SearchEvalReport, "mode" | "fixture" | "next_actions">> {
136
- const repo = fixtureRepository(fixture, provider);
147
+ const { cases: _cases, ...report } = await evaluateSearchFixtureCases(fixture, embeddings, ranking);
148
+ return report;
149
+ }
150
+
151
+ export async function evaluateSearchFixtureCases(
152
+ fixture: EvalFixture,
153
+ embeddings: EmbeddingRuntimeInput,
154
+ ranking?: RankingConfig
155
+ ): Promise<SearchEvalDetailedReport> {
156
+ const runtime = embeddingRuntimeFrom(embeddings);
157
+ const repo = fixtureRepository(fixture, runtime, ranking);
137
158
 
138
159
  let providerError: string | null = null;
139
- if (provider) {
160
+ if (runtime.available()) {
140
161
  try {
141
162
  await repo.ensureEmbeddings();
142
163
  } catch (error) {
143
164
  providerError = errorMessage(error);
144
165
  }
145
166
  }
146
- const searchRepo = providerError ? fixtureRepository(fixture, undefined) : repo;
167
+ const searchRepo = providerError ? fixtureRepository(fixture, undefined, ranking) : repo;
147
168
 
148
169
  const cases: EvalCaseReport[] = [];
149
170
  let hybridFallbackCount = 0;
@@ -154,7 +175,7 @@ export async function evaluateSearchFixture(
154
175
  const results = await searchRepo.search(item.query, { mode: item.mode, limit: 5 });
155
176
  const report = caseReport(item, fixture, results);
156
177
  cases.push(report);
157
- if (item.mode === "hybrid" && (!provider || hasSemanticFallback(results))) {
178
+ if (item.mode === "hybrid" && (!runtime.available() || hasSemanticFallback(results))) {
158
179
  hybridFallbackCount++;
159
180
  }
160
181
  } catch (error) {
@@ -169,8 +190,8 @@ export async function evaluateSearchFixture(
169
190
  const noisyHits = cases.filter((item) => item.judgment === "noisy_hit");
170
191
  const metrics = aggregateMetrics(cases);
171
192
  return {
172
- provider: provider ? embeddingConfig(provider) : null,
173
- semantic_available: Boolean(provider && providerError === null),
193
+ provider: runtime.config(),
194
+ semantic_available: runtime.available() && providerError === null,
174
195
  provider_error: providerError,
175
196
  total_cases: cases.length,
176
197
  metrics: {
@@ -190,6 +211,7 @@ export async function evaluateSearchFixture(
190
211
  failures,
191
212
  misses,
192
213
  noisy_hits: noisyHits,
214
+ cases,
193
215
  };
194
216
  }
195
217
 
@@ -240,8 +262,9 @@ function buildSearchEvalNextActions(
240
262
  ): SearchEvalNextAction[] {
241
263
  const actions: SearchEvalNextAction[] = [];
242
264
  if (report.mode === "live" && !report.semantic_available) {
243
- actions.push({
244
- command: "export JINA_API_KEY=<your-jina-api-key>",
265
+ const action = defaultEmbeddingRuntime().setupAction();
266
+ if (action) actions.push({
267
+ command: action.command,
245
268
  reason: "Enable live semantic checks, then rerun bun run eval:dogfood -- report --live.",
246
269
  });
247
270
  }
@@ -309,8 +332,8 @@ function formatNextActions(actions: SearchEvalNextAction[]): string[] {
309
332
  return actions.map((action) => ` - ${action.command} # ${action.reason}`);
310
333
  }
311
334
 
312
- function fixtureRepository(fixture: EvalFixture, provider: EmbeddingProvider | undefined): TrapRepository {
313
- const repo = new TrapRepository(openDatabase(":memory:"), provider);
335
+ function fixtureRepository(fixture: EvalFixture, embeddings: EmbeddingRuntimeInput, ranking?: RankingConfig): TrapRepository {
336
+ const repo = new TrapRepository(openDatabase(":memory:"), embeddings, ranking);
314
337
  for (const trap of fixture.traps) repo.add(trap);
315
338
  return repo;
316
339
  }
@@ -471,9 +494,6 @@ function providerLabel(provider: EmbeddingConfig | null): string {
471
494
  return `${provider.provider}/${provider.model}`;
472
495
  }
473
496
 
474
- function isRecord(value: unknown): value is Record<string, unknown> {
475
- return typeof value === "object" && value !== null && !Array.isArray(value);
476
- }
477
497
 
478
498
  function round(value: number): number {
479
499
  return Math.round(value * 10000) / 10000;
@@ -0,0 +1,563 @@
1
+ import { readFileSync } from "node:fs";
2
+ import type { TrapSearchResult } from "../domain/trap";
3
+ import { SCOPES, SEARCH_MODES, type Scope, type SearchMode } from "./constants";
4
+ import { isRecord } from "./value-types";
5
+ import {
6
+ defaultEmbeddingRuntime,
7
+ embeddingRuntimeFrom,
8
+ type EmbeddingRuntimeInput,
9
+ } from "./embedding-runtime";
10
+ import {
11
+ DEFAULT_SEARCH_EVAL_FIXTURE,
12
+ EvalEmbedder,
13
+ evaluateSearchFixtureCases,
14
+ readEvalFixture,
15
+ type EvalCaseReport,
16
+ type SearchEvalMetrics,
17
+ } from "./search-eval";
18
+ import { ScopedRepositoryContext } from "./scope-context";
19
+ import {
20
+ DEFAULT_RANKING_CONFIG,
21
+ type RankingConfig,
22
+ } from "./search-policy";
23
+
24
+ export type RankingCandidate = {
25
+ name: string;
26
+ config: RankingConfig;
27
+ };
28
+
29
+ export type GoldTarget = {
30
+ scope?: Scope;
31
+ id?: number;
32
+ title?: string;
33
+ };
34
+
35
+ export type LiveEvalCase = {
36
+ query: string;
37
+ mode?: SearchMode;
38
+ scope?: Scope;
39
+ limit?: number;
40
+ gold?: GoldTarget[];
41
+ minRecallAt3?: number;
42
+ minRecallAt5?: number;
43
+ };
44
+
45
+ export type ComparableSearchCase = {
46
+ key: string;
47
+ query: string;
48
+ mode: SearchMode;
49
+ scope?: Scope;
50
+ scored: boolean;
51
+ recallAt3: number;
52
+ recallAt5: number;
53
+ reciprocalRank: number;
54
+ passed: boolean;
55
+ topResults: { id: number; scope?: Scope; title: string; sources: string[]; diagnostics: string[] }[];
56
+ warnings: string[];
57
+ error?: string;
58
+ };
59
+
60
+ export type CaseDelta = {
61
+ query: string;
62
+ mode: SearchMode;
63
+ scope?: Scope;
64
+ before: number;
65
+ after: number;
66
+ beforeTop: string[];
67
+ afterTop: string[];
68
+ };
69
+
70
+ export type SweepCandidateReport = {
71
+ name: string;
72
+ config: RankingConfig;
73
+ total_cases: number;
74
+ scored_cases: number;
75
+ metrics: SearchEvalMetrics;
76
+ cases: ComparableSearchCase[];
77
+ failures: ComparableSearchCase[];
78
+ regressions: CaseDelta[];
79
+ improvements: CaseDelta[];
80
+ };
81
+
82
+ export type PolicySweepReport = {
83
+ mode: "fixture" | "live";
84
+ source: string;
85
+ cwd?: string;
86
+ candidate_count: number;
87
+ baseline: SweepCandidateReport;
88
+ candidates: SweepCandidateReport[];
89
+ best: SweepCandidateReport;
90
+ recommendation: string;
91
+ };
92
+
93
+ type FixtureSweepOptions = {
94
+ fixturePath?: string;
95
+ candidates?: RankingCandidate[];
96
+ };
97
+
98
+ type LiveSweepOptions = {
99
+ cwd: string;
100
+ cases: LiveEvalCase[];
101
+ candidates?: RankingCandidate[];
102
+ embeddings?: EmbeddingRuntimeInput;
103
+ defaultScope?: Scope;
104
+ home?: string;
105
+ };
106
+
107
+ export const DEFAULT_POLICY_SWEEP_CANDIDATES: RankingCandidate[] = [
108
+ candidate("default", {}),
109
+ candidate("title-tag-heavy", {
110
+ titleTokenBoost: 0.24,
111
+ tagTokenBoost: 0.28,
112
+ maxBoost: 0.55,
113
+ }),
114
+ candidate("identifier-heavy", {
115
+ identifierBoost: 0.3,
116
+ maxBoost: 0.55,
117
+ }),
118
+ candidate("scope-heavy", {
119
+ pathMatchBoost: 0.2,
120
+ moduleMatchBoost: 0.14,
121
+ ownerMatchBoost: 0.08,
122
+ maxBoost: 0.55,
123
+ }),
124
+ candidate("severity-light", {
125
+ severityBoost: {
126
+ warning: 0,
127
+ error: 0.02,
128
+ critical: 0.03,
129
+ },
130
+ }),
131
+ candidate("semantic-loose", {
132
+ semanticMinScore: 0.2,
133
+ }),
134
+ candidate("semantic-strict", {
135
+ semanticMinScore: 0.4,
136
+ }),
137
+ ];
138
+
139
+ export async function runFixturePolicySweep(options: FixtureSweepOptions = {}): Promise<PolicySweepReport> {
140
+ const fixturePath = options.fixturePath ?? DEFAULT_SEARCH_EVAL_FIXTURE;
141
+ const fixture = readEvalFixture(fixturePath);
142
+ const candidates = options.candidates ?? DEFAULT_POLICY_SWEEP_CANDIDATES;
143
+ const reports: SweepCandidateReport[] = [];
144
+ let baselineCases: ComparableSearchCase[] | undefined;
145
+
146
+ for (const item of candidates) {
147
+ const detailed = await evaluateSearchFixtureCases(fixture, new EvalEmbedder(), item.config);
148
+ const cases = detailed.cases.map(fromEvalCaseReport);
149
+ if (!baselineCases) baselineCases = cases;
150
+ reports.push(candidateReport(item, cases, baselineCases));
151
+ }
152
+
153
+ return buildSweepReport("fixture", fixturePath, reports);
154
+ }
155
+
156
+ export async function runLivePolicySweep(options: LiveSweepOptions): Promise<PolicySweepReport> {
157
+ if (options.cases.length === 0) throw new Error("Live sweep requires at least one query case.");
158
+
159
+ const candidates = options.candidates ?? DEFAULT_POLICY_SWEEP_CANDIDATES;
160
+ const embeddings = options.embeddings ?? defaultEmbeddingRuntime();
161
+ const reports: SweepCandidateReport[] = [];
162
+ let baselineCases: ComparableSearchCase[] | undefined;
163
+
164
+ for (const item of candidates) {
165
+ const cases = await evaluateLiveCases({
166
+ ...options,
167
+ embeddings,
168
+ defaultScope: options.defaultScope ?? "project",
169
+ ranking: item.config,
170
+ });
171
+ if (!baselineCases) baselineCases = cases;
172
+ reports.push(candidateReport(item, cases, baselineCases));
173
+ }
174
+
175
+ return buildSweepReport("live", "live project", reports, options.cwd);
176
+ }
177
+
178
+ export function readLiveEvalCases(path: string): LiveEvalCase[] {
179
+ const parsed = JSON.parse(readFileSync(path, "utf-8")) as unknown;
180
+ const records = Array.isArray(parsed)
181
+ ? parsed
182
+ : isRecord(parsed) && Array.isArray(parsed.queries)
183
+ ? parsed.queries
184
+ : null;
185
+ if (!records) throw new Error("Live queries file must be an array or an object with a queries array.");
186
+ return records.map(normalizeLiveEvalCase);
187
+ }
188
+
189
+ export function formatPolicySweepReport(report: PolicySweepReport): string {
190
+ const lines = [
191
+ `Search policy sweep (${report.mode})`,
192
+ `Source: ${report.source}`,
193
+ ];
194
+ if (report.cwd) lines.push(`cwd: ${report.cwd}`);
195
+ lines.push(
196
+ `Candidates: ${report.candidate_count}`,
197
+ `Baseline: ${summaryLine(report.baseline)}`,
198
+ `Best: ${summaryLine(report.best)}`,
199
+ `Recommendation: ${report.recommendation}`,
200
+ "Results:"
201
+ );
202
+
203
+ for (const candidate of report.candidates) {
204
+ lines.push(` - ${summaryLine(candidate)}`);
205
+ if (candidate.regressions.length > 0) {
206
+ lines.push(` regressions: ${formatDeltas(candidate.regressions)}`);
207
+ }
208
+ if (candidate.improvements.length > 0) {
209
+ lines.push(` improvements: ${formatDeltas(candidate.improvements)}`);
210
+ }
211
+ if (candidate.failures.length > 0) {
212
+ lines.push(` failures: ${candidate.failures.slice(0, 3).map((item) => item.query).join("; ")}`);
213
+ }
214
+ }
215
+
216
+ return lines.join("\n");
217
+ }
218
+
219
+ function candidate(name: string, patch: Partial<RankingConfig>): RankingCandidate {
220
+ return {
221
+ name,
222
+ config: {
223
+ ...DEFAULT_RANKING_CONFIG,
224
+ ...patch,
225
+ severityBoost: {
226
+ ...DEFAULT_RANKING_CONFIG.severityBoost,
227
+ ...(patch.severityBoost ?? {}),
228
+ },
229
+ },
230
+ };
231
+ }
232
+
233
+ async function evaluateLiveCases(options: LiveSweepOptions & { ranking: RankingConfig }): Promise<ComparableSearchCase[]> {
234
+ const defaultScope = options.defaultScope ?? "project";
235
+ const scopes = new ScopedRepositoryContext(
236
+ options.cwd,
237
+ embeddingRuntimeFrom(options.embeddings),
238
+ options.home,
239
+ options.ranking
240
+ );
241
+
242
+ const out: ComparableSearchCase[] = [];
243
+ for (const input of options.cases) {
244
+ const scope = input.scope ?? defaultScope;
245
+ const mode = input.mode ?? "hybrid";
246
+ try {
247
+ const results = await scopes.repositoryFor(scope).search(input.query, {
248
+ mode,
249
+ scope,
250
+ limit: input.limit ?? 5,
251
+ });
252
+ out.push(liveCaseReport(input, mode, scope, results));
253
+ } catch (error) {
254
+ out.push({
255
+ key: caseKey(input.query, mode, scope),
256
+ query: input.query,
257
+ mode,
258
+ scope,
259
+ scored: (input.gold ?? []).length > 0,
260
+ recallAt3: 0,
261
+ recallAt5: 0,
262
+ reciprocalRank: 0,
263
+ passed: false,
264
+ topResults: [],
265
+ warnings: [],
266
+ error: error instanceof Error ? error.message : String(error),
267
+ });
268
+ }
269
+ }
270
+ return out;
271
+ }
272
+
273
+ function liveCaseReport(
274
+ input: LiveEvalCase,
275
+ mode: SearchMode,
276
+ scope: Scope,
277
+ results: TrapSearchResult[]
278
+ ): ComparableSearchCase {
279
+ const gold = input.gold ?? [];
280
+ const warnings = new Set<string>();
281
+ const recallAt3 = gold.length > 0 ? recall(gold, results.slice(0, 3), warnings, scope) : 1;
282
+ const recallAt5 = gold.length > 0 ? recall(gold, results.slice(0, 5), warnings, scope) : 1;
283
+ const firstRank = gold.length > 0 ? firstMatchRank(gold, results, warnings, scope) : -1;
284
+ const reciprocalRank = firstRank >= 0 ? 1 / (firstRank + 1) : 0;
285
+ const minRecallAt3 = input.minRecallAt3 ?? (gold.length > 0 ? 1 : 0);
286
+ const minRecallAt5 = input.minRecallAt5 ?? (gold.length > 0 ? 1 : 0);
287
+
288
+ return {
289
+ key: caseKey(input.query, mode, scope),
290
+ query: input.query,
291
+ mode,
292
+ scope,
293
+ scored: gold.length > 0,
294
+ recallAt3,
295
+ recallAt5,
296
+ reciprocalRank,
297
+ passed: recallAt3 >= minRecallAt3 && recallAt5 >= minRecallAt5,
298
+ topResults: results.map((result) => ({
299
+ id: result.trap.id,
300
+ scope: result.trap.scope === "project" || result.trap.scope === "global" ? result.trap.scope : scope,
301
+ title: result.trap.title,
302
+ sources: result.sources ?? [],
303
+ diagnostics: (result.diagnostics ?? []).map((diagnostic) => diagnostic.code),
304
+ })),
305
+ warnings: [...warnings],
306
+ };
307
+ }
308
+
309
+ function recall(gold: GoldTarget[], results: TrapSearchResult[], warnings: Set<string>, defaultScope: Scope): number {
310
+ return gold.filter((target) => targetMatches(target, results, warnings, defaultScope)).length / gold.length;
311
+ }
312
+
313
+ function firstMatchRank(
314
+ gold: GoldTarget[],
315
+ results: TrapSearchResult[],
316
+ warnings: Set<string>,
317
+ defaultScope: Scope
318
+ ): number {
319
+ return results.findIndex((result) =>
320
+ gold.some((target) => targetMatches(target, [result], warnings, defaultScope))
321
+ );
322
+ }
323
+
324
+ function targetMatches(
325
+ target: GoldTarget,
326
+ results: TrapSearchResult[],
327
+ warnings: Set<string>,
328
+ defaultScope: Scope
329
+ ): boolean {
330
+ const scope = target.scope ?? defaultScope;
331
+ const scoped = results.filter((result) => result.trap.scope === scope);
332
+ if (target.id !== undefined) {
333
+ const idMatch = scoped.find((result) => result.trap.id === target.id);
334
+ if (idMatch) {
335
+ if (target.title && idMatch.trap.title !== target.title) {
336
+ warnings.add(`gold_title_mismatch:${target.id}`);
337
+ }
338
+ return true;
339
+ }
340
+ }
341
+ if (!target.title) return false;
342
+
343
+ const titleMatches = scoped.filter((result) => result.trap.title === target.title);
344
+ if (titleMatches.length > 1) warnings.add(`gold_title_ambiguous:${target.title}`);
345
+ if (titleMatches.length > 0) {
346
+ if (target.id !== undefined && titleMatches.every((result) => result.trap.id !== target.id)) {
347
+ warnings.add(`gold_id_drift:${target.id}->${titleMatches.map((result) => result.trap.id).join(",")}`);
348
+ }
349
+ return true;
350
+ }
351
+ return false;
352
+ }
353
+
354
+ function fromEvalCaseReport(item: EvalCaseReport): ComparableSearchCase {
355
+ return {
356
+ key: caseKey(item.query, item.mode),
357
+ query: item.query,
358
+ mode: item.mode,
359
+ scored: item.goldTrapIds.length > 0,
360
+ recallAt3: item.recallAt3,
361
+ recallAt5: item.recallAt5,
362
+ reciprocalRank: item.reciprocalRank,
363
+ passed: item.passed,
364
+ topResults: item.topResults.map((result) => ({
365
+ id: result.id,
366
+ title: result.title,
367
+ sources: result.sources,
368
+ diagnostics: result.diagnostics,
369
+ })),
370
+ warnings: [],
371
+ error: item.error,
372
+ };
373
+ }
374
+
375
+ function candidateReport(
376
+ candidate: RankingCandidate,
377
+ cases: ComparableSearchCase[],
378
+ baselineCases: ComparableSearchCase[]
379
+ ): SweepCandidateReport {
380
+ return {
381
+ name: candidate.name,
382
+ config: candidate.config,
383
+ total_cases: cases.length,
384
+ scored_cases: cases.filter((item) => item.scored).length,
385
+ metrics: aggregateMetrics(cases),
386
+ cases,
387
+ failures: cases.filter((item) => !item.passed),
388
+ regressions: deltas(cases, baselineCases, "regression"),
389
+ improvements: deltas(cases, baselineCases, "improvement"),
390
+ };
391
+ }
392
+
393
+ function aggregateMetrics(cases: ComparableSearchCase[]): SearchEvalMetrics {
394
+ const scored = cases.filter((item) => item.scored);
395
+ const total = scored.length || 1;
396
+ return {
397
+ recall_at_3: round(scored.reduce((sum, item) => sum + item.recallAt3, 0) / total),
398
+ recall_at_5: round(scored.reduce((sum, item) => sum + item.recallAt5, 0) / total),
399
+ mrr: round(scored.reduce((sum, item) => sum + item.reciprocalRank, 0) / total),
400
+ hybrid_fallback_count: cases.filter((item) =>
401
+ item.topResults.some((result) =>
402
+ result.diagnostics.some((code) =>
403
+ ["semantic_unavailable", "semantic_no_candidates", "semantic_failed"].includes(code)
404
+ )
405
+ )
406
+ ).length,
407
+ semantic_error_count: cases.filter((item) => item.error).length,
408
+ };
409
+ }
410
+
411
+ function deltas(
412
+ cases: ComparableSearchCase[],
413
+ baselineCases: ComparableSearchCase[],
414
+ direction: "regression" | "improvement"
415
+ ): CaseDelta[] {
416
+ const baselineByKey = new Map(baselineCases.map((item) => [item.key, item]));
417
+ const out: CaseDelta[] = [];
418
+ for (const item of cases) {
419
+ const baseline = baselineByKey.get(item.key);
420
+ if (!baseline || !item.scored) continue;
421
+ const diff = item.reciprocalRank - baseline.reciprocalRank;
422
+ const changed = direction === "regression" ? diff < -0.0001 : diff > 0.0001;
423
+ if (!changed) continue;
424
+ out.push({
425
+ query: item.query,
426
+ mode: item.mode,
427
+ scope: item.scope,
428
+ before: baseline.reciprocalRank,
429
+ after: item.reciprocalRank,
430
+ beforeTop: topTitles(baseline),
431
+ afterTop: topTitles(item),
432
+ });
433
+ }
434
+ return out;
435
+ }
436
+
437
+ function buildSweepReport(
438
+ mode: PolicySweepReport["mode"],
439
+ source: string,
440
+ candidates: SweepCandidateReport[],
441
+ cwd?: string
442
+ ): PolicySweepReport {
443
+ if (candidates.length === 0) throw new Error("At least one ranking candidate is required.");
444
+ const baseline = candidates[0]!;
445
+ const best = [...candidates].sort(compareCandidates)[0]!;
446
+ return {
447
+ mode,
448
+ source,
449
+ cwd,
450
+ candidate_count: candidates.length,
451
+ baseline,
452
+ candidates,
453
+ best,
454
+ recommendation: recommendation(baseline, best, candidates),
455
+ };
456
+ }
457
+
458
+ function compareCandidates(a: SweepCandidateReport, b: SweepCandidateReport): number {
459
+ return (
460
+ b.metrics.recall_at_3 - a.metrics.recall_at_3 ||
461
+ b.metrics.mrr - a.metrics.mrr ||
462
+ a.failures.length - b.failures.length ||
463
+ a.regressions.length - b.regressions.length
464
+ );
465
+ }
466
+
467
+ function recommendation(baseline: SweepCandidateReport, best: SweepCandidateReport, candidates: SweepCandidateReport[]): string {
468
+ if (baseline.scored_cases === 0) return "No scored live cases yet; add gold targets before using this as an optimization signal.";
469
+ const allTie = candidates.every((item) =>
470
+ item.metrics.recall_at_3 === baseline.metrics.recall_at_3 &&
471
+ item.metrics.recall_at_5 === baseline.metrics.recall_at_5 &&
472
+ item.metrics.mrr === baseline.metrics.mrr
473
+ );
474
+ if (allTie) return "All candidates tie; add harder miss/noisy_hit eval cases before changing ranking config.";
475
+ if (best.name === baseline.name) return "The default config is still best on this fixture.";
476
+ if (best.regressions.length > 0) return `${best.name} improves aggregate metrics but has regressions; inspect before adopting.`;
477
+ return `${best.name} is the strongest candidate on these cases; inspect changed rankings before editing defaults.`;
478
+ }
479
+
480
+ function normalizeLiveEvalCase(value: unknown): LiveEvalCase {
481
+ if (!isRecord(value)) throw new Error("Each live query case must be an object.");
482
+ const query = stringField(value, "query");
483
+ const mode = optionalEnum(value, "mode", SEARCH_MODES);
484
+ const scope = optionalEnum(value, "scope", SCOPES);
485
+ return {
486
+ query,
487
+ mode,
488
+ scope,
489
+ limit: optionalPositiveInt(value, "limit"),
490
+ gold: normalizeGoldTargets(value.gold),
491
+ minRecallAt3: optionalScore(value, "minRecallAt3"),
492
+ minRecallAt5: optionalScore(value, "minRecallAt5"),
493
+ };
494
+ }
495
+
496
+ function normalizeGoldTargets(value: unknown): GoldTarget[] | undefined {
497
+ if (value === undefined) return undefined;
498
+ if (!Array.isArray(value)) throw new Error("gold must be an array.");
499
+ return value.map((item) => {
500
+ if (!isRecord(item)) throw new Error("gold entries must be objects.");
501
+ const id = item.id === undefined ? undefined : Number(item.id);
502
+ if (id !== undefined && (!Number.isInteger(id) || id <= 0)) throw new Error("gold.id must be a positive integer.");
503
+ const scope = optionalEnum(item, "scope", SCOPES);
504
+ const title = typeof item.title === "string" && item.title.trim() ? item.title.trim() : undefined;
505
+ if (id === undefined && !title) throw new Error("gold entries require id or title.");
506
+ return { id, title, scope };
507
+ });
508
+ }
509
+
510
+ function caseKey(query: string, mode: SearchMode, scope?: Scope): string {
511
+ return `${scope ?? "fixture"}\0${mode}\0${query}`;
512
+ }
513
+
514
+ function topTitles(item: ComparableSearchCase): string[] {
515
+ return item.topResults.slice(0, 3).map((result) => `#${result.id} ${result.title}`);
516
+ }
517
+
518
+ function summaryLine(item: SweepCandidateReport): string {
519
+ return `${item.name} R@3=${item.metrics.recall_at_3} R@5=${item.metrics.recall_at_5} MRR=${item.metrics.mrr} failures=${item.failures.length} regressions=${item.regressions.length}`;
520
+ }
521
+
522
+ function formatDeltas(items: CaseDelta[]): string {
523
+ return items
524
+ .slice(0, 3)
525
+ .map((item) => `${item.query} ${round(item.before)}->${round(item.after)}`)
526
+ .join("; ");
527
+ }
528
+
529
+ function optionalEnum<T extends readonly string[]>(value: Record<string, unknown>, key: string, choices: T): T[number] | undefined {
530
+ const field = value[key];
531
+ if (field === undefined) return undefined;
532
+ if (typeof field !== "string" || !(choices as readonly string[]).includes(field)) {
533
+ throw new Error(`${key} must be one of: ${choices.join(", ")}`);
534
+ }
535
+ return field as T[number];
536
+ }
537
+
538
+ function stringField(value: Record<string, unknown>, key: string): string {
539
+ const field = value[key];
540
+ if (typeof field !== "string" || field.trim() === "") throw new Error(`${key} is required.`);
541
+ return field.trim();
542
+ }
543
+
544
+ function optionalPositiveInt(value: Record<string, unknown>, key: string): number | undefined {
545
+ const field = value[key];
546
+ if (field === undefined) return undefined;
547
+ const number = Number(field);
548
+ if (!Number.isInteger(number) || number <= 0) throw new Error(`${key} must be a positive integer.`);
549
+ return number;
550
+ }
551
+
552
+ function optionalScore(value: Record<string, unknown>, key: string): number | undefined {
553
+ const field = value[key];
554
+ if (field === undefined) return undefined;
555
+ const number = Number(field);
556
+ if (!Number.isFinite(number) || number < 0 || number > 1) throw new Error(`${key} must be between 0 and 1.`);
557
+ return number;
558
+ }
559
+
560
+
561
+ function round(value: number): number {
562
+ return Math.round(value * 10000) / 10000;
563
+ }