@wooojin/forgen 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +20 -0
- package/CHANGELOG.md +353 -0
- package/CONTRIBUTING.md +98 -0
- package/LICENSE +21 -0
- package/README.ja.md +469 -0
- package/README.ko.md +469 -0
- package/README.md +483 -0
- package/README.zh.md +469 -0
- package/agents/analyst.md +98 -0
- package/agents/architect.md +62 -0
- package/agents/code-reviewer.md +120 -0
- package/agents/code-simplifier.md +197 -0
- package/agents/critic.md +70 -0
- package/agents/debugger.md +117 -0
- package/agents/designer.md +131 -0
- package/agents/executor.md +54 -0
- package/agents/explore.md +145 -0
- package/agents/git-master.md +212 -0
- package/agents/performance-reviewer.md +172 -0
- package/agents/planner.md +29 -0
- package/agents/qa-tester.md +158 -0
- package/agents/refactoring-expert.md +168 -0
- package/agents/scientist.md +144 -0
- package/agents/security-reviewer.md +137 -0
- package/agents/test-engineer.md +153 -0
- package/agents/verifier.md +133 -0
- package/agents/writer.md +184 -0
- package/commands/api-design.md +268 -0
- package/commands/architecture-decision.md +314 -0
- package/commands/ci-cd.md +270 -0
- package/commands/code-review.md +233 -0
- package/commands/compound.md +117 -0
- package/commands/database.md +263 -0
- package/commands/debug-detective.md +99 -0
- package/commands/docker.md +274 -0
- package/commands/documentation.md +276 -0
- package/commands/ecomode.md +51 -0
- package/commands/frontend.md +271 -0
- package/commands/git-master.md +90 -0
- package/commands/incident-response.md +292 -0
- package/commands/migrate.md +101 -0
- package/commands/performance.md +288 -0
- package/commands/refactor.md +105 -0
- package/commands/security-review.md +288 -0
- package/commands/tdd.md +183 -0
- package/commands/testing-strategy.md +265 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +295 -0
- package/dist/core/auto-compound-runner.d.ts +12 -0
- package/dist/core/auto-compound-runner.js +460 -0
- package/dist/core/config-hooks.d.ts +10 -0
- package/dist/core/config-hooks.js +112 -0
- package/dist/core/config-injector.d.ts +50 -0
- package/dist/core/config-injector.js +455 -0
- package/dist/core/doctor.d.ts +1 -0
- package/dist/core/doctor.js +163 -0
- package/dist/core/errors.d.ts +81 -0
- package/dist/core/errors.js +133 -0
- package/dist/core/global-config.d.ts +43 -0
- package/dist/core/global-config.js +25 -0
- package/dist/core/harness.d.ts +24 -0
- package/dist/core/harness.js +621 -0
- package/dist/core/init.d.ts +7 -0
- package/dist/core/init.js +37 -0
- package/dist/core/inspect-cli.d.ts +7 -0
- package/dist/core/inspect-cli.js +47 -0
- package/dist/core/legacy-detector.d.ts +33 -0
- package/dist/core/legacy-detector.js +66 -0
- package/dist/core/logger.d.ts +34 -0
- package/dist/core/logger.js +121 -0
- package/dist/core/mcp-config.d.ts +44 -0
- package/dist/core/mcp-config.js +177 -0
- package/dist/core/notepad.d.ts +31 -0
- package/dist/core/notepad.js +88 -0
- package/dist/core/paths.d.ts +85 -0
- package/dist/core/paths.js +101 -0
- package/dist/core/plugin-detector.d.ts +44 -0
- package/dist/core/plugin-detector.js +226 -0
- package/dist/core/runtime-detector.d.ts +8 -0
- package/dist/core/runtime-detector.js +49 -0
- package/dist/core/scope-resolver.d.ts +8 -0
- package/dist/core/scope-resolver.js +45 -0
- package/dist/core/session-logger.d.ts +6 -0
- package/dist/core/session-logger.js +111 -0
- package/dist/core/session-store.d.ts +28 -0
- package/dist/core/session-store.js +218 -0
- package/dist/core/settings-lock.d.ts +18 -0
- package/dist/core/settings-lock.js +125 -0
- package/dist/core/spawn.d.ts +3 -0
- package/dist/core/spawn.js +135 -0
- package/dist/core/types.d.ts +108 -0
- package/dist/core/types.js +1 -0
- package/dist/core/uninstall.d.ts +4 -0
- package/dist/core/uninstall.js +307 -0
- package/dist/core/v1-bootstrap.d.ts +26 -0
- package/dist/core/v1-bootstrap.js +155 -0
- package/dist/engine/compound-cli.d.ts +24 -0
- package/dist/engine/compound-cli.js +250 -0
- package/dist/engine/compound-extractor.d.ts +68 -0
- package/dist/engine/compound-extractor.js +860 -0
- package/dist/engine/compound-lifecycle.d.ts +32 -0
- package/dist/engine/compound-lifecycle.js +305 -0
- package/dist/engine/compound-loop.d.ts +32 -0
- package/dist/engine/compound-loop.js +511 -0
- package/dist/engine/match-eval-log.d.ts +139 -0
- package/dist/engine/match-eval-log.js +270 -0
- package/dist/engine/phrase-blocklist.d.ts +119 -0
- package/dist/engine/phrase-blocklist.js +208 -0
- package/dist/engine/skill-promoter.d.ts +20 -0
- package/dist/engine/skill-promoter.js +115 -0
- package/dist/engine/solution-format.d.ts +160 -0
- package/dist/engine/solution-format.js +432 -0
- package/dist/engine/solution-index.d.ts +13 -0
- package/dist/engine/solution-index.js +252 -0
- package/dist/engine/solution-matcher.d.ts +364 -0
- package/dist/engine/solution-matcher.js +656 -0
- package/dist/engine/solution-writer.d.ts +76 -0
- package/dist/engine/solution-writer.js +157 -0
- package/dist/engine/term-matcher.d.ts +81 -0
- package/dist/engine/term-matcher.js +268 -0
- package/dist/engine/term-normalizer.d.ts +116 -0
- package/dist/engine/term-normalizer.js +171 -0
- package/dist/fgx.d.ts +6 -0
- package/dist/fgx.js +42 -0
- package/dist/forge/cli.d.ts +11 -0
- package/dist/forge/cli.js +100 -0
- package/dist/forge/evidence-processor.d.ts +21 -0
- package/dist/forge/evidence-processor.js +87 -0
- package/dist/forge/mismatch-detector.d.ts +44 -0
- package/dist/forge/mismatch-detector.js +83 -0
- package/dist/forge/onboarding-cli.d.ts +6 -0
- package/dist/forge/onboarding-cli.js +89 -0
- package/dist/forge/onboarding.d.ts +25 -0
- package/dist/forge/onboarding.js +122 -0
- package/dist/hooks/compound-reflection.d.ts +45 -0
- package/dist/hooks/compound-reflection.js +82 -0
- package/dist/hooks/context-guard.d.ts +24 -0
- package/dist/hooks/context-guard.js +156 -0
- package/dist/hooks/dangerous-patterns.json +18 -0
- package/dist/hooks/db-guard.d.ts +17 -0
- package/dist/hooks/db-guard.js +105 -0
- package/dist/hooks/hook-config.d.ts +29 -0
- package/dist/hooks/hook-config.js +92 -0
- package/dist/hooks/hook-registry.d.ts +43 -0
- package/dist/hooks/hook-registry.js +31 -0
- package/dist/hooks/hooks-generator.d.ts +49 -0
- package/dist/hooks/hooks-generator.js +99 -0
- package/dist/hooks/intent-classifier.d.ts +12 -0
- package/dist/hooks/intent-classifier.js +62 -0
- package/dist/hooks/keyword-detector.d.ts +25 -0
- package/dist/hooks/keyword-detector.js +389 -0
- package/dist/hooks/notepad-injector.d.ts +18 -0
- package/dist/hooks/notepad-injector.js +51 -0
- package/dist/hooks/permission-handler.d.ts +14 -0
- package/dist/hooks/permission-handler.js +114 -0
- package/dist/hooks/post-tool-failure.d.ts +11 -0
- package/dist/hooks/post-tool-failure.js +118 -0
- package/dist/hooks/post-tool-handlers.d.ts +17 -0
- package/dist/hooks/post-tool-handlers.js +115 -0
- package/dist/hooks/post-tool-use.d.ts +29 -0
- package/dist/hooks/post-tool-use.js +151 -0
- package/dist/hooks/pre-compact.d.ts +10 -0
- package/dist/hooks/pre-compact.js +165 -0
- package/dist/hooks/pre-tool-use.d.ts +31 -0
- package/dist/hooks/pre-tool-use.js +325 -0
- package/dist/hooks/prompt-injection-filter.d.ts +56 -0
- package/dist/hooks/prompt-injection-filter.js +287 -0
- package/dist/hooks/rate-limiter.d.ts +21 -0
- package/dist/hooks/rate-limiter.js +86 -0
- package/dist/hooks/secret-filter.d.ts +14 -0
- package/dist/hooks/secret-filter.js +65 -0
- package/dist/hooks/session-recovery.d.ts +27 -0
- package/dist/hooks/session-recovery.js +406 -0
- package/dist/hooks/shared/atomic-write.d.ts +41 -0
- package/dist/hooks/shared/atomic-write.js +148 -0
- package/dist/hooks/shared/context-budget.d.ts +37 -0
- package/dist/hooks/shared/context-budget.js +45 -0
- package/dist/hooks/shared/file-lock.d.ts +56 -0
- package/dist/hooks/shared/file-lock.js +253 -0
- package/dist/hooks/shared/hook-response.d.ts +33 -0
- package/dist/hooks/shared/hook-response.js +62 -0
- package/dist/hooks/shared/injection-caps.d.ts +39 -0
- package/dist/hooks/shared/injection-caps.js +52 -0
- package/dist/hooks/shared/plugin-signal.d.ts +23 -0
- package/dist/hooks/shared/plugin-signal.js +104 -0
- package/dist/hooks/shared/read-stdin.d.ts +8 -0
- package/dist/hooks/shared/read-stdin.js +63 -0
- package/dist/hooks/shared/sanitize-id.d.ts +7 -0
- package/dist/hooks/shared/sanitize-id.js +9 -0
- package/dist/hooks/shared/sanitize.d.ts +7 -0
- package/dist/hooks/shared/sanitize.js +22 -0
- package/dist/hooks/skill-injector.d.ts +38 -0
- package/dist/hooks/skill-injector.js +285 -0
- package/dist/hooks/slop-detector.d.ts +18 -0
- package/dist/hooks/slop-detector.js +93 -0
- package/dist/hooks/solution-injector.d.ts +58 -0
- package/dist/hooks/solution-injector.js +436 -0
- package/dist/hooks/subagent-tracker.d.ts +10 -0
- package/dist/hooks/subagent-tracker.js +90 -0
- package/dist/i18n/index.d.ts +43 -0
- package/dist/i18n/index.js +224 -0
- package/dist/lib.d.ts +14 -0
- package/dist/lib.js +14 -0
- package/dist/mcp/server.d.ts +8 -0
- package/dist/mcp/server.js +40 -0
- package/dist/mcp/solution-reader.d.ts +90 -0
- package/dist/mcp/solution-reader.js +273 -0
- package/dist/mcp/tools.d.ts +16 -0
- package/dist/mcp/tools.js +302 -0
- package/dist/preset/facet-catalog.d.ts +17 -0
- package/dist/preset/facet-catalog.js +46 -0
- package/dist/preset/preset-manager.d.ts +31 -0
- package/dist/preset/preset-manager.js +111 -0
- package/dist/renderer/inspect-renderer.d.ts +11 -0
- package/dist/renderer/inspect-renderer.js +123 -0
- package/dist/renderer/rule-renderer.d.ts +18 -0
- package/dist/renderer/rule-renderer.js +159 -0
- package/dist/store/evidence-store.d.ts +23 -0
- package/dist/store/evidence-store.js +58 -0
- package/dist/store/profile-store.d.ts +12 -0
- package/dist/store/profile-store.js +53 -0
- package/dist/store/recommendation-store.d.ts +22 -0
- package/dist/store/recommendation-store.js +64 -0
- package/dist/store/rule-store.d.ts +22 -0
- package/dist/store/rule-store.js +62 -0
- package/dist/store/session-state-store.d.ts +11 -0
- package/dist/store/session-state-store.js +44 -0
- package/dist/store/types.d.ts +159 -0
- package/dist/store/types.js +7 -0
- package/hooks/hook-registry.json +21 -0
- package/hooks/hooks.json +185 -0
- package/package.json +89 -0
- package/plugin.json +20 -0
- package/scripts/postinstall.js +826 -0
- package/skills/api-design/SKILL.md +262 -0
- package/skills/architecture-decision/SKILL.md +309 -0
- package/skills/ci-cd/SKILL.md +264 -0
- package/skills/code-review/SKILL.md +228 -0
- package/skills/compound/SKILL.md +101 -0
- package/skills/database/SKILL.md +257 -0
- package/skills/debug-detective/SKILL.md +95 -0
- package/skills/docker/SKILL.md +268 -0
- package/skills/documentation/SKILL.md +270 -0
- package/skills/ecomode/SKILL.md +46 -0
- package/skills/frontend/SKILL.md +265 -0
- package/skills/git-master/SKILL.md +86 -0
- package/skills/incident-response/SKILL.md +286 -0
- package/skills/migrate/SKILL.md +96 -0
- package/skills/performance/SKILL.md +282 -0
- package/skills/refactor/SKILL.md +100 -0
- package/skills/security-review/SKILL.md +282 -0
- package/skills/tdd/SKILL.md +178 -0
- package/skills/testing-strategy/SKILL.md +260 -0
- package/starter-pack/solutions/starter-api-error-responses.md +37 -0
- package/starter-pack/solutions/starter-async-patterns.md +40 -0
- package/starter-pack/solutions/starter-caching-strategy.md +40 -0
- package/starter-pack/solutions/starter-code-review-checklist.md +39 -0
- package/starter-pack/solutions/starter-debugging-systematic.md +40 -0
- package/starter-pack/solutions/starter-dependency-injection.md +40 -0
- package/starter-pack/solutions/starter-error-handling-patterns.md +38 -0
- package/starter-pack/solutions/starter-git-atomic-commits.md +36 -0
- package/starter-pack/solutions/starter-input-validation.md +40 -0
- package/starter-pack/solutions/starter-n-plus-one-queries.md +37 -0
- package/starter-pack/solutions/starter-refactor-safely.md +38 -0
- package/starter-pack/solutions/starter-secret-management.md +37 -0
- package/starter-pack/solutions/starter-separation-of-concerns.md +36 -0
- package/starter-pack/solutions/starter-tdd-red-green-refactor.md +40 -0
- package/starter-pack/solutions/starter-typescript-strict-types.md +39 -0
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import * as fs from 'node:fs';
|
|
2
|
+
import * as path from 'node:path';
|
|
3
|
+
import { parseFrontmatterOnly, isV1Format, migrateV1toV3 } from './solution-format.js';
|
|
4
|
+
import { defaultNormalizer } from './term-normalizer.js';
|
|
5
|
+
import { withFileLockSync } from '../hooks/shared/file-lock.js';
|
|
6
|
+
import { atomicWriteText } from '../hooks/shared/atomic-write.js';
|
|
7
|
+
import { createLogger } from '../core/logger.js';
|
|
8
|
+
const log = createLogger('solution-index');
|
|
9
|
+
/**
|
|
10
|
+
* Cache keyed by an order-preserving directory signature.
|
|
11
|
+
*
|
|
12
|
+
* Why this matters:
|
|
13
|
+
* - `buildIndex` accumulates entries in dir order, and `solution-reader`
|
|
14
|
+
* returns the first match — so dir order is the precedence chain
|
|
15
|
+
* (me > team > project, by convention).
|
|
16
|
+
* - The previous single `cachedIndex` global was reused regardless of the
|
|
17
|
+
* `dirs` argument, so different cwd contexts received stale results
|
|
18
|
+
* when their cached dirs' mtimes hadn't changed.
|
|
19
|
+
* - We must NOT sort the signature: `[me,project]` and `[project,me]` are
|
|
20
|
+
* legitimately different precedence chains and need separate cache slots.
|
|
21
|
+
*
|
|
22
|
+
* PR2c-2: LRU eviction with insertion-order touch.
|
|
23
|
+
* long-running MCP 서버가 여러 cwd를 처리하면 cache가 무한 누적될 수 있음.
|
|
24
|
+
* Map의 insertion order를 LRU 시뮬레이션에 활용 — set/get 시 delete + set으로
|
|
25
|
+
* touch해 가장 최근 사용된 entry가 마지막에 오게 한다. 32 초과 시 oldest evict.
|
|
26
|
+
*/
|
|
27
|
+
const MAX_CACHE_ENTRIES = 32;
|
|
28
|
+
const cachedIndexes = new Map();
|
|
29
|
+
/**
|
|
30
|
+
* SOFT_CAP: 디렉터리당 인덱싱되는 entry 수 상한 (parse 후 slice).
|
|
31
|
+
* 100 → 500 상향 (accumulated knowledge base에 100은 너무 낮음).
|
|
32
|
+
*
|
|
33
|
+
* HARD_CAP: 디렉터리당 read+parse하는 파일 수 상한.
|
|
34
|
+
* SOFT_CAP만으로는 readFileSync + YAML parse가 N번 발생해 hook이 수십 초
|
|
35
|
+
* 블록될 수 있음. HARD_CAP 초과 시 statSync로 cheap mtime 정렬해 상위만 처리.
|
|
36
|
+
*/
|
|
37
|
+
const SOFT_CAP = 500;
|
|
38
|
+
const HARD_CAP = 5000;
|
|
39
|
+
/**
|
|
40
|
+
* Build an escape-safe, order-preserving signature for a dirs set.
|
|
41
|
+
* JSON.stringify avoids delimiter collisions when paths contain `|` or `:`.
|
|
42
|
+
*/
|
|
43
|
+
function dirsSignature(dirs) {
|
|
44
|
+
return JSON.stringify(dirs.map(d => [d.scope, d.dir]));
|
|
45
|
+
}
|
|
46
|
+
export function isIndexStale(index) {
|
|
47
|
+
for (const [dir, mtime] of Object.entries(index.directoryMtimes)) {
|
|
48
|
+
try {
|
|
49
|
+
const current = fs.statSync(dir).mtimeMs;
|
|
50
|
+
if (current !== mtime)
|
|
51
|
+
return true;
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
// Dir doesn't exist anymore
|
|
55
|
+
return true;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
function buildIndex(dirs) {
|
|
61
|
+
const entries = [];
|
|
62
|
+
const directoryMtimes = {};
|
|
63
|
+
for (const dirConfig of dirs) {
|
|
64
|
+
const { dir } = dirConfig;
|
|
65
|
+
let dirStat;
|
|
66
|
+
try {
|
|
67
|
+
dirStat = fs.statSync(dir);
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
continue; // skip non-existent dirs
|
|
71
|
+
}
|
|
72
|
+
directoryMtimes[dir] = dirStat.mtimeMs;
|
|
73
|
+
let files;
|
|
74
|
+
try {
|
|
75
|
+
files = fs.readdirSync(dir).filter(f => f.endsWith('.md'));
|
|
76
|
+
}
|
|
77
|
+
catch {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
// HARD_CAP: read+parse 비용 상한. 초과 시 cheap statSync 정렬로 상위만 처리.
|
|
81
|
+
if (files.length > HARD_CAP) {
|
|
82
|
+
console.warn(`[forgen] Warning: ${dir} contains ${files.length} files; pre-filtering to the ${HARD_CAP} most recent before parsing.`);
|
|
83
|
+
const stats = [];
|
|
84
|
+
for (const f of files) {
|
|
85
|
+
try {
|
|
86
|
+
const m = fs.statSync(path.join(dir, f)).mtimeMs;
|
|
87
|
+
stats.push({ f, m });
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
// skip unreadable
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
stats.sort((a, b) => b.m - a.m);
|
|
94
|
+
files = stats.slice(0, HARD_CAP).map(s => s.f);
|
|
95
|
+
}
|
|
96
|
+
const fileEntries = [];
|
|
97
|
+
// C2: diagnostic counters for solutions dropped during index build.
|
|
98
|
+
// Pre-C2 these were silent `continue` statements — users had no way
|
|
99
|
+
// to know a file existed on disk but was missing from the index
|
|
100
|
+
// (observed cause: auto-compound writing frontmatter with the wrong
|
|
101
|
+
// evidence schema, which made the whole file disappear from searches
|
|
102
|
+
// without any user-visible feedback). Logging them at debug level
|
|
103
|
+
// makes `forgen doctor` / log inspection surface the gap while
|
|
104
|
+
// keeping the normal output quiet.
|
|
105
|
+
let droppedMalformed = 0;
|
|
106
|
+
let droppedRetired = 0;
|
|
107
|
+
let droppedIoError = 0;
|
|
108
|
+
let droppedSymlink = 0;
|
|
109
|
+
for (const file of files) {
|
|
110
|
+
try {
|
|
111
|
+
const filePath = path.join(dir, file);
|
|
112
|
+
// Security: symlink을 통한 임의 파일 읽기 방지 (모든 형식 공통)
|
|
113
|
+
const lst = fs.lstatSync(filePath);
|
|
114
|
+
if (lst.isSymbolicLink()) {
|
|
115
|
+
droppedSymlink++;
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
// A1 performance fix (2026-04-09): short-circuit tiny files
|
|
119
|
+
// before doing YAML parse on the hot path. A valid v3 solution
|
|
120
|
+
// needs at minimum a `---` fence + `name:` + `version:` +
|
|
121
|
+
// `status:` + `confidence:` + `type:` + `scope:` + `tags:` +
|
|
122
|
+
// `identifiers:` + `evidence:` block + closing `---` + a body,
|
|
123
|
+
// which is ~200 bytes at absolute minimum. Files smaller than
|
|
124
|
+
// 64 bytes cannot possibly contain valid frontmatter, so we
|
|
125
|
+
// skip the readFileSync + YAML parse on them. Observed in
|
|
126
|
+
// production: a test that planted 6000 empty .md files was
|
|
127
|
+
// spending the entire 3s hook budget parsing YAML on files
|
|
128
|
+
// that were 0 bytes. The optimization cuts that path from
|
|
129
|
+
// ~7s to ~100ms.
|
|
130
|
+
if (lst.size < 64) {
|
|
131
|
+
droppedMalformed++;
|
|
132
|
+
log.debug(`dropped (file too small: ${lst.size} bytes): ${filePath}`);
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
let content = fs.readFileSync(filePath, 'utf-8');
|
|
136
|
+
const fileMtime = lst.mtimeMs;
|
|
137
|
+
if (!content.trimStart().startsWith('---') && isV1Format(content)) {
|
|
138
|
+
// PR2b: V1→V3 migration도 lock으로 보호. 동시 hook이 같은 V1 파일을
|
|
139
|
+
// 마이그레이션하면 last-writer-wins로 손상될 수 있다. parseSolutionV3를
|
|
140
|
+
// 못 쓰는 케이스라 mutateSolutionFile API 대신 명시적 lock + atomic write.
|
|
141
|
+
try {
|
|
142
|
+
withFileLockSync(filePath, () => {
|
|
143
|
+
const fresh = fs.readFileSync(filePath, 'utf-8');
|
|
144
|
+
if (fresh.trimStart().startsWith('---'))
|
|
145
|
+
return; // 다른 mutator가 이미 마이그레이션
|
|
146
|
+
if (!isV1Format(fresh))
|
|
147
|
+
return;
|
|
148
|
+
const migrated = migrateV1toV3(fresh, filePath);
|
|
149
|
+
atomicWriteText(filePath, migrated);
|
|
150
|
+
content = migrated;
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
catch { /* lock 실패는 non-fatal */ }
|
|
154
|
+
}
|
|
155
|
+
const fm = parseFrontmatterOnly(content);
|
|
156
|
+
if (!fm) {
|
|
157
|
+
droppedMalformed++;
|
|
158
|
+
log.debug(`dropped (malformed frontmatter): ${filePath}`);
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
if (fm.status === 'retired') {
|
|
162
|
+
droppedRetired++;
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
fileEntries.push({
|
|
166
|
+
entry: {
|
|
167
|
+
name: fm.name,
|
|
168
|
+
status: fm.status,
|
|
169
|
+
confidence: fm.confidence,
|
|
170
|
+
type: fm.type,
|
|
171
|
+
scope: dirConfig.scope,
|
|
172
|
+
tags: fm.tags,
|
|
173
|
+
// T2: pre-expand via the shared term normalizer. Once per solution
|
|
174
|
+
// per index build, not once per solution per query. Safe to
|
|
175
|
+
// recompute on rebuild (cheap: O(N_tags) Map lookups).
|
|
176
|
+
normalizedTags: defaultNormalizer.normalizeTerms(fm.tags),
|
|
177
|
+
identifiers: fm.identifiers,
|
|
178
|
+
filePath,
|
|
179
|
+
},
|
|
180
|
+
mtime: fileMtime,
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
catch (e) {
|
|
184
|
+
droppedIoError++;
|
|
185
|
+
log.debug(`dropped (i/o or parse error) ${file}: ${e instanceof Error ? e.message : String(e)}`);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
// Summary log for silently dropped files.
|
|
189
|
+
//
|
|
190
|
+
// Design: the index is rebuilt on every hook invocation when the
|
|
191
|
+
// directory mtime is stale, so a warn-on-every-drop policy would
|
|
192
|
+
// spam stderr on every matching prompt. Instead:
|
|
193
|
+
// - debug-level always: the per-file log calls above already
|
|
194
|
+
// capture each drop path for log inspection / `forgen doctor`
|
|
195
|
+
// - warn-level only when the drop rate is materially high
|
|
196
|
+
// (>10% of files OR >10 files in absolute terms), which
|
|
197
|
+
// indicates a structural problem — e.g. auto-compound writing
|
|
198
|
+
// malformed frontmatter in bulk, not a single one-off file
|
|
199
|
+
// - retired drops are always debug (expected filter semantics)
|
|
200
|
+
//
|
|
201
|
+
// Pre-H-1 (first pass of C2): every non-zero drop warn'd and
|
|
202
|
+
// leaked into test stderr. H-1 downgrades to debug for small counts.
|
|
203
|
+
const totalBad = droppedMalformed + droppedIoError + droppedSymlink;
|
|
204
|
+
const totalScanned = files.length;
|
|
205
|
+
const badRatio = totalScanned > 0 ? totalBad / totalScanned : 0;
|
|
206
|
+
if (totalBad >= 10 || (totalBad > 0 && badRatio >= 0.1)) {
|
|
207
|
+
log.warn(`${dir}: ${droppedMalformed} malformed, ${droppedIoError} i/o errors, ${droppedSymlink} symlinks skipped (${totalBad}/${totalScanned} files)`);
|
|
208
|
+
}
|
|
209
|
+
else if (totalBad > 0) {
|
|
210
|
+
log.debug(`${dir}: ${droppedMalformed} malformed, ${droppedIoError} i/o errors, ${droppedSymlink} symlinks skipped (${totalBad}/${totalScanned} files)`);
|
|
211
|
+
}
|
|
212
|
+
if (droppedRetired > 0) {
|
|
213
|
+
log.debug(`${dir}: ${droppedRetired} retired solutions filtered (expected)`);
|
|
214
|
+
}
|
|
215
|
+
fileEntries.sort((a, b) => b.mtime - a.mtime);
|
|
216
|
+
if (fileEntries.length > SOFT_CAP) {
|
|
217
|
+
console.warn(`[forgen] Warning: ${dir} has ${fileEntries.length} solutions, only the ${SOFT_CAP} most recent are indexed.`);
|
|
218
|
+
}
|
|
219
|
+
const limited = fileEntries.slice(0, SOFT_CAP);
|
|
220
|
+
for (const { entry } of limited) {
|
|
221
|
+
entries.push(entry);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
return { entries, directoryMtimes, builtAt: Date.now() };
|
|
225
|
+
}
|
|
226
|
+
export function getOrBuildIndex(dirs) {
|
|
227
|
+
const sig = dirsSignature(dirs);
|
|
228
|
+
const cached = cachedIndexes.get(sig);
|
|
229
|
+
if (cached && !isIndexStale(cached)) {
|
|
230
|
+
// LRU touch: re-insert으로 가장 최근 사용 표시
|
|
231
|
+
cachedIndexes.delete(sig);
|
|
232
|
+
cachedIndexes.set(sig, cached);
|
|
233
|
+
return cached;
|
|
234
|
+
}
|
|
235
|
+
// Stale rebuild path도 LRU touch — JS Map.set on existing key는
|
|
236
|
+
// insertion order를 갱신하지 않으므로 hot cwd가 자주 invalidate되면
|
|
237
|
+
// 영원히 oldest로 남는다. delete + set으로 강제 reorder.
|
|
238
|
+
cachedIndexes.delete(sig);
|
|
239
|
+
const fresh = buildIndex(dirs);
|
|
240
|
+
cachedIndexes.set(sig, fresh);
|
|
241
|
+
// Evict oldest until size within cap
|
|
242
|
+
while (cachedIndexes.size > MAX_CACHE_ENTRIES) {
|
|
243
|
+
const oldestKey = cachedIndexes.keys().next().value;
|
|
244
|
+
if (oldestKey === undefined)
|
|
245
|
+
break;
|
|
246
|
+
cachedIndexes.delete(oldestKey);
|
|
247
|
+
}
|
|
248
|
+
return fresh;
|
|
249
|
+
}
|
|
250
|
+
export function resetIndexCache() {
|
|
251
|
+
cachedIndexes.clear();
|
|
252
|
+
}
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
import type { ScopeInfo } from '../core/types.js';
|
|
2
|
+
import type { SolutionStatus, SolutionType } from './solution-format.js';
|
|
3
|
+
/**
|
|
4
|
+
* @deprecated Use `defaultNormalizer.normalizeTerms` from
|
|
5
|
+
* `./term-normalizer.js` directly. Kept as a thin wrapper for the existing
|
|
6
|
+
* `synonym-tfidf.test.ts` and any external consumers.
|
|
7
|
+
*/
|
|
8
|
+
export declare function expandTagsWithSynonyms(tags: string[]): string[];
|
|
9
|
+
/** Apply IDF-like weight: common tags get reduced weight */
|
|
10
|
+
export declare function tagWeight(tag: string): number;
|
|
11
|
+
export interface SolutionMatch {
|
|
12
|
+
name: string;
|
|
13
|
+
path: string;
|
|
14
|
+
scope: 'me' | 'team' | 'project';
|
|
15
|
+
relevance: number;
|
|
16
|
+
summary: string;
|
|
17
|
+
status: SolutionStatus;
|
|
18
|
+
confidence: number;
|
|
19
|
+
type: SolutionType;
|
|
20
|
+
tags: string[];
|
|
21
|
+
identifiers: string[];
|
|
22
|
+
matchedTags: string[];
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Optional hints for the v3 `calculateRelevance` path. Used by hot-path
|
|
26
|
+
* callers (matchSolutions, searchSolutions) to avoid re-normalizing the
|
|
27
|
+
* same query tags on every solution.
|
|
28
|
+
*/
|
|
29
|
+
export interface CalculateRelevanceOptions {
|
|
30
|
+
/**
|
|
31
|
+
* Pre-normalized prompt tags (produced by `defaultNormalizer.normalizeTerms`).
|
|
32
|
+
* If provided, skips the per-call expansion. Callers loop-running against
|
|
33
|
+
* many solutions should compute this once outside the loop and pass it in.
|
|
34
|
+
*/
|
|
35
|
+
normalizedPromptTags?: string[];
|
|
36
|
+
/**
|
|
37
|
+
* R4-T1: solution tags expanded with compound-split alternatives
|
|
38
|
+
* (`expandCompoundTags`). When supplied, the intersection/partial-match
|
|
39
|
+
* step uses this set INSTEAD of `solutionTags`, but the Jaccard union
|
|
40
|
+
* denominator still uses `solutionTags` (raw) so the score normalization
|
|
41
|
+
* stays semantically stable. Caller responsibility to pass the matching
|
|
42
|
+
* pair — `solutionTagsExpanded` MUST be a superset of `solutionTags`.
|
|
43
|
+
*/
|
|
44
|
+
solutionTagsExpanded?: string[];
|
|
45
|
+
}
|
|
46
|
+
export declare function calculateRelevance(promptTags: string[], solutionTags: string[], confidence: number, options?: CalculateRelevanceOptions): {
|
|
47
|
+
relevance: number;
|
|
48
|
+
matchedTags: string[];
|
|
49
|
+
};
|
|
50
|
+
/** @deprecated */
|
|
51
|
+
export declare function calculateRelevance(prompt: string, keywords: string[]): number;
|
|
52
|
+
export declare function shouldRejectByR4T3Rules(promptTags: readonly string[], matchedTags: readonly string[]): boolean;
|
|
53
|
+
/**
|
|
54
|
+
* In-memory solution shape for the bootstrap evaluator. Mirrors the index
|
|
55
|
+
* entry fields that `matchSolutions` consumes (tags, identifiers, confidence)
|
|
56
|
+
* but without any filesystem dependency — the evaluator is pure so CI can run
|
|
57
|
+
* it without mounting a starter pack.
|
|
58
|
+
*/
|
|
59
|
+
export interface EvalSolution {
|
|
60
|
+
name: string;
|
|
61
|
+
tags: string[];
|
|
62
|
+
identifiers?: string[];
|
|
63
|
+
confidence: number;
|
|
64
|
+
}
|
|
65
|
+
export interface EvalQuery {
|
|
66
|
+
query: string;
|
|
67
|
+
/** Names that should appear in the top-5. Empty array = expect no match (negative case). */
|
|
68
|
+
expectAnyOf: string[];
|
|
69
|
+
}
|
|
70
|
+
export interface EvalFixture {
|
|
71
|
+
solutions: EvalSolution[];
|
|
72
|
+
positive: EvalQuery[];
|
|
73
|
+
/** Bilingual or compound-word variants that exercise synonym expansion. */
|
|
74
|
+
paraphrase: EvalQuery[];
|
|
75
|
+
/** Unrelated queries that should not return a top-1 hit. */
|
|
76
|
+
negative: EvalQuery[];
|
|
77
|
+
}
|
|
78
|
+
/** Per-bucket metrics. Paraphrase and positive are reported separately so a
|
|
79
|
+
* bilingual regression (T2 synonym change) can't hide inside the aggregate. */
|
|
80
|
+
export interface BucketMetrics {
|
|
81
|
+
/** |{q : ∃i≤5, ranked[i] ∈ q.expectAnyOf}| / |q| */
|
|
82
|
+
recallAt5: number;
|
|
83
|
+
/** Σ (1 / firstMatchRank) / |q|; rank > 5 contributes 0. */
|
|
84
|
+
mrrAt5: number;
|
|
85
|
+
/** |{q : ranked is empty}| / |q| */
|
|
86
|
+
noResultRate: number;
|
|
87
|
+
/** Number of queries in this bucket. */
|
|
88
|
+
total: number;
|
|
89
|
+
}
|
|
90
|
+
export interface EvalResult {
|
|
91
|
+
/** Combined (positive ∪ paraphrase) metrics — backwards-compatible headline numbers. */
|
|
92
|
+
recallAt5: number;
|
|
93
|
+
mrrAt5: number;
|
|
94
|
+
noResultRate: number;
|
|
95
|
+
/**
|
|
96
|
+
* Fraction of negative queries where the matcher returned ≥ 1 candidate
|
|
97
|
+
* (regardless of rank). Name is honest: this is the "any result" rate on
|
|
98
|
+
* the negative bucket, not a rank-1 precision metric. It's the correct
|
|
99
|
+
* baseline for "did synonym/stemming leak into unrelated queries?".
|
|
100
|
+
*/
|
|
101
|
+
negativeAnyResultRate: number;
|
|
102
|
+
/** Per-bucket breakdown — use these to catch paraphrase-only regressions. */
|
|
103
|
+
byBucket: {
|
|
104
|
+
positive: BucketMetrics;
|
|
105
|
+
paraphrase: BucketMetrics;
|
|
106
|
+
};
|
|
107
|
+
total: {
|
|
108
|
+
positive: number;
|
|
109
|
+
paraphrase: number;
|
|
110
|
+
negative: number;
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Round 3 baseline metrics, recorded against the current `term-normalizer`
|
|
115
|
+
* + `calculateRelevance` + fixture `solution-match-bootstrap.json`. Used as
|
|
116
|
+
* a relative regression guard in `tests/solution-matcher-eval.test.ts` —
|
|
117
|
+
* downstream PRs must not regress any field by more than `BASELINE_TOLERANCE`.
|
|
118
|
+
*
|
|
119
|
+
* History (chronological ascending — v1 at top, latest at bottom):
|
|
120
|
+
* - v1 (2026-04-08, fixture v1, 41+10+10 queries): 1.0 / 1.0 / 0.0 / 0.1
|
|
121
|
+
* Recorded against the original 61-query fixture, all positive queries
|
|
122
|
+
* PASS@1. Indicated a measurement plateau but masked the matcher's true
|
|
123
|
+
* ranking and false-positive weaknesses because the fixture queries were
|
|
124
|
+
* too tag-aligned.
|
|
125
|
+
*
|
|
126
|
+
* - v2 (2026-04-08, fixture v2, 53+16+14 queries): 1.0 / 0.969 / 0.0 / 0.357
|
|
127
|
+
* Expanded with 12 hard positive (multi-canonical / compound-tag tug-of-
|
|
128
|
+
* war), 6 Korean subtle paraphrase, and 4 tricky negative queries. The
|
|
129
|
+
* drops are intentional and represent genuine matcher behaviour:
|
|
130
|
+
* * positive mrrAt5 1.0 → 0.959: 4 of 12 added positives rank #2-3:
|
|
131
|
+
* (1) "managing api keys and credentials safely" → secret @3 vs
|
|
132
|
+
* api-error-responses @1 — the `api` canonical in
|
|
133
|
+
* DEFAULT_MATCH_TERMS expands to {api, rest, graphql, endpoint,
|
|
134
|
+
* route}, so query `api` hits BOTH `api` AND `rest` on
|
|
135
|
+
* starter-api-error-responses (matched=['api','rest']) — a
|
|
136
|
+
* double-count numerator. starter-secret-management only scores
|
|
137
|
+
* a single weak partial match on `credential`. The compound
|
|
138
|
+
* `api-key` tag on secret-management is never reached because
|
|
139
|
+
* extractTags strips the query-side hyphen and yields
|
|
140
|
+
* ['api','keys'] (the solution-side tag remains hyphenated in
|
|
141
|
+
* the index but has no query token to intersect with). T4 IDF
|
|
142
|
+
* would down-weight both `api` and `rest`, neutralising the
|
|
143
|
+
* double-count and letting `credential` outscore the noise.
|
|
144
|
+
* (2) "avoiding hardcoded credentials in source code" → secret @2
|
|
145
|
+
* vs code-review @1 — `code` partial-matches `code-review`
|
|
146
|
+
* (len>3, code-review.includes('code')=true) at half weight.
|
|
147
|
+
* secret-management's `credential` matches by partial too but
|
|
148
|
+
* the union size differs.
|
|
149
|
+
* (3) "red green refactor cycle for new features" → tdd @2 vs
|
|
150
|
+
* refactor-safely @1 — `refactor` is a full-weight intersection
|
|
151
|
+
* with both refactor-safely's `refactor` and `리팩토링` (via
|
|
152
|
+
* the refactor canonical), giving 2 hits at 1.0 each. tdd-red-
|
|
153
|
+
* green-refactor only matches the literal compound tag
|
|
154
|
+
* `red-green-refactor` (one weighted hit) — the full-weight
|
|
155
|
+
* generic `refactor` term overpowers the compound-tag specifity.
|
|
156
|
+
* (4) "writing unit tests for a function with side effects" → tdd
|
|
157
|
+
* @2 vs separation-of-concerns @1 — both solutions have a
|
|
158
|
+
* SINGLE matching tag with weighted score 0.5: separation gets
|
|
159
|
+
* `function` (COMMON_TAG, exact intersection, weight 0.5);
|
|
160
|
+
* tdd-red-green-refactor gets `tests` partial-matching `test`
|
|
161
|
+
* (len>3, partial weight 1.0 × 0.5 = 0.5). Both numerators are
|
|
162
|
+
* identical. Separation wins because the `function` co-occurs
|
|
163
|
+
* in both promptTags and solution.tags, shrinking its Jaccard
|
|
164
|
+
* union by one element vs tdd's — a 1-element union-size
|
|
165
|
+
* advantage drives the entire ranking. starter-dependency-
|
|
166
|
+
* injection is *not* in top-5 despite having `testing`/`mock`/
|
|
167
|
+
* `dependency` tags (`tests` does not partial-match `testing`
|
|
168
|
+
* — neither is a substring of the other), so listing `di` in
|
|
169
|
+
* expectAnyOf is purely defensive recall, not a live candidate.
|
|
170
|
+
* T4 BM25 with proper length normalization would attack the
|
|
171
|
+
* union-size tie-breaker more rigorously than current Jaccard.
|
|
172
|
+
* * paraphrase mrrAt5 stays at 1.0: all 6 added Korean paraphrases
|
|
173
|
+
* rank @1 (the originally hard "테스트 먼저 작성하고 리팩토링" is
|
|
174
|
+
* documented in the fixture as legitimately matching either tdd
|
|
175
|
+
* OR refactor-safely, since starter-refactor-safely's README also
|
|
176
|
+
* covers test-first workflows — both are defensible answers).
|
|
177
|
+
* * negativeAnyResultRate 0.1 → 0.357: 4 added tricky negatives all
|
|
178
|
+
* trigger false positives via single common dev-adjacent words —
|
|
179
|
+
* "performance review meeting notes" → caching (matches
|
|
180
|
+
* `performance`), "system architecture overview document" →
|
|
181
|
+
* separation-of-concerns (matches `architecture`), "database backup
|
|
182
|
+
* recovery procedure" → n-plus-one-queries (matches `database`,
|
|
183
|
+
* `query`, `데이터베이스`), "validation of insurance claims" →
|
|
184
|
+
* error-handling (matches `validation`).
|
|
185
|
+
* The original Round 3 plan staged these for T4 (BM25 + IDF). T4 was
|
|
186
|
+
* EMPIRICALLY SKIPPED on 2026-04-08 — see
|
|
187
|
+
* `docs/plans/2026-04-08-t4-bm25-skip-adr.md` for the full decision
|
|
188
|
+
* record. Summary: BM25 prototypes (naive, hybrid Jaccard×IDF,
|
|
189
|
+
* precision filter, soft penalty) all matched or underperformed the
|
|
190
|
+
* current scorer on every metric. The starter corpus (N=15) is too
|
|
191
|
+
* small for IDF to be informative, and the false positives are
|
|
192
|
+
* semantic ("performance" is both a dev tag and an English noun) — not
|
|
193
|
+
* statistical, so no frequency-based weighting can fix them. The real
|
|
194
|
+
* follow-up candidates are tokenizer fix for compound tags, an n-gram
|
|
195
|
+
* phrase matcher, and corpus growth — all deferred to Round 4 per the
|
|
196
|
+
* ADR.
|
|
197
|
+
*
|
|
198
|
+
* - v3 (2026-04-08, fixture v2 + R4-T1 compound-tag fix): 1.0 / 0.986 / 0.0 / 0.357
|
|
199
|
+
* R4-T1 added `expandCompoundTags` (solution-side) and
|
|
200
|
+
* `expandQueryBigrams` (query-side) so hyphenated solution tags like
|
|
201
|
+
* `api-key`, `code-review`, `red-green-refactor` participate in direct
|
|
202
|
+
* intersection rather than relying on the half-weight partialMatches
|
|
203
|
+
* fallback. positive `mrrAt5` improved 0.959 → 0.981 (+0.022). 2 of
|
|
204
|
+
* the 4 v2 hard positive cases were resolved (`managing api keys and
|
|
205
|
+
* credentials safely` and `red green refactor cycle for new features`
|
|
206
|
+
* now rank @1). The remaining 2 (`avoiding hardcoded credentials …`
|
|
207
|
+
* and `writing unit tests for a function with side effects`) require
|
|
208
|
+
* R4-T2 (phrase matcher) or R4-T3 (specificity classifier) — they're
|
|
209
|
+
* about query-side English semantics, not compound-tag tokenization.
|
|
210
|
+
* `negativeAnyResultRate` is unchanged at 0.357 because R4-T1 is a
|
|
211
|
+
* ranking-quality fix, not a false-positive filter.
|
|
212
|
+
*
|
|
213
|
+
* - v4 (2026-04-08, fixture v2 + R4-T1 + R4-T2 phrase blocklist):
|
|
214
|
+
* 1.0 / 0.986 / 0.0 / 0.143
|
|
215
|
+
* R4-T2 added `phrase-blocklist.ts` with 17 curated 2-word English
|
|
216
|
+
* non-dev compounds ("performance review", "system architecture",
|
|
217
|
+
* "database backup", etc.) and a `maskBlockedTokens` step at the
|
|
218
|
+
* top of `rankCandidates` and `searchSolutions`. When a query
|
|
219
|
+
* contains a blocked phrase, the constituent tokens are removed
|
|
220
|
+
* from the prompt tag list before bigram expansion / canonical
|
|
221
|
+
* normalization runs — so the false-positive evidence is removed
|
|
222
|
+
* at the source rather than demoted in scoring.
|
|
223
|
+
*
|
|
224
|
+
* `negativeAnyResultRate` dropped 0.357 → 0.143 (3 of 5 v2 trigger
|
|
225
|
+
* negatives fully blocked):
|
|
226
|
+
* * "performance review meeting notes" — blocked via
|
|
227
|
+
* `performance review` + `meeting notes`
|
|
228
|
+
* * "system architecture overview document" — blocked via
|
|
229
|
+
* `system architecture` + `overview document`
|
|
230
|
+
* * "solar system planets astronomy" — blocked via `solar system`
|
|
231
|
+
*
|
|
232
|
+
* 2 false positives remain (both deferred to R4-T3 query-side
|
|
233
|
+
* specificity classifier — the residuals share a common shape:
|
|
234
|
+
* a single dev-tag homograph survives whatever masking is applied,
|
|
235
|
+
* and the term-normalizer expansion still surfaces a false match):
|
|
236
|
+
*
|
|
237
|
+
* * "database backup recovery procedure" → error-handling-patterns:
|
|
238
|
+
* `database backup` is blocked, but the residual tokens
|
|
239
|
+
* {`recovery`, `procedure`} survive. `recovery` is in the
|
|
240
|
+
* `handling` canonical's matchTerms (intentional, for legitimate
|
|
241
|
+
* "error recovery handler" queries), so the masked query still
|
|
242
|
+
* hits `starter-error-handling-patterns` via the handling
|
|
243
|
+
* family. A 3-word `recovery procedure` blocklist entry was
|
|
244
|
+
* considered and rejected — it would silently mask legitimate
|
|
245
|
+
* dev SRE queries like "disaster recovery procedure" or
|
|
246
|
+
* "rollback recovery procedure" without a fixture-driven
|
|
247
|
+
* signal. The right fix is at the query-specificity layer
|
|
248
|
+
* (R4-T3): require ≥ 2 distinct dev-context signals before any
|
|
249
|
+
* match is returned, not at the phrase-blocklist layer.
|
|
250
|
+
*
|
|
251
|
+
* * "validation of insurance claims" → error-handling-patterns:
|
|
252
|
+
* `insurance claim` is blocked, but the residual `validation`
|
|
253
|
+
* token IS a legitimate dev tag (input-validation,
|
|
254
|
+
* error-handling-patterns both have it). Same R4-T3 target.
|
|
255
|
+
*
|
|
256
|
+
* positive/paraphrase mrrAt5 are unchanged from v3 because no
|
|
257
|
+
* legitimate dev query in the fixture contains a blocked phrase.
|
|
258
|
+
*
|
|
259
|
+
* - v5 (2026-04-08, fixture v2 + R4-T1 + R4-T2 + R4-T3 specificity guards):
|
|
260
|
+
* 1.0 / 0.986 / 0.0 / 0.000
|
|
261
|
+
* R4-T3 added two narrow precision rules at the ORCHESTRATION LAYER —
|
|
262
|
+
* NOT inside `calculateRelevance` (which remains a pure scoring
|
|
263
|
+
* function for test symmetry). The rules are implemented as the
|
|
264
|
+
* exported helper `shouldRejectByR4T3Rules(promptTags, matchedTags)`
|
|
265
|
+
* and called from both `rankCandidates` (hook path) and
|
|
266
|
+
* `searchSolutions` (MCP path) right after the per-solution
|
|
267
|
+
* `calculateRelevance` call:
|
|
268
|
+
* (Rule A) single-token query AND single-tag match → reject;
|
|
269
|
+
* (Rule B) single-tag match with no literal hit in the prompt
|
|
270
|
+
* (verbatim match, or substring partial length > 3, or
|
|
271
|
+
* shared prefix ≥ 4 for morphological stems) → reject.
|
|
272
|
+
* Both rules are scoped narrowly enough to fix exactly the 2 R4-T2
|
|
273
|
+
* residuals without recall regression — every fixture positive and
|
|
274
|
+
* paraphrase still ranks identically:
|
|
275
|
+
* * "validation of insurance claims" → masked to `[validation]`
|
|
276
|
+
* (length 1) with single-tag match `validation` → Rule A reject.
|
|
277
|
+
* * "database backup recovery procedure" → masked to
|
|
278
|
+
* `[recovery, procedure]` with single-tag match `handling`
|
|
279
|
+
* (zero literal hit; `handling` is reached via the `recovery`
|
|
280
|
+
* canonical-family expansion in term-normalizer) → Rule B reject.
|
|
281
|
+
* `negativeAnyResultRate` is now 0.000 — every fixture v2 negative
|
|
282
|
+
* produces zero candidates. positive/paraphrase metrics unchanged
|
|
283
|
+
* from v4 because no fixture positive matches the (single-token AND
|
|
284
|
+
* single-tag) or (all-expansion AND single-tag) shape.
|
|
285
|
+
*
|
|
286
|
+
* Escape hatch: identifier-boost evidence (hook path) or name-match
|
|
287
|
+
* evidence (MCP path) BYPASSES the R4-T3 rules. A candidate with
|
|
288
|
+
* even a single weak tag match plus an identifier hit still
|
|
289
|
+
* surfaces — the precision rules only fire when the candidate's
|
|
290
|
+
* entire evidence pool is a single ambiguous tag.
|
|
291
|
+
*
|
|
292
|
+
* Defensive precision note: Rule B's "shared prefix ≥ 4"
|
|
293
|
+
* morphological check is currently NOT fixture-driven (no fixture
|
|
294
|
+
* query masks down to the `caching/cache`-style morphological gap).
|
|
295
|
+
* It exists as a pre-emptive fix against silently rejecting
|
|
296
|
+
* legitimate future queries where the term-normalizer synonym
|
|
297
|
+
* expansion is the only bridge between the query token and the
|
|
298
|
+
* solution tag. If a production query surfaces a case the prefix
|
|
299
|
+
* check misses, extend it (e.g. by lowering the threshold or
|
|
300
|
+
* adding a Levenshtein-1 check) rather than removing it.
|
|
301
|
+
*
|
|
302
|
+
* Known matcher quirks (separate from the T4 BM25 investigation):
|
|
303
|
+
* - `term-normalizer.ts` `error` canonical contains `debug` as a matchTerm
|
|
304
|
+
* (intentional for `bug → error` recall), which causes any prompt
|
|
305
|
+
* containing `error` to expand to `debug` and over-rank
|
|
306
|
+
* `starter-debugging-systematic` on otherwise unrelated queries. This
|
|
307
|
+
* is why `async await error propagation` could not be added as a hard
|
|
308
|
+
* case — the matcher returns debugging-systematic at #1, which is
|
|
309
|
+
* defensible-but-noisy. The fix is at the normalizer level (split
|
|
310
|
+
* `debug` out of the `error` family or remove the `error → debug`
|
|
311
|
+
* edge entirely) and is queued as a Round 4 follow-up. T4 BM25 was
|
|
312
|
+
* considered as a partial mitigation but the T4 skip ADR (referenced
|
|
313
|
+
* in the Round 3 outcome paragraph above) shows it does not help.
|
|
314
|
+
*
|
|
315
|
+
* Long-tail caveat:
|
|
316
|
+
* - `"trying to handle authentication errors gracefully when our backend
|
|
317
|
+
* api returns inconsistent response formats from different
|
|
318
|
+
* microservices"` is a 17-word query intentionally added to exercise
|
|
319
|
+
* long-tail behaviour. Currently PASS@1. Originally flagged as BM25
|
|
320
|
+
* length-normalization sensitive, but since T4 BM25 was skipped this
|
|
321
|
+
* caveat is now informational only — no length-norm code path is
|
|
322
|
+
* planned in Round 3.
|
|
323
|
+
*
|
|
324
|
+
* If a PR legitimately improves a metric, update this constant in the same
|
|
325
|
+
* commit so future PRs guard against the new floor.
|
|
326
|
+
*/
|
|
327
|
+
export declare const ROUND3_BASELINE: EvalResult;
|
|
328
|
+
/** Maximum allowed absolute regression per metric. 5% is tight enough to catch
|
|
329
|
+
* ~3-4 query regressions in a 69-query combined bucket (positive+paraphrase)
|
|
330
|
+
* but lenient enough that a single fixture edit won't spuriously fail the
|
|
331
|
+
* guard. */
|
|
332
|
+
export declare const BASELINE_TOLERANCE = 0.05;
|
|
333
|
+
/**
|
|
334
|
+
* Test/diagnostic helper: evaluate one query against a fixture solution set
|
|
335
|
+
* and return the top-5 ranked candidates with their relevance + matched tags.
|
|
336
|
+
*
|
|
337
|
+
* Exists so per-query regression tests (e.g. the R4-T1 hard-positive guards
|
|
338
|
+
* in `tests/solution-matcher-eval.test.ts`) can assert specific ranking
|
|
339
|
+
* outcomes without scraping aggregate metrics. Wraps `rankCandidates` so
|
|
340
|
+
* the test path stays in sync with the production ranker.
|
|
341
|
+
*
|
|
342
|
+
* Returns the same shape as `rankCandidates` minus the generic carrier:
|
|
343
|
+
* `{name, relevance, matchedTags}`. Use the names to assert "expected
|
|
344
|
+
* solution at rank 1".
|
|
345
|
+
*/
|
|
346
|
+
export declare function evaluateQuery(query: string, solutions: readonly EvalSolution[]): Array<{
|
|
347
|
+
name: string;
|
|
348
|
+
relevance: number;
|
|
349
|
+
matchedTags: string[];
|
|
350
|
+
}>;
|
|
351
|
+
/**
|
|
352
|
+
* Evaluate the current matcher against a labeled fixture and return IR
|
|
353
|
+
* metrics. This is the Round 3 baseline — each downstream PR (T2/T3/T4) must
|
|
354
|
+
* not regress any of the thresholds asserted in `solution-matcher-eval.test.ts`.
|
|
355
|
+
*
|
|
356
|
+
* Uses `rankCandidates` (shared with `matchSolutions`) so the evaluator can't
|
|
357
|
+
* silently drift from production ranking behaviour.
|
|
358
|
+
*
|
|
359
|
+
* Metrics are reported both aggregated (positive ∪ paraphrase) and per-bucket,
|
|
360
|
+
* so paraphrase-only regressions surface in `byBucket.paraphrase` even if the
|
|
361
|
+
* aggregate looks fine.
|
|
362
|
+
*/
|
|
363
|
+
export declare function evaluateSolutionMatcher(fixture: EvalFixture): EvalResult;
|
|
364
|
+
export declare function matchSolutions(prompt: string, scope: ScopeInfo, cwd: string): SolutionMatch[];
|