@wooojin/forgen 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +20 -0
- package/CHANGELOG.md +353 -0
- package/CONTRIBUTING.md +98 -0
- package/LICENSE +21 -0
- package/README.ja.md +469 -0
- package/README.ko.md +469 -0
- package/README.md +483 -0
- package/README.zh.md +469 -0
- package/agents/analyst.md +98 -0
- package/agents/architect.md +62 -0
- package/agents/code-reviewer.md +120 -0
- package/agents/code-simplifier.md +197 -0
- package/agents/critic.md +70 -0
- package/agents/debugger.md +117 -0
- package/agents/designer.md +131 -0
- package/agents/executor.md +54 -0
- package/agents/explore.md +145 -0
- package/agents/git-master.md +212 -0
- package/agents/performance-reviewer.md +172 -0
- package/agents/planner.md +29 -0
- package/agents/qa-tester.md +158 -0
- package/agents/refactoring-expert.md +168 -0
- package/agents/scientist.md +144 -0
- package/agents/security-reviewer.md +137 -0
- package/agents/test-engineer.md +153 -0
- package/agents/verifier.md +133 -0
- package/agents/writer.md +184 -0
- package/commands/api-design.md +268 -0
- package/commands/architecture-decision.md +314 -0
- package/commands/ci-cd.md +270 -0
- package/commands/code-review.md +233 -0
- package/commands/compound.md +117 -0
- package/commands/database.md +263 -0
- package/commands/debug-detective.md +99 -0
- package/commands/docker.md +274 -0
- package/commands/documentation.md +276 -0
- package/commands/ecomode.md +51 -0
- package/commands/frontend.md +271 -0
- package/commands/git-master.md +90 -0
- package/commands/incident-response.md +292 -0
- package/commands/migrate.md +101 -0
- package/commands/performance.md +288 -0
- package/commands/refactor.md +105 -0
- package/commands/security-review.md +288 -0
- package/commands/tdd.md +183 -0
- package/commands/testing-strategy.md +265 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +295 -0
- package/dist/core/auto-compound-runner.d.ts +12 -0
- package/dist/core/auto-compound-runner.js +460 -0
- package/dist/core/config-hooks.d.ts +10 -0
- package/dist/core/config-hooks.js +112 -0
- package/dist/core/config-injector.d.ts +50 -0
- package/dist/core/config-injector.js +455 -0
- package/dist/core/doctor.d.ts +1 -0
- package/dist/core/doctor.js +163 -0
- package/dist/core/errors.d.ts +81 -0
- package/dist/core/errors.js +133 -0
- package/dist/core/global-config.d.ts +43 -0
- package/dist/core/global-config.js +25 -0
- package/dist/core/harness.d.ts +24 -0
- package/dist/core/harness.js +621 -0
- package/dist/core/init.d.ts +7 -0
- package/dist/core/init.js +37 -0
- package/dist/core/inspect-cli.d.ts +7 -0
- package/dist/core/inspect-cli.js +47 -0
- package/dist/core/legacy-detector.d.ts +33 -0
- package/dist/core/legacy-detector.js +66 -0
- package/dist/core/logger.d.ts +34 -0
- package/dist/core/logger.js +121 -0
- package/dist/core/mcp-config.d.ts +44 -0
- package/dist/core/mcp-config.js +177 -0
- package/dist/core/notepad.d.ts +31 -0
- package/dist/core/notepad.js +88 -0
- package/dist/core/paths.d.ts +85 -0
- package/dist/core/paths.js +101 -0
- package/dist/core/plugin-detector.d.ts +44 -0
- package/dist/core/plugin-detector.js +226 -0
- package/dist/core/runtime-detector.d.ts +8 -0
- package/dist/core/runtime-detector.js +49 -0
- package/dist/core/scope-resolver.d.ts +8 -0
- package/dist/core/scope-resolver.js +45 -0
- package/dist/core/session-logger.d.ts +6 -0
- package/dist/core/session-logger.js +111 -0
- package/dist/core/session-store.d.ts +28 -0
- package/dist/core/session-store.js +218 -0
- package/dist/core/settings-lock.d.ts +18 -0
- package/dist/core/settings-lock.js +125 -0
- package/dist/core/spawn.d.ts +3 -0
- package/dist/core/spawn.js +135 -0
- package/dist/core/types.d.ts +108 -0
- package/dist/core/types.js +1 -0
- package/dist/core/uninstall.d.ts +4 -0
- package/dist/core/uninstall.js +307 -0
- package/dist/core/v1-bootstrap.d.ts +26 -0
- package/dist/core/v1-bootstrap.js +155 -0
- package/dist/engine/compound-cli.d.ts +24 -0
- package/dist/engine/compound-cli.js +250 -0
- package/dist/engine/compound-extractor.d.ts +68 -0
- package/dist/engine/compound-extractor.js +860 -0
- package/dist/engine/compound-lifecycle.d.ts +32 -0
- package/dist/engine/compound-lifecycle.js +305 -0
- package/dist/engine/compound-loop.d.ts +32 -0
- package/dist/engine/compound-loop.js +511 -0
- package/dist/engine/match-eval-log.d.ts +139 -0
- package/dist/engine/match-eval-log.js +270 -0
- package/dist/engine/phrase-blocklist.d.ts +119 -0
- package/dist/engine/phrase-blocklist.js +208 -0
- package/dist/engine/skill-promoter.d.ts +20 -0
- package/dist/engine/skill-promoter.js +115 -0
- package/dist/engine/solution-format.d.ts +160 -0
- package/dist/engine/solution-format.js +432 -0
- package/dist/engine/solution-index.d.ts +13 -0
- package/dist/engine/solution-index.js +252 -0
- package/dist/engine/solution-matcher.d.ts +364 -0
- package/dist/engine/solution-matcher.js +656 -0
- package/dist/engine/solution-writer.d.ts +76 -0
- package/dist/engine/solution-writer.js +157 -0
- package/dist/engine/term-matcher.d.ts +81 -0
- package/dist/engine/term-matcher.js +268 -0
- package/dist/engine/term-normalizer.d.ts +116 -0
- package/dist/engine/term-normalizer.js +171 -0
- package/dist/fgx.d.ts +6 -0
- package/dist/fgx.js +42 -0
- package/dist/forge/cli.d.ts +11 -0
- package/dist/forge/cli.js +100 -0
- package/dist/forge/evidence-processor.d.ts +21 -0
- package/dist/forge/evidence-processor.js +87 -0
- package/dist/forge/mismatch-detector.d.ts +44 -0
- package/dist/forge/mismatch-detector.js +83 -0
- package/dist/forge/onboarding-cli.d.ts +6 -0
- package/dist/forge/onboarding-cli.js +89 -0
- package/dist/forge/onboarding.d.ts +25 -0
- package/dist/forge/onboarding.js +122 -0
- package/dist/hooks/compound-reflection.d.ts +45 -0
- package/dist/hooks/compound-reflection.js +82 -0
- package/dist/hooks/context-guard.d.ts +24 -0
- package/dist/hooks/context-guard.js +156 -0
- package/dist/hooks/dangerous-patterns.json +18 -0
- package/dist/hooks/db-guard.d.ts +17 -0
- package/dist/hooks/db-guard.js +105 -0
- package/dist/hooks/hook-config.d.ts +29 -0
- package/dist/hooks/hook-config.js +92 -0
- package/dist/hooks/hook-registry.d.ts +43 -0
- package/dist/hooks/hook-registry.js +31 -0
- package/dist/hooks/hooks-generator.d.ts +49 -0
- package/dist/hooks/hooks-generator.js +99 -0
- package/dist/hooks/intent-classifier.d.ts +12 -0
- package/dist/hooks/intent-classifier.js +62 -0
- package/dist/hooks/keyword-detector.d.ts +25 -0
- package/dist/hooks/keyword-detector.js +389 -0
- package/dist/hooks/notepad-injector.d.ts +18 -0
- package/dist/hooks/notepad-injector.js +51 -0
- package/dist/hooks/permission-handler.d.ts +14 -0
- package/dist/hooks/permission-handler.js +114 -0
- package/dist/hooks/post-tool-failure.d.ts +11 -0
- package/dist/hooks/post-tool-failure.js +118 -0
- package/dist/hooks/post-tool-handlers.d.ts +17 -0
- package/dist/hooks/post-tool-handlers.js +115 -0
- package/dist/hooks/post-tool-use.d.ts +29 -0
- package/dist/hooks/post-tool-use.js +151 -0
- package/dist/hooks/pre-compact.d.ts +10 -0
- package/dist/hooks/pre-compact.js +165 -0
- package/dist/hooks/pre-tool-use.d.ts +31 -0
- package/dist/hooks/pre-tool-use.js +325 -0
- package/dist/hooks/prompt-injection-filter.d.ts +56 -0
- package/dist/hooks/prompt-injection-filter.js +287 -0
- package/dist/hooks/rate-limiter.d.ts +21 -0
- package/dist/hooks/rate-limiter.js +86 -0
- package/dist/hooks/secret-filter.d.ts +14 -0
- package/dist/hooks/secret-filter.js +65 -0
- package/dist/hooks/session-recovery.d.ts +27 -0
- package/dist/hooks/session-recovery.js +406 -0
- package/dist/hooks/shared/atomic-write.d.ts +41 -0
- package/dist/hooks/shared/atomic-write.js +148 -0
- package/dist/hooks/shared/context-budget.d.ts +37 -0
- package/dist/hooks/shared/context-budget.js +45 -0
- package/dist/hooks/shared/file-lock.d.ts +56 -0
- package/dist/hooks/shared/file-lock.js +253 -0
- package/dist/hooks/shared/hook-response.d.ts +33 -0
- package/dist/hooks/shared/hook-response.js +62 -0
- package/dist/hooks/shared/injection-caps.d.ts +39 -0
- package/dist/hooks/shared/injection-caps.js +52 -0
- package/dist/hooks/shared/plugin-signal.d.ts +23 -0
- package/dist/hooks/shared/plugin-signal.js +104 -0
- package/dist/hooks/shared/read-stdin.d.ts +8 -0
- package/dist/hooks/shared/read-stdin.js +63 -0
- package/dist/hooks/shared/sanitize-id.d.ts +7 -0
- package/dist/hooks/shared/sanitize-id.js +9 -0
- package/dist/hooks/shared/sanitize.d.ts +7 -0
- package/dist/hooks/shared/sanitize.js +22 -0
- package/dist/hooks/skill-injector.d.ts +38 -0
- package/dist/hooks/skill-injector.js +285 -0
- package/dist/hooks/slop-detector.d.ts +18 -0
- package/dist/hooks/slop-detector.js +93 -0
- package/dist/hooks/solution-injector.d.ts +58 -0
- package/dist/hooks/solution-injector.js +436 -0
- package/dist/hooks/subagent-tracker.d.ts +10 -0
- package/dist/hooks/subagent-tracker.js +90 -0
- package/dist/i18n/index.d.ts +43 -0
- package/dist/i18n/index.js +224 -0
- package/dist/lib.d.ts +14 -0
- package/dist/lib.js +14 -0
- package/dist/mcp/server.d.ts +8 -0
- package/dist/mcp/server.js +40 -0
- package/dist/mcp/solution-reader.d.ts +90 -0
- package/dist/mcp/solution-reader.js +273 -0
- package/dist/mcp/tools.d.ts +16 -0
- package/dist/mcp/tools.js +302 -0
- package/dist/preset/facet-catalog.d.ts +17 -0
- package/dist/preset/facet-catalog.js +46 -0
- package/dist/preset/preset-manager.d.ts +31 -0
- package/dist/preset/preset-manager.js +111 -0
- package/dist/renderer/inspect-renderer.d.ts +11 -0
- package/dist/renderer/inspect-renderer.js +123 -0
- package/dist/renderer/rule-renderer.d.ts +18 -0
- package/dist/renderer/rule-renderer.js +159 -0
- package/dist/store/evidence-store.d.ts +23 -0
- package/dist/store/evidence-store.js +58 -0
- package/dist/store/profile-store.d.ts +12 -0
- package/dist/store/profile-store.js +53 -0
- package/dist/store/recommendation-store.d.ts +22 -0
- package/dist/store/recommendation-store.js +64 -0
- package/dist/store/rule-store.d.ts +22 -0
- package/dist/store/rule-store.js +62 -0
- package/dist/store/session-state-store.d.ts +11 -0
- package/dist/store/session-state-store.js +44 -0
- package/dist/store/types.d.ts +159 -0
- package/dist/store/types.js +7 -0
- package/hooks/hook-registry.json +21 -0
- package/hooks/hooks.json +185 -0
- package/package.json +89 -0
- package/plugin.json +20 -0
- package/scripts/postinstall.js +826 -0
- package/skills/api-design/SKILL.md +262 -0
- package/skills/architecture-decision/SKILL.md +309 -0
- package/skills/ci-cd/SKILL.md +264 -0
- package/skills/code-review/SKILL.md +228 -0
- package/skills/compound/SKILL.md +101 -0
- package/skills/database/SKILL.md +257 -0
- package/skills/debug-detective/SKILL.md +95 -0
- package/skills/docker/SKILL.md +268 -0
- package/skills/documentation/SKILL.md +270 -0
- package/skills/ecomode/SKILL.md +46 -0
- package/skills/frontend/SKILL.md +265 -0
- package/skills/git-master/SKILL.md +86 -0
- package/skills/incident-response/SKILL.md +286 -0
- package/skills/migrate/SKILL.md +96 -0
- package/skills/performance/SKILL.md +282 -0
- package/skills/refactor/SKILL.md +100 -0
- package/skills/security-review/SKILL.md +282 -0
- package/skills/tdd/SKILL.md +178 -0
- package/skills/testing-strategy/SKILL.md +260 -0
- package/starter-pack/solutions/starter-api-error-responses.md +37 -0
- package/starter-pack/solutions/starter-async-patterns.md +40 -0
- package/starter-pack/solutions/starter-caching-strategy.md +40 -0
- package/starter-pack/solutions/starter-code-review-checklist.md +39 -0
- package/starter-pack/solutions/starter-debugging-systematic.md +40 -0
- package/starter-pack/solutions/starter-dependency-injection.md +40 -0
- package/starter-pack/solutions/starter-error-handling-patterns.md +38 -0
- package/starter-pack/solutions/starter-git-atomic-commits.md +36 -0
- package/starter-pack/solutions/starter-input-validation.md +40 -0
- package/starter-pack/solutions/starter-n-plus-one-queries.md +37 -0
- package/starter-pack/solutions/starter-refactor-safely.md +38 -0
- package/starter-pack/solutions/starter-secret-management.md +37 -0
- package/starter-pack/solutions/starter-separation-of-concerns.md +36 -0
- package/starter-pack/solutions/starter-tdd-red-green-refactor.md +40 -0
- package/starter-pack/solutions/starter-typescript-strict-types.md +39 -0
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
import * as path from 'node:path';
|
|
2
|
+
import { ME_SOLUTIONS, PACKS_DIR } from '../core/paths.js';
|
|
3
|
+
import { extractTags, expandCompoundTags, expandQueryBigrams } from './solution-format.js';
|
|
4
|
+
import { getOrBuildIndex } from './solution-index.js';
|
|
5
|
+
import { defaultNormalizer } from './term-normalizer.js';
|
|
6
|
+
import { maskBlockedTokens } from './phrase-blocklist.js';
|
|
7
|
+
// ── Synonym expansion (delegates to term-normalizer) ──
|
|
8
|
+
//
|
|
9
|
+
// The old `SYNONYM_MAP` + `expandTagsWithSynonyms` pair had two problems:
|
|
10
|
+
// 1. The reverse-lookup `Object.entries(SYNONYM_MAP).filter(v => v.includes(tag))`
|
|
11
|
+
// was O(N) per term and ran once per (query, solution) pair — quadratic
|
|
12
|
+
// on the solution count.
|
|
13
|
+
// 2. Korean↔English cross-mapping was maintained as two separate map entries
|
|
14
|
+
// that drifted (fixed in 5.1.2 but fragile).
|
|
15
|
+
//
|
|
16
|
+
// Both are now handled by `src/engine/term-normalizer.ts`. See that file for
|
|
17
|
+
// the canonical registry (`DEFAULT_MATCH_TERMS`) and the `buildTermNormalizer`
|
|
18
|
+
// implementation. Reverse lookup is an O(1) `Map<term, canonicals>` fetch.
|
|
19
|
+
//
|
|
20
|
+
// The export below is kept as a thin backwards-compatible wrapper so
|
|
21
|
+
// downstream callers (and the existing `synonym-tfidf.test.ts` spot-checks)
|
|
22
|
+
// continue to work — but the hot path in this module now passes
|
|
23
|
+
// pre-normalized query tags via the new `calculateRelevance` options arg
|
|
24
|
+
// and skips the wrapper entirely.
|
|
25
|
+
/**
|
|
26
|
+
* @deprecated Use `defaultNormalizer.normalizeTerms` from
|
|
27
|
+
* `./term-normalizer.js` directly. Kept as a thin wrapper for the existing
|
|
28
|
+
* `synonym-tfidf.test.ts` and any external consumers.
|
|
29
|
+
*/
|
|
30
|
+
export function expandTagsWithSynonyms(tags) {
|
|
31
|
+
return defaultNormalizer.normalizeTerms(tags);
|
|
32
|
+
}
|
|
33
|
+
// ── TF-IDF weighting for common tags ──
|
|
34
|
+
/** High-frequency tags that should be weighted lower */
|
|
35
|
+
const COMMON_TAGS = new Set([
|
|
36
|
+
'typescript', 'ts', 'javascript', 'js', 'fix', 'update', 'add', 'change',
|
|
37
|
+
'file', 'code', 'function', 'import', 'export', 'error', 'type', 'string',
|
|
38
|
+
'number', 'object', 'array', 'return', 'const', 'class', 'module',
|
|
39
|
+
'코드', '파일', '함수', '수정', '추가', '변경', '에러', '타입',
|
|
40
|
+
]);
|
|
41
|
+
/** Apply IDF-like weight: common tags get reduced weight */
|
|
42
|
+
export function tagWeight(tag) {
|
|
43
|
+
return COMMON_TAGS.has(tag) ? 0.5 : 1.0;
|
|
44
|
+
}
|
|
45
|
+
export function calculateRelevance(promptOrTags, keywordsOrTags, confidence, options) {
|
|
46
|
+
if (typeof promptOrTags === 'string') {
|
|
47
|
+
// Legacy mode: substring matching for backwards compatibility.
|
|
48
|
+
// Not a hot path — only hit by the (old) solution-matcher.test.ts cases.
|
|
49
|
+
const promptTags = extractTags(promptOrTags);
|
|
50
|
+
const intersection = keywordsOrTags.filter(kw => promptTags.some(pt => pt === kw || (pt.length > 3 && kw.length > 3 && (pt.startsWith(kw) || kw.startsWith(pt)))));
|
|
51
|
+
return Math.min(1, intersection.length / Math.max(promptTags.length * 0.5, 1));
|
|
52
|
+
}
|
|
53
|
+
// v3 mode: tag matching with synonym expansion + TF-IDF weighting.
|
|
54
|
+
//
|
|
55
|
+
// T2: the synonym expansion is now a hash-indexed lookup via
|
|
56
|
+
// `defaultNormalizer.normalizeTerms` (see term-normalizer.ts). Callers in
|
|
57
|
+
// the hot path pre-compute the expansion once per query and pass it via
|
|
58
|
+
// `options.normalizedPromptTags`, so this function no longer repeats the
|
|
59
|
+
// work per solution.
|
|
60
|
+
const expandedPromptTags = options?.normalizedPromptTags
|
|
61
|
+
?? defaultNormalizer.normalizeTerms(promptOrTags);
|
|
62
|
+
// R4-T1: when the caller supplies a compound-expanded solution tag set,
|
|
63
|
+
// intersection and partial matching run against the expanded set (so
|
|
64
|
+
// `api-key` matches `api`/`key` queries via the split parts), but the
|
|
65
|
+
// Jaccard union denominator below still uses the RAW `keywordsOrTags`
|
|
66
|
+
// for normalization stability.
|
|
67
|
+
const matchTags = options?.solutionTagsExpanded ?? keywordsOrTags;
|
|
68
|
+
const intersection = matchTags.filter(t => expandedPromptTags.includes(t));
|
|
69
|
+
// partial/substring matches for longer tags (>3 chars)
|
|
70
|
+
const partialMatches = matchTags.filter(t => t.length > 3 && !intersection.includes(t)
|
|
71
|
+
&& expandedPromptTags.some(pt => pt.length > 3 && (pt.includes(t) || t.includes(pt))));
|
|
72
|
+
// Apply TF-IDF weighting: common tags count less
|
|
73
|
+
const weightedMatched = intersection.reduce((sum, t) => sum + tagWeight(t), 0)
|
|
74
|
+
+ partialMatches.reduce((sum, t) => sum + tagWeight(t) * 0.5, 0);
|
|
75
|
+
// 완화된 임계값: 가중 점수 0.5 이상이면 후보
|
|
76
|
+
if (weightedMatched < 0.5)
|
|
77
|
+
return { relevance: 0, matchedTags: [] };
|
|
78
|
+
// Jaccard-like: weighted matched / union.
|
|
79
|
+
// Union uses RAW promptTags and RAW solutionTags — not the expanded set —
|
|
80
|
+
// so that the denominator semantics are unchanged from pre-T2 behaviour.
|
|
81
|
+
// This is intentional: expanding both sides of the Jaccard would
|
|
82
|
+
// asymmetrically inflate recall and silently shift all baseline metrics.
|
|
83
|
+
// R4-T1 explicitly preserves this: `keywordsOrTags` is the raw solution
|
|
84
|
+
// tag list, not the compound-expanded `matchTags` used above.
|
|
85
|
+
const union = new Set([...promptOrTags, ...keywordsOrTags]).size;
|
|
86
|
+
const tagScore = weightedMatched / Math.max(union, 1);
|
|
87
|
+
return {
|
|
88
|
+
relevance: tagScore * (confidence ?? 1),
|
|
89
|
+
matchedTags: [...intersection, ...partialMatches],
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
// ── R4-T3: query-side specificity guards (orchestration layer) ──
|
|
93
|
+
//
|
|
94
|
+
// Two narrow precision rules applied AFTER `calculateRelevance` returns,
|
|
95
|
+
// at the orchestration layer (`rankCandidates`, `searchSolutions`).
|
|
96
|
+
// These rules fix the 2 surviving false positives from R4-T2 — the
|
|
97
|
+
// "validation of insurance claims" and "database backup recovery
|
|
98
|
+
// procedure" residuals — WITHOUT regressing any legitimate fixture
|
|
99
|
+
// positive or paraphrase.
|
|
100
|
+
//
|
|
101
|
+
// Why orchestration-level (not inside calculateRelevance):
|
|
102
|
+
// `calculateRelevance` is a pure scoring function with a stable
|
|
103
|
+
// contract: given (promptTags, solutionTags, confidence), return the
|
|
104
|
+
// relevance and the matched tag set. Several internal tests
|
|
105
|
+
// (synonym-tfidf.test.ts) call it directly with single-token inputs
|
|
106
|
+
// to verify synonym expansion in isolation. Embedding precision
|
|
107
|
+
// filters in the scoring path would break those tests AND break the
|
|
108
|
+
// semantic of "scoring is a pure function". The two rules below are
|
|
109
|
+
// policy-layer decisions about which scored candidates to surface,
|
|
110
|
+
// so they belong at the caller — not at the scorer.
|
|
111
|
+
//
|
|
112
|
+
// Rule A — single-token query AND single-tag match → reject.
|
|
113
|
+
// Rationale: a query that's been reduced to a single dev token (after
|
|
114
|
+
// R4-T2 phrase masking) is unlikely to be a real dev question. Combined
|
|
115
|
+
// with a single-tag match, this is the "validation of insurance
|
|
116
|
+
// claims" shape: masked to `[validation]`, matched a single ambiguous
|
|
117
|
+
// tag `validation` on error-handling-patterns. No legitimate fixture
|
|
118
|
+
// positive or paraphrase has both promptTags.length === 1 AND
|
|
119
|
+
// matchedTags.length === 1.
|
|
120
|
+
//
|
|
121
|
+
// Rule B — all matched tags came via SYNONYM EXPANSION (none appear
|
|
122
|
+
// literally in the prompt tokens) AND match is single-tag → reject.
|
|
123
|
+
// Rationale: the "database backup recovery procedure" shape. After
|
|
124
|
+
// R4-T2 masks `database`/`backup`, the residual tokens are `[recovery,
|
|
125
|
+
// procedure]`. The matched tag is `handling` — which appears nowhere
|
|
126
|
+
// in the query. It only matches because the term-normalizer's
|
|
127
|
+
// `handling` canonical includes `recovery` as a matchTerm (legitimate
|
|
128
|
+
// for "error recovery handler" queries). The rule rejects this
|
|
129
|
+
// expansion-only single-tag match because the query carries no
|
|
130
|
+
// LITERAL signal that the matched solution is relevant. Multi-tag
|
|
131
|
+
// expansion matches are NOT rejected — those indicate the canonical
|
|
132
|
+
// family is being hit from multiple angles ("버그 재현 시스템적으로"
|
|
133
|
+
// hits debugging-systematic via both `debug` and `debugging` — two
|
|
134
|
+
// distinct matches survive).
|
|
135
|
+
//
|
|
136
|
+
// Literal hit: a matched tag is "literal" with respect to the query if
|
|
137
|
+
// any of the following holds for some prompt token `pt`:
|
|
138
|
+
// 1. `pt === tag` (exact verbatim match in the query)
|
|
139
|
+
// 2. `pt` is a substring of `tag` or vice versa, with both length > 3
|
|
140
|
+
// (mirrors the partialMatches discovery rule in calculateRelevance —
|
|
141
|
+
// e.g., `code` (query) ↔ `code-review` (matched tag))
|
|
142
|
+
// 3. `pt` and `tag` share a common prefix of length ≥ 4 (catches
|
|
143
|
+
// morphological variants like `caching` ↔ `cache`, `cached` ↔
|
|
144
|
+
// `cache`, `documents` ↔ `document` where neither is a substring
|
|
145
|
+
// of the other but both clearly come from the same stem)
|
|
146
|
+
//
|
|
147
|
+
// Rule (3) is the defensive precision fix: without it, a query like
|
|
148
|
+
// "caching strategy" (which the term-normalizer expands `caching → cache`
|
|
149
|
+
// via the cache canonical) would have its single-tag `cache` match
|
|
150
|
+
// rejected by Rule B, even though `caching` is morphologically the same
|
|
151
|
+
// concept. The 4-char threshold is the same as the partialMatches rule
|
|
152
|
+
// to keep the literal-hit semantics consistent across the matcher.
|
|
153
|
+
//
|
|
154
|
+
// Returns true if the candidate should be rejected (caller filters
|
|
155
|
+
// it out), false if the candidate passes both rules.
|
|
156
|
+
export function shouldRejectByR4T3Rules(promptTags, matchedTags) {
|
|
157
|
+
// Rule A
|
|
158
|
+
if (promptTags.length === 1 && matchedTags.length === 1) {
|
|
159
|
+
return true;
|
|
160
|
+
}
|
|
161
|
+
// Rule B
|
|
162
|
+
if (matchedTags.length === 1) {
|
|
163
|
+
const tag = matchedTags[0];
|
|
164
|
+
const literalHit = promptTags.includes(tag)
|
|
165
|
+
|| promptTags.some(pt => {
|
|
166
|
+
if (pt.length <= 3 || tag.length <= 3)
|
|
167
|
+
return false;
|
|
168
|
+
if (pt.includes(tag) || tag.includes(pt))
|
|
169
|
+
return true;
|
|
170
|
+
// Morphological stem: shared prefix of length ≥ 4
|
|
171
|
+
let i = 0;
|
|
172
|
+
const limit = Math.min(pt.length, tag.length);
|
|
173
|
+
while (i < limit && pt[i] === tag[i])
|
|
174
|
+
i++;
|
|
175
|
+
return i >= 4;
|
|
176
|
+
});
|
|
177
|
+
if (!literalHit)
|
|
178
|
+
return true;
|
|
179
|
+
}
|
|
180
|
+
return false;
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Shared ranking core: tag-based relevance + identifier boost + top-5 sort.
|
|
184
|
+
*
|
|
185
|
+
* Single source of truth for the matcher's ranking behaviour. Both
|
|
186
|
+
* `matchSolutions` (production, reads from the index) and
|
|
187
|
+
* `evaluateSolutionMatcher` (bootstrap eval, reads from an in-memory fixture)
|
|
188
|
+
* call through here so the eval metrics track reality — any future
|
|
189
|
+
* ranking-logic change only needs to happen in one place.
|
|
190
|
+
*
|
|
191
|
+
* Contract:
|
|
192
|
+
* - identifier boost requires `id.length >= 4` (STRONG_ID_MIN_LENGTH mirror)
|
|
193
|
+
* and substring presence in the prompt (case-insensitive).
|
|
194
|
+
* - candidates with zero matched tags AND zero matched identifiers are dropped.
|
|
195
|
+
* - top-5 by `relevance` descending.
|
|
196
|
+
* - duplicate names are NOT deduplicated — that matches the pre-refactor
|
|
197
|
+
* `matchSolutions` behaviour (both scopes could rank). Callers that want
|
|
198
|
+
* first-wins scope precedence must dedupe on their side.
|
|
199
|
+
*/
|
|
200
|
+
function rankCandidates(promptTags, promptLower, solutions) {
|
|
201
|
+
// T2: normalize prompt tags ONCE per query (not once per solution).
|
|
202
|
+
// Pre-T2 this expansion happened inside calculateRelevance and was
|
|
203
|
+
// repeated N times for N solutions — the plan's primary hot-path win.
|
|
204
|
+
//
|
|
205
|
+
// R4-T2: BEFORE any expansion or normalization, mask out tokens that
|
|
206
|
+
// belong to blocked English phrases ("performance review", "system
|
|
207
|
+
// architecture", etc.). This is a precision filter for non-dev-context
|
|
208
|
+
// false positives. The mask runs first so neither bigram expansion nor
|
|
209
|
+
// canonical normalization can re-introduce a masked token via synonyms
|
|
210
|
+
// or compound recovery — the masked tokens are simply removed from the
|
|
211
|
+
// matching pipeline. See `phrase-blocklist.ts` for the full rationale
|
|
212
|
+
// and the `maskBlockedTokens` contract.
|
|
213
|
+
const maskedPromptTags = maskBlockedTokens(promptLower, promptTags);
|
|
214
|
+
if (maskedPromptTags.length === 0)
|
|
215
|
+
return [];
|
|
216
|
+
//
|
|
217
|
+
// R4-T1: also expand the prompt tags with adjacent-token bigrams BEFORE
|
|
218
|
+
// running the canonical normalizer. `expandQueryBigrams` produces compound
|
|
219
|
+
// forms like `api-key`, `apikey`, `api-keys`, `apikeys` from the raw
|
|
220
|
+
// ['api', 'keys'] token pair, so a query "api keys" can hit a solution
|
|
221
|
+
// tag `api-key` via direct intersection — without depending on the
|
|
222
|
+
// partialMatches half-weight fallback. The bigram expansion is layered
|
|
223
|
+
// BEFORE normalization so that `apikey → api` (via the api canonical
|
|
224
|
+
// family) still works.
|
|
225
|
+
//
|
|
226
|
+
// Note: we intentionally do NOT use `sol.normalizedTags` (if present) for
|
|
227
|
+
// the intersection. Using normalized on BOTH sides is bidirectional
|
|
228
|
+
// expansion that inflates Jaccard intersection 5-10× and silently shifts
|
|
229
|
+
// every baseline metric. `entry.normalizedTags` is populated by the
|
|
230
|
+
// index but reserved for log explainability. If a future change uses it
|
|
231
|
+
// in scoring, it must update ROUND3_BASELINE in the same PR.
|
|
232
|
+
const promptTagsWithBigrams = expandQueryBigrams(maskedPromptTags);
|
|
233
|
+
const normalizedPromptTags = defaultNormalizer.normalizeTerms(promptTagsWithBigrams);
|
|
234
|
+
return solutions
|
|
235
|
+
.map(sol => {
|
|
236
|
+
// R4-T1: solution-side compound-tag expansion. `api-key` becomes
|
|
237
|
+
// {api-key, api, key} so a query token `api` (from "api keys") hits
|
|
238
|
+
// it directly. Computed per solution because each sol.tags is
|
|
239
|
+
// independent — caching across the rank loop is not worth the
|
|
240
|
+
// bookkeeping for the corpus sizes Forgen targets (N ≤ 200).
|
|
241
|
+
const solTagsExpanded = expandCompoundTags(sol.tags);
|
|
242
|
+
// R4-T2: pass `maskedPromptTags` (not the original `promptTags`) as
|
|
243
|
+
// the first arg so the Jaccard union denominator inside
|
|
244
|
+
// calculateRelevance reflects the post-mask tag set. The matching
|
|
245
|
+
// step (intersection/partialMatches) already uses the masked set
|
|
246
|
+
// via `normalizedPromptTags` — the union must match for score
|
|
247
|
+
// semantics to stay consistent.
|
|
248
|
+
const result = calculateRelevance(maskedPromptTags, sol.tags, sol.confidence, { normalizedPromptTags, solutionTagsExpanded: solTagsExpanded });
|
|
249
|
+
// Compute identifier boost FIRST — independent of tag scoring so
|
|
250
|
+
// R4-T3's tag-evidence precision rules below cannot silently drop
|
|
251
|
+
// a candidate that has strong identifier-level evidence.
|
|
252
|
+
let identifierBoost = 0;
|
|
253
|
+
const matchedIdentifiers = [];
|
|
254
|
+
for (const id of sol.identifiers ?? []) {
|
|
255
|
+
if (id.length >= 4 && promptLower.includes(id.toLowerCase())) {
|
|
256
|
+
identifierBoost += 0.15;
|
|
257
|
+
matchedIdentifiers.push(id);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
// R4-T3: orchestration-layer specificity guards. Reject single-tag
|
|
261
|
+
// matches that lack a corroborating signal (single-token query OR
|
|
262
|
+
// all-via-expansion match). See `shouldRejectByR4T3Rules` for the
|
|
263
|
+
// full rule rationale.
|
|
264
|
+
//
|
|
265
|
+
// Identifier evidence is the escape hatch: if the query literally
|
|
266
|
+
// mentioned one of the solution's identifiers (e.g. a function or
|
|
267
|
+
// file name), the R4-T3 tag-precision rules are bypassed because
|
|
268
|
+
// the identifier hit is itself a strong-specificity signal. Only
|
|
269
|
+
// the tag evidence is zeroed out when R4-T3 fires; the identifier
|
|
270
|
+
// boost and matched identifiers are preserved, so a candidate with
|
|
271
|
+
// a single weak tag match but a valid identifier still survives
|
|
272
|
+
// the `matchedTags.length + matchedIdentifiers.length >= 1` filter.
|
|
273
|
+
let tagRelevance = result.relevance;
|
|
274
|
+
let tagMatches = result.matchedTags;
|
|
275
|
+
if (matchedIdentifiers.length === 0
|
|
276
|
+
&& tagMatches.length > 0
|
|
277
|
+
&& shouldRejectByR4T3Rules(maskedPromptTags, tagMatches)) {
|
|
278
|
+
tagRelevance = 0;
|
|
279
|
+
tagMatches = [];
|
|
280
|
+
}
|
|
281
|
+
return {
|
|
282
|
+
solution: sol,
|
|
283
|
+
relevance: tagRelevance + identifierBoost,
|
|
284
|
+
matchedTags: tagMatches,
|
|
285
|
+
matchedIdentifiers,
|
|
286
|
+
};
|
|
287
|
+
})
|
|
288
|
+
.filter(c => c.matchedTags.length + c.matchedIdentifiers.length >= 1)
|
|
289
|
+
.sort((a, b) => b.relevance - a.relevance)
|
|
290
|
+
.slice(0, 5);
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Round 3 baseline metrics, recorded against the current `term-normalizer`
|
|
294
|
+
* + `calculateRelevance` + fixture `solution-match-bootstrap.json`. Used as
|
|
295
|
+
* a relative regression guard in `tests/solution-matcher-eval.test.ts` —
|
|
296
|
+
* downstream PRs must not regress any field by more than `BASELINE_TOLERANCE`.
|
|
297
|
+
*
|
|
298
|
+
* History (chronological ascending — v1 at top, latest at bottom):
|
|
299
|
+
* - v1 (2026-04-08, fixture v1, 41+10+10 queries): 1.0 / 1.0 / 0.0 / 0.1
|
|
300
|
+
* Recorded against the original 61-query fixture, all positive queries
|
|
301
|
+
* PASS@1. Indicated a measurement plateau but masked the matcher's true
|
|
302
|
+
* ranking and false-positive weaknesses because the fixture queries were
|
|
303
|
+
* too tag-aligned.
|
|
304
|
+
*
|
|
305
|
+
* - v2 (2026-04-08, fixture v2, 53+16+14 queries): 1.0 / 0.969 / 0.0 / 0.357
|
|
306
|
+
* Expanded with 12 hard positive (multi-canonical / compound-tag tug-of-
|
|
307
|
+
* war), 6 Korean subtle paraphrase, and 4 tricky negative queries. The
|
|
308
|
+
* drops are intentional and represent genuine matcher behaviour:
|
|
309
|
+
* * positive mrrAt5 1.0 → 0.959: 4 of 12 added positives rank #2-3:
|
|
310
|
+
* (1) "managing api keys and credentials safely" → secret @3 vs
|
|
311
|
+
* api-error-responses @1 — the `api` canonical in
|
|
312
|
+
* DEFAULT_MATCH_TERMS expands to {api, rest, graphql, endpoint,
|
|
313
|
+
* route}, so query `api` hits BOTH `api` AND `rest` on
|
|
314
|
+
* starter-api-error-responses (matched=['api','rest']) — a
|
|
315
|
+
* double-count numerator. starter-secret-management only scores
|
|
316
|
+
* a single weak partial match on `credential`. The compound
|
|
317
|
+
* `api-key` tag on secret-management is never reached because
|
|
318
|
+
* extractTags strips the query-side hyphen and yields
|
|
319
|
+
* ['api','keys'] (the solution-side tag remains hyphenated in
|
|
320
|
+
* the index but has no query token to intersect with). T4 IDF
|
|
321
|
+
* would down-weight both `api` and `rest`, neutralising the
|
|
322
|
+
* double-count and letting `credential` outscore the noise.
|
|
323
|
+
* (2) "avoiding hardcoded credentials in source code" → secret @2
|
|
324
|
+
* vs code-review @1 — `code` partial-matches `code-review`
|
|
325
|
+
* (len>3, code-review.includes('code')=true) at half weight.
|
|
326
|
+
* secret-management's `credential` matches by partial too but
|
|
327
|
+
* the union size differs.
|
|
328
|
+
* (3) "red green refactor cycle for new features" → tdd @2 vs
|
|
329
|
+
* refactor-safely @1 — `refactor` is a full-weight intersection
|
|
330
|
+
* with both refactor-safely's `refactor` and `리팩토링` (via
|
|
331
|
+
* the refactor canonical), giving 2 hits at 1.0 each. tdd-red-
|
|
332
|
+
* green-refactor only matches the literal compound tag
|
|
333
|
+
* `red-green-refactor` (one weighted hit) — the full-weight
|
|
334
|
+
* generic `refactor` term overpowers the compound-tag specifity.
|
|
335
|
+
* (4) "writing unit tests for a function with side effects" → tdd
|
|
336
|
+
* @2 vs separation-of-concerns @1 — both solutions have a
|
|
337
|
+
* SINGLE matching tag with weighted score 0.5: separation gets
|
|
338
|
+
* `function` (COMMON_TAG, exact intersection, weight 0.5);
|
|
339
|
+
* tdd-red-green-refactor gets `tests` partial-matching `test`
|
|
340
|
+
* (len>3, partial weight 1.0 × 0.5 = 0.5). Both numerators are
|
|
341
|
+
* identical. Separation wins because the `function` co-occurs
|
|
342
|
+
* in both promptTags and solution.tags, shrinking its Jaccard
|
|
343
|
+
* union by one element vs tdd's — a 1-element union-size
|
|
344
|
+
* advantage drives the entire ranking. starter-dependency-
|
|
345
|
+
* injection is *not* in top-5 despite having `testing`/`mock`/
|
|
346
|
+
* `dependency` tags (`tests` does not partial-match `testing`
|
|
347
|
+
* — neither is a substring of the other), so listing `di` in
|
|
348
|
+
* expectAnyOf is purely defensive recall, not a live candidate.
|
|
349
|
+
* T4 BM25 with proper length normalization would attack the
|
|
350
|
+
* union-size tie-breaker more rigorously than current Jaccard.
|
|
351
|
+
* * paraphrase mrrAt5 stays at 1.0: all 6 added Korean paraphrases
|
|
352
|
+
* rank @1 (the originally hard "테스트 먼저 작성하고 리팩토링" is
|
|
353
|
+
* documented in the fixture as legitimately matching either tdd
|
|
354
|
+
* OR refactor-safely, since starter-refactor-safely's README also
|
|
355
|
+
* covers test-first workflows — both are defensible answers).
|
|
356
|
+
* * negativeAnyResultRate 0.1 → 0.357: 4 added tricky negatives all
|
|
357
|
+
* trigger false positives via single common dev-adjacent words —
|
|
358
|
+
* "performance review meeting notes" → caching (matches
|
|
359
|
+
* `performance`), "system architecture overview document" →
|
|
360
|
+
* separation-of-concerns (matches `architecture`), "database backup
|
|
361
|
+
* recovery procedure" → n-plus-one-queries (matches `database`,
|
|
362
|
+
* `query`, `데이터베이스`), "validation of insurance claims" →
|
|
363
|
+
* error-handling (matches `validation`).
|
|
364
|
+
* The original Round 3 plan staged these for T4 (BM25 + IDF). T4 was
|
|
365
|
+
* EMPIRICALLY SKIPPED on 2026-04-08 — see
|
|
366
|
+
* `docs/plans/2026-04-08-t4-bm25-skip-adr.md` for the full decision
|
|
367
|
+
* record. Summary: BM25 prototypes (naive, hybrid Jaccard×IDF,
|
|
368
|
+
* precision filter, soft penalty) all matched or underperformed the
|
|
369
|
+
* current scorer on every metric. The starter corpus (N=15) is too
|
|
370
|
+
* small for IDF to be informative, and the false positives are
|
|
371
|
+
* semantic ("performance" is both a dev tag and an English noun) — not
|
|
372
|
+
* statistical, so no frequency-based weighting can fix them. The real
|
|
373
|
+
* follow-up candidates are tokenizer fix for compound tags, an n-gram
|
|
374
|
+
* phrase matcher, and corpus growth — all deferred to Round 4 per the
|
|
375
|
+
* ADR.
|
|
376
|
+
*
|
|
377
|
+
* - v3 (2026-04-08, fixture v2 + R4-T1 compound-tag fix): 1.0 / 0.986 / 0.0 / 0.357
|
|
378
|
+
* R4-T1 added `expandCompoundTags` (solution-side) and
|
|
379
|
+
* `expandQueryBigrams` (query-side) so hyphenated solution tags like
|
|
380
|
+
* `api-key`, `code-review`, `red-green-refactor` participate in direct
|
|
381
|
+
* intersection rather than relying on the half-weight partialMatches
|
|
382
|
+
* fallback. positive `mrrAt5` improved 0.959 → 0.981 (+0.022). 2 of
|
|
383
|
+
* the 4 v2 hard positive cases were resolved (`managing api keys and
|
|
384
|
+
* credentials safely` and `red green refactor cycle for new features`
|
|
385
|
+
* now rank @1). The remaining 2 (`avoiding hardcoded credentials …`
|
|
386
|
+
* and `writing unit tests for a function with side effects`) require
|
|
387
|
+
* R4-T2 (phrase matcher) or R4-T3 (specificity classifier) — they're
|
|
388
|
+
* about query-side English semantics, not compound-tag tokenization.
|
|
389
|
+
* `negativeAnyResultRate` is unchanged at 0.357 because R4-T1 is a
|
|
390
|
+
* ranking-quality fix, not a false-positive filter.
|
|
391
|
+
*
|
|
392
|
+
* - v4 (2026-04-08, fixture v2 + R4-T1 + R4-T2 phrase blocklist):
|
|
393
|
+
* 1.0 / 0.986 / 0.0 / 0.143
|
|
394
|
+
* R4-T2 added `phrase-blocklist.ts` with 17 curated 2-word English
|
|
395
|
+
* non-dev compounds ("performance review", "system architecture",
|
|
396
|
+
* "database backup", etc.) and a `maskBlockedTokens` step at the
|
|
397
|
+
* top of `rankCandidates` and `searchSolutions`. When a query
|
|
398
|
+
* contains a blocked phrase, the constituent tokens are removed
|
|
399
|
+
* from the prompt tag list before bigram expansion / canonical
|
|
400
|
+
* normalization runs — so the false-positive evidence is removed
|
|
401
|
+
* at the source rather than demoted in scoring.
|
|
402
|
+
*
|
|
403
|
+
* `negativeAnyResultRate` dropped 0.357 → 0.143 (3 of 5 v2 trigger
|
|
404
|
+
* negatives fully blocked):
|
|
405
|
+
* * "performance review meeting notes" — blocked via
|
|
406
|
+
* `performance review` + `meeting notes`
|
|
407
|
+
* * "system architecture overview document" — blocked via
|
|
408
|
+
* `system architecture` + `overview document`
|
|
409
|
+
* * "solar system planets astronomy" — blocked via `solar system`
|
|
410
|
+
*
|
|
411
|
+
* 2 false positives remain (both deferred to R4-T3 query-side
|
|
412
|
+
* specificity classifier — the residuals share a common shape:
|
|
413
|
+
* a single dev-tag homograph survives whatever masking is applied,
|
|
414
|
+
* and the term-normalizer expansion still surfaces a false match):
|
|
415
|
+
*
|
|
416
|
+
* * "database backup recovery procedure" → error-handling-patterns:
|
|
417
|
+
* `database backup` is blocked, but the residual tokens
|
|
418
|
+
* {`recovery`, `procedure`} survive. `recovery` is in the
|
|
419
|
+
* `handling` canonical's matchTerms (intentional, for legitimate
|
|
420
|
+
* "error recovery handler" queries), so the masked query still
|
|
421
|
+
* hits `starter-error-handling-patterns` via the handling
|
|
422
|
+
* family. A 3-word `recovery procedure` blocklist entry was
|
|
423
|
+
* considered and rejected — it would silently mask legitimate
|
|
424
|
+
* dev SRE queries like "disaster recovery procedure" or
|
|
425
|
+
* "rollback recovery procedure" without a fixture-driven
|
|
426
|
+
* signal. The right fix is at the query-specificity layer
|
|
427
|
+
* (R4-T3): require ≥ 2 distinct dev-context signals before any
|
|
428
|
+
* match is returned, not at the phrase-blocklist layer.
|
|
429
|
+
*
|
|
430
|
+
* * "validation of insurance claims" → error-handling-patterns:
|
|
431
|
+
* `insurance claim` is blocked, but the residual `validation`
|
|
432
|
+
* token IS a legitimate dev tag (input-validation,
|
|
433
|
+
* error-handling-patterns both have it). Same R4-T3 target.
|
|
434
|
+
*
|
|
435
|
+
* positive/paraphrase mrrAt5 are unchanged from v3 because no
|
|
436
|
+
* legitimate dev query in the fixture contains a blocked phrase.
|
|
437
|
+
*
|
|
438
|
+
* - v5 (2026-04-08, fixture v2 + R4-T1 + R4-T2 + R4-T3 specificity guards):
|
|
439
|
+
* 1.0 / 0.986 / 0.0 / 0.000
|
|
440
|
+
* R4-T3 added two narrow precision rules at the ORCHESTRATION LAYER —
|
|
441
|
+
* NOT inside `calculateRelevance` (which remains a pure scoring
|
|
442
|
+
* function for test symmetry). The rules are implemented as the
|
|
443
|
+
* exported helper `shouldRejectByR4T3Rules(promptTags, matchedTags)`
|
|
444
|
+
* and called from both `rankCandidates` (hook path) and
|
|
445
|
+
* `searchSolutions` (MCP path) right after the per-solution
|
|
446
|
+
* `calculateRelevance` call:
|
|
447
|
+
* (Rule A) single-token query AND single-tag match → reject;
|
|
448
|
+
* (Rule B) single-tag match with no literal hit in the prompt
|
|
449
|
+
* (verbatim match, or substring partial length > 3, or
|
|
450
|
+
* shared prefix ≥ 4 for morphological stems) → reject.
|
|
451
|
+
* Both rules are scoped narrowly enough to fix exactly the 2 R4-T2
|
|
452
|
+
* residuals without recall regression — every fixture positive and
|
|
453
|
+
* paraphrase still ranks identically:
|
|
454
|
+
* * "validation of insurance claims" → masked to `[validation]`
|
|
455
|
+
* (length 1) with single-tag match `validation` → Rule A reject.
|
|
456
|
+
* * "database backup recovery procedure" → masked to
|
|
457
|
+
* `[recovery, procedure]` with single-tag match `handling`
|
|
458
|
+
* (zero literal hit; `handling` is reached via the `recovery`
|
|
459
|
+
* canonical-family expansion in term-normalizer) → Rule B reject.
|
|
460
|
+
* `negativeAnyResultRate` is now 0.000 — every fixture v2 negative
|
|
461
|
+
* produces zero candidates. positive/paraphrase metrics unchanged
|
|
462
|
+
* from v4 because no fixture positive matches the (single-token AND
|
|
463
|
+
* single-tag) or (all-expansion AND single-tag) shape.
|
|
464
|
+
*
|
|
465
|
+
* Escape hatch: identifier-boost evidence (hook path) or name-match
|
|
466
|
+
* evidence (MCP path) BYPASSES the R4-T3 rules. A candidate with
|
|
467
|
+
* even a single weak tag match plus an identifier hit still
|
|
468
|
+
* surfaces — the precision rules only fire when the candidate's
|
|
469
|
+
* entire evidence pool is a single ambiguous tag.
|
|
470
|
+
*
|
|
471
|
+
* Defensive precision note: Rule B's "shared prefix ≥ 4"
|
|
472
|
+
* morphological check is currently NOT fixture-driven (no fixture
|
|
473
|
+
* query masks down to the `caching/cache`-style morphological gap).
|
|
474
|
+
* It exists as a pre-emptive fix against silently rejecting
|
|
475
|
+
* legitimate future queries where the term-normalizer synonym
|
|
476
|
+
* expansion is the only bridge between the query token and the
|
|
477
|
+
* solution tag. If a production query surfaces a case the prefix
|
|
478
|
+
* check misses, extend it (e.g. by lowering the threshold or
|
|
479
|
+
* adding a Levenshtein-1 check) rather than removing it.
|
|
480
|
+
*
|
|
481
|
+
* Known matcher quirks (separate from the T4 BM25 investigation):
|
|
482
|
+
* - `term-normalizer.ts` `error` canonical contains `debug` as a matchTerm
|
|
483
|
+
* (intentional for `bug → error` recall), which causes any prompt
|
|
484
|
+
* containing `error` to expand to `debug` and over-rank
|
|
485
|
+
* `starter-debugging-systematic` on otherwise unrelated queries. This
|
|
486
|
+
* is why `async await error propagation` could not be added as a hard
|
|
487
|
+
* case — the matcher returns debugging-systematic at #1, which is
|
|
488
|
+
* defensible-but-noisy. The fix is at the normalizer level (split
|
|
489
|
+
* `debug` out of the `error` family or remove the `error → debug`
|
|
490
|
+
* edge entirely) and is queued as a Round 4 follow-up. T4 BM25 was
|
|
491
|
+
* considered as a partial mitigation but the T4 skip ADR (referenced
|
|
492
|
+
* in the Round 3 outcome paragraph above) shows it does not help.
|
|
493
|
+
*
|
|
494
|
+
* Long-tail caveat:
|
|
495
|
+
* - `"trying to handle authentication errors gracefully when our backend
|
|
496
|
+
* api returns inconsistent response formats from different
|
|
497
|
+
* microservices"` is a 17-word query intentionally added to exercise
|
|
498
|
+
* long-tail behaviour. Currently PASS@1. Originally flagged as BM25
|
|
499
|
+
* length-normalization sensitive, but since T4 BM25 was skipped this
|
|
500
|
+
* caveat is now informational only — no length-norm code path is
|
|
501
|
+
* planned in Round 3.
|
|
502
|
+
*
|
|
503
|
+
* If a PR legitimately improves a metric, update this constant in the same
|
|
504
|
+
* commit so future PRs guard against the new floor.
|
|
505
|
+
*/
|
|
506
|
+
export const ROUND3_BASELINE = {
|
|
507
|
+
recallAt5: 1.0,
|
|
508
|
+
mrrAt5: 0.986,
|
|
509
|
+
noResultRate: 0.0,
|
|
510
|
+
negativeAnyResultRate: 0.0,
|
|
511
|
+
byBucket: {
|
|
512
|
+
positive: { recallAt5: 1.0, mrrAt5: 0.981, noResultRate: 0.0, total: 53 },
|
|
513
|
+
paraphrase: { recallAt5: 1.0, mrrAt5: 1.0, noResultRate: 0.0, total: 16 },
|
|
514
|
+
},
|
|
515
|
+
total: { positive: 53, paraphrase: 16, negative: 14 },
|
|
516
|
+
};
|
|
517
|
+
/** Maximum allowed absolute regression per metric. 5% is tight enough to catch
|
|
518
|
+
* ~3-4 query regressions in a 69-query combined bucket (positive+paraphrase)
|
|
519
|
+
* but lenient enough that a single fixture edit won't spuriously fail the
|
|
520
|
+
* guard. */
|
|
521
|
+
export const BASELINE_TOLERANCE = 0.05;
|
|
522
|
+
/** Run a single bucket through the ranking pipeline and aggregate IR metrics. */
|
|
523
|
+
function computeBucketMetrics(queries, solutions) {
|
|
524
|
+
let recallHits = 0;
|
|
525
|
+
let reciprocalSum = 0;
|
|
526
|
+
let noResultCount = 0;
|
|
527
|
+
for (const q of queries) {
|
|
528
|
+
const promptTags = extractTags(q.query);
|
|
529
|
+
const ranked = rankCandidates(promptTags, q.query.toLowerCase(), solutions);
|
|
530
|
+
if (ranked.length === 0) {
|
|
531
|
+
noResultCount++;
|
|
532
|
+
continue;
|
|
533
|
+
}
|
|
534
|
+
for (let i = 0; i < ranked.length; i++) {
|
|
535
|
+
if (q.expectAnyOf.includes(ranked[i].solution.name)) {
|
|
536
|
+
recallHits++;
|
|
537
|
+
reciprocalSum += 1 / (i + 1);
|
|
538
|
+
break;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
const total = queries.length;
|
|
543
|
+
return {
|
|
544
|
+
recallAt5: total > 0 ? recallHits / total : 0,
|
|
545
|
+
mrrAt5: total > 0 ? reciprocalSum / total : 0,
|
|
546
|
+
noResultRate: total > 0 ? noResultCount / total : 0,
|
|
547
|
+
total,
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
/**
|
|
551
|
+
* Test/diagnostic helper: evaluate one query against a fixture solution set
|
|
552
|
+
* and return the top-5 ranked candidates with their relevance + matched tags.
|
|
553
|
+
*
|
|
554
|
+
* Exists so per-query regression tests (e.g. the R4-T1 hard-positive guards
|
|
555
|
+
* in `tests/solution-matcher-eval.test.ts`) can assert specific ranking
|
|
556
|
+
* outcomes without scraping aggregate metrics. Wraps `rankCandidates` so
|
|
557
|
+
* the test path stays in sync with the production ranker.
|
|
558
|
+
*
|
|
559
|
+
* Returns the same shape as `rankCandidates` minus the generic carrier:
|
|
560
|
+
* `{name, relevance, matchedTags}`. Use the names to assert "expected
|
|
561
|
+
* solution at rank 1".
|
|
562
|
+
*/
|
|
563
|
+
export function evaluateQuery(query, solutions) {
|
|
564
|
+
const promptTags = extractTags(query);
|
|
565
|
+
return rankCandidates(promptTags, query.toLowerCase(), solutions).map(c => ({
|
|
566
|
+
name: c.solution.name,
|
|
567
|
+
relevance: c.relevance,
|
|
568
|
+
matchedTags: c.matchedTags,
|
|
569
|
+
}));
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Evaluate the current matcher against a labeled fixture and return IR
|
|
573
|
+
* metrics. This is the Round 3 baseline — each downstream PR (T2/T3/T4) must
|
|
574
|
+
* not regress any of the thresholds asserted in `solution-matcher-eval.test.ts`.
|
|
575
|
+
*
|
|
576
|
+
* Uses `rankCandidates` (shared with `matchSolutions`) so the evaluator can't
|
|
577
|
+
* silently drift from production ranking behaviour.
|
|
578
|
+
*
|
|
579
|
+
* Metrics are reported both aggregated (positive ∪ paraphrase) and per-bucket,
|
|
580
|
+
* so paraphrase-only regressions surface in `byBucket.paraphrase` even if the
|
|
581
|
+
* aggregate looks fine.
|
|
582
|
+
*/
|
|
583
|
+
export function evaluateSolutionMatcher(fixture) {
|
|
584
|
+
const positiveM = computeBucketMetrics(fixture.positive, fixture.solutions);
|
|
585
|
+
const paraphraseM = computeBucketMetrics(fixture.paraphrase, fixture.solutions);
|
|
586
|
+
const combinedTotal = positiveM.total + paraphraseM.total;
|
|
587
|
+
// Weighted aggregation: counts, not means — so a large positive bucket
|
|
588
|
+
// doesn't drown a small paraphrase bucket but also a single-query bucket
|
|
589
|
+
// doesn't dominate.
|
|
590
|
+
const recallAt5 = combinedTotal > 0
|
|
591
|
+
? (positiveM.recallAt5 * positiveM.total + paraphraseM.recallAt5 * paraphraseM.total) / combinedTotal
|
|
592
|
+
: 0;
|
|
593
|
+
const mrrAt5 = combinedTotal > 0
|
|
594
|
+
? (positiveM.mrrAt5 * positiveM.total + paraphraseM.mrrAt5 * paraphraseM.total) / combinedTotal
|
|
595
|
+
: 0;
|
|
596
|
+
const noResultRate = combinedTotal > 0
|
|
597
|
+
? (positiveM.noResultRate * positiveM.total + paraphraseM.noResultRate * paraphraseM.total) / combinedTotal
|
|
598
|
+
: 0;
|
|
599
|
+
let negAnyResult = 0;
|
|
600
|
+
for (const q of fixture.negative) {
|
|
601
|
+
const promptTags = extractTags(q.query);
|
|
602
|
+
const ranked = rankCandidates(promptTags, q.query.toLowerCase(), fixture.solutions);
|
|
603
|
+
if (ranked.length >= 1)
|
|
604
|
+
negAnyResult++;
|
|
605
|
+
}
|
|
606
|
+
const negTotal = fixture.negative.length;
|
|
607
|
+
return {
|
|
608
|
+
recallAt5,
|
|
609
|
+
mrrAt5,
|
|
610
|
+
noResultRate,
|
|
611
|
+
negativeAnyResultRate: negTotal > 0 ? negAnyResult / negTotal : 0,
|
|
612
|
+
byBucket: {
|
|
613
|
+
positive: positiveM,
|
|
614
|
+
paraphrase: paraphraseM,
|
|
615
|
+
},
|
|
616
|
+
total: {
|
|
617
|
+
positive: fixture.positive.length,
|
|
618
|
+
paraphrase: fixture.paraphrase.length,
|
|
619
|
+
negative: fixture.negative.length,
|
|
620
|
+
},
|
|
621
|
+
};
|
|
622
|
+
}
|
|
623
|
+
export function matchSolutions(prompt, scope, cwd) {
|
|
624
|
+
// Build solution dirs for index cache
|
|
625
|
+
const dirs = [
|
|
626
|
+
{ dir: ME_SOLUTIONS, scope: 'me' },
|
|
627
|
+
];
|
|
628
|
+
if (scope.team) {
|
|
629
|
+
dirs.push({ dir: path.join(PACKS_DIR, scope.team.name, 'solutions'), scope: 'team' });
|
|
630
|
+
}
|
|
631
|
+
dirs.push({ dir: path.join(cwd, '.compound', 'solutions'), scope: 'project' });
|
|
632
|
+
// Use cached index (rebuilt only when dirs change)
|
|
633
|
+
const index = getOrBuildIndex(dirs);
|
|
634
|
+
const allSolutions = index.entries.map(e => ({ ...e }));
|
|
635
|
+
const promptTags = extractTags(prompt);
|
|
636
|
+
const promptLower = prompt.toLowerCase();
|
|
637
|
+
// Delegate to shared ranking core. `rankCandidates` is generic so each
|
|
638
|
+
// ranked candidate carries the original `LoadedSolution` reference — no
|
|
639
|
+
// name-based re-lookup, so two scopes sharing a name (e.g. me/foo and
|
|
640
|
+
// project/foo) can both appear in the result without a Map last-wins
|
|
641
|
+
// scope-precedence bug.
|
|
642
|
+
const ranked = rankCandidates(promptTags, promptLower, allSolutions);
|
|
643
|
+
return ranked.map(c => ({
|
|
644
|
+
name: c.solution.name,
|
|
645
|
+
path: c.solution.filePath,
|
|
646
|
+
scope: c.solution.scope,
|
|
647
|
+
relevance: c.relevance,
|
|
648
|
+
summary: c.solution.name,
|
|
649
|
+
status: c.solution.status,
|
|
650
|
+
confidence: c.solution.confidence,
|
|
651
|
+
type: c.solution.type,
|
|
652
|
+
tags: c.solution.tags,
|
|
653
|
+
identifiers: c.solution.identifiers,
|
|
654
|
+
matchedTags: [...c.matchedTags, ...c.matchedIdentifiers],
|
|
655
|
+
}));
|
|
656
|
+
}
|