@wooojin/forgen 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. package/.claude-plugin/plugin.json +20 -0
  2. package/CHANGELOG.md +353 -0
  3. package/CONTRIBUTING.md +98 -0
  4. package/LICENSE +21 -0
  5. package/README.ja.md +469 -0
  6. package/README.ko.md +469 -0
  7. package/README.md +483 -0
  8. package/README.zh.md +469 -0
  9. package/agents/analyst.md +98 -0
  10. package/agents/architect.md +62 -0
  11. package/agents/code-reviewer.md +120 -0
  12. package/agents/code-simplifier.md +197 -0
  13. package/agents/critic.md +70 -0
  14. package/agents/debugger.md +117 -0
  15. package/agents/designer.md +131 -0
  16. package/agents/executor.md +54 -0
  17. package/agents/explore.md +145 -0
  18. package/agents/git-master.md +212 -0
  19. package/agents/performance-reviewer.md +172 -0
  20. package/agents/planner.md +29 -0
  21. package/agents/qa-tester.md +158 -0
  22. package/agents/refactoring-expert.md +168 -0
  23. package/agents/scientist.md +144 -0
  24. package/agents/security-reviewer.md +137 -0
  25. package/agents/test-engineer.md +153 -0
  26. package/agents/verifier.md +133 -0
  27. package/agents/writer.md +184 -0
  28. package/commands/api-design.md +268 -0
  29. package/commands/architecture-decision.md +314 -0
  30. package/commands/ci-cd.md +270 -0
  31. package/commands/code-review.md +233 -0
  32. package/commands/compound.md +117 -0
  33. package/commands/database.md +263 -0
  34. package/commands/debug-detective.md +99 -0
  35. package/commands/docker.md +274 -0
  36. package/commands/documentation.md +276 -0
  37. package/commands/ecomode.md +51 -0
  38. package/commands/frontend.md +271 -0
  39. package/commands/git-master.md +90 -0
  40. package/commands/incident-response.md +292 -0
  41. package/commands/migrate.md +101 -0
  42. package/commands/performance.md +288 -0
  43. package/commands/refactor.md +105 -0
  44. package/commands/security-review.md +288 -0
  45. package/commands/tdd.md +183 -0
  46. package/commands/testing-strategy.md +265 -0
  47. package/dist/cli.d.ts +2 -0
  48. package/dist/cli.js +295 -0
  49. package/dist/core/auto-compound-runner.d.ts +12 -0
  50. package/dist/core/auto-compound-runner.js +460 -0
  51. package/dist/core/config-hooks.d.ts +10 -0
  52. package/dist/core/config-hooks.js +112 -0
  53. package/dist/core/config-injector.d.ts +50 -0
  54. package/dist/core/config-injector.js +455 -0
  55. package/dist/core/doctor.d.ts +1 -0
  56. package/dist/core/doctor.js +163 -0
  57. package/dist/core/errors.d.ts +81 -0
  58. package/dist/core/errors.js +133 -0
  59. package/dist/core/global-config.d.ts +43 -0
  60. package/dist/core/global-config.js +25 -0
  61. package/dist/core/harness.d.ts +24 -0
  62. package/dist/core/harness.js +621 -0
  63. package/dist/core/init.d.ts +7 -0
  64. package/dist/core/init.js +37 -0
  65. package/dist/core/inspect-cli.d.ts +7 -0
  66. package/dist/core/inspect-cli.js +47 -0
  67. package/dist/core/legacy-detector.d.ts +33 -0
  68. package/dist/core/legacy-detector.js +66 -0
  69. package/dist/core/logger.d.ts +34 -0
  70. package/dist/core/logger.js +121 -0
  71. package/dist/core/mcp-config.d.ts +44 -0
  72. package/dist/core/mcp-config.js +177 -0
  73. package/dist/core/notepad.d.ts +31 -0
  74. package/dist/core/notepad.js +88 -0
  75. package/dist/core/paths.d.ts +85 -0
  76. package/dist/core/paths.js +101 -0
  77. package/dist/core/plugin-detector.d.ts +44 -0
  78. package/dist/core/plugin-detector.js +226 -0
  79. package/dist/core/runtime-detector.d.ts +8 -0
  80. package/dist/core/runtime-detector.js +49 -0
  81. package/dist/core/scope-resolver.d.ts +8 -0
  82. package/dist/core/scope-resolver.js +45 -0
  83. package/dist/core/session-logger.d.ts +6 -0
  84. package/dist/core/session-logger.js +111 -0
  85. package/dist/core/session-store.d.ts +28 -0
  86. package/dist/core/session-store.js +218 -0
  87. package/dist/core/settings-lock.d.ts +18 -0
  88. package/dist/core/settings-lock.js +125 -0
  89. package/dist/core/spawn.d.ts +3 -0
  90. package/dist/core/spawn.js +135 -0
  91. package/dist/core/types.d.ts +108 -0
  92. package/dist/core/types.js +1 -0
  93. package/dist/core/uninstall.d.ts +4 -0
  94. package/dist/core/uninstall.js +307 -0
  95. package/dist/core/v1-bootstrap.d.ts +26 -0
  96. package/dist/core/v1-bootstrap.js +155 -0
  97. package/dist/engine/compound-cli.d.ts +24 -0
  98. package/dist/engine/compound-cli.js +250 -0
  99. package/dist/engine/compound-extractor.d.ts +68 -0
  100. package/dist/engine/compound-extractor.js +860 -0
  101. package/dist/engine/compound-lifecycle.d.ts +32 -0
  102. package/dist/engine/compound-lifecycle.js +305 -0
  103. package/dist/engine/compound-loop.d.ts +32 -0
  104. package/dist/engine/compound-loop.js +511 -0
  105. package/dist/engine/match-eval-log.d.ts +139 -0
  106. package/dist/engine/match-eval-log.js +270 -0
  107. package/dist/engine/phrase-blocklist.d.ts +119 -0
  108. package/dist/engine/phrase-blocklist.js +208 -0
  109. package/dist/engine/skill-promoter.d.ts +20 -0
  110. package/dist/engine/skill-promoter.js +115 -0
  111. package/dist/engine/solution-format.d.ts +160 -0
  112. package/dist/engine/solution-format.js +432 -0
  113. package/dist/engine/solution-index.d.ts +13 -0
  114. package/dist/engine/solution-index.js +252 -0
  115. package/dist/engine/solution-matcher.d.ts +364 -0
  116. package/dist/engine/solution-matcher.js +656 -0
  117. package/dist/engine/solution-writer.d.ts +76 -0
  118. package/dist/engine/solution-writer.js +157 -0
  119. package/dist/engine/term-matcher.d.ts +81 -0
  120. package/dist/engine/term-matcher.js +268 -0
  121. package/dist/engine/term-normalizer.d.ts +116 -0
  122. package/dist/engine/term-normalizer.js +171 -0
  123. package/dist/fgx.d.ts +6 -0
  124. package/dist/fgx.js +42 -0
  125. package/dist/forge/cli.d.ts +11 -0
  126. package/dist/forge/cli.js +100 -0
  127. package/dist/forge/evidence-processor.d.ts +21 -0
  128. package/dist/forge/evidence-processor.js +87 -0
  129. package/dist/forge/mismatch-detector.d.ts +44 -0
  130. package/dist/forge/mismatch-detector.js +83 -0
  131. package/dist/forge/onboarding-cli.d.ts +6 -0
  132. package/dist/forge/onboarding-cli.js +89 -0
  133. package/dist/forge/onboarding.d.ts +25 -0
  134. package/dist/forge/onboarding.js +122 -0
  135. package/dist/hooks/compound-reflection.d.ts +45 -0
  136. package/dist/hooks/compound-reflection.js +82 -0
  137. package/dist/hooks/context-guard.d.ts +24 -0
  138. package/dist/hooks/context-guard.js +156 -0
  139. package/dist/hooks/dangerous-patterns.json +18 -0
  140. package/dist/hooks/db-guard.d.ts +17 -0
  141. package/dist/hooks/db-guard.js +105 -0
  142. package/dist/hooks/hook-config.d.ts +29 -0
  143. package/dist/hooks/hook-config.js +92 -0
  144. package/dist/hooks/hook-registry.d.ts +43 -0
  145. package/dist/hooks/hook-registry.js +31 -0
  146. package/dist/hooks/hooks-generator.d.ts +49 -0
  147. package/dist/hooks/hooks-generator.js +99 -0
  148. package/dist/hooks/intent-classifier.d.ts +12 -0
  149. package/dist/hooks/intent-classifier.js +62 -0
  150. package/dist/hooks/keyword-detector.d.ts +25 -0
  151. package/dist/hooks/keyword-detector.js +389 -0
  152. package/dist/hooks/notepad-injector.d.ts +18 -0
  153. package/dist/hooks/notepad-injector.js +51 -0
  154. package/dist/hooks/permission-handler.d.ts +14 -0
  155. package/dist/hooks/permission-handler.js +114 -0
  156. package/dist/hooks/post-tool-failure.d.ts +11 -0
  157. package/dist/hooks/post-tool-failure.js +118 -0
  158. package/dist/hooks/post-tool-handlers.d.ts +17 -0
  159. package/dist/hooks/post-tool-handlers.js +115 -0
  160. package/dist/hooks/post-tool-use.d.ts +29 -0
  161. package/dist/hooks/post-tool-use.js +151 -0
  162. package/dist/hooks/pre-compact.d.ts +10 -0
  163. package/dist/hooks/pre-compact.js +165 -0
  164. package/dist/hooks/pre-tool-use.d.ts +31 -0
  165. package/dist/hooks/pre-tool-use.js +325 -0
  166. package/dist/hooks/prompt-injection-filter.d.ts +56 -0
  167. package/dist/hooks/prompt-injection-filter.js +287 -0
  168. package/dist/hooks/rate-limiter.d.ts +21 -0
  169. package/dist/hooks/rate-limiter.js +86 -0
  170. package/dist/hooks/secret-filter.d.ts +14 -0
  171. package/dist/hooks/secret-filter.js +65 -0
  172. package/dist/hooks/session-recovery.d.ts +27 -0
  173. package/dist/hooks/session-recovery.js +406 -0
  174. package/dist/hooks/shared/atomic-write.d.ts +41 -0
  175. package/dist/hooks/shared/atomic-write.js +148 -0
  176. package/dist/hooks/shared/context-budget.d.ts +37 -0
  177. package/dist/hooks/shared/context-budget.js +45 -0
  178. package/dist/hooks/shared/file-lock.d.ts +56 -0
  179. package/dist/hooks/shared/file-lock.js +253 -0
  180. package/dist/hooks/shared/hook-response.d.ts +33 -0
  181. package/dist/hooks/shared/hook-response.js +62 -0
  182. package/dist/hooks/shared/injection-caps.d.ts +39 -0
  183. package/dist/hooks/shared/injection-caps.js +52 -0
  184. package/dist/hooks/shared/plugin-signal.d.ts +23 -0
  185. package/dist/hooks/shared/plugin-signal.js +104 -0
  186. package/dist/hooks/shared/read-stdin.d.ts +8 -0
  187. package/dist/hooks/shared/read-stdin.js +63 -0
  188. package/dist/hooks/shared/sanitize-id.d.ts +7 -0
  189. package/dist/hooks/shared/sanitize-id.js +9 -0
  190. package/dist/hooks/shared/sanitize.d.ts +7 -0
  191. package/dist/hooks/shared/sanitize.js +22 -0
  192. package/dist/hooks/skill-injector.d.ts +38 -0
  193. package/dist/hooks/skill-injector.js +285 -0
  194. package/dist/hooks/slop-detector.d.ts +18 -0
  195. package/dist/hooks/slop-detector.js +93 -0
  196. package/dist/hooks/solution-injector.d.ts +58 -0
  197. package/dist/hooks/solution-injector.js +436 -0
  198. package/dist/hooks/subagent-tracker.d.ts +10 -0
  199. package/dist/hooks/subagent-tracker.js +90 -0
  200. package/dist/i18n/index.d.ts +43 -0
  201. package/dist/i18n/index.js +224 -0
  202. package/dist/lib.d.ts +14 -0
  203. package/dist/lib.js +14 -0
  204. package/dist/mcp/server.d.ts +8 -0
  205. package/dist/mcp/server.js +40 -0
  206. package/dist/mcp/solution-reader.d.ts +90 -0
  207. package/dist/mcp/solution-reader.js +273 -0
  208. package/dist/mcp/tools.d.ts +16 -0
  209. package/dist/mcp/tools.js +302 -0
  210. package/dist/preset/facet-catalog.d.ts +17 -0
  211. package/dist/preset/facet-catalog.js +46 -0
  212. package/dist/preset/preset-manager.d.ts +31 -0
  213. package/dist/preset/preset-manager.js +111 -0
  214. package/dist/renderer/inspect-renderer.d.ts +11 -0
  215. package/dist/renderer/inspect-renderer.js +123 -0
  216. package/dist/renderer/rule-renderer.d.ts +18 -0
  217. package/dist/renderer/rule-renderer.js +159 -0
  218. package/dist/store/evidence-store.d.ts +23 -0
  219. package/dist/store/evidence-store.js +58 -0
  220. package/dist/store/profile-store.d.ts +12 -0
  221. package/dist/store/profile-store.js +53 -0
  222. package/dist/store/recommendation-store.d.ts +22 -0
  223. package/dist/store/recommendation-store.js +64 -0
  224. package/dist/store/rule-store.d.ts +22 -0
  225. package/dist/store/rule-store.js +62 -0
  226. package/dist/store/session-state-store.d.ts +11 -0
  227. package/dist/store/session-state-store.js +44 -0
  228. package/dist/store/types.d.ts +159 -0
  229. package/dist/store/types.js +7 -0
  230. package/hooks/hook-registry.json +21 -0
  231. package/hooks/hooks.json +185 -0
  232. package/package.json +89 -0
  233. package/plugin.json +20 -0
  234. package/scripts/postinstall.js +826 -0
  235. package/skills/api-design/SKILL.md +262 -0
  236. package/skills/architecture-decision/SKILL.md +309 -0
  237. package/skills/ci-cd/SKILL.md +264 -0
  238. package/skills/code-review/SKILL.md +228 -0
  239. package/skills/compound/SKILL.md +101 -0
  240. package/skills/database/SKILL.md +257 -0
  241. package/skills/debug-detective/SKILL.md +95 -0
  242. package/skills/docker/SKILL.md +268 -0
  243. package/skills/documentation/SKILL.md +270 -0
  244. package/skills/ecomode/SKILL.md +46 -0
  245. package/skills/frontend/SKILL.md +265 -0
  246. package/skills/git-master/SKILL.md +86 -0
  247. package/skills/incident-response/SKILL.md +286 -0
  248. package/skills/migrate/SKILL.md +96 -0
  249. package/skills/performance/SKILL.md +282 -0
  250. package/skills/refactor/SKILL.md +100 -0
  251. package/skills/security-review/SKILL.md +282 -0
  252. package/skills/tdd/SKILL.md +178 -0
  253. package/skills/testing-strategy/SKILL.md +260 -0
  254. package/starter-pack/solutions/starter-api-error-responses.md +37 -0
  255. package/starter-pack/solutions/starter-async-patterns.md +40 -0
  256. package/starter-pack/solutions/starter-caching-strategy.md +40 -0
  257. package/starter-pack/solutions/starter-code-review-checklist.md +39 -0
  258. package/starter-pack/solutions/starter-debugging-systematic.md +40 -0
  259. package/starter-pack/solutions/starter-dependency-injection.md +40 -0
  260. package/starter-pack/solutions/starter-error-handling-patterns.md +38 -0
  261. package/starter-pack/solutions/starter-git-atomic-commits.md +36 -0
  262. package/starter-pack/solutions/starter-input-validation.md +40 -0
  263. package/starter-pack/solutions/starter-n-plus-one-queries.md +37 -0
  264. package/starter-pack/solutions/starter-refactor-safely.md +38 -0
  265. package/starter-pack/solutions/starter-secret-management.md +37 -0
  266. package/starter-pack/solutions/starter-separation-of-concerns.md +36 -0
  267. package/starter-pack/solutions/starter-tdd-red-green-refactor.md +40 -0
  268. package/starter-pack/solutions/starter-typescript-strict-types.md +39 -0
@@ -0,0 +1,656 @@
1
+ import * as path from 'node:path';
2
+ import { ME_SOLUTIONS, PACKS_DIR } from '../core/paths.js';
3
+ import { extractTags, expandCompoundTags, expandQueryBigrams } from './solution-format.js';
4
+ import { getOrBuildIndex } from './solution-index.js';
5
+ import { defaultNormalizer } from './term-normalizer.js';
6
+ import { maskBlockedTokens } from './phrase-blocklist.js';
7
+ // ── Synonym expansion (delegates to term-normalizer) ──
8
+ //
9
+ // The old `SYNONYM_MAP` + `expandTagsWithSynonyms` pair had two problems:
10
+ // 1. The reverse-lookup `Object.entries(SYNONYM_MAP).filter(v => v.includes(tag))`
11
+ // was O(N) per term and ran once per (query, solution) pair — quadratic
12
+ // on the solution count.
13
+ // 2. Korean↔English cross-mapping was maintained as two separate map entries
14
+ // that drifted (fixed in 5.1.2 but fragile).
15
+ //
16
+ // Both are now handled by `src/engine/term-normalizer.ts`. See that file for
17
+ // the canonical registry (`DEFAULT_MATCH_TERMS`) and the `buildTermNormalizer`
18
+ // implementation. Reverse lookup is an O(1) `Map<term, canonicals>` fetch.
19
+ //
20
+ // The export below is kept as a thin backwards-compatible wrapper so
21
+ // downstream callers (and the existing `synonym-tfidf.test.ts` spot-checks)
22
+ // continue to work — but the hot path in this module now passes
23
+ // pre-normalized query tags via the new `calculateRelevance` options arg
24
+ // and skips the wrapper entirely.
25
+ /**
26
+ * @deprecated Use `defaultNormalizer.normalizeTerms` from
27
+ * `./term-normalizer.js` directly. Kept as a thin wrapper for the existing
28
+ * `synonym-tfidf.test.ts` and any external consumers.
29
+ */
30
+ export function expandTagsWithSynonyms(tags) {
31
+ return defaultNormalizer.normalizeTerms(tags);
32
+ }
33
+ // ── TF-IDF weighting for common tags ──
34
+ /** High-frequency tags that should be weighted lower */
35
+ const COMMON_TAGS = new Set([
36
+ 'typescript', 'ts', 'javascript', 'js', 'fix', 'update', 'add', 'change',
37
+ 'file', 'code', 'function', 'import', 'export', 'error', 'type', 'string',
38
+ 'number', 'object', 'array', 'return', 'const', 'class', 'module',
39
+ '코드', '파일', '함수', '수정', '추가', '변경', '에러', '타입',
40
+ ]);
41
+ /** Apply IDF-like weight: common tags get reduced weight */
42
+ export function tagWeight(tag) {
43
+ return COMMON_TAGS.has(tag) ? 0.5 : 1.0;
44
+ }
45
+ export function calculateRelevance(promptOrTags, keywordsOrTags, confidence, options) {
46
+ if (typeof promptOrTags === 'string') {
47
+ // Legacy mode: substring matching for backwards compatibility.
48
+ // Not a hot path — only hit by the (old) solution-matcher.test.ts cases.
49
+ const promptTags = extractTags(promptOrTags);
50
+ const intersection = keywordsOrTags.filter(kw => promptTags.some(pt => pt === kw || (pt.length > 3 && kw.length > 3 && (pt.startsWith(kw) || kw.startsWith(pt)))));
51
+ return Math.min(1, intersection.length / Math.max(promptTags.length * 0.5, 1));
52
+ }
53
+ // v3 mode: tag matching with synonym expansion + TF-IDF weighting.
54
+ //
55
+ // T2: the synonym expansion is now a hash-indexed lookup via
56
+ // `defaultNormalizer.normalizeTerms` (see term-normalizer.ts). Callers in
57
+ // the hot path pre-compute the expansion once per query and pass it via
58
+ // `options.normalizedPromptTags`, so this function no longer repeats the
59
+ // work per solution.
60
+ const expandedPromptTags = options?.normalizedPromptTags
61
+ ?? defaultNormalizer.normalizeTerms(promptOrTags);
62
+ // R4-T1: when the caller supplies a compound-expanded solution tag set,
63
+ // intersection and partial matching run against the expanded set (so
64
+ // `api-key` matches `api`/`key` queries via the split parts), but the
65
+ // Jaccard union denominator below still uses the RAW `keywordsOrTags`
66
+ // for normalization stability.
67
+ const matchTags = options?.solutionTagsExpanded ?? keywordsOrTags;
68
+ const intersection = matchTags.filter(t => expandedPromptTags.includes(t));
69
+ // partial/substring matches for longer tags (>3 chars)
70
+ const partialMatches = matchTags.filter(t => t.length > 3 && !intersection.includes(t)
71
+ && expandedPromptTags.some(pt => pt.length > 3 && (pt.includes(t) || t.includes(pt))));
72
+ // Apply TF-IDF weighting: common tags count less
73
+ const weightedMatched = intersection.reduce((sum, t) => sum + tagWeight(t), 0)
74
+ + partialMatches.reduce((sum, t) => sum + tagWeight(t) * 0.5, 0);
75
+ // 완화된 임계값: 가중 점수 0.5 이상이면 후보
76
+ if (weightedMatched < 0.5)
77
+ return { relevance: 0, matchedTags: [] };
78
+ // Jaccard-like: weighted matched / union.
79
+ // Union uses RAW promptTags and RAW solutionTags — not the expanded set —
80
+ // so that the denominator semantics are unchanged from pre-T2 behaviour.
81
+ // This is intentional: expanding both sides of the Jaccard would
82
+ // asymmetrically inflate recall and silently shift all baseline metrics.
83
+ // R4-T1 explicitly preserves this: `keywordsOrTags` is the raw solution
84
+ // tag list, not the compound-expanded `matchTags` used above.
85
+ const union = new Set([...promptOrTags, ...keywordsOrTags]).size;
86
+ const tagScore = weightedMatched / Math.max(union, 1);
87
+ return {
88
+ relevance: tagScore * (confidence ?? 1),
89
+ matchedTags: [...intersection, ...partialMatches],
90
+ };
91
+ }
92
+ // ── R4-T3: query-side specificity guards (orchestration layer) ──
93
+ //
94
+ // Two narrow precision rules applied AFTER `calculateRelevance` returns,
95
+ // at the orchestration layer (`rankCandidates`, `searchSolutions`).
96
+ // These rules fix the 2 surviving false positives from R4-T2 — the
97
+ // "validation of insurance claims" and "database backup recovery
98
+ // procedure" residuals — WITHOUT regressing any legitimate fixture
99
+ // positive or paraphrase.
100
+ //
101
+ // Why orchestration-level (not inside calculateRelevance):
102
+ // `calculateRelevance` is a pure scoring function with a stable
103
+ // contract: given (promptTags, solutionTags, confidence), return the
104
+ // relevance and the matched tag set. Several internal tests
105
+ // (synonym-tfidf.test.ts) call it directly with single-token inputs
106
+ // to verify synonym expansion in isolation. Embedding precision
107
+ // filters in the scoring path would break those tests AND break the
108
+ // semantic of "scoring is a pure function". The two rules below are
109
+ // policy-layer decisions about which scored candidates to surface,
110
+ // so they belong at the caller — not at the scorer.
111
+ //
112
+ // Rule A — single-token query AND single-tag match → reject.
113
+ // Rationale: a query that's been reduced to a single dev token (after
114
+ // R4-T2 phrase masking) is unlikely to be a real dev question. Combined
115
+ // with a single-tag match, this is the "validation of insurance
116
+ // claims" shape: masked to `[validation]`, matched a single ambiguous
117
+ // tag `validation` on error-handling-patterns. No legitimate fixture
118
+ // positive or paraphrase has both promptTags.length === 1 AND
119
+ // matchedTags.length === 1.
120
+ //
121
+ // Rule B — all matched tags came via SYNONYM EXPANSION (none appear
122
+ // literally in the prompt tokens) AND match is single-tag → reject.
123
+ // Rationale: the "database backup recovery procedure" shape. After
124
+ // R4-T2 masks `database`/`backup`, the residual tokens are `[recovery,
125
+ // procedure]`. The matched tag is `handling` — which appears nowhere
126
+ // in the query. It only matches because the term-normalizer's
127
+ // `handling` canonical includes `recovery` as a matchTerm (legitimate
128
+ // for "error recovery handler" queries). The rule rejects this
129
+ // expansion-only single-tag match because the query carries no
130
+ // LITERAL signal that the matched solution is relevant. Multi-tag
131
+ // expansion matches are NOT rejected — those indicate the canonical
132
+ // family is being hit from multiple angles ("버그 재현 시스템적으로"
133
+ // hits debugging-systematic via both `debug` and `debugging` — two
134
+ // distinct matches survive).
135
+ //
136
+ // Literal hit: a matched tag is "literal" with respect to the query if
137
+ // any of the following holds for some prompt token `pt`:
138
+ // 1. `pt === tag` (exact verbatim match in the query)
139
+ // 2. `pt` is a substring of `tag` or vice versa, with both length > 3
140
+ // (mirrors the partialMatches discovery rule in calculateRelevance —
141
+ // e.g., `code` (query) ↔ `code-review` (matched tag))
142
+ // 3. `pt` and `tag` share a common prefix of length ≥ 4 (catches
143
+ // morphological variants like `caching` ↔ `cache`, `cached` ↔
144
+ // `cache`, `documents` ↔ `document` where neither is a substring
145
+ // of the other but both clearly come from the same stem)
146
+ //
147
+ // Rule (3) is the defensive precision fix: without it, a query like
148
+ // "caching strategy" (which the term-normalizer expands `caching → cache`
149
+ // via the cache canonical) would have its single-tag `cache` match
150
+ // rejected by Rule B, even though `caching` is morphologically the same
151
+ // concept. The 4-char threshold is the same as the partialMatches rule
152
+ // to keep the literal-hit semantics consistent across the matcher.
153
+ //
154
+ // Returns true if the candidate should be rejected (caller filters
155
+ // it out), false if the candidate passes both rules.
156
+ export function shouldRejectByR4T3Rules(promptTags, matchedTags) {
157
+ // Rule A
158
+ if (promptTags.length === 1 && matchedTags.length === 1) {
159
+ return true;
160
+ }
161
+ // Rule B
162
+ if (matchedTags.length === 1) {
163
+ const tag = matchedTags[0];
164
+ const literalHit = promptTags.includes(tag)
165
+ || promptTags.some(pt => {
166
+ if (pt.length <= 3 || tag.length <= 3)
167
+ return false;
168
+ if (pt.includes(tag) || tag.includes(pt))
169
+ return true;
170
+ // Morphological stem: shared prefix of length ≥ 4
171
+ let i = 0;
172
+ const limit = Math.min(pt.length, tag.length);
173
+ while (i < limit && pt[i] === tag[i])
174
+ i++;
175
+ return i >= 4;
176
+ });
177
+ if (!literalHit)
178
+ return true;
179
+ }
180
+ return false;
181
+ }
182
+ /**
183
+ * Shared ranking core: tag-based relevance + identifier boost + top-5 sort.
184
+ *
185
+ * Single source of truth for the matcher's ranking behaviour. Both
186
+ * `matchSolutions` (production, reads from the index) and
187
+ * `evaluateSolutionMatcher` (bootstrap eval, reads from an in-memory fixture)
188
+ * call through here so the eval metrics track reality — any future
189
+ * ranking-logic change only needs to happen in one place.
190
+ *
191
+ * Contract:
192
+ * - identifier boost requires `id.length >= 4` (STRONG_ID_MIN_LENGTH mirror)
193
+ * and substring presence in the prompt (case-insensitive).
194
+ * - candidates with zero matched tags AND zero matched identifiers are dropped.
195
+ * - top-5 by `relevance` descending.
196
+ * - duplicate names are NOT deduplicated — that matches the pre-refactor
197
+ * `matchSolutions` behaviour (both scopes could rank). Callers that want
198
+ * first-wins scope precedence must dedupe on their side.
199
+ */
200
+ function rankCandidates(promptTags, promptLower, solutions) {
201
+ // T2: normalize prompt tags ONCE per query (not once per solution).
202
+ // Pre-T2 this expansion happened inside calculateRelevance and was
203
+ // repeated N times for N solutions — the plan's primary hot-path win.
204
+ //
205
+ // R4-T2: BEFORE any expansion or normalization, mask out tokens that
206
+ // belong to blocked English phrases ("performance review", "system
207
+ // architecture", etc.). This is a precision filter for non-dev-context
208
+ // false positives. The mask runs first so neither bigram expansion nor
209
+ // canonical normalization can re-introduce a masked token via synonyms
210
+ // or compound recovery — the masked tokens are simply removed from the
211
+ // matching pipeline. See `phrase-blocklist.ts` for the full rationale
212
+ // and the `maskBlockedTokens` contract.
213
+ const maskedPromptTags = maskBlockedTokens(promptLower, promptTags);
214
+ if (maskedPromptTags.length === 0)
215
+ return [];
216
+ //
217
+ // R4-T1: also expand the prompt tags with adjacent-token bigrams BEFORE
218
+ // running the canonical normalizer. `expandQueryBigrams` produces compound
219
+ // forms like `api-key`, `apikey`, `api-keys`, `apikeys` from the raw
220
+ // ['api', 'keys'] token pair, so a query "api keys" can hit a solution
221
+ // tag `api-key` via direct intersection — without depending on the
222
+ // partialMatches half-weight fallback. The bigram expansion is layered
223
+ // BEFORE normalization so that `apikey → api` (via the api canonical
224
+ // family) still works.
225
+ //
226
+ // Note: we intentionally do NOT use `sol.normalizedTags` (if present) for
227
+ // the intersection. Using normalized on BOTH sides is bidirectional
228
+ // expansion that inflates Jaccard intersection 5-10× and silently shifts
229
+ // every baseline metric. `entry.normalizedTags` is populated by the
230
+ // index but reserved for log explainability. If a future change uses it
231
+ // in scoring, it must update ROUND3_BASELINE in the same PR.
232
+ const promptTagsWithBigrams = expandQueryBigrams(maskedPromptTags);
233
+ const normalizedPromptTags = defaultNormalizer.normalizeTerms(promptTagsWithBigrams);
234
+ return solutions
235
+ .map(sol => {
236
+ // R4-T1: solution-side compound-tag expansion. `api-key` becomes
237
+ // {api-key, api, key} so a query token `api` (from "api keys") hits
238
+ // it directly. Computed per solution because each sol.tags is
239
+ // independent — caching across the rank loop is not worth the
240
+ // bookkeeping for the corpus sizes Forgen targets (N ≤ 200).
241
+ const solTagsExpanded = expandCompoundTags(sol.tags);
242
+ // R4-T2: pass `maskedPromptTags` (not the original `promptTags`) as
243
+ // the first arg so the Jaccard union denominator inside
244
+ // calculateRelevance reflects the post-mask tag set. The matching
245
+ // step (intersection/partialMatches) already uses the masked set
246
+ // via `normalizedPromptTags` — the union must match for score
247
+ // semantics to stay consistent.
248
+ const result = calculateRelevance(maskedPromptTags, sol.tags, sol.confidence, { normalizedPromptTags, solutionTagsExpanded: solTagsExpanded });
249
+ // Compute identifier boost FIRST — independent of tag scoring so
250
+ // R4-T3's tag-evidence precision rules below cannot silently drop
251
+ // a candidate that has strong identifier-level evidence.
252
+ let identifierBoost = 0;
253
+ const matchedIdentifiers = [];
254
+ for (const id of sol.identifiers ?? []) {
255
+ if (id.length >= 4 && promptLower.includes(id.toLowerCase())) {
256
+ identifierBoost += 0.15;
257
+ matchedIdentifiers.push(id);
258
+ }
259
+ }
260
+ // R4-T3: orchestration-layer specificity guards. Reject single-tag
261
+ // matches that lack a corroborating signal (single-token query OR
262
+ // all-via-expansion match). See `shouldRejectByR4T3Rules` for the
263
+ // full rule rationale.
264
+ //
265
+ // Identifier evidence is the escape hatch: if the query literally
266
+ // mentioned one of the solution's identifiers (e.g. a function or
267
+ // file name), the R4-T3 tag-precision rules are bypassed because
268
+ // the identifier hit is itself a strong-specificity signal. Only
269
+ // the tag evidence is zeroed out when R4-T3 fires; the identifier
270
+ // boost and matched identifiers are preserved, so a candidate with
271
+ // a single weak tag match but a valid identifier still survives
272
+ // the `matchedTags.length + matchedIdentifiers.length >= 1` filter.
273
+ let tagRelevance = result.relevance;
274
+ let tagMatches = result.matchedTags;
275
+ if (matchedIdentifiers.length === 0
276
+ && tagMatches.length > 0
277
+ && shouldRejectByR4T3Rules(maskedPromptTags, tagMatches)) {
278
+ tagRelevance = 0;
279
+ tagMatches = [];
280
+ }
281
+ return {
282
+ solution: sol,
283
+ relevance: tagRelevance + identifierBoost,
284
+ matchedTags: tagMatches,
285
+ matchedIdentifiers,
286
+ };
287
+ })
288
+ .filter(c => c.matchedTags.length + c.matchedIdentifiers.length >= 1)
289
+ .sort((a, b) => b.relevance - a.relevance)
290
+ .slice(0, 5);
291
+ }
292
+ /**
293
+ * Round 3 baseline metrics, recorded against the current `term-normalizer`
294
+ * + `calculateRelevance` + fixture `solution-match-bootstrap.json`. Used as
295
+ * a relative regression guard in `tests/solution-matcher-eval.test.ts` —
296
+ * downstream PRs must not regress any field by more than `BASELINE_TOLERANCE`.
297
+ *
298
+ * History (chronological ascending — v1 at top, latest at bottom):
299
+ * - v1 (2026-04-08, fixture v1, 41+10+10 queries): 1.0 / 1.0 / 0.0 / 0.1
300
+ * Recorded against the original 61-query fixture, all positive queries
301
+ * PASS@1. Indicated a measurement plateau but masked the matcher's true
302
+ * ranking and false-positive weaknesses because the fixture queries were
303
+ * too tag-aligned.
304
+ *
305
+ * - v2 (2026-04-08, fixture v2, 53+16+14 queries): 1.0 / 0.969 / 0.0 / 0.357
306
+ * Expanded with 12 hard positive (multi-canonical / compound-tag tug-of-
307
+ * war), 6 Korean subtle paraphrase, and 4 tricky negative queries. The
308
+ * drops are intentional and represent genuine matcher behaviour:
309
+ * * positive mrrAt5 1.0 → 0.959: 4 of 12 added positives rank #2-3:
310
+ * (1) "managing api keys and credentials safely" → secret @3 vs
311
+ * api-error-responses @1 — the `api` canonical in
312
+ * DEFAULT_MATCH_TERMS expands to {api, rest, graphql, endpoint,
313
+ * route}, so query `api` hits BOTH `api` AND `rest` on
314
+ * starter-api-error-responses (matched=['api','rest']) — a
315
+ * double-count numerator. starter-secret-management only scores
316
+ * a single weak partial match on `credential`. The compound
317
+ * `api-key` tag on secret-management is never reached because
318
+ * extractTags strips the query-side hyphen and yields
319
+ * ['api','keys'] (the solution-side tag remains hyphenated in
320
+ * the index but has no query token to intersect with). T4 IDF
321
+ * would down-weight both `api` and `rest`, neutralising the
322
+ * double-count and letting `credential` outscore the noise.
323
+ * (2) "avoiding hardcoded credentials in source code" → secret @2
324
+ * vs code-review @1 — `code` partial-matches `code-review`
325
+ * (len>3, code-review.includes('code')=true) at half weight.
326
+ * secret-management's `credential` matches by partial too but
327
+ * the union size differs.
328
+ * (3) "red green refactor cycle for new features" → tdd @2 vs
329
+ * refactor-safely @1 — `refactor` is a full-weight intersection
330
+ * with both refactor-safely's `refactor` and `리팩토링` (via
331
+ * the refactor canonical), giving 2 hits at 1.0 each. tdd-red-
332
+ * green-refactor only matches the literal compound tag
333
+ * `red-green-refactor` (one weighted hit) — the full-weight
334
+ * generic `refactor` term overpowers the compound-tag specifity.
335
+ * (4) "writing unit tests for a function with side effects" → tdd
336
+ * @2 vs separation-of-concerns @1 — both solutions have a
337
+ * SINGLE matching tag with weighted score 0.5: separation gets
338
+ * `function` (COMMON_TAG, exact intersection, weight 0.5);
339
+ * tdd-red-green-refactor gets `tests` partial-matching `test`
340
+ * (len>3, partial weight 1.0 × 0.5 = 0.5). Both numerators are
341
+ * identical. Separation wins because the `function` co-occurs
342
+ * in both promptTags and solution.tags, shrinking its Jaccard
343
+ * union by one element vs tdd's — a 1-element union-size
344
+ * advantage drives the entire ranking. starter-dependency-
345
+ * injection is *not* in top-5 despite having `testing`/`mock`/
346
+ * `dependency` tags (`tests` does not partial-match `testing`
347
+ * — neither is a substring of the other), so listing `di` in
348
+ * expectAnyOf is purely defensive recall, not a live candidate.
349
+ * T4 BM25 with proper length normalization would attack the
350
+ * union-size tie-breaker more rigorously than current Jaccard.
351
+ * * paraphrase mrrAt5 stays at 1.0: all 6 added Korean paraphrases
352
+ * rank @1 (the originally hard "테스트 먼저 작성하고 리팩토링" is
353
+ * documented in the fixture as legitimately matching either tdd
354
+ * OR refactor-safely, since starter-refactor-safely's README also
355
+ * covers test-first workflows — both are defensible answers).
356
+ * * negativeAnyResultRate 0.1 → 0.357: 4 added tricky negatives all
357
+ * trigger false positives via single common dev-adjacent words —
358
+ * "performance review meeting notes" → caching (matches
359
+ * `performance`), "system architecture overview document" →
360
+ * separation-of-concerns (matches `architecture`), "database backup
361
+ * recovery procedure" → n-plus-one-queries (matches `database`,
362
+ * `query`, `데이터베이스`), "validation of insurance claims" →
363
+ * error-handling (matches `validation`).
364
+ * The original Round 3 plan staged these for T4 (BM25 + IDF). T4 was
365
+ * EMPIRICALLY SKIPPED on 2026-04-08 — see
366
+ * `docs/plans/2026-04-08-t4-bm25-skip-adr.md` for the full decision
367
+ * record. Summary: BM25 prototypes (naive, hybrid Jaccard×IDF,
368
+ * precision filter, soft penalty) all matched or underperformed the
369
+ * current scorer on every metric. The starter corpus (N=15) is too
370
+ * small for IDF to be informative, and the false positives are
371
+ * semantic ("performance" is both a dev tag and an English noun) — not
372
+ * statistical, so no frequency-based weighting can fix them. The real
373
+ * follow-up candidates are tokenizer fix for compound tags, an n-gram
374
+ * phrase matcher, and corpus growth — all deferred to Round 4 per the
375
+ * ADR.
376
+ *
377
+ * - v3 (2026-04-08, fixture v2 + R4-T1 compound-tag fix): 1.0 / 0.986 / 0.0 / 0.357
378
+ * R4-T1 added `expandCompoundTags` (solution-side) and
379
+ * `expandQueryBigrams` (query-side) so hyphenated solution tags like
380
+ * `api-key`, `code-review`, `red-green-refactor` participate in direct
381
+ * intersection rather than relying on the half-weight partialMatches
382
+ * fallback. positive `mrrAt5` improved 0.959 → 0.981 (+0.022). 2 of
383
+ * the 4 v2 hard positive cases were resolved (`managing api keys and
384
+ * credentials safely` and `red green refactor cycle for new features`
385
+ * now rank @1). The remaining 2 (`avoiding hardcoded credentials …`
386
+ * and `writing unit tests for a function with side effects`) require
387
+ * R4-T2 (phrase matcher) or R4-T3 (specificity classifier) — they're
388
+ * about query-side English semantics, not compound-tag tokenization.
389
+ * `negativeAnyResultRate` is unchanged at 0.357 because R4-T1 is a
390
+ * ranking-quality fix, not a false-positive filter.
391
+ *
392
+ * - v4 (2026-04-08, fixture v2 + R4-T1 + R4-T2 phrase blocklist):
393
+ * 1.0 / 0.986 / 0.0 / 0.143
394
+ * R4-T2 added `phrase-blocklist.ts` with 17 curated 2-word English
395
+ * non-dev compounds ("performance review", "system architecture",
396
+ * "database backup", etc.) and a `maskBlockedTokens` step at the
397
+ * top of `rankCandidates` and `searchSolutions`. When a query
398
+ * contains a blocked phrase, the constituent tokens are removed
399
+ * from the prompt tag list before bigram expansion / canonical
400
+ * normalization runs — so the false-positive evidence is removed
401
+ * at the source rather than demoted in scoring.
402
+ *
403
+ * `negativeAnyResultRate` dropped 0.357 → 0.143 (3 of 5 v2 trigger
404
+ * negatives fully blocked):
405
+ * * "performance review meeting notes" — blocked via
406
+ * `performance review` + `meeting notes`
407
+ * * "system architecture overview document" — blocked via
408
+ * `system architecture` + `overview document`
409
+ * * "solar system planets astronomy" — blocked via `solar system`
410
+ *
411
+ * 2 false positives remain (both deferred to R4-T3 query-side
412
+ * specificity classifier — the residuals share a common shape:
413
+ * a single dev-tag homograph survives whatever masking is applied,
414
+ * and the term-normalizer expansion still surfaces a false match):
415
+ *
416
+ * * "database backup recovery procedure" → error-handling-patterns:
417
+ * `database backup` is blocked, but the residual tokens
418
+ * {`recovery`, `procedure`} survive. `recovery` is in the
419
+ * `handling` canonical's matchTerms (intentional, for legitimate
420
+ * "error recovery handler" queries), so the masked query still
421
+ * hits `starter-error-handling-patterns` via the handling
422
+ * family. A 3-word `recovery procedure` blocklist entry was
423
+ * considered and rejected — it would silently mask legitimate
424
+ * dev SRE queries like "disaster recovery procedure" or
425
+ * "rollback recovery procedure" without a fixture-driven
426
+ * signal. The right fix is at the query-specificity layer
427
+ * (R4-T3): require ≥ 2 distinct dev-context signals before any
428
+ * match is returned, not at the phrase-blocklist layer.
429
+ *
430
+ * * "validation of insurance claims" → error-handling-patterns:
431
+ * `insurance claim` is blocked, but the residual `validation`
432
+ * token IS a legitimate dev tag (input-validation,
433
+ * error-handling-patterns both have it). Same R4-T3 target.
434
+ *
435
+ * positive/paraphrase mrrAt5 are unchanged from v3 because no
436
+ * legitimate dev query in the fixture contains a blocked phrase.
437
+ *
438
+ * - v5 (2026-04-08, fixture v2 + R4-T1 + R4-T2 + R4-T3 specificity guards):
439
+ * 1.0 / 0.986 / 0.0 / 0.000
440
+ * R4-T3 added two narrow precision rules at the ORCHESTRATION LAYER —
441
+ * NOT inside `calculateRelevance` (which remains a pure scoring
442
+ * function for test symmetry). The rules are implemented as the
443
+ * exported helper `shouldRejectByR4T3Rules(promptTags, matchedTags)`
444
+ * and called from both `rankCandidates` (hook path) and
445
+ * `searchSolutions` (MCP path) right after the per-solution
446
+ * `calculateRelevance` call:
447
+ * (Rule A) single-token query AND single-tag match → reject;
448
+ * (Rule B) single-tag match with no literal hit in the prompt
449
+ * (verbatim match, or substring partial length > 3, or
450
+ * shared prefix ≥ 4 for morphological stems) → reject.
451
+ * Both rules are scoped narrowly enough to fix exactly the 2 R4-T2
452
+ * residuals without recall regression — every fixture positive and
453
+ * paraphrase still ranks identically:
454
+ * * "validation of insurance claims" → masked to `[validation]`
455
+ * (length 1) with single-tag match `validation` → Rule A reject.
456
+ * * "database backup recovery procedure" → masked to
457
+ * `[recovery, procedure]` with single-tag match `handling`
458
+ * (zero literal hit; `handling` is reached via the `recovery`
459
+ * canonical-family expansion in term-normalizer) → Rule B reject.
460
+ * `negativeAnyResultRate` is now 0.000 — every fixture v2 negative
461
+ * produces zero candidates. positive/paraphrase metrics unchanged
462
+ * from v4 because no fixture positive matches the (single-token AND
463
+ * single-tag) or (all-expansion AND single-tag) shape.
464
+ *
465
+ * Escape hatch: identifier-boost evidence (hook path) or name-match
466
+ * evidence (MCP path) BYPASSES the R4-T3 rules. A candidate with
467
+ * even a single weak tag match plus an identifier hit still
468
+ * surfaces — the precision rules only fire when the candidate's
469
+ * entire evidence pool is a single ambiguous tag.
470
+ *
471
+ * Defensive precision note: Rule B's "shared prefix ≥ 4"
472
+ * morphological check is currently NOT fixture-driven (no fixture
473
+ * query masks down to the `caching/cache`-style morphological gap).
474
+ * It exists as a pre-emptive fix against silently rejecting
475
+ * legitimate future queries where the term-normalizer synonym
476
+ * expansion is the only bridge between the query token and the
477
+ * solution tag. If a production query surfaces a case the prefix
478
+ * check misses, extend it (e.g. by lowering the threshold or
479
+ * adding a Levenshtein-1 check) rather than removing it.
480
+ *
481
+ * Known matcher quirks (separate from the T4 BM25 investigation):
482
+ * - `term-normalizer.ts` `error` canonical contains `debug` as a matchTerm
483
+ * (intentional for `bug → error` recall), which causes any prompt
484
+ * containing `error` to expand to `debug` and over-rank
485
+ * `starter-debugging-systematic` on otherwise unrelated queries. This
486
+ * is why `async await error propagation` could not be added as a hard
487
+ * case — the matcher returns debugging-systematic at #1, which is
488
+ * defensible-but-noisy. The fix is at the normalizer level (split
489
+ * `debug` out of the `error` family or remove the `error → debug`
490
+ * edge entirely) and is queued as a Round 4 follow-up. T4 BM25 was
491
+ * considered as a partial mitigation but the T4 skip ADR (referenced
492
+ * in the Round 3 outcome paragraph above) shows it does not help.
493
+ *
494
+ * Long-tail caveat:
495
+ * - `"trying to handle authentication errors gracefully when our backend
496
+ * api returns inconsistent response formats from different
497
+ * microservices"` is a 17-word query intentionally added to exercise
498
+ * long-tail behaviour. Currently PASS@1. Originally flagged as BM25
499
+ * length-normalization sensitive, but since T4 BM25 was skipped this
500
+ * caveat is now informational only — no length-norm code path is
501
+ * planned in Round 3.
502
+ *
503
+ * If a PR legitimately improves a metric, update this constant in the same
504
+ * commit so future PRs guard against the new floor.
505
+ */
506
+ export const ROUND3_BASELINE = {
507
+ recallAt5: 1.0,
508
+ mrrAt5: 0.986,
509
+ noResultRate: 0.0,
510
+ negativeAnyResultRate: 0.0,
511
+ byBucket: {
512
+ positive: { recallAt5: 1.0, mrrAt5: 0.981, noResultRate: 0.0, total: 53 },
513
+ paraphrase: { recallAt5: 1.0, mrrAt5: 1.0, noResultRate: 0.0, total: 16 },
514
+ },
515
+ total: { positive: 53, paraphrase: 16, negative: 14 },
516
+ };
517
+ /** Maximum allowed absolute regression per metric. 5% is tight enough to catch
518
+ * ~3-4 query regressions in a 69-query combined bucket (positive+paraphrase)
519
+ * but lenient enough that a single fixture edit won't spuriously fail the
520
+ * guard. */
521
+ export const BASELINE_TOLERANCE = 0.05;
522
+ /** Run a single bucket through the ranking pipeline and aggregate IR metrics. */
523
+ function computeBucketMetrics(queries, solutions) {
524
+ let recallHits = 0;
525
+ let reciprocalSum = 0;
526
+ let noResultCount = 0;
527
+ for (const q of queries) {
528
+ const promptTags = extractTags(q.query);
529
+ const ranked = rankCandidates(promptTags, q.query.toLowerCase(), solutions);
530
+ if (ranked.length === 0) {
531
+ noResultCount++;
532
+ continue;
533
+ }
534
+ for (let i = 0; i < ranked.length; i++) {
535
+ if (q.expectAnyOf.includes(ranked[i].solution.name)) {
536
+ recallHits++;
537
+ reciprocalSum += 1 / (i + 1);
538
+ break;
539
+ }
540
+ }
541
+ }
542
+ const total = queries.length;
543
+ return {
544
+ recallAt5: total > 0 ? recallHits / total : 0,
545
+ mrrAt5: total > 0 ? reciprocalSum / total : 0,
546
+ noResultRate: total > 0 ? noResultCount / total : 0,
547
+ total,
548
+ };
549
+ }
550
+ /**
551
+ * Test/diagnostic helper: evaluate one query against a fixture solution set
552
+ * and return the top-5 ranked candidates with their relevance + matched tags.
553
+ *
554
+ * Exists so per-query regression tests (e.g. the R4-T1 hard-positive guards
555
+ * in `tests/solution-matcher-eval.test.ts`) can assert specific ranking
556
+ * outcomes without scraping aggregate metrics. Wraps `rankCandidates` so
557
+ * the test path stays in sync with the production ranker.
558
+ *
559
+ * Returns the same shape as `rankCandidates` minus the generic carrier:
560
+ * `{name, relevance, matchedTags}`. Use the names to assert "expected
561
+ * solution at rank 1".
562
+ */
563
+ export function evaluateQuery(query, solutions) {
564
+ const promptTags = extractTags(query);
565
+ return rankCandidates(promptTags, query.toLowerCase(), solutions).map(c => ({
566
+ name: c.solution.name,
567
+ relevance: c.relevance,
568
+ matchedTags: c.matchedTags,
569
+ }));
570
+ }
571
+ /**
572
+ * Evaluate the current matcher against a labeled fixture and return IR
573
+ * metrics. This is the Round 3 baseline — each downstream PR (T2/T3/T4) must
574
+ * not regress any of the thresholds asserted in `solution-matcher-eval.test.ts`.
575
+ *
576
+ * Uses `rankCandidates` (shared with `matchSolutions`) so the evaluator can't
577
+ * silently drift from production ranking behaviour.
578
+ *
579
+ * Metrics are reported both aggregated (positive ∪ paraphrase) and per-bucket,
580
+ * so paraphrase-only regressions surface in `byBucket.paraphrase` even if the
581
+ * aggregate looks fine.
582
+ */
583
+ export function evaluateSolutionMatcher(fixture) {
584
+ const positiveM = computeBucketMetrics(fixture.positive, fixture.solutions);
585
+ const paraphraseM = computeBucketMetrics(fixture.paraphrase, fixture.solutions);
586
+ const combinedTotal = positiveM.total + paraphraseM.total;
587
+ // Weighted aggregation: counts, not means — so a large positive bucket
588
+ // doesn't drown a small paraphrase bucket but also a single-query bucket
589
+ // doesn't dominate.
590
+ const recallAt5 = combinedTotal > 0
591
+ ? (positiveM.recallAt5 * positiveM.total + paraphraseM.recallAt5 * paraphraseM.total) / combinedTotal
592
+ : 0;
593
+ const mrrAt5 = combinedTotal > 0
594
+ ? (positiveM.mrrAt5 * positiveM.total + paraphraseM.mrrAt5 * paraphraseM.total) / combinedTotal
595
+ : 0;
596
+ const noResultRate = combinedTotal > 0
597
+ ? (positiveM.noResultRate * positiveM.total + paraphraseM.noResultRate * paraphraseM.total) / combinedTotal
598
+ : 0;
599
+ let negAnyResult = 0;
600
+ for (const q of fixture.negative) {
601
+ const promptTags = extractTags(q.query);
602
+ const ranked = rankCandidates(promptTags, q.query.toLowerCase(), fixture.solutions);
603
+ if (ranked.length >= 1)
604
+ negAnyResult++;
605
+ }
606
+ const negTotal = fixture.negative.length;
607
+ return {
608
+ recallAt5,
609
+ mrrAt5,
610
+ noResultRate,
611
+ negativeAnyResultRate: negTotal > 0 ? negAnyResult / negTotal : 0,
612
+ byBucket: {
613
+ positive: positiveM,
614
+ paraphrase: paraphraseM,
615
+ },
616
+ total: {
617
+ positive: fixture.positive.length,
618
+ paraphrase: fixture.paraphrase.length,
619
+ negative: fixture.negative.length,
620
+ },
621
+ };
622
+ }
623
+ export function matchSolutions(prompt, scope, cwd) {
624
+ // Build solution dirs for index cache
625
+ const dirs = [
626
+ { dir: ME_SOLUTIONS, scope: 'me' },
627
+ ];
628
+ if (scope.team) {
629
+ dirs.push({ dir: path.join(PACKS_DIR, scope.team.name, 'solutions'), scope: 'team' });
630
+ }
631
+ dirs.push({ dir: path.join(cwd, '.compound', 'solutions'), scope: 'project' });
632
+ // Use cached index (rebuilt only when dirs change)
633
+ const index = getOrBuildIndex(dirs);
634
+ const allSolutions = index.entries.map(e => ({ ...e }));
635
+ const promptTags = extractTags(prompt);
636
+ const promptLower = prompt.toLowerCase();
637
+ // Delegate to shared ranking core. `rankCandidates` is generic so each
638
+ // ranked candidate carries the original `LoadedSolution` reference — no
639
+ // name-based re-lookup, so two scopes sharing a name (e.g. me/foo and
640
+ // project/foo) can both appear in the result without a Map last-wins
641
+ // scope-precedence bug.
642
+ const ranked = rankCandidates(promptTags, promptLower, allSolutions);
643
+ return ranked.map(c => ({
644
+ name: c.solution.name,
645
+ path: c.solution.filePath,
646
+ scope: c.solution.scope,
647
+ relevance: c.relevance,
648
+ summary: c.solution.name,
649
+ status: c.solution.status,
650
+ confidence: c.solution.confidence,
651
+ type: c.solution.type,
652
+ tags: c.solution.tags,
653
+ identifiers: c.solution.identifiers,
654
+ matchedTags: [...c.matchedTags, ...c.matchedIdentifiers],
655
+ }));
656
+ }