pcl-mcp 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -28
- package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts +30 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.js +50 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.js.map +1 -0
- package/dist/benchmarks/evaluators/ir-metrics.d.ts +32 -0
- package/dist/benchmarks/evaluators/ir-metrics.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/ir-metrics.js +98 -0
- package/dist/benchmarks/evaluators/ir-metrics.js.map +1 -0
- package/dist/benchmarks/evaluators/structured-judge.d.ts +34 -0
- package/dist/benchmarks/evaluators/structured-judge.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/structured-judge.js +153 -0
- package/dist/benchmarks/evaluators/structured-judge.js.map +1 -0
- package/dist/benchmarks/evaluators/token-counter.d.ts +9 -0
- package/dist/benchmarks/evaluators/token-counter.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/token-counter.js +24 -0
- package/dist/benchmarks/evaluators/token-counter.js.map +1 -0
- package/dist/benchmarks/generators/generate-corpus.d.ts +2 -0
- package/dist/benchmarks/generators/generate-corpus.d.ts.map +1 -0
- package/dist/benchmarks/generators/generate-corpus.js +243 -0
- package/dist/benchmarks/generators/generate-corpus.js.map +1 -0
- package/dist/benchmarks/lib/harness.d.ts +23 -0
- package/dist/benchmarks/lib/harness.d.ts.map +1 -0
- package/dist/benchmarks/lib/harness.js +44 -0
- package/dist/benchmarks/lib/harness.js.map +1 -0
- package/dist/benchmarks/lib/types.d.ts +79 -0
- package/dist/benchmarks/lib/types.d.ts.map +1 -0
- package/dist/benchmarks/lib/types.js +2 -0
- package/dist/benchmarks/lib/types.js.map +1 -0
- package/dist/benchmarks/reporters/markdown-reporter.d.ts +2 -0
- package/dist/benchmarks/reporters/markdown-reporter.d.ts.map +1 -0
- package/dist/benchmarks/reporters/markdown-reporter.js +80 -0
- package/dist/benchmarks/reporters/markdown-reporter.js.map +1 -0
- package/dist/benchmarks/runners/bench-ablation.d.ts +2 -0
- package/dist/benchmarks/runners/bench-ablation.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-ablation.js +49 -0
- package/dist/benchmarks/runners/bench-ablation.js.map +1 -0
- package/dist/benchmarks/runners/bench-ai-quality.d.ts +2 -0
- package/dist/benchmarks/runners/bench-ai-quality.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-ai-quality.js +297 -0
- package/dist/benchmarks/runners/bench-ai-quality.js.map +1 -0
- package/dist/benchmarks/runners/bench-interactive-eval.d.ts +2 -0
- package/dist/benchmarks/runners/bench-interactive-eval.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-interactive-eval.js +119 -0
- package/dist/benchmarks/runners/bench-interactive-eval.js.map +1 -0
- package/dist/benchmarks/runners/bench-performance.bench.d.ts +2 -0
- package/dist/benchmarks/runners/bench-performance.bench.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-performance.bench.js +50 -0
- package/dist/benchmarks/runners/bench-performance.bench.js.map +1 -0
- package/dist/benchmarks/runners/bench-search-quality.d.ts +2 -0
- package/dist/benchmarks/runners/bench-search-quality.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-search-quality.js +70 -0
- package/dist/benchmarks/runners/bench-search-quality.js.map +1 -0
- package/dist/benchmarks/runners/bench-token-efficiency.d.ts +2 -0
- package/dist/benchmarks/runners/bench-token-efficiency.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-token-efficiency.js +89 -0
- package/dist/benchmarks/runners/bench-token-efficiency.js.map +1 -0
- package/dist/benchmarks/runners/diag.d.ts +2 -0
- package/dist/benchmarks/runners/diag.d.ts.map +1 -0
- package/dist/benchmarks/runners/diag.js +30 -0
- package/dist/benchmarks/runners/diag.js.map +1 -0
- package/dist/benchmarks/vitest.config.bench.d.ts +3 -0
- package/dist/benchmarks/vitest.config.bench.d.ts.map +1 -0
- package/dist/benchmarks/vitest.config.bench.js +14 -0
- package/dist/benchmarks/vitest.config.bench.js.map +1 -0
- package/dist/bin/pcl.js +36 -23
- package/dist/bin/pcl.js.map +1 -1
- package/dist/src/db.d.ts +2 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +25 -21
- package/dist/src/db.js.map +1 -1
- package/dist/src/embeddings.d.ts +1 -1
- package/dist/src/embeddings.js +2 -2
- package/dist/src/embeddings.js.map +1 -1
- package/dist/src/indexer.d.ts +1 -1
- package/dist/src/indexer.d.ts.map +1 -1
- package/dist/src/indexer.js +6 -2
- package/dist/src/indexer.js.map +1 -1
- package/dist/src/search.d.ts.map +1 -1
- package/dist/src/search.js +138 -26
- package/dist/src/search.js.map +1 -1
- package/dist/src/server.js +6 -0
- package/dist/src/server.js.map +1 -1
- package/dist/src/types.d.ts +1 -0
- package/dist/src/types.d.ts.map +1 -1
- package/dist/tests/db.test.d.ts +2 -0
- package/dist/tests/db.test.d.ts.map +1 -0
- package/dist/tests/db.test.js +459 -0
- package/dist/tests/db.test.js.map +1 -0
- package/dist/tests/embeddings.test.d.ts +2 -0
- package/dist/tests/embeddings.test.d.ts.map +1 -0
- package/dist/tests/embeddings.test.js +165 -0
- package/dist/tests/embeddings.test.js.map +1 -0
- package/dist/tests/helpers/test-harness.d.ts +26 -0
- package/dist/tests/helpers/test-harness.d.ts.map +1 -0
- package/dist/tests/helpers/test-harness.js +80 -0
- package/dist/tests/helpers/test-harness.js.map +1 -0
- package/dist/tests/indexer.test.d.ts +2 -0
- package/dist/tests/indexer.test.d.ts.map +1 -0
- package/dist/tests/indexer.test.js +299 -0
- package/dist/tests/indexer.test.js.map +1 -0
- package/dist/tests/schemas.test.d.ts +2 -0
- package/dist/tests/schemas.test.d.ts.map +1 -0
- package/dist/tests/schemas.test.js +378 -0
- package/dist/tests/schemas.test.js.map +1 -0
- package/dist/tests/search.test.d.ts +2 -0
- package/dist/tests/search.test.d.ts.map +1 -0
- package/dist/tests/search.test.js +129 -0
- package/dist/tests/search.test.js.map +1 -0
- package/dist/tests/tools.test.d.ts +2 -0
- package/dist/tests/tools.test.d.ts.map +1 -0
- package/dist/tests/tools.test.js +232 -0
- package/dist/tests/tools.test.js.map +1 -0
- package/package.json +14 -2
package/README.md
CHANGED
|
@@ -1,17 +1,24 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
1
3
|
# PCL — Product Context Layer
|
|
2
4
|
|
|
3
5
|
**Give AI coding agents persistent, structured knowledge of your product.**
|
|
4
6
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
7
|
+
[](https://www.npmjs.com/package/pcl-mcp)
|
|
8
|
+
[](https://www.npmjs.com/package/pcl-mcp)
|
|
9
|
+
[](https://nodejs.org)
|
|
10
|
+
[](LICENSE)
|
|
8
11
|
|
|
9
12
|
```bash
|
|
10
|
-
|
|
11
|
-
npx pcl init
|
|
12
|
-
# add MCP config (see Agent Configuration below), then start a new agent session
|
|
13
|
+
npx pcl-mcp init
|
|
13
14
|
```
|
|
14
15
|
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+
Instead of re-explaining your personas, journeys, and architecture decisions every session, PCL serves them via MCP on demand. Any agent (Claude Code, Cursor, Windsurf) queries exactly what it needs, when it needs it.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
15
22
|
## Why PCL?
|
|
16
23
|
|
|
17
24
|
**Without PCL**, every coding session starts from scratch:
|
|
@@ -34,21 +41,40 @@ You ask your agent: *"Build the checkout flow"*
|
|
|
34
41
|
|
|
35
42
|
**With PCL:** The agent auto-loads critical billing rules at session start (~200 tokens). When it starts the checkout feature, it pulls the relevant persona, fetches the journey steps, and checks the spec's acceptance criteria — all on-demand, only what's needed. Every session, automatically.
|
|
36
43
|
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
npm install pcl-mcp
|
|
50
|
+
npx pcl init # prompts before adding example files, sets up CLAUDE.md
|
|
51
|
+
# add MCP config (see Agent Configuration below), then start a new agent session
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
37
56
|
## Stack
|
|
38
57
|
|
|
39
58
|
| Layer | Technology | Why |
|
|
40
59
|
|---|---|---|
|
|
41
60
|
| Protocol | MCP (stdio) | Universal — works with every major agent |
|
|
42
61
|
| Storage | SQLite + FTS5 | Zero infra, git-friendly, offline |
|
|
43
|
-
| Keyword search | BM25 via FTS5 | Best-in-class for exact
|
|
44
|
-
| Semantic search | `all-
|
|
45
|
-
|
|
|
46
|
-
|
|
|
62
|
+
| Keyword search | BM25 via FTS5 (title-weighted 10×) | Best-in-class for exact terms, IDs, proper nouns |
|
|
63
|
+
| Semantic search | `all-mpnet-base-v2` (local, 768d) | Higher quality than MiniLM, zero API cost, ~3ms/doc |
|
|
64
|
+
| Embedding strategy | Split body + title embeddings | Separate semantic channels for body and title matching |
|
|
65
|
+
| Hybrid fusion | Adaptive RRF (corpus-size-aware k) | Better recall on both small and large corpora |
|
|
66
|
+
| Score filtering | 15% gap threshold | Prevents low-quality tail results from surfacing |
|
|
67
|
+
| Cross-references | Auto frontmatter link resolution | Pulls related files into results automatically |
|
|
68
|
+
| Validation | Zod schemas | Agents get predictable, parseable frontmatter |
|
|
47
69
|
| File watching | Chokidar v4 | Live reindex on save |
|
|
48
70
|
|
|
71
|
+
---
|
|
72
|
+
|
|
49
73
|
## Prerequisites
|
|
50
74
|
|
|
51
|
-
|
|
75
|
+
**Node.js >= 22** (required — PCL uses modern Node APIs)
|
|
76
|
+
|
|
77
|
+
---
|
|
52
78
|
|
|
53
79
|
## Install
|
|
54
80
|
|
|
@@ -59,6 +85,8 @@ npx pcl init # creates ./product with templates
|
|
|
59
85
|
|
|
60
86
|
Also available on GitHub Packages as `@michaelgorski/pcl-mcp`.
|
|
61
87
|
|
|
88
|
+
---
|
|
89
|
+
|
|
62
90
|
## Import existing docs
|
|
63
91
|
|
|
64
92
|
If you already have markdown documentation in your repo, PCL can scan, classify, and import it automatically:
|
|
@@ -77,8 +105,12 @@ The scanner:
|
|
|
77
105
|
|
|
78
106
|
Supported classifications: **persona**, **journey**, **spec**, **decision**, **domain**, **product**
|
|
79
107
|
|
|
108
|
+
---
|
|
109
|
+
|
|
80
110
|
## Agent configuration
|
|
81
111
|
|
|
112
|
+
Works with any MCP-compatible agent. Configuration examples below.
|
|
113
|
+
|
|
82
114
|
### Claude Code — `.claude/mcp.json`
|
|
83
115
|
```json
|
|
84
116
|
{
|
|
@@ -113,6 +145,8 @@ Supported classifications: **persona**, **journey**, **spec**, **decision**, **d
|
|
|
113
145
|
}
|
|
114
146
|
```
|
|
115
147
|
|
|
148
|
+
---
|
|
149
|
+
|
|
116
150
|
## File structure
|
|
117
151
|
|
|
118
152
|
```
|
|
@@ -131,19 +165,23 @@ Supported classifications: **persona**, **journey**, **spec**, **decision**, **d
|
|
|
131
165
|
.pcl.db ← SQLite index (auto-generated, gitignore this)
|
|
132
166
|
```
|
|
133
167
|
|
|
168
|
+
---
|
|
169
|
+
|
|
134
170
|
## Tools available to agents
|
|
135
171
|
|
|
136
172
|
| Tool | Params | Description |
|
|
137
173
|
|---|---|---|
|
|
138
174
|
| `pcl_product_summary` | — | Load the product north-star document. Call at session start. |
|
|
139
|
-
| `pcl_get_persona
|
|
140
|
-
| `pcl_get_journey
|
|
141
|
-
| `pcl_get_spec
|
|
142
|
-
| `pcl_get_decision
|
|
143
|
-
| `pcl_get_domain
|
|
144
|
-
| `pcl_list
|
|
145
|
-
| `pcl_search
|
|
146
|
-
| `pcl_related
|
|
175
|
+
| `pcl_get_persona` | `id` | Get a user persona by ID. Call before any user-facing feature. |
|
|
176
|
+
| `pcl_get_journey` | `id` | Get a user journey by ID including step-by-step detail. |
|
|
177
|
+
| `pcl_get_spec` | `id` | Get a feature spec by ID including acceptance criteria. |
|
|
178
|
+
| `pcl_get_decision` | `id` | Get an architecture decision record (ADR) by ID. |
|
|
179
|
+
| `pcl_get_domain` | `id` or `"*critical"` | Get domain rules by ID. Pass `"*critical"` to load all critical rules. |
|
|
180
|
+
| `pcl_list` | `type`: `"personas"` \| `"journeys"` \| `"specs"` \| `"decisions"` \| `"domain"` | List all files of a given type with IDs, titles, and summaries. |
|
|
181
|
+
| `pcl_search` | `query`, `mode?` (`"hybrid"` \| `"semantic"` \| `"keyword"`), `types?`, `top_k?` | Hybrid semantic + keyword search across all product files. |
|
|
182
|
+
| `pcl_related` | `id`, `top_k?` | Find files semantically related to a given file ID. |
|
|
183
|
+
|
|
184
|
+
---
|
|
147
185
|
|
|
148
186
|
## Prompts & Resources
|
|
149
187
|
|
|
@@ -153,25 +191,92 @@ In addition to tools, PCL exposes MCP prompts and resources:
|
|
|
153
191
|
|
|
154
192
|
**Resources: `pcl://files/{type}/{id}`** — Each indexed file is available as an MCP resource. Agents can browse and read individual files directly via the resource URI (e.g., `pcl://files/persona/example-user`).
|
|
155
193
|
|
|
194
|
+
---
|
|
195
|
+
|
|
156
196
|
## How hybrid search works
|
|
157
197
|
|
|
198
|
+
PCL runs three parallel retrieval signals and fuses them with Reciprocal Rank Fusion:
|
|
199
|
+
|
|
158
200
|
```
|
|
159
201
|
query: "what does Max find frustrating about onboarding"
|
|
160
202
|
|
|
161
|
-
BM25 (FTS5
|
|
162
|
-
|
|
203
|
+
BM25 (FTS5, title-weighted 10×):
|
|
204
|
+
→ persona-max, journey-onboarding, spec-magic-link
|
|
205
|
+
↓ ranked by bm25(title=10×, body=1×) — exact terms, IDs, proper nouns
|
|
206
|
+
|
|
207
|
+
Semantic — body embedding (all-mpnet-base-v2, 768d):
|
|
208
|
+
→ journey-onboarding, persona-max, domain-core-rules
|
|
209
|
+
↓ cosine similarity on full-text embedding
|
|
163
210
|
|
|
164
|
-
|
|
165
|
-
|
|
211
|
+
Semantic — title embedding (all-mpnet-base-v2, 768d):
|
|
212
|
+
→ persona-max, journey-onboarding, spec-onboarding-ux
|
|
213
|
+
↓ cosine similarity on title + summary embedding
|
|
166
214
|
|
|
167
|
-
RRF
|
|
168
|
-
|
|
215
|
+
Adaptive RRF (k = corpus_size / 10):
|
|
216
|
+
score(d) = Σ 1 / (k + rank(d)) fused across all three lists
|
|
169
217
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
218
|
+
Score gap filter (15% threshold):
|
|
219
|
+
Drops results below 0.15 × top_score — removes noise
|
|
220
|
+
|
|
221
|
+
Cross-reference resolution:
|
|
222
|
+
journey-onboarding.frontmatter.persona = "max"
|
|
223
|
+
→ auto-includes persona-max even if it ranked outside top-k
|
|
224
|
+
|
|
225
|
+
Result: 1. journey-onboarding (0.94)
|
|
226
|
+
2. persona-max (0.87)
|
|
227
|
+
3. spec-onboarding-ux (0.71)
|
|
173
228
|
```
|
|
174
229
|
|
|
230
|
+
**Why split embeddings?** Body and title carry different semantic signals. A query like *"checkout persona"* should match a persona file by title even if its body content is mostly demographic data. Indexing them separately gives the fusion step two distinct semantic channels rather than one diluted one.
|
|
231
|
+
|
|
232
|
+
**Why adaptive RRF k?** Fixed k=60 over-smooths rankings on small corpora (10–20 files). Corpus-aware k scales down on small collections to let strong matches separate from weak ones.
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Testing & Benchmarks
|
|
237
|
+
|
|
238
|
+
PCL ships with a full test suite and a multi-dimensional benchmark framework.
|
|
239
|
+
|
|
240
|
+
### Tests
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
npm test # run all tests (vitest)
|
|
244
|
+
npm run test:watch # watch mode
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Six test suites cover the full stack:
|
|
248
|
+
|
|
249
|
+
| Suite | Coverage |
|
|
250
|
+
|---|---|
|
|
251
|
+
| `db.test.ts` | SQLite operations, FTS5 queries, embedding storage |
|
|
252
|
+
| `embeddings.test.ts` | Embedding generation, cache hits, dimension checks |
|
|
253
|
+
| `indexer.test.ts` | File discovery, schema extraction, change detection |
|
|
254
|
+
| `schemas.test.ts` | Zod frontmatter validation for all file types |
|
|
255
|
+
| `search.test.ts` | Hybrid search, RRF, multi-hop decomposition, cross-refs |
|
|
256
|
+
| `tools.test.ts` | MCP tool handlers, response formatting, error paths |
|
|
257
|
+
|
|
258
|
+
### Benchmarks
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
npm run bench # all benchmarks
|
|
262
|
+
npm run bench:perf # latency benchmarks (search + embedding speed)
|
|
263
|
+
npm run bench:quality # search quality: Precision@k, Recall@k, NDCG, MRR
|
|
264
|
+
npm run bench:tokens # token efficiency across search modes
|
|
265
|
+
npm run bench:ablation # hybrid vs keyword-only vs semantic-only comparison
|
|
266
|
+
npm run bench:ai # Claude-judged result quality (requires ANTHROPIC_API_KEY)
|
|
267
|
+
npm run bench:report # generate markdown report from results
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
| Suite | Measures |
|
|
271
|
+
|---|---|
|
|
272
|
+
| Performance | Search + embedding latency (p50/p95) |
|
|
273
|
+
| Search quality | Precision@k, Recall@k, NDCG, MRR on labeled corpus |
|
|
274
|
+
| Token efficiency | Tokens consumed per query across search modes |
|
|
275
|
+
| Ablation | Quality delta: hybrid vs keyword-only vs semantic-only |
|
|
276
|
+
| AI quality | Claude-judged relevance score for top-k results |
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
175
280
|
## Human workflow
|
|
176
281
|
|
|
177
282
|
The system is only as good as what you put in. Discipline:
|
|
@@ -184,12 +289,16 @@ The system is only as good as what you put in. Discipline:
|
|
|
184
289
|
|
|
185
290
|
The agent does the rest.
|
|
186
291
|
|
|
292
|
+
---
|
|
293
|
+
|
|
187
294
|
## Gitignore
|
|
188
295
|
|
|
189
296
|
```gitignore
|
|
190
297
|
product/.pcl.db # SQLite index — auto-regenerated
|
|
191
298
|
```
|
|
192
299
|
|
|
300
|
+
---
|
|
301
|
+
|
|
193
302
|
## License
|
|
194
303
|
|
|
195
304
|
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
|
|
3
|
+
* Disentangles "did PCL find the right docs" from "did the LLM use them well."
|
|
4
|
+
*/
|
|
5
|
+
export interface ContextMetrics {
|
|
6
|
+
recall: number;
|
|
7
|
+
precision: number;
|
|
8
|
+
f1: number;
|
|
9
|
+
retrieved: string[];
|
|
10
|
+
required: string[];
|
|
11
|
+
hits: string[];
|
|
12
|
+
misses: string[];
|
|
13
|
+
noise: string[];
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Compute context retrieval quality metrics.
|
|
17
|
+
*
|
|
18
|
+
* @param retrievedIds - Document IDs returned by PCL search
|
|
19
|
+
* @param requiredIds - Document IDs that the task actually needs
|
|
20
|
+
*/
|
|
21
|
+
export declare function measureContextRetrieval(retrievedIds: string[], requiredIds: string[]): ContextMetrics;
|
|
22
|
+
/**
|
|
23
|
+
* Aggregate context metrics across multiple tasks.
|
|
24
|
+
*/
|
|
25
|
+
export declare function averageContextMetrics(metrics: ContextMetrics[]): {
|
|
26
|
+
recall: number;
|
|
27
|
+
precision: number;
|
|
28
|
+
f1: number;
|
|
29
|
+
};
|
|
30
|
+
//# sourceMappingURL=context-retrieval-quality.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-retrieval-quality.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,IAAI,EAAE,MAAM,EAAE,CAAC;IACf,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,YAAY,EAAE,MAAM,EAAE,EACtB,WAAW,EAAE,MAAM,EAAE,GACpB,cAAc,CA0BhB;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,OAAO,EAAE,cAAc,EAAE,GACxB;IAAE,MAAM,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,MAAM,CAAA;CAAE,CAenD"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
|
|
3
|
+
* Disentangles "did PCL find the right docs" from "did the LLM use them well."
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Compute context retrieval quality metrics.
|
|
7
|
+
*
|
|
8
|
+
* @param retrievedIds - Document IDs returned by PCL search
|
|
9
|
+
* @param requiredIds - Document IDs that the task actually needs
|
|
10
|
+
*/
|
|
11
|
+
export function measureContextRetrieval(retrievedIds, requiredIds) {
|
|
12
|
+
const retrievedSet = new Set(retrievedIds);
|
|
13
|
+
const requiredSet = new Set(requiredIds);
|
|
14
|
+
const hits = requiredIds.filter((id) => retrievedSet.has(id));
|
|
15
|
+
const misses = requiredIds.filter((id) => !retrievedSet.has(id));
|
|
16
|
+
const noise = retrievedIds.filter((id) => !requiredSet.has(id));
|
|
17
|
+
const recall = requiredIds.length > 0 ? hits.length / requiredIds.length : 1;
|
|
18
|
+
const precision = retrievedIds.length > 0 ? hits.length / retrievedIds.length : 1;
|
|
19
|
+
const f1 = recall + precision > 0
|
|
20
|
+
? (2 * recall * precision) / (recall + precision)
|
|
21
|
+
: 0;
|
|
22
|
+
return {
|
|
23
|
+
recall,
|
|
24
|
+
precision,
|
|
25
|
+
f1,
|
|
26
|
+
retrieved: retrievedIds,
|
|
27
|
+
required: requiredIds,
|
|
28
|
+
hits,
|
|
29
|
+
misses,
|
|
30
|
+
noise,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Aggregate context metrics across multiple tasks.
|
|
35
|
+
*/
|
|
36
|
+
export function averageContextMetrics(metrics) {
|
|
37
|
+
if (metrics.length === 0)
|
|
38
|
+
return { recall: 0, precision: 0, f1: 0 };
|
|
39
|
+
const sum = metrics.reduce((acc, m) => ({
|
|
40
|
+
recall: acc.recall + m.recall,
|
|
41
|
+
precision: acc.precision + m.precision,
|
|
42
|
+
f1: acc.f1 + m.f1,
|
|
43
|
+
}), { recall: 0, precision: 0, f1: 0 });
|
|
44
|
+
return {
|
|
45
|
+
recall: sum.recall / metrics.length,
|
|
46
|
+
precision: sum.precision / metrics.length,
|
|
47
|
+
f1: sum.f1 / metrics.length,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=context-retrieval-quality.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-retrieval-quality.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAaH;;;;;GAKG;AACH,MAAM,UAAU,uBAAuB,CACrC,YAAsB,EACtB,WAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;IAEzC,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACjE,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAEhE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7E,MAAM,SAAS,GACb,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAClE,MAAM,EAAE,GACN,MAAM,GAAG,SAAS,GAAG,CAAC;QACpB,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,CAAC;QACjD,CAAC,CAAC,CAAC,CAAC;IAER,OAAO;QACL,MAAM;QACN,SAAS;QACT,EAAE;QACF,SAAS,EAAE,YAAY;QACvB,QAAQ,EAAE,WAAW;QACrB,IAAI;QACJ,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,OAAyB;IAEzB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC;IACpE,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM;QAC7B,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE;KAClB,CAAC,EACF,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CACnC,CAAC;IACF,OAAO;QACL,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM;QACnC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,OAAO,CAAC,MAAM;QACzC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,OAAO,CAAC,MAAM;KAC5B,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Precision@K: fraction of top-K results that are relevant.
|
|
3
|
+
*/
|
|
4
|
+
export declare function precisionAtK(retrieved: string[], relevant: Set<string>, k: number): number;
|
|
5
|
+
/**
|
|
6
|
+
* Recall@K: fraction of relevant docs found in top-K.
|
|
7
|
+
*/
|
|
8
|
+
export declare function recallAtK(retrieved: string[], relevant: Set<string>, k: number): number;
|
|
9
|
+
/**
|
|
10
|
+
* MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
|
|
11
|
+
*/
|
|
12
|
+
export declare function reciprocalRank(retrieved: string[], relevant: Set<string>): number;
|
|
13
|
+
/**
|
|
14
|
+
* NDCG@K: normalized DCG using ideal ranking.
|
|
15
|
+
*/
|
|
16
|
+
export declare function ndcgAtK(retrieved: string[], relevanceScores: Map<string, number>, k: number): number;
|
|
17
|
+
/**
|
|
18
|
+
* Compute all metrics for a single query.
|
|
19
|
+
*/
|
|
20
|
+
export declare function computeMetrics(retrieved: string[], relevanceScores: Map<string, number>, relevantThreshold?: number): {
|
|
21
|
+
precisionAt1: number;
|
|
22
|
+
precisionAt3: number;
|
|
23
|
+
precisionAt5: number;
|
|
24
|
+
recallAt5: number;
|
|
25
|
+
mrr: number;
|
|
26
|
+
ndcgAt5: number;
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* Average metrics across multiple queries.
|
|
30
|
+
*/
|
|
31
|
+
export declare function averageMetrics(results: Array<ReturnType<typeof computeMetrics>>): ReturnType<typeof computeMetrics>;
|
|
32
|
+
//# sourceMappingURL=ir-metrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ir-metrics.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,wBAAgB,YAAY,CAC1B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,SAAS,CACvB,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,GACpB,MAAM,CAKR;AAmBD;;GAEG;AACH,wBAAgB,OAAO,CACrB,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,CAAC,EAAE,MAAM,GACR,MAAM,CASR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,iBAAiB,SAAI,GACpB;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB,CAeA;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,OAAO,EAAE,KAAK,CAAC,UAAU,CAAC,OAAO,cAAc,CAAC,CAAC,GAChD,UAAU,CAAC,OAAO,cAAc,CAAC,CA0BnC"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Precision@K: fraction of top-K results that are relevant.
|
|
3
|
+
*/
|
|
4
|
+
export function precisionAtK(retrieved, relevant, k) {
|
|
5
|
+
const topK = retrieved.slice(0, k);
|
|
6
|
+
if (topK.length === 0)
|
|
7
|
+
return 0;
|
|
8
|
+
const hits = topK.filter(id => relevant.has(id)).length;
|
|
9
|
+
return hits / topK.length;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Recall@K: fraction of relevant docs found in top-K.
|
|
13
|
+
*/
|
|
14
|
+
export function recallAtK(retrieved, relevant, k) {
|
|
15
|
+
if (relevant.size === 0)
|
|
16
|
+
return 1; // no relevant docs = perfect recall vacuously
|
|
17
|
+
const topK = retrieved.slice(0, k);
|
|
18
|
+
const hits = topK.filter(id => relevant.has(id)).length;
|
|
19
|
+
return hits / relevant.size;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
|
|
23
|
+
*/
|
|
24
|
+
export function reciprocalRank(retrieved, relevant) {
|
|
25
|
+
for (let i = 0; i < retrieved.length; i++) {
|
|
26
|
+
if (relevant.has(retrieved[i]))
|
|
27
|
+
return 1 / (i + 1);
|
|
28
|
+
}
|
|
29
|
+
return 0;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* DCG@K with graded relevance (relevance scores 0-3).
|
|
33
|
+
*/
|
|
34
|
+
function dcgAtK(retrieved, relevanceScores, k) {
|
|
35
|
+
let dcg = 0;
|
|
36
|
+
const topK = retrieved.slice(0, k);
|
|
37
|
+
for (let i = 0; i < topK.length; i++) {
|
|
38
|
+
const rel = relevanceScores.get(topK[i]) ?? 0;
|
|
39
|
+
dcg += (Math.pow(2, rel) - 1) / Math.log2(i + 2); // i+2 because log2(1)=0
|
|
40
|
+
}
|
|
41
|
+
return dcg;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* NDCG@K: normalized DCG using ideal ranking.
|
|
45
|
+
*/
|
|
46
|
+
export function ndcgAtK(retrieved, relevanceScores, k) {
|
|
47
|
+
const dcg = dcgAtK(retrieved, relevanceScores, k);
|
|
48
|
+
// Ideal ranking: sort all docs by relevance descending
|
|
49
|
+
const idealOrder = [...relevanceScores.entries()]
|
|
50
|
+
.sort(([, a], [, b]) => b - a)
|
|
51
|
+
.map(([id]) => id);
|
|
52
|
+
const idcg = dcgAtK(idealOrder, relevanceScores, k);
|
|
53
|
+
if (idcg === 0)
|
|
54
|
+
return 0;
|
|
55
|
+
return dcg / idcg;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Compute all metrics for a single query.
|
|
59
|
+
*/
|
|
60
|
+
export function computeMetrics(retrieved, relevanceScores, relevantThreshold = 1) {
|
|
61
|
+
const relevant = new Set([...relevanceScores.entries()]
|
|
62
|
+
.filter(([, score]) => score >= relevantThreshold)
|
|
63
|
+
.map(([id]) => id));
|
|
64
|
+
return {
|
|
65
|
+
precisionAt1: precisionAtK(retrieved, relevant, 1),
|
|
66
|
+
precisionAt3: precisionAtK(retrieved, relevant, 3),
|
|
67
|
+
precisionAt5: precisionAtK(retrieved, relevant, 5),
|
|
68
|
+
recallAt5: recallAtK(retrieved, relevant, 5),
|
|
69
|
+
mrr: reciprocalRank(retrieved, relevant),
|
|
70
|
+
ndcgAt5: ndcgAtK(retrieved, relevanceScores, 5),
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Average metrics across multiple queries.
|
|
75
|
+
*/
|
|
76
|
+
export function averageMetrics(results) {
|
|
77
|
+
const n = results.length;
|
|
78
|
+
if (n === 0) {
|
|
79
|
+
return { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 };
|
|
80
|
+
}
|
|
81
|
+
const sum = results.reduce((acc, r) => ({
|
|
82
|
+
precisionAt1: acc.precisionAt1 + r.precisionAt1,
|
|
83
|
+
precisionAt3: acc.precisionAt3 + r.precisionAt3,
|
|
84
|
+
precisionAt5: acc.precisionAt5 + r.precisionAt5,
|
|
85
|
+
recallAt5: acc.recallAt5 + r.recallAt5,
|
|
86
|
+
mrr: acc.mrr + r.mrr,
|
|
87
|
+
ndcgAt5: acc.ndcgAt5 + r.ndcgAt5,
|
|
88
|
+
}), { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 });
|
|
89
|
+
return {
|
|
90
|
+
precisionAt1: sum.precisionAt1 / n,
|
|
91
|
+
precisionAt3: sum.precisionAt3 / n,
|
|
92
|
+
precisionAt5: sum.precisionAt5 / n,
|
|
93
|
+
recallAt5: sum.recallAt5 / n,
|
|
94
|
+
mrr: sum.mrr / n,
|
|
95
|
+
ndcgAt5: sum.ndcgAt5 / n,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=ir-metrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ir-metrics.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAChC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CACvB,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,IAAI,QAAQ,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC,CAAC,8CAA8C;IACjF,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,QAAqB;IAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,IAAI,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;GAEG;AACH,SAAS,MAAM,CACb,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAC;QAC/C,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,wBAAwB;IAC5E,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,OAAO,CACrB,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,MAAM,GAAG,GAAG,MAAM,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IAClD,uDAAuD;IACvD,MAAM,UAAU,GAAG,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC9C,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC;SAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;IACrB,MAAM,IAAI,GAAG,MAAM,CAAC,UAAU,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IACpD,IAAI,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACzB,OAAO,GAAG,GAAG,IAAI,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,eAAoC,EACpC,iBAAiB,GAAG,CAAC;IASrB,MAAM,QAAQ,GAAG,IAAI,GAAG,CACtB,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC3B,MAAM,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,KAAK,IAAI,iBAAiB,CAAC;SACjD,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CACrB,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,SAAS,EAAE,SAAS,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAC5C,GAAG,EAAE,cAAc,CAAC,SAAS,EAAE,QAAQ,CAAC;QACxC,OAAO,EAAE,OAAO,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC;KAChD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,OAAiD;IAEjD,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IACzB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;IACjG,CAAC;IAED,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG;QACpB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;KACjC,CAAC,EACF,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CACxF,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC;QAC5B,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC;QAChB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC;KACzB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured Judge — evaluates AI output with specific yes/no questions per criterion.
|
|
3
|
+
*
|
|
4
|
+
* Instead of "rate 0-10", asks: "Does the code implement X? YES/NO"
|
|
5
|
+
* Then scores = (yes_count / total_questions) * 10.
|
|
6
|
+
*/
|
|
7
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
8
|
+
import type { CodingTask } from "../lib/types.js";
|
|
9
|
+
export interface JudgmentResult {
|
|
10
|
+
taskId: string;
|
|
11
|
+
totalQuestions: number;
|
|
12
|
+
yesCount: number;
|
|
13
|
+
score: number;
|
|
14
|
+
details: Array<{
|
|
15
|
+
criterion: string;
|
|
16
|
+
answer: "YES" | "NO" | "PARTIAL";
|
|
17
|
+
reasoning: string;
|
|
18
|
+
}>;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Evaluate an AI-generated output against structured criteria.
|
|
22
|
+
*/
|
|
23
|
+
export declare function structuredJudge(client: Anthropic, task: CodingTask, output: string, contextDocs: string): Promise<JudgmentResult>;
|
|
24
|
+
/**
|
|
25
|
+
* Check if generated code contains valid TypeScript/JSX syntax.
|
|
26
|
+
* Uses the TypeScript compiler API in syntax-only mode for accurate parsing
|
|
27
|
+
* of template literals, JSX, and other complex syntax.
|
|
28
|
+
*/
|
|
29
|
+
export declare function checkTypeScriptSyntax(output: string): {
|
|
30
|
+
valid: boolean;
|
|
31
|
+
errorCount: number;
|
|
32
|
+
codeBlockCount: number;
|
|
33
|
+
};
|
|
34
|
+
//# sourceMappingURL=structured-judge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-judge.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/structured-judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAIlD,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,KAAK,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,KAAK,GAAG,IAAI,GAAG,SAAS,CAAC;QACjC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC,CAAC;CACJ;AAiCD;;GAEG;AACH,wBAAsB,eAAe,CACnC,MAAM,EAAE,SAAS,EACjB,IAAI,EAAE,UAAU,EAChB,MAAM,EAAE,MAAM,EACd,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,cAAc,CAAC,CAsGzB;AAED;;;;GAIG;AACH,wBAAgB,qBAAqB,CACnC,MAAM,EAAE,MAAM,GACb;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,UAAU,EAAE,MAAM,CAAC;IAAC,cAAc,EAAE,MAAM,CAAA;CAAE,CAgDhE"}
|