pcl-mcp 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/README.md +137 -28
  2. package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts +30 -0
  3. package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts.map +1 -0
  4. package/dist/benchmarks/evaluators/context-retrieval-quality.js +50 -0
  5. package/dist/benchmarks/evaluators/context-retrieval-quality.js.map +1 -0
  6. package/dist/benchmarks/evaluators/ir-metrics.d.ts +32 -0
  7. package/dist/benchmarks/evaluators/ir-metrics.d.ts.map +1 -0
  8. package/dist/benchmarks/evaluators/ir-metrics.js +98 -0
  9. package/dist/benchmarks/evaluators/ir-metrics.js.map +1 -0
  10. package/dist/benchmarks/evaluators/structured-judge.d.ts +34 -0
  11. package/dist/benchmarks/evaluators/structured-judge.d.ts.map +1 -0
  12. package/dist/benchmarks/evaluators/structured-judge.js +153 -0
  13. package/dist/benchmarks/evaluators/structured-judge.js.map +1 -0
  14. package/dist/benchmarks/evaluators/token-counter.d.ts +9 -0
  15. package/dist/benchmarks/evaluators/token-counter.d.ts.map +1 -0
  16. package/dist/benchmarks/evaluators/token-counter.js +24 -0
  17. package/dist/benchmarks/evaluators/token-counter.js.map +1 -0
  18. package/dist/benchmarks/generators/generate-corpus.d.ts +2 -0
  19. package/dist/benchmarks/generators/generate-corpus.d.ts.map +1 -0
  20. package/dist/benchmarks/generators/generate-corpus.js +243 -0
  21. package/dist/benchmarks/generators/generate-corpus.js.map +1 -0
  22. package/dist/benchmarks/lib/harness.d.ts +23 -0
  23. package/dist/benchmarks/lib/harness.d.ts.map +1 -0
  24. package/dist/benchmarks/lib/harness.js +44 -0
  25. package/dist/benchmarks/lib/harness.js.map +1 -0
  26. package/dist/benchmarks/lib/types.d.ts +79 -0
  27. package/dist/benchmarks/lib/types.d.ts.map +1 -0
  28. package/dist/benchmarks/lib/types.js +2 -0
  29. package/dist/benchmarks/lib/types.js.map +1 -0
  30. package/dist/benchmarks/reporters/markdown-reporter.d.ts +2 -0
  31. package/dist/benchmarks/reporters/markdown-reporter.d.ts.map +1 -0
  32. package/dist/benchmarks/reporters/markdown-reporter.js +80 -0
  33. package/dist/benchmarks/reporters/markdown-reporter.js.map +1 -0
  34. package/dist/benchmarks/runners/bench-ablation.d.ts +2 -0
  35. package/dist/benchmarks/runners/bench-ablation.d.ts.map +1 -0
  36. package/dist/benchmarks/runners/bench-ablation.js +49 -0
  37. package/dist/benchmarks/runners/bench-ablation.js.map +1 -0
  38. package/dist/benchmarks/runners/bench-ai-quality.d.ts +2 -0
  39. package/dist/benchmarks/runners/bench-ai-quality.d.ts.map +1 -0
  40. package/dist/benchmarks/runners/bench-ai-quality.js +297 -0
  41. package/dist/benchmarks/runners/bench-ai-quality.js.map +1 -0
  42. package/dist/benchmarks/runners/bench-interactive-eval.d.ts +2 -0
  43. package/dist/benchmarks/runners/bench-interactive-eval.d.ts.map +1 -0
  44. package/dist/benchmarks/runners/bench-interactive-eval.js +119 -0
  45. package/dist/benchmarks/runners/bench-interactive-eval.js.map +1 -0
  46. package/dist/benchmarks/runners/bench-performance.bench.d.ts +2 -0
  47. package/dist/benchmarks/runners/bench-performance.bench.d.ts.map +1 -0
  48. package/dist/benchmarks/runners/bench-performance.bench.js +50 -0
  49. package/dist/benchmarks/runners/bench-performance.bench.js.map +1 -0
  50. package/dist/benchmarks/runners/bench-search-quality.d.ts +2 -0
  51. package/dist/benchmarks/runners/bench-search-quality.d.ts.map +1 -0
  52. package/dist/benchmarks/runners/bench-search-quality.js +70 -0
  53. package/dist/benchmarks/runners/bench-search-quality.js.map +1 -0
  54. package/dist/benchmarks/runners/bench-token-efficiency.d.ts +2 -0
  55. package/dist/benchmarks/runners/bench-token-efficiency.d.ts.map +1 -0
  56. package/dist/benchmarks/runners/bench-token-efficiency.js +89 -0
  57. package/dist/benchmarks/runners/bench-token-efficiency.js.map +1 -0
  58. package/dist/benchmarks/runners/diag.d.ts +2 -0
  59. package/dist/benchmarks/runners/diag.d.ts.map +1 -0
  60. package/dist/benchmarks/runners/diag.js +30 -0
  61. package/dist/benchmarks/runners/diag.js.map +1 -0
  62. package/dist/benchmarks/vitest.config.bench.d.ts +3 -0
  63. package/dist/benchmarks/vitest.config.bench.d.ts.map +1 -0
  64. package/dist/benchmarks/vitest.config.bench.js +14 -0
  65. package/dist/benchmarks/vitest.config.bench.js.map +1 -0
  66. package/dist/bin/pcl.js +36 -23
  67. package/dist/bin/pcl.js.map +1 -1
  68. package/dist/src/db.d.ts +2 -1
  69. package/dist/src/db.d.ts.map +1 -1
  70. package/dist/src/db.js +25 -21
  71. package/dist/src/db.js.map +1 -1
  72. package/dist/src/embeddings.d.ts +1 -1
  73. package/dist/src/embeddings.js +2 -2
  74. package/dist/src/embeddings.js.map +1 -1
  75. package/dist/src/indexer.d.ts +1 -1
  76. package/dist/src/indexer.d.ts.map +1 -1
  77. package/dist/src/indexer.js +6 -2
  78. package/dist/src/indexer.js.map +1 -1
  79. package/dist/src/search.d.ts.map +1 -1
  80. package/dist/src/search.js +138 -26
  81. package/dist/src/search.js.map +1 -1
  82. package/dist/src/server.js +6 -0
  83. package/dist/src/server.js.map +1 -1
  84. package/dist/src/types.d.ts +1 -0
  85. package/dist/src/types.d.ts.map +1 -1
  86. package/dist/tests/db.test.d.ts +2 -0
  87. package/dist/tests/db.test.d.ts.map +1 -0
  88. package/dist/tests/db.test.js +459 -0
  89. package/dist/tests/db.test.js.map +1 -0
  90. package/dist/tests/embeddings.test.d.ts +2 -0
  91. package/dist/tests/embeddings.test.d.ts.map +1 -0
  92. package/dist/tests/embeddings.test.js +165 -0
  93. package/dist/tests/embeddings.test.js.map +1 -0
  94. package/dist/tests/helpers/test-harness.d.ts +26 -0
  95. package/dist/tests/helpers/test-harness.d.ts.map +1 -0
  96. package/dist/tests/helpers/test-harness.js +80 -0
  97. package/dist/tests/helpers/test-harness.js.map +1 -0
  98. package/dist/tests/indexer.test.d.ts +2 -0
  99. package/dist/tests/indexer.test.d.ts.map +1 -0
  100. package/dist/tests/indexer.test.js +299 -0
  101. package/dist/tests/indexer.test.js.map +1 -0
  102. package/dist/tests/schemas.test.d.ts +2 -0
  103. package/dist/tests/schemas.test.d.ts.map +1 -0
  104. package/dist/tests/schemas.test.js +378 -0
  105. package/dist/tests/schemas.test.js.map +1 -0
  106. package/dist/tests/search.test.d.ts +2 -0
  107. package/dist/tests/search.test.d.ts.map +1 -0
  108. package/dist/tests/search.test.js +129 -0
  109. package/dist/tests/search.test.js.map +1 -0
  110. package/dist/tests/tools.test.d.ts +2 -0
  111. package/dist/tests/tools.test.d.ts.map +1 -0
  112. package/dist/tests/tools.test.js +232 -0
  113. package/dist/tests/tools.test.js.map +1 -0
  114. package/package.json +14 -2
package/README.md CHANGED
@@ -1,17 +1,24 @@
1
+ <div align="center">
2
+
1
3
  # PCL — Product Context Layer
2
4
 
3
5
  **Give AI coding agents persistent, structured knowledge of your product.**
4
6
 
5
- Instead of re-explaining your personas, journeys, and architecture decisions every session, PCL serves them via MCP on demand. Any agent (Claude Code, Cursor, Windsurf) queries exactly what it needs, when it needs it.
6
-
7
- ## Quick Start
7
+ [![npm version](https://img.shields.io/npm/v/pcl-mcp?color=brightgreen)](https://www.npmjs.com/package/pcl-mcp)
8
+ [![npm downloads](https://img.shields.io/npm/dm/pcl-mcp)](https://www.npmjs.com/package/pcl-mcp)
9
+ [![Node >=22](https://img.shields.io/badge/node-%3E%3D22-blue)](https://nodejs.org)
10
+ [![MIT License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
8
11
 
9
12
  ```bash
10
- npm install pcl-mcp
11
- npx pcl init
12
- # add MCP config (see Agent Configuration below), then start a new agent session
13
+ npx pcl-mcp init
13
14
  ```
14
15
 
16
+ </div>
17
+
18
+ Instead of re-explaining your personas, journeys, and architecture decisions every session, PCL serves them via MCP on demand. Any agent (Claude Code, Cursor, Windsurf) queries exactly what it needs, when it needs it.
19
+
20
+ ---
21
+
15
22
  ## Why PCL?
16
23
 
17
24
  **Without PCL**, every coding session starts from scratch:
@@ -34,21 +41,40 @@ You ask your agent: *"Build the checkout flow"*
34
41
 
35
42
  **With PCL:** The agent auto-loads critical billing rules at session start (~200 tokens). When it starts the checkout feature, it pulls the relevant persona, fetches the journey steps, and checks the spec's acceptance criteria — all on-demand, only what's needed. Every session, automatically.
36
43
 
44
+ ---
45
+
46
+ ## Quick Start
47
+
48
+ ```bash
49
+ npm install pcl-mcp
50
+ npx pcl init # prompts before adding example files, sets up CLAUDE.md
51
+ # add MCP config (see Agent Configuration below), then start a new agent session
52
+ ```
53
+
54
+ ---
55
+
37
56
  ## Stack
38
57
 
39
58
  | Layer | Technology | Why |
40
59
  |---|---|---|
41
60
  | Protocol | MCP (stdio) | Universal — works with every major agent |
42
61
  | Storage | SQLite + FTS5 | Zero infra, git-friendly, offline |
43
- | Keyword search | BM25 via FTS5 | Best-in-class for exact term matching |
44
- | Semantic search | `all-MiniLM-L6-v2` (local) | 23MB, zero API cost, ~3ms/doc |
45
- | Hybrid fusion | Reciprocal Rank Fusion (k=60) | Better than either alone, no tuning |
46
- | Validation | Zod schemas | Agents rely on predictable frontmatter |
62
+ | Keyword search | BM25 via FTS5 (title-weighted 10×) | Best-in-class for exact terms, IDs, proper nouns |
63
+ | Semantic search | `all-mpnet-base-v2` (local, 768d) | Higher quality than MiniLM, zero API cost, ~3ms/doc |
64
+ | Embedding strategy | Split body + title embeddings | Separate semantic channels for body and title matching |
65
+ | Hybrid fusion | Adaptive RRF (corpus-size-aware k) | Better recall on both small and large corpora |
66
+ | Score filtering | 15% gap threshold | Prevents low-quality tail results from surfacing |
67
+ | Cross-references | Auto frontmatter link resolution | Pulls related files into results automatically |
68
+ | Validation | Zod schemas | Agents get predictable, parseable frontmatter |
47
69
  | File watching | Chokidar v4 | Live reindex on save |
48
70
 
71
+ ---
72
+
49
73
  ## Prerequisites
50
74
 
51
- - **Node.js >= 22** (required — PCL uses modern Node APIs)
75
+ **Node.js >= 22** (required — PCL uses modern Node APIs)
76
+
77
+ ---
52
78
 
53
79
  ## Install
54
80
 
@@ -59,6 +85,8 @@ npx pcl init # creates ./product with templates
59
85
 
60
86
  Also available on GitHub Packages as `@michaelgorski/pcl-mcp`.
61
87
 
88
+ ---
89
+
62
90
  ## Import existing docs
63
91
 
64
92
  If you already have markdown documentation in your repo, PCL can scan, classify, and import it automatically:
@@ -77,8 +105,12 @@ The scanner:
77
105
 
78
106
  Supported classifications: **persona**, **journey**, **spec**, **decision**, **domain**, **product**
79
107
 
108
+ ---
109
+
80
110
  ## Agent configuration
81
111
 
112
+ Works with any MCP-compatible agent. Configuration examples below.
113
+
82
114
  ### Claude Code — `.claude/mcp.json`
83
115
  ```json
84
116
  {
@@ -113,6 +145,8 @@ Supported classifications: **persona**, **journey**, **spec**, **decision**, **d
113
145
  }
114
146
  ```
115
147
 
148
+ ---
149
+
116
150
  ## File structure
117
151
 
118
152
  ```
@@ -131,19 +165,23 @@ Supported classifications: **persona**, **journey**, **spec**, **decision**, **d
131
165
  .pcl.db ← SQLite index (auto-generated, gitignore this)
132
166
  ```
133
167
 
168
+ ---
169
+
134
170
  ## Tools available to agents
135
171
 
136
172
  | Tool | Params | Description |
137
173
  |---|---|---|
138
174
  | `pcl_product_summary` | — | Load the product north-star document. Call at session start. |
139
- | `pcl_get_persona(id)` | `id`: persona ID | Get a user persona by ID. Call before any user-facing feature. |
140
- | `pcl_get_journey(id)` | `id`: journey ID | Get a user journey by ID including step-by-step detail. |
141
- | `pcl_get_spec(id)` | `id`: spec ID | Get a feature spec by ID including acceptance criteria. |
142
- | `pcl_get_decision(id)` | `id`: decision ID | Get an architecture decision record (ADR) by ID. |
143
- | `pcl_get_domain(id)` | `id`: domain ID or `"*critical"` | Get domain rules by ID. Pass `"*critical"` to load all critical rules. |
144
- | `pcl_list({ type })` | `type`: `"personas"` \| `"journeys"` \| `"specs"` \| `"decisions"` \| `"domain"` | List all files of a given type with IDs, titles, and summaries. |
145
- | `pcl_search({ query })` | `query`, `mode?` (`"hybrid"` \| `"semantic"` \| `"keyword"`), `types?`, `top_k?` | Hybrid semantic + keyword search across all product files. |
146
- | `pcl_related(id)` | `id`: source file ID, `top_k?` | Find files semantically related to a given file ID. |
175
+ | `pcl_get_persona` | `id` | Get a user persona by ID. Call before any user-facing feature. |
176
+ | `pcl_get_journey` | `id` | Get a user journey by ID including step-by-step detail. |
177
+ | `pcl_get_spec` | `id` | Get a feature spec by ID including acceptance criteria. |
178
+ | `pcl_get_decision` | `id` | Get an architecture decision record (ADR) by ID. |
179
+ | `pcl_get_domain` | `id` or `"*critical"` | Get domain rules by ID. Pass `"*critical"` to load all critical rules. |
180
+ | `pcl_list` | `type`: `"personas"` \| `"journeys"` \| `"specs"` \| `"decisions"` \| `"domain"` | List all files of a given type with IDs, titles, and summaries. |
181
+ | `pcl_search` | `query`, `mode?` (`"hybrid"` \| `"semantic"` \| `"keyword"`), `types?`, `top_k?` | Hybrid semantic + keyword search across all product files. |
182
+ | `pcl_related` | `id`, `top_k?` | Find files semantically related to a given file ID. |
183
+
184
+ ---
147
185
 
148
186
  ## Prompts & Resources
149
187
 
@@ -153,25 +191,92 @@ In addition to tools, PCL exposes MCP prompts and resources:
153
191
 
154
192
  **Resources: `pcl://files/{type}/{id}`** — Each indexed file is available as an MCP resource. Agents can browse and read individual files directly via the resource URI (e.g., `pcl://files/persona/example-user`).
155
193
 
194
+ ---
195
+
156
196
  ## How hybrid search works
157
197
 
198
+ PCL runs three parallel retrieval signals and fuses them with Reciprocal Rank Fusion:
199
+
158
200
  ```
159
201
  query: "what does Max find frustrating about onboarding"
160
202
 
161
- BM25 (FTS5): [persona-max, journey-onboarding, spec-magic-link, ...]
162
- ranked by term frequency + IDF
203
+ BM25 (FTS5, title-weighted 10×):
204
+ persona-max, journey-onboarding, spec-magic-link
205
+ ↓ ranked by bm25(title=10×, body=1×) — exact terms, IDs, proper nouns
206
+
207
+ Semantic — body embedding (all-mpnet-base-v2, 768d):
208
+ → journey-onboarding, persona-max, domain-core-rules
209
+ ↓ cosine similarity on full-text embedding
163
210
 
164
- Cosine similarity: [journey-onboarding, persona-max, domain-core-rules, ...]
165
- ranked by embedding dot product (MiniLM-L6-v2)
211
+ Semantic title embedding (all-mpnet-base-v2, 768d):
212
+ persona-max, journey-onboarding, spec-onboarding-ux
213
+ ↓ cosine similarity on title + summary embedding
166
214
 
167
- RRF fusion: score(d) = Σ 1 / (60 + rank(d))
168
- combines both rankings without weight tuning
215
+ Adaptive RRF (k = corpus_size / 10):
216
+ score(d) = Σ 1 / (k + rank(d)) fused across all three lists
169
217
 
170
- Result: 1. journey-onboarding (0.94)
171
- 2. persona-max (0.87)
172
- 3. spec-onboarding-ux (0.71)
218
+ Score gap filter (15% threshold):
219
+ Drops results below 0.15 × top_score — removes noise
220
+
221
+ Cross-reference resolution:
222
+ journey-onboarding.frontmatter.persona = "max"
223
+ → auto-includes persona-max even if it ranked outside top-k
224
+
225
+ Result: 1. journey-onboarding (0.94)
226
+ 2. persona-max (0.87)
227
+ 3. spec-onboarding-ux (0.71)
173
228
  ```
174
229
 
230
+ **Why split embeddings?** Body and title carry different semantic signals. A query like *"checkout persona"* should match a persona file by title even if its body content is mostly demographic data. Indexing them separately gives the fusion step two distinct semantic channels rather than one diluted one.
231
+
232
+ **Why adaptive RRF k?** Fixed k=60 over-smooths rankings on small corpora (10–20 files). Corpus-aware k scales down on small collections to let strong matches separate from weak ones.
233
+
234
+ ---
235
+
236
+ ## Testing & Benchmarks
237
+
238
+ PCL ships with a full test suite and a multi-dimensional benchmark framework.
239
+
240
+ ### Tests
241
+
242
+ ```bash
243
+ npm test # run all tests (vitest)
244
+ npm run test:watch # watch mode
245
+ ```
246
+
247
+ Six test suites cover the full stack:
248
+
249
+ | Suite | Coverage |
250
+ |---|---|
251
+ | `db.test.ts` | SQLite operations, FTS5 queries, embedding storage |
252
+ | `embeddings.test.ts` | Embedding generation, cache hits, dimension checks |
253
+ | `indexer.test.ts` | File discovery, schema extraction, change detection |
254
+ | `schemas.test.ts` | Zod frontmatter validation for all file types |
255
+ | `search.test.ts` | Hybrid search, RRF, multi-hop decomposition, cross-refs |
256
+ | `tools.test.ts` | MCP tool handlers, response formatting, error paths |
257
+
258
+ ### Benchmarks
259
+
260
+ ```bash
261
+ npm run bench # all benchmarks
262
+ npm run bench:perf # latency benchmarks (search + embedding speed)
263
+ npm run bench:quality # search quality: Precision@k, Recall@k, NDCG, MRR
264
+ npm run bench:tokens # token efficiency across search modes
265
+ npm run bench:ablation # hybrid vs keyword-only vs semantic-only comparison
266
+ npm run bench:ai # Claude-judged result quality (requires ANTHROPIC_API_KEY)
267
+ npm run bench:report # generate markdown report from results
268
+ ```
269
+
270
+ | Suite | Measures |
271
+ |---|---|
272
+ | Performance | Search + embedding latency (p50/p95) |
273
+ | Search quality | Precision@k, Recall@k, NDCG, MRR on labeled corpus |
274
+ | Token efficiency | Tokens consumed per query across search modes |
275
+ | Ablation | Quality delta: hybrid vs keyword-only vs semantic-only |
276
+ | AI quality | Claude-judged relevance score for top-k results |
277
+
278
+ ---
279
+
175
280
  ## Human workflow
176
281
 
177
282
  The system is only as good as what you put in. Discipline:
@@ -184,12 +289,16 @@ The system is only as good as what you put in. Discipline:
184
289
 
185
290
  The agent does the rest.
186
291
 
292
+ ---
293
+
187
294
  ## Gitignore
188
295
 
189
296
  ```gitignore
190
297
  product/.pcl.db # SQLite index — auto-regenerated
191
298
  ```
192
299
 
300
+ ---
301
+
193
302
  ## License
194
303
 
195
304
  MIT
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
3
+ * Disentangles "did PCL find the right docs" from "did the LLM use them well."
4
+ */
5
+ export interface ContextMetrics {
6
+ recall: number;
7
+ precision: number;
8
+ f1: number;
9
+ retrieved: string[];
10
+ required: string[];
11
+ hits: string[];
12
+ misses: string[];
13
+ noise: string[];
14
+ }
15
+ /**
16
+ * Compute context retrieval quality metrics.
17
+ *
18
+ * @param retrievedIds - Document IDs returned by PCL search
19
+ * @param requiredIds - Document IDs that the task actually needs
20
+ */
21
+ export declare function measureContextRetrieval(retrievedIds: string[], requiredIds: string[]): ContextMetrics;
22
+ /**
23
+ * Aggregate context metrics across multiple tasks.
24
+ */
25
+ export declare function averageContextMetrics(metrics: ContextMetrics[]): {
26
+ recall: number;
27
+ precision: number;
28
+ f1: number;
29
+ };
30
+ //# sourceMappingURL=context-retrieval-quality.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-retrieval-quality.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,IAAI,EAAE,MAAM,EAAE,CAAC;IACf,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,YAAY,EAAE,MAAM,EAAE,EACtB,WAAW,EAAE,MAAM,EAAE,GACpB,cAAc,CA0BhB;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,OAAO,EAAE,cAAc,EAAE,GACxB;IAAE,MAAM,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,MAAM,CAAA;CAAE,CAenD"}
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
3
+ * Disentangles "did PCL find the right docs" from "did the LLM use them well."
4
+ */
5
+ /**
6
+ * Compute context retrieval quality metrics.
7
+ *
8
+ * @param retrievedIds - Document IDs returned by PCL search
9
+ * @param requiredIds - Document IDs that the task actually needs
10
+ */
11
+ export function measureContextRetrieval(retrievedIds, requiredIds) {
12
+ const retrievedSet = new Set(retrievedIds);
13
+ const requiredSet = new Set(requiredIds);
14
+ const hits = requiredIds.filter((id) => retrievedSet.has(id));
15
+ const misses = requiredIds.filter((id) => !retrievedSet.has(id));
16
+ const noise = retrievedIds.filter((id) => !requiredSet.has(id));
17
+ const recall = requiredIds.length > 0 ? hits.length / requiredIds.length : 1;
18
+ const precision = retrievedIds.length > 0 ? hits.length / retrievedIds.length : 1;
19
+ const f1 = recall + precision > 0
20
+ ? (2 * recall * precision) / (recall + precision)
21
+ : 0;
22
+ return {
23
+ recall,
24
+ precision,
25
+ f1,
26
+ retrieved: retrievedIds,
27
+ required: requiredIds,
28
+ hits,
29
+ misses,
30
+ noise,
31
+ };
32
+ }
33
+ /**
34
+ * Aggregate context metrics across multiple tasks.
35
+ */
36
+ export function averageContextMetrics(metrics) {
37
+ if (metrics.length === 0)
38
+ return { recall: 0, precision: 0, f1: 0 };
39
+ const sum = metrics.reduce((acc, m) => ({
40
+ recall: acc.recall + m.recall,
41
+ precision: acc.precision + m.precision,
42
+ f1: acc.f1 + m.f1,
43
+ }), { recall: 0, precision: 0, f1: 0 });
44
+ return {
45
+ recall: sum.recall / metrics.length,
46
+ precision: sum.precision / metrics.length,
47
+ f1: sum.f1 / metrics.length,
48
+ };
49
+ }
50
+ //# sourceMappingURL=context-retrieval-quality.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-retrieval-quality.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAaH;;;;;GAKG;AACH,MAAM,UAAU,uBAAuB,CACrC,YAAsB,EACtB,WAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;IAEzC,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACjE,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAEhE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7E,MAAM,SAAS,GACb,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAClE,MAAM,EAAE,GACN,MAAM,GAAG,SAAS,GAAG,CAAC;QACpB,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,CAAC;QACjD,CAAC,CAAC,CAAC,CAAC;IAER,OAAO;QACL,MAAM;QACN,SAAS;QACT,EAAE;QACF,SAAS,EAAE,YAAY;QACvB,QAAQ,EAAE,WAAW;QACrB,IAAI;QACJ,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,OAAyB;IAEzB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC;IACpE,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM;QAC7B,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE;KAClB,CAAC,EACF,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CACnC,CAAC;IACF,OAAO;QACL,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM;QACnC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,OAAO,CAAC,MAAM;QACzC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,OAAO,CAAC,MAAM;KAC5B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Precision@K: fraction of top-K results that are relevant.
3
+ */
4
+ export declare function precisionAtK(retrieved: string[], relevant: Set<string>, k: number): number;
5
+ /**
6
+ * Recall@K: fraction of relevant docs found in top-K.
7
+ */
8
+ export declare function recallAtK(retrieved: string[], relevant: Set<string>, k: number): number;
9
+ /**
10
+ * MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
11
+ */
12
+ export declare function reciprocalRank(retrieved: string[], relevant: Set<string>): number;
13
+ /**
14
+ * NDCG@K: normalized DCG using ideal ranking.
15
+ */
16
+ export declare function ndcgAtK(retrieved: string[], relevanceScores: Map<string, number>, k: number): number;
17
+ /**
18
+ * Compute all metrics for a single query.
19
+ */
20
+ export declare function computeMetrics(retrieved: string[], relevanceScores: Map<string, number>, relevantThreshold?: number): {
21
+ precisionAt1: number;
22
+ precisionAt3: number;
23
+ precisionAt5: number;
24
+ recallAt5: number;
25
+ mrr: number;
26
+ ndcgAt5: number;
27
+ };
28
+ /**
29
+ * Average metrics across multiple queries.
30
+ */
31
+ export declare function averageMetrics(results: Array<ReturnType<typeof computeMetrics>>): ReturnType<typeof computeMetrics>;
32
+ //# sourceMappingURL=ir-metrics.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ir-metrics.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,wBAAgB,YAAY,CAC1B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,SAAS,CACvB,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,GACpB,MAAM,CAKR;AAmBD;;GAEG;AACH,wBAAgB,OAAO,CACrB,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,CAAC,EAAE,MAAM,GACR,MAAM,CASR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,iBAAiB,SAAI,GACpB;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB,CAeA;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,OAAO,EAAE,KAAK,CAAC,UAAU,CAAC,OAAO,cAAc,CAAC,CAAC,GAChD,UAAU,CAAC,OAAO,cAAc,CAAC,CA0BnC"}
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Precision@K: fraction of top-K results that are relevant.
3
+ */
4
+ export function precisionAtK(retrieved, relevant, k) {
5
+ const topK = retrieved.slice(0, k);
6
+ if (topK.length === 0)
7
+ return 0;
8
+ const hits = topK.filter(id => relevant.has(id)).length;
9
+ return hits / topK.length;
10
+ }
11
+ /**
12
+ * Recall@K: fraction of relevant docs found in top-K.
13
+ */
14
+ export function recallAtK(retrieved, relevant, k) {
15
+ if (relevant.size === 0)
16
+ return 1; // no relevant docs = perfect recall vacuously
17
+ const topK = retrieved.slice(0, k);
18
+ const hits = topK.filter(id => relevant.has(id)).length;
19
+ return hits / relevant.size;
20
+ }
21
+ /**
22
+ * MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
23
+ */
24
+ export function reciprocalRank(retrieved, relevant) {
25
+ for (let i = 0; i < retrieved.length; i++) {
26
+ if (relevant.has(retrieved[i]))
27
+ return 1 / (i + 1);
28
+ }
29
+ return 0;
30
+ }
31
+ /**
32
+ * DCG@K with graded relevance (relevance scores 0-3).
33
+ */
34
+ function dcgAtK(retrieved, relevanceScores, k) {
35
+ let dcg = 0;
36
+ const topK = retrieved.slice(0, k);
37
+ for (let i = 0; i < topK.length; i++) {
38
+ const rel = relevanceScores.get(topK[i]) ?? 0;
39
+ dcg += (Math.pow(2, rel) - 1) / Math.log2(i + 2); // i+2 because log2(1)=0
40
+ }
41
+ return dcg;
42
+ }
43
+ /**
44
+ * NDCG@K: normalized DCG using ideal ranking.
45
+ */
46
+ export function ndcgAtK(retrieved, relevanceScores, k) {
47
+ const dcg = dcgAtK(retrieved, relevanceScores, k);
48
+ // Ideal ranking: sort all docs by relevance descending
49
+ const idealOrder = [...relevanceScores.entries()]
50
+ .sort(([, a], [, b]) => b - a)
51
+ .map(([id]) => id);
52
+ const idcg = dcgAtK(idealOrder, relevanceScores, k);
53
+ if (idcg === 0)
54
+ return 0;
55
+ return dcg / idcg;
56
+ }
57
+ /**
58
+ * Compute all metrics for a single query.
59
+ */
60
+ export function computeMetrics(retrieved, relevanceScores, relevantThreshold = 1) {
61
+ const relevant = new Set([...relevanceScores.entries()]
62
+ .filter(([, score]) => score >= relevantThreshold)
63
+ .map(([id]) => id));
64
+ return {
65
+ precisionAt1: precisionAtK(retrieved, relevant, 1),
66
+ precisionAt3: precisionAtK(retrieved, relevant, 3),
67
+ precisionAt5: precisionAtK(retrieved, relevant, 5),
68
+ recallAt5: recallAtK(retrieved, relevant, 5),
69
+ mrr: reciprocalRank(retrieved, relevant),
70
+ ndcgAt5: ndcgAtK(retrieved, relevanceScores, 5),
71
+ };
72
+ }
73
+ /**
74
+ * Average metrics across multiple queries.
75
+ */
76
+ export function averageMetrics(results) {
77
+ const n = results.length;
78
+ if (n === 0) {
79
+ return { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 };
80
+ }
81
+ const sum = results.reduce((acc, r) => ({
82
+ precisionAt1: acc.precisionAt1 + r.precisionAt1,
83
+ precisionAt3: acc.precisionAt3 + r.precisionAt3,
84
+ precisionAt5: acc.precisionAt5 + r.precisionAt5,
85
+ recallAt5: acc.recallAt5 + r.recallAt5,
86
+ mrr: acc.mrr + r.mrr,
87
+ ndcgAt5: acc.ndcgAt5 + r.ndcgAt5,
88
+ }), { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 });
89
+ return {
90
+ precisionAt1: sum.precisionAt1 / n,
91
+ precisionAt3: sum.precisionAt3 / n,
92
+ precisionAt5: sum.precisionAt5 / n,
93
+ recallAt5: sum.recallAt5 / n,
94
+ mrr: sum.mrr / n,
95
+ ndcgAt5: sum.ndcgAt5 / n,
96
+ };
97
+ }
98
+ //# sourceMappingURL=ir-metrics.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ir-metrics.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAChC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CACvB,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,IAAI,QAAQ,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC,CAAC,8CAA8C;IACjF,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,QAAqB;IAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,IAAI,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;GAEG;AACH,SAAS,MAAM,CACb,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAC;QAC/C,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,wBAAwB;IAC5E,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,OAAO,CACrB,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,MAAM,GAAG,GAAG,MAAM,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IAClD,uDAAuD;IACvD,MAAM,UAAU,GAAG,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC9C,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC;SAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;IACrB,MAAM,IAAI,GAAG,MAAM,CAAC,UAAU,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IACpD,IAAI,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACzB,OAAO,GAAG,GAAG,IAAI,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,eAAoC,EACpC,iBAAiB,GAAG,CAAC;IASrB,MAAM,QAAQ,GAAG,IAAI,GAAG,CACtB,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC3B,MAAM,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,KAAK,IAAI,iBAAiB,CAAC;SACjD,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CACrB,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,SAAS,EAAE,SAAS,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAC5C,GAAG,EAAE,cAAc,CAAC,SAAS,EAAE,QAAQ,CAAC;QACxC,OAAO,EAAE,OAAO,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC;KAChD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,OAAiD;IAEjD,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IACzB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;IACjG,CAAC;IAED,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG;QACpB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;KACjC,CAAC,EACF,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CACxF,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC;QAC5B,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC;QAChB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC;KACzB,CAAC;AACJ,CAAC"}
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Structured Judge — evaluates AI output with specific yes/no questions per criterion.
3
+ *
4
+ * Instead of "rate 0-10", asks: "Does the code implement X? YES/NO"
5
+ * Then scores = (yes_count / total_questions) * 10.
6
+ */
7
+ import Anthropic from "@anthropic-ai/sdk";
8
+ import type { CodingTask } from "../lib/types.js";
9
+ export interface JudgmentResult {
10
+ taskId: string;
11
+ totalQuestions: number;
12
+ yesCount: number;
13
+ score: number;
14
+ details: Array<{
15
+ criterion: string;
16
+ answer: "YES" | "NO" | "PARTIAL";
17
+ reasoning: string;
18
+ }>;
19
+ }
20
+ /**
21
+ * Evaluate an AI-generated output against structured criteria.
22
+ */
23
+ export declare function structuredJudge(client: Anthropic, task: CodingTask, output: string, contextDocs: string): Promise<JudgmentResult>;
24
+ /**
25
+ * Check if generated code contains valid TypeScript/JSX syntax.
26
+ * Uses the TypeScript compiler API in syntax-only mode for accurate parsing
27
+ * of template literals, JSX, and other complex syntax.
28
+ */
29
+ export declare function checkTypeScriptSyntax(output: string): {
30
+ valid: boolean;
31
+ errorCount: number;
32
+ codeBlockCount: number;
33
+ };
34
+ //# sourceMappingURL=structured-judge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"structured-judge.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/structured-judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAIlD,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,KAAK,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,KAAK,GAAG,IAAI,GAAG,SAAS,CAAC;QACjC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC,CAAC;CACJ;AAiCD;;GAEG;AACH,wBAAsB,eAAe,CACnC,MAAM,EAAE,SAAS,EACjB,IAAI,EAAE,UAAU,EAChB,MAAM,EAAE,MAAM,EACd,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,cAAc,CAAC,CAsGzB;AAED;;;;GAIG;AACH,wBAAgB,qBAAqB,CACnC,MAAM,EAAE,MAAM,GACb;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,UAAU,EAAE,MAAM,CAAC;IAAC,cAAc,EAAE,MAAM,CAAA;CAAE,CAgDhE"}