pcl-mcp 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts +30 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.js +50 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.js.map +1 -0
- package/dist/benchmarks/evaluators/ir-metrics.d.ts +32 -0
- package/dist/benchmarks/evaluators/ir-metrics.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/ir-metrics.js +98 -0
- package/dist/benchmarks/evaluators/ir-metrics.js.map +1 -0
- package/dist/benchmarks/evaluators/structured-judge.d.ts +34 -0
- package/dist/benchmarks/evaluators/structured-judge.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/structured-judge.js +153 -0
- package/dist/benchmarks/evaluators/structured-judge.js.map +1 -0
- package/dist/benchmarks/evaluators/token-counter.d.ts +9 -0
- package/dist/benchmarks/evaluators/token-counter.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/token-counter.js +24 -0
- package/dist/benchmarks/evaluators/token-counter.js.map +1 -0
- package/dist/benchmarks/generators/generate-corpus.d.ts +2 -0
- package/dist/benchmarks/generators/generate-corpus.d.ts.map +1 -0
- package/dist/benchmarks/generators/generate-corpus.js +243 -0
- package/dist/benchmarks/generators/generate-corpus.js.map +1 -0
- package/dist/benchmarks/lib/harness.d.ts +23 -0
- package/dist/benchmarks/lib/harness.d.ts.map +1 -0
- package/dist/benchmarks/lib/harness.js +44 -0
- package/dist/benchmarks/lib/harness.js.map +1 -0
- package/dist/benchmarks/lib/types.d.ts +79 -0
- package/dist/benchmarks/lib/types.d.ts.map +1 -0
- package/dist/benchmarks/lib/types.js +2 -0
- package/dist/benchmarks/lib/types.js.map +1 -0
- package/dist/benchmarks/reporters/markdown-reporter.d.ts +2 -0
- package/dist/benchmarks/reporters/markdown-reporter.d.ts.map +1 -0
- package/dist/benchmarks/reporters/markdown-reporter.js +80 -0
- package/dist/benchmarks/reporters/markdown-reporter.js.map +1 -0
- package/dist/benchmarks/runners/bench-ablation.d.ts +2 -0
- package/dist/benchmarks/runners/bench-ablation.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-ablation.js +49 -0
- package/dist/benchmarks/runners/bench-ablation.js.map +1 -0
- package/dist/benchmarks/runners/bench-ai-quality.d.ts +2 -0
- package/dist/benchmarks/runners/bench-ai-quality.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-ai-quality.js +297 -0
- package/dist/benchmarks/runners/bench-ai-quality.js.map +1 -0
- package/dist/benchmarks/runners/bench-interactive-eval.d.ts +2 -0
- package/dist/benchmarks/runners/bench-interactive-eval.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-interactive-eval.js +119 -0
- package/dist/benchmarks/runners/bench-interactive-eval.js.map +1 -0
- package/dist/benchmarks/runners/bench-performance.bench.d.ts +2 -0
- package/dist/benchmarks/runners/bench-performance.bench.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-performance.bench.js +50 -0
- package/dist/benchmarks/runners/bench-performance.bench.js.map +1 -0
- package/dist/benchmarks/runners/bench-search-quality.d.ts +2 -0
- package/dist/benchmarks/runners/bench-search-quality.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-search-quality.js +70 -0
- package/dist/benchmarks/runners/bench-search-quality.js.map +1 -0
- package/dist/benchmarks/runners/bench-token-efficiency.d.ts +2 -0
- package/dist/benchmarks/runners/bench-token-efficiency.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-token-efficiency.js +89 -0
- package/dist/benchmarks/runners/bench-token-efficiency.js.map +1 -0
- package/dist/benchmarks/runners/diag.d.ts +2 -0
- package/dist/benchmarks/runners/diag.d.ts.map +1 -0
- package/dist/benchmarks/runners/diag.js +30 -0
- package/dist/benchmarks/runners/diag.js.map +1 -0
- package/dist/benchmarks/vitest.config.bench.d.ts +3 -0
- package/dist/benchmarks/vitest.config.bench.d.ts.map +1 -0
- package/dist/benchmarks/vitest.config.bench.js +14 -0
- package/dist/benchmarks/vitest.config.bench.js.map +1 -0
- package/dist/src/db.d.ts +2 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +25 -21
- package/dist/src/db.js.map +1 -1
- package/dist/src/embeddings.d.ts +1 -1
- package/dist/src/embeddings.js +2 -2
- package/dist/src/embeddings.js.map +1 -1
- package/dist/src/indexer.d.ts +1 -1
- package/dist/src/indexer.d.ts.map +1 -1
- package/dist/src/indexer.js +6 -2
- package/dist/src/indexer.js.map +1 -1
- package/dist/src/search.d.ts.map +1 -1
- package/dist/src/search.js +138 -26
- package/dist/src/search.js.map +1 -1
- package/dist/src/server.js +6 -0
- package/dist/src/server.js.map +1 -1
- package/dist/src/types.d.ts +1 -0
- package/dist/src/types.d.ts.map +1 -1
- package/dist/tests/db.test.d.ts +2 -0
- package/dist/tests/db.test.d.ts.map +1 -0
- package/dist/tests/db.test.js +459 -0
- package/dist/tests/db.test.js.map +1 -0
- package/dist/tests/embeddings.test.d.ts +2 -0
- package/dist/tests/embeddings.test.d.ts.map +1 -0
- package/dist/tests/embeddings.test.js +165 -0
- package/dist/tests/embeddings.test.js.map +1 -0
- package/dist/tests/helpers/test-harness.d.ts +26 -0
- package/dist/tests/helpers/test-harness.d.ts.map +1 -0
- package/dist/tests/helpers/test-harness.js +80 -0
- package/dist/tests/helpers/test-harness.js.map +1 -0
- package/dist/tests/indexer.test.d.ts +2 -0
- package/dist/tests/indexer.test.d.ts.map +1 -0
- package/dist/tests/indexer.test.js +299 -0
- package/dist/tests/indexer.test.js.map +1 -0
- package/dist/tests/schemas.test.d.ts +2 -0
- package/dist/tests/schemas.test.d.ts.map +1 -0
- package/dist/tests/schemas.test.js +378 -0
- package/dist/tests/schemas.test.js.map +1 -0
- package/dist/tests/search.test.d.ts +2 -0
- package/dist/tests/search.test.d.ts.map +1 -0
- package/dist/tests/search.test.js +129 -0
- package/dist/tests/search.test.js.map +1 -0
- package/dist/tests/tools.test.d.ts +2 -0
- package/dist/tests/tools.test.d.ts.map +1 -0
- package/dist/tests/tools.test.js +232 -0
- package/dist/tests/tools.test.js.map +1 -0
- package/package.json +14 -2
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
|
|
3
|
+
* Disentangles "did PCL find the right docs" from "did the LLM use them well."
|
|
4
|
+
*/
|
|
5
|
+
export interface ContextMetrics {
|
|
6
|
+
recall: number;
|
|
7
|
+
precision: number;
|
|
8
|
+
f1: number;
|
|
9
|
+
retrieved: string[];
|
|
10
|
+
required: string[];
|
|
11
|
+
hits: string[];
|
|
12
|
+
misses: string[];
|
|
13
|
+
noise: string[];
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Compute context retrieval quality metrics.
|
|
17
|
+
*
|
|
18
|
+
* @param retrievedIds - Document IDs returned by PCL search
|
|
19
|
+
* @param requiredIds - Document IDs that the task actually needs
|
|
20
|
+
*/
|
|
21
|
+
export declare function measureContextRetrieval(retrievedIds: string[], requiredIds: string[]): ContextMetrics;
|
|
22
|
+
/**
|
|
23
|
+
* Aggregate context metrics across multiple tasks.
|
|
24
|
+
*/
|
|
25
|
+
export declare function averageContextMetrics(metrics: ContextMetrics[]): {
|
|
26
|
+
recall: number;
|
|
27
|
+
precision: number;
|
|
28
|
+
f1: number;
|
|
29
|
+
};
|
|
30
|
+
//# sourceMappingURL=context-retrieval-quality.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-retrieval-quality.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,IAAI,EAAE,MAAM,EAAE,CAAC;IACf,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,YAAY,EAAE,MAAM,EAAE,EACtB,WAAW,EAAE,MAAM,EAAE,GACpB,cAAc,CA0BhB;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,OAAO,EAAE,cAAc,EAAE,GACxB;IAAE,MAAM,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,MAAM,CAAA;CAAE,CAenD"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
|
|
3
|
+
* Disentangles "did PCL find the right docs" from "did the LLM use them well."
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* Compute context retrieval quality metrics.
|
|
7
|
+
*
|
|
8
|
+
* @param retrievedIds - Document IDs returned by PCL search
|
|
9
|
+
* @param requiredIds - Document IDs that the task actually needs
|
|
10
|
+
*/
|
|
11
|
+
export function measureContextRetrieval(retrievedIds, requiredIds) {
|
|
12
|
+
const retrievedSet = new Set(retrievedIds);
|
|
13
|
+
const requiredSet = new Set(requiredIds);
|
|
14
|
+
const hits = requiredIds.filter((id) => retrievedSet.has(id));
|
|
15
|
+
const misses = requiredIds.filter((id) => !retrievedSet.has(id));
|
|
16
|
+
const noise = retrievedIds.filter((id) => !requiredSet.has(id));
|
|
17
|
+
const recall = requiredIds.length > 0 ? hits.length / requiredIds.length : 1;
|
|
18
|
+
const precision = retrievedIds.length > 0 ? hits.length / retrievedIds.length : 1;
|
|
19
|
+
const f1 = recall + precision > 0
|
|
20
|
+
? (2 * recall * precision) / (recall + precision)
|
|
21
|
+
: 0;
|
|
22
|
+
return {
|
|
23
|
+
recall,
|
|
24
|
+
precision,
|
|
25
|
+
f1,
|
|
26
|
+
retrieved: retrievedIds,
|
|
27
|
+
required: requiredIds,
|
|
28
|
+
hits,
|
|
29
|
+
misses,
|
|
30
|
+
noise,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Aggregate context metrics across multiple tasks.
|
|
35
|
+
*/
|
|
36
|
+
export function averageContextMetrics(metrics) {
|
|
37
|
+
if (metrics.length === 0)
|
|
38
|
+
return { recall: 0, precision: 0, f1: 0 };
|
|
39
|
+
const sum = metrics.reduce((acc, m) => ({
|
|
40
|
+
recall: acc.recall + m.recall,
|
|
41
|
+
precision: acc.precision + m.precision,
|
|
42
|
+
f1: acc.f1 + m.f1,
|
|
43
|
+
}), { recall: 0, precision: 0, f1: 0 });
|
|
44
|
+
return {
|
|
45
|
+
recall: sum.recall / metrics.length,
|
|
46
|
+
precision: sum.precision / metrics.length,
|
|
47
|
+
f1: sum.f1 / metrics.length,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=context-retrieval-quality.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-retrieval-quality.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAaH;;;;;GAKG;AACH,MAAM,UAAU,uBAAuB,CACrC,YAAsB,EACtB,WAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;IAEzC,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACjE,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAEhE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7E,MAAM,SAAS,GACb,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAClE,MAAM,EAAE,GACN,MAAM,GAAG,SAAS,GAAG,CAAC;QACpB,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,CAAC;QACjD,CAAC,CAAC,CAAC,CAAC;IAER,OAAO;QACL,MAAM;QACN,SAAS;QACT,EAAE;QACF,SAAS,EAAE,YAAY;QACvB,QAAQ,EAAE,WAAW;QACrB,IAAI;QACJ,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,OAAyB;IAEzB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC;IACpE,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM;QAC7B,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE;KAClB,CAAC,EACF,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CACnC,CAAC;IACF,OAAO;QACL,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM;QACnC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,OAAO,CAAC,MAAM;QACzC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,OAAO,CAAC,MAAM;KAC5B,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Precision@K: fraction of top-K results that are relevant.
|
|
3
|
+
*/
|
|
4
|
+
export declare function precisionAtK(retrieved: string[], relevant: Set<string>, k: number): number;
|
|
5
|
+
/**
|
|
6
|
+
* Recall@K: fraction of relevant docs found in top-K.
|
|
7
|
+
*/
|
|
8
|
+
export declare function recallAtK(retrieved: string[], relevant: Set<string>, k: number): number;
|
|
9
|
+
/**
|
|
10
|
+
* MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
|
|
11
|
+
*/
|
|
12
|
+
export declare function reciprocalRank(retrieved: string[], relevant: Set<string>): number;
|
|
13
|
+
/**
|
|
14
|
+
* NDCG@K: normalized DCG using ideal ranking.
|
|
15
|
+
*/
|
|
16
|
+
export declare function ndcgAtK(retrieved: string[], relevanceScores: Map<string, number>, k: number): number;
|
|
17
|
+
/**
|
|
18
|
+
* Compute all metrics for a single query.
|
|
19
|
+
*/
|
|
20
|
+
export declare function computeMetrics(retrieved: string[], relevanceScores: Map<string, number>, relevantThreshold?: number): {
|
|
21
|
+
precisionAt1: number;
|
|
22
|
+
precisionAt3: number;
|
|
23
|
+
precisionAt5: number;
|
|
24
|
+
recallAt5: number;
|
|
25
|
+
mrr: number;
|
|
26
|
+
ndcgAt5: number;
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* Average metrics across multiple queries.
|
|
30
|
+
*/
|
|
31
|
+
export declare function averageMetrics(results: Array<ReturnType<typeof computeMetrics>>): ReturnType<typeof computeMetrics>;
|
|
32
|
+
//# sourceMappingURL=ir-metrics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ir-metrics.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,wBAAgB,YAAY,CAC1B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,SAAS,CACvB,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,GACpB,MAAM,CAKR;AAmBD;;GAEG;AACH,wBAAgB,OAAO,CACrB,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,CAAC,EAAE,MAAM,GACR,MAAM,CASR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,iBAAiB,SAAI,GACpB;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB,CAeA;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,OAAO,EAAE,KAAK,CAAC,UAAU,CAAC,OAAO,cAAc,CAAC,CAAC,GAChD,UAAU,CAAC,OAAO,cAAc,CAAC,CA0BnC"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Precision@K: fraction of top-K results that are relevant.
|
|
3
|
+
*/
|
|
4
|
+
export function precisionAtK(retrieved, relevant, k) {
|
|
5
|
+
const topK = retrieved.slice(0, k);
|
|
6
|
+
if (topK.length === 0)
|
|
7
|
+
return 0;
|
|
8
|
+
const hits = topK.filter(id => relevant.has(id)).length;
|
|
9
|
+
return hits / topK.length;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Recall@K: fraction of relevant docs found in top-K.
|
|
13
|
+
*/
|
|
14
|
+
export function recallAtK(retrieved, relevant, k) {
|
|
15
|
+
if (relevant.size === 0)
|
|
16
|
+
return 1; // no relevant docs = perfect recall vacuously
|
|
17
|
+
const topK = retrieved.slice(0, k);
|
|
18
|
+
const hits = topK.filter(id => relevant.has(id)).length;
|
|
19
|
+
return hits / relevant.size;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
|
|
23
|
+
*/
|
|
24
|
+
export function reciprocalRank(retrieved, relevant) {
|
|
25
|
+
for (let i = 0; i < retrieved.length; i++) {
|
|
26
|
+
if (relevant.has(retrieved[i]))
|
|
27
|
+
return 1 / (i + 1);
|
|
28
|
+
}
|
|
29
|
+
return 0;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* DCG@K with graded relevance (relevance scores 0-3).
|
|
33
|
+
*/
|
|
34
|
+
function dcgAtK(retrieved, relevanceScores, k) {
|
|
35
|
+
let dcg = 0;
|
|
36
|
+
const topK = retrieved.slice(0, k);
|
|
37
|
+
for (let i = 0; i < topK.length; i++) {
|
|
38
|
+
const rel = relevanceScores.get(topK[i]) ?? 0;
|
|
39
|
+
dcg += (Math.pow(2, rel) - 1) / Math.log2(i + 2); // i+2 because log2(1)=0
|
|
40
|
+
}
|
|
41
|
+
return dcg;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* NDCG@K: normalized DCG using ideal ranking.
|
|
45
|
+
*/
|
|
46
|
+
export function ndcgAtK(retrieved, relevanceScores, k) {
|
|
47
|
+
const dcg = dcgAtK(retrieved, relevanceScores, k);
|
|
48
|
+
// Ideal ranking: sort all docs by relevance descending
|
|
49
|
+
const idealOrder = [...relevanceScores.entries()]
|
|
50
|
+
.sort(([, a], [, b]) => b - a)
|
|
51
|
+
.map(([id]) => id);
|
|
52
|
+
const idcg = dcgAtK(idealOrder, relevanceScores, k);
|
|
53
|
+
if (idcg === 0)
|
|
54
|
+
return 0;
|
|
55
|
+
return dcg / idcg;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Compute all metrics for a single query.
|
|
59
|
+
*/
|
|
60
|
+
export function computeMetrics(retrieved, relevanceScores, relevantThreshold = 1) {
|
|
61
|
+
const relevant = new Set([...relevanceScores.entries()]
|
|
62
|
+
.filter(([, score]) => score >= relevantThreshold)
|
|
63
|
+
.map(([id]) => id));
|
|
64
|
+
return {
|
|
65
|
+
precisionAt1: precisionAtK(retrieved, relevant, 1),
|
|
66
|
+
precisionAt3: precisionAtK(retrieved, relevant, 3),
|
|
67
|
+
precisionAt5: precisionAtK(retrieved, relevant, 5),
|
|
68
|
+
recallAt5: recallAtK(retrieved, relevant, 5),
|
|
69
|
+
mrr: reciprocalRank(retrieved, relevant),
|
|
70
|
+
ndcgAt5: ndcgAtK(retrieved, relevanceScores, 5),
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Average metrics across multiple queries.
|
|
75
|
+
*/
|
|
76
|
+
export function averageMetrics(results) {
|
|
77
|
+
const n = results.length;
|
|
78
|
+
if (n === 0) {
|
|
79
|
+
return { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 };
|
|
80
|
+
}
|
|
81
|
+
const sum = results.reduce((acc, r) => ({
|
|
82
|
+
precisionAt1: acc.precisionAt1 + r.precisionAt1,
|
|
83
|
+
precisionAt3: acc.precisionAt3 + r.precisionAt3,
|
|
84
|
+
precisionAt5: acc.precisionAt5 + r.precisionAt5,
|
|
85
|
+
recallAt5: acc.recallAt5 + r.recallAt5,
|
|
86
|
+
mrr: acc.mrr + r.mrr,
|
|
87
|
+
ndcgAt5: acc.ndcgAt5 + r.ndcgAt5,
|
|
88
|
+
}), { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 });
|
|
89
|
+
return {
|
|
90
|
+
precisionAt1: sum.precisionAt1 / n,
|
|
91
|
+
precisionAt3: sum.precisionAt3 / n,
|
|
92
|
+
precisionAt5: sum.precisionAt5 / n,
|
|
93
|
+
recallAt5: sum.recallAt5 / n,
|
|
94
|
+
mrr: sum.mrr / n,
|
|
95
|
+
ndcgAt5: sum.ndcgAt5 / n,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=ir-metrics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ir-metrics.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAChC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CACvB,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,IAAI,QAAQ,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC,CAAC,8CAA8C;IACjF,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,QAAqB;IAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,IAAI,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;GAEG;AACH,SAAS,MAAM,CACb,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAC;QAC/C,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,wBAAwB;IAC5E,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,OAAO,CACrB,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,MAAM,GAAG,GAAG,MAAM,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IAClD,uDAAuD;IACvD,MAAM,UAAU,GAAG,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC9C,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC;SAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;IACrB,MAAM,IAAI,GAAG,MAAM,CAAC,UAAU,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IACpD,IAAI,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACzB,OAAO,GAAG,GAAG,IAAI,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,eAAoC,EACpC,iBAAiB,GAAG,CAAC;IASrB,MAAM,QAAQ,GAAG,IAAI,GAAG,CACtB,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC3B,MAAM,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,KAAK,IAAI,iBAAiB,CAAC;SACjD,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CACrB,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,SAAS,EAAE,SAAS,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAC5C,GAAG,EAAE,cAAc,CAAC,SAAS,EAAE,QAAQ,CAAC;QACxC,OAAO,EAAE,OAAO,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC;KAChD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,OAAiD;IAEjD,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IACzB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;IACjG,CAAC;IAED,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG;QACpB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;KACjC,CAAC,EACF,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CACxF,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC;QAC5B,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC;QAChB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC;KACzB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured Judge — evaluates AI output with specific yes/no questions per criterion.
|
|
3
|
+
*
|
|
4
|
+
* Instead of "rate 0-10", asks: "Does the code implement X? YES/NO"
|
|
5
|
+
* Then scores = (yes_count / total_questions) * 10.
|
|
6
|
+
*/
|
|
7
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
8
|
+
import type { CodingTask } from "../lib/types.js";
|
|
9
|
+
export interface JudgmentResult {
|
|
10
|
+
taskId: string;
|
|
11
|
+
totalQuestions: number;
|
|
12
|
+
yesCount: number;
|
|
13
|
+
score: number;
|
|
14
|
+
details: Array<{
|
|
15
|
+
criterion: string;
|
|
16
|
+
answer: "YES" | "NO" | "PARTIAL";
|
|
17
|
+
reasoning: string;
|
|
18
|
+
}>;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Evaluate an AI-generated output against structured criteria.
|
|
22
|
+
*/
|
|
23
|
+
export declare function structuredJudge(client: Anthropic, task: CodingTask, output: string, contextDocs: string): Promise<JudgmentResult>;
|
|
24
|
+
/**
|
|
25
|
+
* Check if generated code contains valid TypeScript/JSX syntax.
|
|
26
|
+
* Uses the TypeScript compiler API in syntax-only mode for accurate parsing
|
|
27
|
+
* of template literals, JSX, and other complex syntax.
|
|
28
|
+
*/
|
|
29
|
+
export declare function checkTypeScriptSyntax(output: string): {
|
|
30
|
+
valid: boolean;
|
|
31
|
+
errorCount: number;
|
|
32
|
+
codeBlockCount: number;
|
|
33
|
+
};
|
|
34
|
+
//# sourceMappingURL=structured-judge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-judge.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/structured-judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAIlD,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,KAAK,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,KAAK,GAAG,IAAI,GAAG,SAAS,CAAC;QACjC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC,CAAC;CACJ;AAiCD;;GAEG;AACH,wBAAsB,eAAe,CACnC,MAAM,EAAE,SAAS,EACjB,IAAI,EAAE,UAAU,EAChB,MAAM,EAAE,MAAM,EACd,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,cAAc,CAAC,CAsGzB;AAED;;;;GAIG;AACH,wBAAgB,qBAAqB,CACnC,MAAM,EAAE,MAAM,GACb;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,UAAU,EAAE,MAAM,CAAC;IAAC,cAAc,EAAE,MAAM,CAAA;CAAE,CAgDhE"}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
const JUDGE_MODEL = "claude-sonnet-4-20250514";
|
|
2
|
+
/**
|
|
3
|
+
* Build structured yes/no questions from task criteria and context.
|
|
4
|
+
*/
|
|
5
|
+
function buildQuestions(task, contextDocs) {
|
|
6
|
+
const questions = [];
|
|
7
|
+
for (const criterion of task.evaluationCriteria) {
|
|
8
|
+
if (criterion.prompt) {
|
|
9
|
+
questions.push(criterion.prompt);
|
|
10
|
+
}
|
|
11
|
+
else if (criterion.pattern) {
|
|
12
|
+
questions.push(`Does the code contain or implement: ${criterion.criterion}?`);
|
|
13
|
+
}
|
|
14
|
+
else {
|
|
15
|
+
questions.push(`Does the code satisfy: ${criterion.criterion}?`);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
// Add forbidden pattern checks
|
|
19
|
+
for (const pattern of task.forbiddenPatterns) {
|
|
20
|
+
questions.push(`Does the code AVOID the forbidden pattern: ${pattern}? (YES means it correctly avoids it)`);
|
|
21
|
+
}
|
|
22
|
+
return questions;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Evaluate an AI-generated output against structured criteria.
|
|
26
|
+
*/
|
|
27
|
+
export async function structuredJudge(client, task, output, contextDocs) {
|
|
28
|
+
const questions = buildQuestions(task, contextDocs);
|
|
29
|
+
const questionsBlock = questions
|
|
30
|
+
.map((q, i) => `Q${i + 1}: ${q}`)
|
|
31
|
+
.join("\n");
|
|
32
|
+
const prompt = `You are evaluating an AI coding agent's output against specific product requirements.
|
|
33
|
+
|
|
34
|
+
TASK DESCRIPTION:
|
|
35
|
+
${task.description}
|
|
36
|
+
|
|
37
|
+
RELEVANT PRODUCT CONTEXT:
|
|
38
|
+
${contextDocs.slice(0, 8000)}
|
|
39
|
+
|
|
40
|
+
AI AGENT'S CODE OUTPUT:
|
|
41
|
+
${output.slice(0, 12000)}
|
|
42
|
+
|
|
43
|
+
EVALUATION QUESTIONS:
|
|
44
|
+
${questionsBlock}
|
|
45
|
+
|
|
46
|
+
For each question, answer with:
|
|
47
|
+
- "YES" if the code clearly satisfies the criterion
|
|
48
|
+
- "NO" if the code clearly fails the criterion
|
|
49
|
+
- "PARTIAL" if partially satisfied
|
|
50
|
+
|
|
51
|
+
Return a JSON array with one object per question:
|
|
52
|
+
[{"answer": "YES"|"NO"|"PARTIAL", "reasoning": "brief explanation"}]
|
|
53
|
+
|
|
54
|
+
Return ONLY the JSON array, no other text.`;
|
|
55
|
+
const response = await client.messages.create({
|
|
56
|
+
model: JUDGE_MODEL,
|
|
57
|
+
max_tokens: 2048,
|
|
58
|
+
temperature: 0,
|
|
59
|
+
system: "You are a precise code evaluation judge. Return only valid JSON arrays.",
|
|
60
|
+
messages: [{ role: "user", content: prompt }],
|
|
61
|
+
});
|
|
62
|
+
const text = response.content[0]?.type === "text" ? response.content[0].text : "[]";
|
|
63
|
+
let answers;
|
|
64
|
+
try {
|
|
65
|
+
answers = JSON.parse(text);
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
// Fallback: try to extract JSON from text
|
|
69
|
+
const match = text.match(/\[[\s\S]*\]/);
|
|
70
|
+
if (match) {
|
|
71
|
+
try {
|
|
72
|
+
answers = JSON.parse(match[0]);
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
answers = [];
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
answers = [];
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
const details = questions.map((q, i) => {
|
|
83
|
+
const a = answers[i];
|
|
84
|
+
const answer = (a?.answer?.toUpperCase() ?? "NO");
|
|
85
|
+
return {
|
|
86
|
+
criterion: q,
|
|
87
|
+
answer: answer === "YES" || answer === "NO" || answer === "PARTIAL"
|
|
88
|
+
? answer
|
|
89
|
+
: "NO",
|
|
90
|
+
reasoning: a?.reasoning ?? "No response from judge",
|
|
91
|
+
};
|
|
92
|
+
});
|
|
93
|
+
const yesCount = details.reduce((sum, d) => {
|
|
94
|
+
if (d.answer === "YES")
|
|
95
|
+
return sum + 1;
|
|
96
|
+
if (d.answer === "PARTIAL")
|
|
97
|
+
return sum + 0.5;
|
|
98
|
+
return sum;
|
|
99
|
+
}, 0);
|
|
100
|
+
const totalQuestions = details.length;
|
|
101
|
+
const score = totalQuestions > 0 ? (yesCount / totalQuestions) * 10 : 0;
|
|
102
|
+
return {
|
|
103
|
+
taskId: task.id,
|
|
104
|
+
totalQuestions,
|
|
105
|
+
yesCount,
|
|
106
|
+
score: Math.round(score * 10) / 10,
|
|
107
|
+
details,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Check if generated code contains valid TypeScript/JSX syntax.
|
|
112
|
+
* Uses the TypeScript compiler API in syntax-only mode for accurate parsing
|
|
113
|
+
* of template literals, JSX, and other complex syntax.
|
|
114
|
+
*/
|
|
115
|
+
export function checkTypeScriptSyntax(output) {
|
|
116
|
+
// Extract fenced code blocks
|
|
117
|
+
const codeBlockRegex = /```(?:typescript|tsx?|jsx?|js)?\s*\n([\s\S]*?)```/g;
|
|
118
|
+
const blocks = [];
|
|
119
|
+
let match;
|
|
120
|
+
while ((match = codeBlockRegex.exec(output)) !== null) {
|
|
121
|
+
if (match[1]?.trim())
|
|
122
|
+
blocks.push(match[1]);
|
|
123
|
+
}
|
|
124
|
+
if (blocks.length === 0) {
|
|
125
|
+
return { valid: true, errorCount: 0, codeBlockCount: 0 };
|
|
126
|
+
}
|
|
127
|
+
let totalErrors = 0;
|
|
128
|
+
// Dynamic import of typescript — may not be available in all environments
|
|
129
|
+
let ts;
|
|
130
|
+
try {
|
|
131
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
132
|
+
ts = require("typescript");
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
// TypeScript not available — fall back to permissive (assume valid)
|
|
136
|
+
return { valid: true, errorCount: 0, codeBlockCount: blocks.length };
|
|
137
|
+
}
|
|
138
|
+
for (const block of blocks) {
|
|
139
|
+
// Parse as TSX to handle both TypeScript and JSX syntax
|
|
140
|
+
const sourceFile = ts.createSourceFile("check.tsx", block, ts.ScriptTarget.Latest, false, ts.ScriptKind.TSX);
|
|
141
|
+
// Count syntax-level diagnostics only (not semantic/type errors)
|
|
142
|
+
// parseDiagnostics is internal; access via type assertion
|
|
143
|
+
const syntaxDiags = (sourceFile
|
|
144
|
+
.parseDiagnostics?.length) ?? 0;
|
|
145
|
+
totalErrors += syntaxDiags;
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
valid: totalErrors === 0,
|
|
149
|
+
errorCount: totalErrors,
|
|
150
|
+
codeBlockCount: blocks.length,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
//# sourceMappingURL=structured-judge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-judge.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/structured-judge.ts"],"names":[],"mappings":"AASA,MAAM,WAAW,GAAG,0BAA0B,CAAC;AAc/C;;GAEG;AACH,SAAS,cAAc,CACrB,IAAgB,EAChB,WAAmB;IAEnB,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAChD,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;YACrB,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;YAC7B,SAAS,CAAC,IAAI,CACZ,uCAAuC,SAAS,CAAC,SAAS,GAAG,CAC9D,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,IAAI,CAAC,0BAA0B,SAAS,CAAC,SAAS,GAAG,CAAC,CAAC;QACnE,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAC7C,SAAS,CAAC,IAAI,CACZ,8CAA8C,OAAO,sCAAsC,CAC5F,CAAC;IACJ,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAiB,EACjB,IAAgB,EAChB,MAAc,EACd,WAAmB;IAEnB,MAAM,SAAS,GAAG,cAAc,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IAEpD,MAAM,cAAc,GAAG,SAAS;SAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;SAChC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,MAAM,MAAM,GAAG;;;EAGf,IAAI,CAAC,WAAW;;;EAGhB,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC;;;EAG1B,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;;;EAGtB,cAAc;;;;;;;;;;2CAU2B,CAAC;IAE1C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAC5C,KAAK,EAAE,WAAW;QAClB,UAAU,EAAE,IAAI;QAChB,WAAW,EAAE,CAAC;QACd,MAAM,EACJ,yEAAyE;QAC3E,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;KAC9C,CAAC,CAAC;IAEH,MAAM,IAAI,GACR,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;IAEzE,IAAI,OAGF,CAAC;IAEH,IAAI,CAAC;QACH,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAGvB,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,0CAA0C;QAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QACxC,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC;gBACH,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAG3B,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,GAAG,EAAE,CAAC;YACf,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,EAAE,CAAC;QACf,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACrC,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,IAAI,IAAI,CAGnC,CAAC;QACd,OAAO;YACL,SAAS,EAAE,CAAC;YACZ,MAAM,EAAE,MAAM,KAAK,KAAK,IAAI,MAAM,KAAK,IAAI,IAAI,MAAM,KAAK,SAAS;gBACjE,CAAC,CAAC,MAAM;gBACR,CAAC,CAAE,IAAc;YACnB,SAAS,EAAE,CAAC,EAAE,SAAS,IAAI,wBAAwB;SACpD,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;QACzC,IAAI,CAAC,CAAC,MAAM,KAAK,KAAK;YAAE,OAAO,GAAG,GAAG,CAAC,CAAC;QACvC,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,GAAG,GAAG,CAAC;QAC7C,OAAO,GAAG,CAAC;IACb,CAAC,EAAE,CAAC,CAAC,CAAC;IAEN,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC;IACtC,MAAM,KAAK,GACT,cAAc,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,cAAc,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5D,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,cAAc;QACd,QAAQ;QACR,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,EAAE,CAAC,GAAG,EAAE;QAClC,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CACnC,MAAc;IAEd,6BAA6B;IAC7B,MAAM,cAAc,GAAG,oDAAoD,CAAC;IAC5E,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,KAAK,CAAC;IACV,OAAO,CAAC,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE;YAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;IAC3D,CAAC;IAED,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,0EAA0E;IAC1E,IAAI,EAA2C,CAAC;IAChD,IAAI,CAAC;QACH,iEAAiE;QACjE,EAAE,GAAG,OAAO,CAAC,YAAY,CAAgC,CAAC;IAC5D,CAAC;IAAC,MAAM,CAAC;QACP,oEAAoE;QACpE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC;IACvE,CAAC;IAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,wDAAwD;QACxD,MAAM,UAAU,GAAG,EAAE,CAAC,gBAAgB,CACpC,WAAW,EACX,KAAK,EACL,EAAE,CAAC,YAAY,CAAC,MAAM,EACtB,KAAK,EACL,EAAE,CAAC,UAAU,CAAC,GAAG,CAClB,CAAC;QAEF,iEAAiE;QACjE,0DAA0D;QAC1D,MAAM,WAAW,GACf,CAAE,UAA0D;aACzD,gBAAgB,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;QACpC,WAAW,IAAI,WAAW,CAAC;IAC7B,CAAC;IAED,OAAO;QACL,KAAK,EAAE,WAAW,KAAK,CAAC;QACxB,UAAU,EAAE,WAAW;QACvB,cAAc,EAAE,MAAM,CAAC,MAAM;KAC9B,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Count tokens in a string using cl100k_base (Claude/GPT-4 tokenizer).
|
|
3
|
+
*/
|
|
4
|
+
export declare function countTokens(text: string): number;
|
|
5
|
+
/**
|
|
6
|
+
* Dispose the encoder to free memory.
|
|
7
|
+
*/
|
|
8
|
+
export declare function disposeEncoder(): void;
|
|
9
|
+
//# sourceMappingURL=token-counter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"token-counter.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/token-counter.ts"],"names":[],"mappings":"AAWA;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEhD;AAED;;GAEG;AACH,wBAAgB,cAAc,IAAI,IAAI,CAKrC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { get_encoding } from "tiktoken";
|
|
2
|
+
let _encoder = null;
|
|
3
|
+
function getEncoder() {
|
|
4
|
+
if (!_encoder) {
|
|
5
|
+
_encoder = get_encoding("cl100k_base");
|
|
6
|
+
}
|
|
7
|
+
return _encoder;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Count tokens in a string using cl100k_base (Claude/GPT-4 tokenizer).
|
|
11
|
+
*/
|
|
12
|
+
export function countTokens(text) {
|
|
13
|
+
return getEncoder().encode(text).length;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Dispose the encoder to free memory.
|
|
17
|
+
*/
|
|
18
|
+
export function disposeEncoder() {
|
|
19
|
+
if (_encoder) {
|
|
20
|
+
_encoder.free();
|
|
21
|
+
_encoder = null;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=token-counter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"token-counter.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/token-counter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAiB,MAAM,UAAU,CAAC;AAEvD,IAAI,QAAQ,GAAoB,IAAI,CAAC;AAErC,SAAS,UAAU;IACjB,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,QAAQ,GAAG,YAAY,CAAC,aAAa,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,OAAO,UAAU,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc;IAC5B,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,IAAI,EAAE,CAAC;QAChB,QAAQ,GAAG,IAAI,CAAC;IAClB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generate-corpus.d.ts","sourceRoot":"","sources":["../../../benchmarks/generators/generate-corpus.ts"],"names":[],"mappings":""}
|