pcl-mcp 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts +30 -0
  2. package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts.map +1 -0
  3. package/dist/benchmarks/evaluators/context-retrieval-quality.js +50 -0
  4. package/dist/benchmarks/evaluators/context-retrieval-quality.js.map +1 -0
  5. package/dist/benchmarks/evaluators/ir-metrics.d.ts +32 -0
  6. package/dist/benchmarks/evaluators/ir-metrics.d.ts.map +1 -0
  7. package/dist/benchmarks/evaluators/ir-metrics.js +98 -0
  8. package/dist/benchmarks/evaluators/ir-metrics.js.map +1 -0
  9. package/dist/benchmarks/evaluators/structured-judge.d.ts +34 -0
  10. package/dist/benchmarks/evaluators/structured-judge.d.ts.map +1 -0
  11. package/dist/benchmarks/evaluators/structured-judge.js +153 -0
  12. package/dist/benchmarks/evaluators/structured-judge.js.map +1 -0
  13. package/dist/benchmarks/evaluators/token-counter.d.ts +9 -0
  14. package/dist/benchmarks/evaluators/token-counter.d.ts.map +1 -0
  15. package/dist/benchmarks/evaluators/token-counter.js +24 -0
  16. package/dist/benchmarks/evaluators/token-counter.js.map +1 -0
  17. package/dist/benchmarks/generators/generate-corpus.d.ts +2 -0
  18. package/dist/benchmarks/generators/generate-corpus.d.ts.map +1 -0
  19. package/dist/benchmarks/generators/generate-corpus.js +243 -0
  20. package/dist/benchmarks/generators/generate-corpus.js.map +1 -0
  21. package/dist/benchmarks/lib/harness.d.ts +23 -0
  22. package/dist/benchmarks/lib/harness.d.ts.map +1 -0
  23. package/dist/benchmarks/lib/harness.js +44 -0
  24. package/dist/benchmarks/lib/harness.js.map +1 -0
  25. package/dist/benchmarks/lib/types.d.ts +79 -0
  26. package/dist/benchmarks/lib/types.d.ts.map +1 -0
  27. package/dist/benchmarks/lib/types.js +2 -0
  28. package/dist/benchmarks/lib/types.js.map +1 -0
  29. package/dist/benchmarks/reporters/markdown-reporter.d.ts +2 -0
  30. package/dist/benchmarks/reporters/markdown-reporter.d.ts.map +1 -0
  31. package/dist/benchmarks/reporters/markdown-reporter.js +80 -0
  32. package/dist/benchmarks/reporters/markdown-reporter.js.map +1 -0
  33. package/dist/benchmarks/runners/bench-ablation.d.ts +2 -0
  34. package/dist/benchmarks/runners/bench-ablation.d.ts.map +1 -0
  35. package/dist/benchmarks/runners/bench-ablation.js +49 -0
  36. package/dist/benchmarks/runners/bench-ablation.js.map +1 -0
  37. package/dist/benchmarks/runners/bench-ai-quality.d.ts +2 -0
  38. package/dist/benchmarks/runners/bench-ai-quality.d.ts.map +1 -0
  39. package/dist/benchmarks/runners/bench-ai-quality.js +297 -0
  40. package/dist/benchmarks/runners/bench-ai-quality.js.map +1 -0
  41. package/dist/benchmarks/runners/bench-interactive-eval.d.ts +2 -0
  42. package/dist/benchmarks/runners/bench-interactive-eval.d.ts.map +1 -0
  43. package/dist/benchmarks/runners/bench-interactive-eval.js +119 -0
  44. package/dist/benchmarks/runners/bench-interactive-eval.js.map +1 -0
  45. package/dist/benchmarks/runners/bench-performance.bench.d.ts +2 -0
  46. package/dist/benchmarks/runners/bench-performance.bench.d.ts.map +1 -0
  47. package/dist/benchmarks/runners/bench-performance.bench.js +50 -0
  48. package/dist/benchmarks/runners/bench-performance.bench.js.map +1 -0
  49. package/dist/benchmarks/runners/bench-search-quality.d.ts +2 -0
  50. package/dist/benchmarks/runners/bench-search-quality.d.ts.map +1 -0
  51. package/dist/benchmarks/runners/bench-search-quality.js +70 -0
  52. package/dist/benchmarks/runners/bench-search-quality.js.map +1 -0
  53. package/dist/benchmarks/runners/bench-token-efficiency.d.ts +2 -0
  54. package/dist/benchmarks/runners/bench-token-efficiency.d.ts.map +1 -0
  55. package/dist/benchmarks/runners/bench-token-efficiency.js +89 -0
  56. package/dist/benchmarks/runners/bench-token-efficiency.js.map +1 -0
  57. package/dist/benchmarks/runners/diag.d.ts +2 -0
  58. package/dist/benchmarks/runners/diag.d.ts.map +1 -0
  59. package/dist/benchmarks/runners/diag.js +30 -0
  60. package/dist/benchmarks/runners/diag.js.map +1 -0
  61. package/dist/benchmarks/vitest.config.bench.d.ts +3 -0
  62. package/dist/benchmarks/vitest.config.bench.d.ts.map +1 -0
  63. package/dist/benchmarks/vitest.config.bench.js +14 -0
  64. package/dist/benchmarks/vitest.config.bench.js.map +1 -0
  65. package/dist/src/db.d.ts +2 -1
  66. package/dist/src/db.d.ts.map +1 -1
  67. package/dist/src/db.js +25 -21
  68. package/dist/src/db.js.map +1 -1
  69. package/dist/src/embeddings.d.ts +1 -1
  70. package/dist/src/embeddings.js +2 -2
  71. package/dist/src/embeddings.js.map +1 -1
  72. package/dist/src/indexer.d.ts +1 -1
  73. package/dist/src/indexer.d.ts.map +1 -1
  74. package/dist/src/indexer.js +6 -2
  75. package/dist/src/indexer.js.map +1 -1
  76. package/dist/src/search.d.ts.map +1 -1
  77. package/dist/src/search.js +138 -26
  78. package/dist/src/search.js.map +1 -1
  79. package/dist/src/server.js +6 -0
  80. package/dist/src/server.js.map +1 -1
  81. package/dist/src/types.d.ts +1 -0
  82. package/dist/src/types.d.ts.map +1 -1
  83. package/dist/tests/db.test.d.ts +2 -0
  84. package/dist/tests/db.test.d.ts.map +1 -0
  85. package/dist/tests/db.test.js +459 -0
  86. package/dist/tests/db.test.js.map +1 -0
  87. package/dist/tests/embeddings.test.d.ts +2 -0
  88. package/dist/tests/embeddings.test.d.ts.map +1 -0
  89. package/dist/tests/embeddings.test.js +165 -0
  90. package/dist/tests/embeddings.test.js.map +1 -0
  91. package/dist/tests/helpers/test-harness.d.ts +26 -0
  92. package/dist/tests/helpers/test-harness.d.ts.map +1 -0
  93. package/dist/tests/helpers/test-harness.js +80 -0
  94. package/dist/tests/helpers/test-harness.js.map +1 -0
  95. package/dist/tests/indexer.test.d.ts +2 -0
  96. package/dist/tests/indexer.test.d.ts.map +1 -0
  97. package/dist/tests/indexer.test.js +299 -0
  98. package/dist/tests/indexer.test.js.map +1 -0
  99. package/dist/tests/schemas.test.d.ts +2 -0
  100. package/dist/tests/schemas.test.d.ts.map +1 -0
  101. package/dist/tests/schemas.test.js +378 -0
  102. package/dist/tests/schemas.test.js.map +1 -0
  103. package/dist/tests/search.test.d.ts +2 -0
  104. package/dist/tests/search.test.d.ts.map +1 -0
  105. package/dist/tests/search.test.js +129 -0
  106. package/dist/tests/search.test.js.map +1 -0
  107. package/dist/tests/tools.test.d.ts +2 -0
  108. package/dist/tests/tools.test.d.ts.map +1 -0
  109. package/dist/tests/tools.test.js +232 -0
  110. package/dist/tests/tools.test.js.map +1 -0
  111. package/package.json +14 -2
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
3
+ * Disentangles "did PCL find the right docs" from "did the LLM use them well."
4
+ */
5
+ export interface ContextMetrics {
6
+ recall: number;
7
+ precision: number;
8
+ f1: number;
9
+ retrieved: string[];
10
+ required: string[];
11
+ hits: string[];
12
+ misses: string[];
13
+ noise: string[];
14
+ }
15
+ /**
16
+ * Compute context retrieval quality metrics.
17
+ *
18
+ * @param retrievedIds - Document IDs returned by PCL search
19
+ * @param requiredIds - Document IDs that the task actually needs
20
+ */
21
+ export declare function measureContextRetrieval(retrievedIds: string[], requiredIds: string[]): ContextMetrics;
22
+ /**
23
+ * Aggregate context metrics across multiple tasks.
24
+ */
25
+ export declare function averageContextMetrics(metrics: ContextMetrics[]): {
26
+ recall: number;
27
+ precision: number;
28
+ f1: number;
29
+ };
30
+ //# sourceMappingURL=context-retrieval-quality.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-retrieval-quality.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,EAAE,CAAC;IACpB,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,IAAI,EAAE,MAAM,EAAE,CAAC;IACf,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACrC,YAAY,EAAE,MAAM,EAAE,EACtB,WAAW,EAAE,MAAM,EAAE,GACpB,cAAc,CA0BhB;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,OAAO,EAAE,cAAc,EAAE,GACxB;IAAE,MAAM,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,MAAM,CAAA;CAAE,CAenD"}
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Context Retrieval Quality — measures if PCL retrieves the RIGHT documents.
3
+ * Disentangles "did PCL find the right docs" from "did the LLM use them well."
4
+ */
5
+ /**
6
+ * Compute context retrieval quality metrics.
7
+ *
8
+ * @param retrievedIds - Document IDs returned by PCL search
9
+ * @param requiredIds - Document IDs that the task actually needs
10
+ */
11
+ export function measureContextRetrieval(retrievedIds, requiredIds) {
12
+ const retrievedSet = new Set(retrievedIds);
13
+ const requiredSet = new Set(requiredIds);
14
+ const hits = requiredIds.filter((id) => retrievedSet.has(id));
15
+ const misses = requiredIds.filter((id) => !retrievedSet.has(id));
16
+ const noise = retrievedIds.filter((id) => !requiredSet.has(id));
17
+ const recall = requiredIds.length > 0 ? hits.length / requiredIds.length : 1;
18
+ const precision = retrievedIds.length > 0 ? hits.length / retrievedIds.length : 1;
19
+ const f1 = recall + precision > 0
20
+ ? (2 * recall * precision) / (recall + precision)
21
+ : 0;
22
+ return {
23
+ recall,
24
+ precision,
25
+ f1,
26
+ retrieved: retrievedIds,
27
+ required: requiredIds,
28
+ hits,
29
+ misses,
30
+ noise,
31
+ };
32
+ }
33
+ /**
34
+ * Aggregate context metrics across multiple tasks.
35
+ */
36
+ export function averageContextMetrics(metrics) {
37
+ if (metrics.length === 0)
38
+ return { recall: 0, precision: 0, f1: 0 };
39
+ const sum = metrics.reduce((acc, m) => ({
40
+ recall: acc.recall + m.recall,
41
+ precision: acc.precision + m.precision,
42
+ f1: acc.f1 + m.f1,
43
+ }), { recall: 0, precision: 0, f1: 0 });
44
+ return {
45
+ recall: sum.recall / metrics.length,
46
+ precision: sum.precision / metrics.length,
47
+ f1: sum.f1 / metrics.length,
48
+ };
49
+ }
50
+ //# sourceMappingURL=context-retrieval-quality.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-retrieval-quality.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/context-retrieval-quality.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAaH;;;;;GAKG;AACH,MAAM,UAAU,uBAAuB,CACrC,YAAsB,EACtB,WAAqB;IAErB,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,YAAY,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;IAEzC,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACjE,MAAM,KAAK,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IAEhE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7E,MAAM,SAAS,GACb,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAClE,MAAM,EAAE,GACN,MAAM,GAAG,SAAS,GAAG,CAAC;QACpB,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,CAAC;QACjD,CAAC,CAAC,CAAC,CAAC;IAER,OAAO;QACL,MAAM;QACN,SAAS;QACT,EAAE;QACF,SAAS,EAAE,YAAY;QACvB,QAAQ,EAAE,WAAW;QACrB,IAAI;QACJ,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,qBAAqB,CACnC,OAAyB;IAEzB,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC;IACpE,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM;QAC7B,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,EAAE;KAClB,CAAC,EACF,EAAE,MAAM,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CACnC,CAAC;IACF,OAAO;QACL,MAAM,EAAE,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM;QACnC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,OAAO,CAAC,MAAM;QACzC,EAAE,EAAE,GAAG,CAAC,EAAE,GAAG,OAAO,CAAC,MAAM;KAC5B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Precision@K: fraction of top-K results that are relevant.
3
+ */
4
+ export declare function precisionAtK(retrieved: string[], relevant: Set<string>, k: number): number;
5
+ /**
6
+ * Recall@K: fraction of relevant docs found in top-K.
7
+ */
8
+ export declare function recallAtK(retrieved: string[], relevant: Set<string>, k: number): number;
9
+ /**
10
+ * MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
11
+ */
12
+ export declare function reciprocalRank(retrieved: string[], relevant: Set<string>): number;
13
+ /**
14
+ * NDCG@K: normalized DCG using ideal ranking.
15
+ */
16
+ export declare function ndcgAtK(retrieved: string[], relevanceScores: Map<string, number>, k: number): number;
17
+ /**
18
+ * Compute all metrics for a single query.
19
+ */
20
+ export declare function computeMetrics(retrieved: string[], relevanceScores: Map<string, number>, relevantThreshold?: number): {
21
+ precisionAt1: number;
22
+ precisionAt3: number;
23
+ precisionAt5: number;
24
+ recallAt5: number;
25
+ mrr: number;
26
+ ndcgAt5: number;
27
+ };
28
+ /**
29
+ * Average metrics across multiple queries.
30
+ */
31
+ export declare function averageMetrics(results: Array<ReturnType<typeof computeMetrics>>): ReturnType<typeof computeMetrics>;
32
+ //# sourceMappingURL=ir-metrics.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ir-metrics.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,wBAAgB,YAAY,CAC1B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,SAAS,CACvB,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,EACrB,CAAC,EAAE,MAAM,GACR,MAAM,CAKR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,QAAQ,EAAE,GAAG,CAAC,MAAM,CAAC,GACpB,MAAM,CAKR;AAmBD;;GAEG;AACH,wBAAgB,OAAO,CACrB,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,CAAC,EAAE,MAAM,GACR,MAAM,CASR;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,SAAS,EAAE,MAAM,EAAE,EACnB,eAAe,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EACpC,iBAAiB,SAAI,GACpB;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB,CAeA;AAED;;GAEG;AACH,wBAAgB,cAAc,CAC5B,OAAO,EAAE,KAAK,CAAC,UAAU,CAAC,OAAO,cAAc,CAAC,CAAC,GAChD,UAAU,CAAC,OAAO,cAAc,CAAC,CA0BnC"}
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Precision@K: fraction of top-K results that are relevant.
3
+ */
4
+ export function precisionAtK(retrieved, relevant, k) {
5
+ const topK = retrieved.slice(0, k);
6
+ if (topK.length === 0)
7
+ return 0;
8
+ const hits = topK.filter(id => relevant.has(id)).length;
9
+ return hits / topK.length;
10
+ }
11
+ /**
12
+ * Recall@K: fraction of relevant docs found in top-K.
13
+ */
14
+ export function recallAtK(retrieved, relevant, k) {
15
+ if (relevant.size === 0)
16
+ return 1; // no relevant docs = perfect recall vacuously
17
+ const topK = retrieved.slice(0, k);
18
+ const hits = topK.filter(id => relevant.has(id)).length;
19
+ return hits / relevant.size;
20
+ }
21
+ /**
22
+ * MRR (Mean Reciprocal Rank): 1 / rank of first relevant result.
23
+ */
24
+ export function reciprocalRank(retrieved, relevant) {
25
+ for (let i = 0; i < retrieved.length; i++) {
26
+ if (relevant.has(retrieved[i]))
27
+ return 1 / (i + 1);
28
+ }
29
+ return 0;
30
+ }
31
+ /**
32
+ * DCG@K with graded relevance (relevance scores 0-3).
33
+ */
34
+ function dcgAtK(retrieved, relevanceScores, k) {
35
+ let dcg = 0;
36
+ const topK = retrieved.slice(0, k);
37
+ for (let i = 0; i < topK.length; i++) {
38
+ const rel = relevanceScores.get(topK[i]) ?? 0;
39
+ dcg += (Math.pow(2, rel) - 1) / Math.log2(i + 2); // i+2 because log2(1)=0
40
+ }
41
+ return dcg;
42
+ }
43
+ /**
44
+ * NDCG@K: normalized DCG using ideal ranking.
45
+ */
46
+ export function ndcgAtK(retrieved, relevanceScores, k) {
47
+ const dcg = dcgAtK(retrieved, relevanceScores, k);
48
+ // Ideal ranking: sort all docs by relevance descending
49
+ const idealOrder = [...relevanceScores.entries()]
50
+ .sort(([, a], [, b]) => b - a)
51
+ .map(([id]) => id);
52
+ const idcg = dcgAtK(idealOrder, relevanceScores, k);
53
+ if (idcg === 0)
54
+ return 0;
55
+ return dcg / idcg;
56
+ }
57
+ /**
58
+ * Compute all metrics for a single query.
59
+ */
60
+ export function computeMetrics(retrieved, relevanceScores, relevantThreshold = 1) {
61
+ const relevant = new Set([...relevanceScores.entries()]
62
+ .filter(([, score]) => score >= relevantThreshold)
63
+ .map(([id]) => id));
64
+ return {
65
+ precisionAt1: precisionAtK(retrieved, relevant, 1),
66
+ precisionAt3: precisionAtK(retrieved, relevant, 3),
67
+ precisionAt5: precisionAtK(retrieved, relevant, 5),
68
+ recallAt5: recallAtK(retrieved, relevant, 5),
69
+ mrr: reciprocalRank(retrieved, relevant),
70
+ ndcgAt5: ndcgAtK(retrieved, relevanceScores, 5),
71
+ };
72
+ }
73
+ /**
74
+ * Average metrics across multiple queries.
75
+ */
76
+ export function averageMetrics(results) {
77
+ const n = results.length;
78
+ if (n === 0) {
79
+ return { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 };
80
+ }
81
+ const sum = results.reduce((acc, r) => ({
82
+ precisionAt1: acc.precisionAt1 + r.precisionAt1,
83
+ precisionAt3: acc.precisionAt3 + r.precisionAt3,
84
+ precisionAt5: acc.precisionAt5 + r.precisionAt5,
85
+ recallAt5: acc.recallAt5 + r.recallAt5,
86
+ mrr: acc.mrr + r.mrr,
87
+ ndcgAt5: acc.ndcgAt5 + r.ndcgAt5,
88
+ }), { precisionAt1: 0, precisionAt3: 0, precisionAt5: 0, recallAt5: 0, mrr: 0, ndcgAt5: 0 });
89
+ return {
90
+ precisionAt1: sum.precisionAt1 / n,
91
+ precisionAt3: sum.precisionAt3 / n,
92
+ precisionAt5: sum.precisionAt5 / n,
93
+ recallAt5: sum.recallAt5 / n,
94
+ mrr: sum.mrr / n,
95
+ ndcgAt5: sum.ndcgAt5 / n,
96
+ };
97
+ }
98
+ //# sourceMappingURL=ir-metrics.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ir-metrics.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/ir-metrics.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,UAAU,YAAY,CAC1B,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAChC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC;AAC5B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,SAAS,CACvB,SAAmB,EACnB,QAAqB,EACrB,CAAS;IAET,IAAI,QAAQ,CAAC,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC,CAAC,8CAA8C;IACjF,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,QAAqB;IAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC1C,IAAI,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;GAEG;AACH,SAAS,MAAM,CACb,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,IAAI,GAAG,GAAG,CAAC,CAAC;IACZ,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAC;QAC/C,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,wBAAwB;IAC5E,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,OAAO,CACrB,SAAmB,EACnB,eAAoC,EACpC,CAAS;IAET,MAAM,GAAG,GAAG,MAAM,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IAClD,uDAAuD;IACvD,MAAM,UAAU,GAAG,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC9C,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC;SAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;IACrB,MAAM,IAAI,GAAG,MAAM,CAAC,UAAU,EAAE,eAAe,EAAE,CAAC,CAAC,CAAC;IACpD,IAAI,IAAI,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACzB,OAAO,GAAG,GAAG,IAAI,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,SAAmB,EACnB,eAAoC,EACpC,iBAAiB,GAAG,CAAC;IASrB,MAAM,QAAQ,GAAG,IAAI,GAAG,CACtB,CAAC,GAAG,eAAe,CAAC,OAAO,EAAE,CAAC;SAC3B,MAAM,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,KAAK,IAAI,iBAAiB,CAAC;SACjD,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CACrB,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,YAAY,EAAE,YAAY,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAClD,SAAS,EAAE,SAAS,CAAC,SAAS,EAAE,QAAQ,EAAE,CAAC,CAAC;QAC5C,GAAG,EAAE,cAAc,CAAC,SAAS,EAAE,QAAQ,CAAC;QACxC,OAAO,EAAE,OAAO,CAAC,SAAS,EAAE,eAAe,EAAE,CAAC,CAAC;KAChD,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,OAAiD;IAEjD,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;IACzB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACZ,OAAO,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;IACjG,CAAC;IAED,MAAM,GAAG,GAAG,OAAO,CAAC,MAAM,CACxB,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACX,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC,CAAC,YAAY;QAC/C,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS;QACtC,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG;QACpB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC,CAAC,OAAO;KACjC,CAAC,EACF,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CACxF,CAAC;IAEF,OAAO;QACL,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,YAAY,EAAE,GAAG,CAAC,YAAY,GAAG,CAAC;QAClC,SAAS,EAAE,GAAG,CAAC,SAAS,GAAG,CAAC;QAC5B,GAAG,EAAE,GAAG,CAAC,GAAG,GAAG,CAAC;QAChB,OAAO,EAAE,GAAG,CAAC,OAAO,GAAG,CAAC;KACzB,CAAC;AACJ,CAAC"}
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Structured Judge — evaluates AI output with specific yes/no questions per criterion.
3
+ *
4
+ * Instead of "rate 0-10", asks: "Does the code implement X? YES/NO"
5
+ * Then scores = (yes_count / total_questions) * 10.
6
+ */
7
+ import Anthropic from "@anthropic-ai/sdk";
8
+ import type { CodingTask } from "../lib/types.js";
9
+ export interface JudgmentResult {
10
+ taskId: string;
11
+ totalQuestions: number;
12
+ yesCount: number;
13
+ score: number;
14
+ details: Array<{
15
+ criterion: string;
16
+ answer: "YES" | "NO" | "PARTIAL";
17
+ reasoning: string;
18
+ }>;
19
+ }
20
+ /**
21
+ * Evaluate an AI-generated output against structured criteria.
22
+ */
23
+ export declare function structuredJudge(client: Anthropic, task: CodingTask, output: string, contextDocs: string): Promise<JudgmentResult>;
24
+ /**
25
+ * Check if generated code contains valid TypeScript/JSX syntax.
26
+ * Uses the TypeScript compiler API in syntax-only mode for accurate parsing
27
+ * of template literals, JSX, and other complex syntax.
28
+ */
29
+ export declare function checkTypeScriptSyntax(output: string): {
30
+ valid: boolean;
31
+ errorCount: number;
32
+ codeBlockCount: number;
33
+ };
34
+ //# sourceMappingURL=structured-judge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"structured-judge.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/structured-judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AACH,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAC1C,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAIlD,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,cAAc,EAAE,MAAM,CAAC;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,KAAK,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,KAAK,GAAG,IAAI,GAAG,SAAS,CAAC;QACjC,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC,CAAC;CACJ;AAiCD;;GAEG;AACH,wBAAsB,eAAe,CACnC,MAAM,EAAE,SAAS,EACjB,IAAI,EAAE,UAAU,EAChB,MAAM,EAAE,MAAM,EACd,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,cAAc,CAAC,CAsGzB;AAED;;;;GAIG;AACH,wBAAgB,qBAAqB,CACnC,MAAM,EAAE,MAAM,GACb;IAAE,KAAK,EAAE,OAAO,CAAC;IAAC,UAAU,EAAE,MAAM,CAAC;IAAC,cAAc,EAAE,MAAM,CAAA;CAAE,CAgDhE"}
@@ -0,0 +1,153 @@
1
+ const JUDGE_MODEL = "claude-sonnet-4-20250514";
2
+ /**
3
+ * Build structured yes/no questions from task criteria and context.
4
+ */
5
+ function buildQuestions(task, contextDocs) {
6
+ const questions = [];
7
+ for (const criterion of task.evaluationCriteria) {
8
+ if (criterion.prompt) {
9
+ questions.push(criterion.prompt);
10
+ }
11
+ else if (criterion.pattern) {
12
+ questions.push(`Does the code contain or implement: ${criterion.criterion}?`);
13
+ }
14
+ else {
15
+ questions.push(`Does the code satisfy: ${criterion.criterion}?`);
16
+ }
17
+ }
18
+ // Add forbidden pattern checks
19
+ for (const pattern of task.forbiddenPatterns) {
20
+ questions.push(`Does the code AVOID the forbidden pattern: ${pattern}? (YES means it correctly avoids it)`);
21
+ }
22
+ return questions;
23
+ }
24
+ /**
25
+ * Evaluate an AI-generated output against structured criteria.
26
+ */
27
+ export async function structuredJudge(client, task, output, contextDocs) {
28
+ const questions = buildQuestions(task, contextDocs);
29
+ const questionsBlock = questions
30
+ .map((q, i) => `Q${i + 1}: ${q}`)
31
+ .join("\n");
32
+ const prompt = `You are evaluating an AI coding agent's output against specific product requirements.
33
+
34
+ TASK DESCRIPTION:
35
+ ${task.description}
36
+
37
+ RELEVANT PRODUCT CONTEXT:
38
+ ${contextDocs.slice(0, 8000)}
39
+
40
+ AI AGENT'S CODE OUTPUT:
41
+ ${output.slice(0, 12000)}
42
+
43
+ EVALUATION QUESTIONS:
44
+ ${questionsBlock}
45
+
46
+ For each question, answer with:
47
+ - "YES" if the code clearly satisfies the criterion
48
+ - "NO" if the code clearly fails the criterion
49
+ - "PARTIAL" if partially satisfied
50
+
51
+ Return a JSON array with one object per question:
52
+ [{"answer": "YES"|"NO"|"PARTIAL", "reasoning": "brief explanation"}]
53
+
54
+ Return ONLY the JSON array, no other text.`;
55
+ const response = await client.messages.create({
56
+ model: JUDGE_MODEL,
57
+ max_tokens: 2048,
58
+ temperature: 0,
59
+ system: "You are a precise code evaluation judge. Return only valid JSON arrays.",
60
+ messages: [{ role: "user", content: prompt }],
61
+ });
62
+ const text = response.content[0]?.type === "text" ? response.content[0].text : "[]";
63
+ let answers;
64
+ try {
65
+ answers = JSON.parse(text);
66
+ }
67
+ catch {
68
+ // Fallback: try to extract JSON from text
69
+ const match = text.match(/\[[\s\S]*\]/);
70
+ if (match) {
71
+ try {
72
+ answers = JSON.parse(match[0]);
73
+ }
74
+ catch {
75
+ answers = [];
76
+ }
77
+ }
78
+ else {
79
+ answers = [];
80
+ }
81
+ }
82
+ const details = questions.map((q, i) => {
83
+ const a = answers[i];
84
+ const answer = (a?.answer?.toUpperCase() ?? "NO");
85
+ return {
86
+ criterion: q,
87
+ answer: answer === "YES" || answer === "NO" || answer === "PARTIAL"
88
+ ? answer
89
+ : "NO",
90
+ reasoning: a?.reasoning ?? "No response from judge",
91
+ };
92
+ });
93
+ const yesCount = details.reduce((sum, d) => {
94
+ if (d.answer === "YES")
95
+ return sum + 1;
96
+ if (d.answer === "PARTIAL")
97
+ return sum + 0.5;
98
+ return sum;
99
+ }, 0);
100
+ const totalQuestions = details.length;
101
+ const score = totalQuestions > 0 ? (yesCount / totalQuestions) * 10 : 0;
102
+ return {
103
+ taskId: task.id,
104
+ totalQuestions,
105
+ yesCount,
106
+ score: Math.round(score * 10) / 10,
107
+ details,
108
+ };
109
+ }
110
+ /**
111
+ * Check if generated code contains valid TypeScript/JSX syntax.
112
+ * Uses the TypeScript compiler API in syntax-only mode for accurate parsing
113
+ * of template literals, JSX, and other complex syntax.
114
+ */
115
+ export function checkTypeScriptSyntax(output) {
116
+ // Extract fenced code blocks
117
+ const codeBlockRegex = /```(?:typescript|tsx?|jsx?|js)?\s*\n([\s\S]*?)```/g;
118
+ const blocks = [];
119
+ let match;
120
+ while ((match = codeBlockRegex.exec(output)) !== null) {
121
+ if (match[1]?.trim())
122
+ blocks.push(match[1]);
123
+ }
124
+ if (blocks.length === 0) {
125
+ return { valid: true, errorCount: 0, codeBlockCount: 0 };
126
+ }
127
+ let totalErrors = 0;
128
+ // Dynamic import of typescript — may not be available in all environments
129
+ let ts;
130
+ try {
131
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
132
+ ts = require("typescript");
133
+ }
134
+ catch {
135
+ // TypeScript not available — fall back to permissive (assume valid)
136
+ return { valid: true, errorCount: 0, codeBlockCount: blocks.length };
137
+ }
138
+ for (const block of blocks) {
139
+ // Parse as TSX to handle both TypeScript and JSX syntax
140
+ const sourceFile = ts.createSourceFile("check.tsx", block, ts.ScriptTarget.Latest, false, ts.ScriptKind.TSX);
141
+ // Count syntax-level diagnostics only (not semantic/type errors)
142
+ // parseDiagnostics is internal; access via type assertion
143
+ const syntaxDiags = (sourceFile
144
+ .parseDiagnostics?.length) ?? 0;
145
+ totalErrors += syntaxDiags;
146
+ }
147
+ return {
148
+ valid: totalErrors === 0,
149
+ errorCount: totalErrors,
150
+ codeBlockCount: blocks.length,
151
+ };
152
+ }
153
+ //# sourceMappingURL=structured-judge.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"structured-judge.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/structured-judge.ts"],"names":[],"mappings":"AASA,MAAM,WAAW,GAAG,0BAA0B,CAAC;AAc/C;;GAEG;AACH,SAAS,cAAc,CACrB,IAAgB,EAChB,WAAmB;IAEnB,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAChD,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;YACrB,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;YAC7B,SAAS,CAAC,IAAI,CACZ,uCAAuC,SAAS,CAAC,SAAS,GAAG,CAC9D,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,IAAI,CAAC,0BAA0B,SAAS,CAAC,SAAS,GAAG,CAAC,CAAC;QACnE,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAC7C,SAAS,CAAC,IAAI,CACZ,8CAA8C,OAAO,sCAAsC,CAC5F,CAAC;IACJ,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAiB,EACjB,IAAgB,EAChB,MAAc,EACd,WAAmB;IAEnB,MAAM,SAAS,GAAG,cAAc,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IAEpD,MAAM,cAAc,GAAG,SAAS;SAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;SAChC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,MAAM,MAAM,GAAG;;;EAGf,IAAI,CAAC,WAAW;;;EAGhB,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC;;;EAG1B,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;;;EAGtB,cAAc;;;;;;;;;;2CAU2B,CAAC;IAE1C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAC5C,KAAK,EAAE,WAAW;QAClB,UAAU,EAAE,IAAI;QAChB,WAAW,EAAE,CAAC;QACd,MAAM,EACJ,yEAAyE;QAC3E,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;KAC9C,CAAC,CAAC;IAEH,MAAM,IAAI,GACR,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;IAEzE,IAAI,OAGF,CAAC;IAEH,IAAI,CAAC;QACH,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAGvB,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,0CAA0C;QAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QACxC,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC;gBACH,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAG3B,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,GAAG,EAAE,CAAC;YACf,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,EAAE,CAAC;QACf,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACrC,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,IAAI,IAAI,CAGnC,CAAC;QACd,OAAO;YACL,SAAS,EAAE,CAAC;YACZ,MAAM,EAAE,MAAM,KAAK,KAAK,IAAI,MAAM,KAAK,IAAI,IAAI,MAAM,KAAK,SAAS;gBACjE,CAAC,CAAC,MAAM;gBACR,CAAC,CAAE,IAAc;YACnB,SAAS,EAAE,CAAC,EAAE,SAAS,IAAI,wBAAwB;SACpD,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;QACzC,IAAI,CAAC,CAAC,MAAM,KAAK,KAAK;YAAE,OAAO,GAAG,GAAG,CAAC,CAAC;QACvC,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,GAAG,GAAG,CAAC;QAC7C,OAAO,GAAG,CAAC;IACb,CAAC,EAAE,CAAC,CAAC,CAAC;IAEN,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC;IACtC,MAAM,KAAK,GACT,cAAc,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,cAAc,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5D,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,cAAc;QACd,QAAQ;QACR,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,EAAE,CAAC,GAAG,EAAE;QAClC,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CACnC,MAAc;IAEd,6BAA6B;IAC7B,MAAM,cAAc,GAAG,oDAAoD,CAAC;IAC5E,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,KAAK,CAAC;IACV,OAAO,CAAC,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE;YAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;IAC3D,CAAC;IAED,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,0EAA0E;IAC1E,IAAI,EAA2C,CAAC;IAChD,IAAI,CAAC;QACH,iEAAiE;QACjE,EAAE,GAAG,OAAO,CAAC,YAAY,CAAgC,CAAC;IAC5D,CAAC;IAAC,MAAM,CAAC;QACP,oEAAoE;QACpE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC;IACvE,CAAC;IAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,wDAAwD;QACxD,MAAM,UAAU,GAAG,EAAE,CAAC,gBAAgB,CACpC,WAAW,EACX,KAAK,EACL,EAAE,CAAC,YAAY,CAAC,MAAM,EACtB,KAAK,EACL,EAAE,CAAC,UAAU,CAAC,GAAG,CAClB,CAAC;QAEF,iEAAiE;QACjE,0DAA0D;QAC1D,MAAM,WAAW,GACf,CAAE,UAA0D;aACzD,gBAAgB,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;QACpC,WAAW,IAAI,WAAW,CAAC;IAC7B,CAAC;IAED,OAAO;QACL,KAAK,EAAE,WAAW,KAAK,CAAC;QACxB,UAAU,EAAE,WAAW;QACvB,cAAc,EAAE,MAAM,CAAC,MAAM;KAC9B,CAAC;AACJ,CAAC"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Count tokens in a string using cl100k_base (Claude/GPT-4 tokenizer).
3
+ */
4
+ export declare function countTokens(text: string): number;
5
+ /**
6
+ * Dispose the encoder to free memory.
7
+ */
8
+ export declare function disposeEncoder(): void;
9
+ //# sourceMappingURL=token-counter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"token-counter.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/token-counter.ts"],"names":[],"mappings":"AAWA;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEhD;AAED;;GAEG;AACH,wBAAgB,cAAc,IAAI,IAAI,CAKrC"}
@@ -0,0 +1,24 @@
1
+ import { get_encoding } from "tiktoken";
2
+ let _encoder = null;
3
+ function getEncoder() {
4
+ if (!_encoder) {
5
+ _encoder = get_encoding("cl100k_base");
6
+ }
7
+ return _encoder;
8
+ }
9
+ /**
10
+ * Count tokens in a string using cl100k_base (Claude/GPT-4 tokenizer).
11
+ */
12
+ export function countTokens(text) {
13
+ return getEncoder().encode(text).length;
14
+ }
15
+ /**
16
+ * Dispose the encoder to free memory.
17
+ */
18
+ export function disposeEncoder() {
19
+ if (_encoder) {
20
+ _encoder.free();
21
+ _encoder = null;
22
+ }
23
+ }
24
+ //# sourceMappingURL=token-counter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"token-counter.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/token-counter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAiB,MAAM,UAAU,CAAC;AAEvD,IAAI,QAAQ,GAAoB,IAAI,CAAC;AAErC,SAAS,UAAU;IACjB,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,QAAQ,GAAG,YAAY,CAAC,aAAa,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,OAAO,UAAU,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc;IAC5B,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,IAAI,EAAE,CAAC;QAChB,QAAQ,GAAG,IAAI,CAAC;IAClB,CAAC;AACH,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=generate-corpus.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"generate-corpus.d.ts","sourceRoot":"","sources":["../../../benchmarks/generators/generate-corpus.ts"],"names":[],"mappings":""}