@goreal-ai/echo-pdk 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/README.md +1 -1
  2. package/dist/ai-judge/index.d.ts +11 -21
  3. package/dist/ai-judge/index.d.ts.map +1 -1
  4. package/dist/ai-judge/index.js +36 -90
  5. package/dist/ai-judge/index.js.map +1 -1
  6. package/dist/embeddings/cosine.d.ts +15 -0
  7. package/dist/embeddings/cosine.d.ts.map +1 -0
  8. package/dist/embeddings/cosine.js +37 -0
  9. package/dist/embeddings/cosine.js.map +1 -0
  10. package/dist/embeddings/index.d.ts +9 -0
  11. package/dist/embeddings/index.d.ts.map +1 -0
  12. package/dist/embeddings/index.js +11 -0
  13. package/dist/embeddings/index.js.map +1 -0
  14. package/dist/embeddings/openai.d.ts +11 -0
  15. package/dist/embeddings/openai.d.ts.map +1 -0
  16. package/dist/embeddings/openai.js +38 -0
  17. package/dist/embeddings/openai.js.map +1 -0
  18. package/dist/embeddings/registry.d.ts +13 -0
  19. package/dist/embeddings/registry.d.ts.map +1 -0
  20. package/dist/embeddings/registry.js +29 -0
  21. package/dist/embeddings/registry.js.map +1 -0
  22. package/dist/embeddings/types.d.ts +35 -0
  23. package/dist/embeddings/types.d.ts.map +1 -0
  24. package/dist/embeddings/types.js +8 -0
  25. package/dist/embeddings/types.js.map +1 -0
  26. package/dist/embeddings/voyage.d.ts +12 -0
  27. package/dist/embeddings/voyage.d.ts.map +1 -0
  28. package/dist/embeddings/voyage.js +39 -0
  29. package/dist/embeddings/voyage.js.map +1 -0
  30. package/dist/eval/assertions.d.ts +35 -0
  31. package/dist/eval/assertions.d.ts.map +1 -0
  32. package/dist/eval/assertions.js +349 -0
  33. package/dist/eval/assertions.js.map +1 -0
  34. package/dist/eval/dataset.d.ts +42 -0
  35. package/dist/eval/dataset.d.ts.map +1 -0
  36. package/dist/eval/dataset.js +101 -0
  37. package/dist/eval/dataset.js.map +1 -0
  38. package/dist/eval/index.d.ts +14 -0
  39. package/dist/eval/index.d.ts.map +1 -0
  40. package/dist/eval/index.js +17 -0
  41. package/dist/eval/index.js.map +1 -0
  42. package/dist/eval/loader.d.ts +30 -0
  43. package/dist/eval/loader.d.ts.map +1 -0
  44. package/dist/eval/loader.js +170 -0
  45. package/dist/eval/loader.js.map +1 -0
  46. package/dist/eval/reporter.d.ts +26 -0
  47. package/dist/eval/reporter.d.ts.map +1 -0
  48. package/dist/eval/reporter.js +164 -0
  49. package/dist/eval/reporter.js.map +1 -0
  50. package/dist/eval/runner.d.ts +28 -0
  51. package/dist/eval/runner.d.ts.map +1 -0
  52. package/dist/eval/runner.js +232 -0
  53. package/dist/eval/runner.js.map +1 -0
  54. package/dist/eval/types.d.ts +257 -0
  55. package/dist/eval/types.d.ts.map +1 -0
  56. package/dist/eval/types.js +11 -0
  57. package/dist/eval/types.js.map +1 -0
  58. package/dist/evaluator/evaluator.d.ts +2 -2
  59. package/dist/evaluator/evaluator.js +5 -5
  60. package/dist/evaluator/evaluator.js.map +1 -1
  61. package/dist/evaluator/index.d.ts +1 -1
  62. package/dist/evaluator/index.d.ts.map +1 -1
  63. package/dist/evaluator/index.js +1 -1
  64. package/dist/evaluator/index.js.map +1 -1
  65. package/dist/evaluator/operators.d.ts +9 -5
  66. package/dist/evaluator/operators.d.ts.map +1 -1
  67. package/dist/evaluator/operators.js +26 -33
  68. package/dist/evaluator/operators.js.map +1 -1
  69. package/dist/index.d.ts +8 -2
  70. package/dist/index.d.ts.map +1 -1
  71. package/dist/index.js +38 -20
  72. package/dist/index.js.map +1 -1
  73. package/dist/parser/ast.js +1 -1
  74. package/dist/parser/ast.js.map +1 -1
  75. package/dist/parser/lexer.d.ts +1 -1
  76. package/dist/parser/lexer.js +1 -1
  77. package/dist/project/index.d.ts.map +1 -1
  78. package/dist/project/index.js +10 -3
  79. package/dist/project/index.js.map +1 -1
  80. package/dist/project/types.d.ts +19 -4
  81. package/dist/project/types.d.ts.map +1 -1
  82. package/dist/project/types.js +3 -0
  83. package/dist/project/types.js.map +1 -1
  84. package/dist/providers/anthropic.d.ts +18 -0
  85. package/dist/providers/anthropic.d.ts.map +1 -0
  86. package/dist/providers/anthropic.js +123 -0
  87. package/dist/providers/anthropic.js.map +1 -0
  88. package/dist/providers/base.d.ts +45 -0
  89. package/dist/providers/base.d.ts.map +1 -0
  90. package/dist/providers/base.js +107 -0
  91. package/dist/providers/base.js.map +1 -0
  92. package/dist/providers/index.d.ts +14 -0
  93. package/dist/providers/index.d.ts.map +1 -0
  94. package/dist/providers/index.js +16 -0
  95. package/dist/providers/index.js.map +1 -0
  96. package/dist/providers/openai.d.ts +18 -0
  97. package/dist/providers/openai.d.ts.map +1 -0
  98. package/dist/providers/openai.js +106 -0
  99. package/dist/providers/openai.js.map +1 -0
  100. package/dist/providers/registry.d.ts +80 -0
  101. package/dist/providers/registry.d.ts.map +1 -0
  102. package/dist/providers/registry.js +118 -0
  103. package/dist/providers/registry.js.map +1 -0
  104. package/dist/providers/run-prompt.d.ts +69 -0
  105. package/dist/providers/run-prompt.d.ts.map +1 -0
  106. package/dist/providers/run-prompt.js +79 -0
  107. package/dist/providers/run-prompt.js.map +1 -0
  108. package/dist/providers/types.d.ts +123 -0
  109. package/dist/providers/types.d.ts.map +1 -0
  110. package/dist/providers/types.js +9 -0
  111. package/dist/providers/types.js.map +1 -0
  112. package/dist/types.d.ts +5 -5
  113. package/dist/types.d.ts.map +1 -1
  114. package/dist/utils/file-utils.d.ts.map +1 -1
  115. package/dist/utils/file-utils.js +9 -2
  116. package/dist/utils/file-utils.js.map +1 -1
  117. package/package.json +12 -6
  118. package/scripts/bundle-for-graaljs.mjs +220 -0
  119. package/scripts/stubs/crypto.mjs +12 -0
@@ -0,0 +1,349 @@
1
+ /**
2
+ * @fileoverview Assertion implementations for eval tests
3
+ *
4
+ * Each assertion takes a text output and returns a pass/fail result.
5
+ * Assertions are organized by category:
6
+ * - Text assertions (contains, equals, matches, etc.)
7
+ * - Structural assertions (json_valid, json_schema)
8
+ * - AI/Semantic assertions (llm_judge, similar_to, sentiment)
9
+ * - Performance assertions (latency, token_count, cost)
10
+ */
11
+ /**
12
+ * Run a single assertion against the given context.
13
+ */
14
+ export async function runAssertion(assertion, ctx) {
15
+ // Extract the operator (first key of the assertion object)
16
+ const entries = Object.entries(assertion);
17
+ if (entries.length === 0) {
18
+ return { operator: 'unknown', status: 'error', message: 'Empty assertion' };
19
+ }
20
+ const [operator, value] = entries[0];
21
+ const handler = assertionHandlers[operator];
22
+ if (!handler) {
23
+ return {
24
+ operator,
25
+ status: 'error',
26
+ message: `Unknown assertion operator: ${operator}`,
27
+ };
28
+ }
29
+ try {
30
+ return await handler(value, ctx);
31
+ }
32
+ catch (err) {
33
+ return {
34
+ operator,
35
+ status: 'error',
36
+ message: `Assertion error: ${err.message}`,
37
+ };
38
+ }
39
+ }
40
+ /**
41
+ * Run multiple assertions against the given context.
42
+ */
43
+ export async function runAssertions(assertions, ctx) {
44
+ const results = [];
45
+ for (const assertion of assertions) {
46
+ results.push(await runAssertion(assertion, ctx));
47
+ }
48
+ return results;
49
+ }
50
+ const assertionHandlers = {
51
+ // Text assertions
52
+ contains: async (value, ctx) => {
53
+ const expected = String(value);
54
+ const pass = ctx.text.includes(expected);
55
+ return {
56
+ operator: 'contains',
57
+ status: pass ? 'pass' : 'fail',
58
+ expected,
59
+ actual: pass ? undefined : truncate(ctx.text, 100),
60
+ message: pass ? undefined : `Expected output to contain "${expected}"`,
61
+ };
62
+ },
63
+ not_contains: async (value, ctx) => {
64
+ const expected = String(value);
65
+ const pass = !ctx.text.includes(expected);
66
+ return {
67
+ operator: 'not_contains',
68
+ status: pass ? 'pass' : 'fail',
69
+ expected: `not "${expected}"`,
70
+ message: pass ? undefined : `Expected output NOT to contain "${expected}"`,
71
+ };
72
+ },
73
+ equals: async (value, ctx) => {
74
+ const expected = String(value);
75
+ const pass = ctx.text.trim() === expected.trim();
76
+ return {
77
+ operator: 'equals',
78
+ status: pass ? 'pass' : 'fail',
79
+ expected,
80
+ actual: pass ? undefined : truncate(ctx.text, 100),
81
+ message: pass ? undefined : 'Output does not match expected text',
82
+ };
83
+ },
84
+ matches: async (value, ctx) => {
85
+ const pattern = String(value);
86
+ const regex = new RegExp(pattern);
87
+ const pass = regex.test(ctx.text);
88
+ return {
89
+ operator: 'matches',
90
+ status: pass ? 'pass' : 'fail',
91
+ expected: pattern,
92
+ message: pass ? undefined : `Output does not match pattern: ${pattern}`,
93
+ };
94
+ },
95
+ starts_with: async (value, ctx) => {
96
+ const expected = String(value);
97
+ const pass = ctx.text.trimStart().startsWith(expected);
98
+ return {
99
+ operator: 'starts_with',
100
+ status: pass ? 'pass' : 'fail',
101
+ expected,
102
+ message: pass ? undefined : `Expected output to start with "${expected}"`,
103
+ };
104
+ },
105
+ ends_with: async (value, ctx) => {
106
+ const expected = String(value);
107
+ const pass = ctx.text.trimEnd().endsWith(expected);
108
+ return {
109
+ operator: 'ends_with',
110
+ status: pass ? 'pass' : 'fail',
111
+ expected,
112
+ message: pass ? undefined : `Expected output to end with "${expected}"`,
113
+ };
114
+ },
115
+ length: async (value, ctx) => {
116
+ const opts = value;
117
+ const len = ctx.text.length;
118
+ let pass = true;
119
+ const messages = [];
120
+ if (opts.min != null && len < opts.min) {
121
+ pass = false;
122
+ messages.push(`length ${len} < min ${opts.min}`);
123
+ }
124
+ if (opts.max != null && len > opts.max) {
125
+ pass = false;
126
+ messages.push(`length ${len} > max ${opts.max}`);
127
+ }
128
+ return {
129
+ operator: 'length',
130
+ status: pass ? 'pass' : 'fail',
131
+ expected: `${opts.min ?? '0'}..${opts.max ?? '∞'}`,
132
+ actual: String(len),
133
+ message: pass ? undefined : messages.join(', '),
134
+ };
135
+ },
136
+ word_count: async (value, ctx) => {
137
+ const opts = value;
138
+ const words = ctx.text.trim().split(/\s+/).filter(Boolean).length;
139
+ let pass = true;
140
+ const messages = [];
141
+ if (opts.min != null && words < opts.min) {
142
+ pass = false;
143
+ messages.push(`word count ${words} < min ${opts.min}`);
144
+ }
145
+ if (opts.max != null && words > opts.max) {
146
+ pass = false;
147
+ messages.push(`word count ${words} > max ${opts.max}`);
148
+ }
149
+ return {
150
+ operator: 'word_count',
151
+ status: pass ? 'pass' : 'fail',
152
+ expected: `${opts.min ?? '0'}..${opts.max ?? '∞'} words`,
153
+ actual: `${words} words`,
154
+ message: pass ? undefined : messages.join(', '),
155
+ };
156
+ },
157
+ // Structural assertions
158
+ json_valid: async (_value, ctx) => {
159
+ try {
160
+ JSON.parse(ctx.text);
161
+ return { operator: 'json_valid', status: 'pass' };
162
+ }
163
+ catch {
164
+ return {
165
+ operator: 'json_valid',
166
+ status: 'fail',
167
+ message: 'Output is not valid JSON',
168
+ };
169
+ }
170
+ },
171
+ json_schema: async (_value, ctx) => {
172
+ // Schema validation is a stretch goal — for now just validate JSON
173
+ try {
174
+ JSON.parse(ctx.text);
175
+ return {
176
+ operator: 'json_schema',
177
+ status: 'pass',
178
+ message: 'JSON is valid (schema validation not yet implemented)',
179
+ };
180
+ }
181
+ catch {
182
+ return {
183
+ operator: 'json_schema',
184
+ status: 'fail',
185
+ message: 'Output is not valid JSON',
186
+ };
187
+ }
188
+ },
189
+ // AI/Semantic assertions (require LLM provider)
190
+ llm_judge: async (value, ctx) => {
191
+ const question = String(value);
192
+ if (!ctx.llmProvider) {
193
+ return {
194
+ operator: 'llm_judge',
195
+ status: 'error',
196
+ message: 'LLM provider not configured — cannot run llm_judge assertion',
197
+ };
198
+ }
199
+ const { pass, reasoning } = await ctx.llmProvider.judge(ctx.text, question);
200
+ return {
201
+ operator: 'llm_judge',
202
+ status: pass ? 'pass' : 'fail',
203
+ expected: question,
204
+ actual: reasoning,
205
+ message: pass
206
+ ? `LLM judge: ${reasoning}`
207
+ : `LLM judge answered "no": ${reasoning}`,
208
+ };
209
+ },
210
+ similar_to: async (value, ctx) => {
211
+ const opts = value;
212
+ if (!ctx.embeddingSimilarity && !ctx.llmProvider) {
213
+ return {
214
+ operator: 'similar_to',
215
+ status: 'error',
216
+ message: 'No embedding or LLM provider configured — cannot run similar_to assertion',
217
+ };
218
+ }
219
+ if (!ctx.loadGolden) {
220
+ return {
221
+ operator: 'similar_to',
222
+ status: 'error',
223
+ message: 'Dataset loader not configured — cannot run similar_to assertion',
224
+ };
225
+ }
226
+ const golden = await ctx.loadGolden(opts.dataset);
227
+ if (!golden) {
228
+ return {
229
+ operator: 'similar_to',
230
+ status: 'error',
231
+ message: `No golden response found in dataset "${opts.dataset}"`,
232
+ };
233
+ }
234
+ // Prefer embeddings over LLM — deterministic, cheaper, faster
235
+ let score;
236
+ let method;
237
+ if (ctx.embeddingSimilarity) {
238
+ score = await ctx.embeddingSimilarity(ctx.text, golden);
239
+ method = 'embeddings';
240
+ }
241
+ else {
242
+ score = await ctx.llmProvider.similarity(ctx.text, golden);
243
+ method = 'llm';
244
+ }
245
+ const pass = score >= opts.threshold;
246
+ return {
247
+ operator: 'similar_to',
248
+ status: pass ? 'pass' : 'fail',
249
+ expected: `similarity >= ${opts.threshold}`,
250
+ actual: `similarity = ${score.toFixed(3)} (${method})`,
251
+ message: pass ? undefined : `Similarity ${score.toFixed(3)} below threshold ${opts.threshold}`,
252
+ };
253
+ },
254
+ sentiment: async (value, ctx) => {
255
+ const expected = String(value);
256
+ if (!ctx.llmProvider) {
257
+ return {
258
+ operator: 'sentiment',
259
+ status: 'error',
260
+ message: 'LLM provider not configured — cannot run sentiment assertion',
261
+ };
262
+ }
263
+ const question = `Does this text have a ${expected} sentiment or tone?`;
264
+ const { pass, reasoning } = await ctx.llmProvider.judge(ctx.text, question);
265
+ return {
266
+ operator: 'sentiment',
267
+ status: pass ? 'pass' : 'fail',
268
+ expected,
269
+ actual: reasoning,
270
+ message: pass ? undefined : `Sentiment does not match expected: ${expected}. ${reasoning}`,
271
+ };
272
+ },
273
+ // Performance assertions (use LLM response metadata)
274
+ latency: async (value, ctx) => {
275
+ const opts = value;
276
+ const actual = ctx.llmResponse?.latencyMs;
277
+ if (actual == null) {
278
+ return {
279
+ operator: 'latency',
280
+ status: 'error',
281
+ message: 'No latency data available',
282
+ };
283
+ }
284
+ const pass = actual <= opts.max;
285
+ return {
286
+ operator: 'latency',
287
+ status: pass ? 'pass' : 'fail',
288
+ expected: `<= ${opts.max}ms`,
289
+ actual: `${actual}ms`,
290
+ message: pass ? undefined : `Latency ${actual}ms exceeds max ${opts.max}ms`,
291
+ };
292
+ },
293
+ token_count: async (value, ctx) => {
294
+ const opts = value;
295
+ const tokens = ctx.llmResponse?.tokens?.completion;
296
+ if (tokens == null) {
297
+ return {
298
+ operator: 'token_count',
299
+ status: 'error',
300
+ message: 'No token count data available',
301
+ };
302
+ }
303
+ let pass = true;
304
+ const messages = [];
305
+ if (opts.max != null && tokens > opts.max) {
306
+ pass = false;
307
+ messages.push(`tokens ${tokens} > max ${opts.max}`);
308
+ }
309
+ if (opts.min != null && tokens < opts.min) {
310
+ pass = false;
311
+ messages.push(`tokens ${tokens} < min ${opts.min}`);
312
+ }
313
+ return {
314
+ operator: 'token_count',
315
+ status: pass ? 'pass' : 'fail',
316
+ expected: `${opts.min ?? '0'}..${opts.max ?? '∞'} tokens`,
317
+ actual: `${tokens} tokens`,
318
+ message: pass ? undefined : messages.join(', '),
319
+ };
320
+ },
321
+ cost: async (value, ctx) => {
322
+ const opts = value;
323
+ const actual = ctx.llmResponse?.costUsd;
324
+ if (actual == null) {
325
+ return {
326
+ operator: 'cost',
327
+ status: 'error',
328
+ message: 'No cost data available',
329
+ };
330
+ }
331
+ const pass = actual <= opts.max;
332
+ return {
333
+ operator: 'cost',
334
+ status: pass ? 'pass' : 'fail',
335
+ expected: `<= $${opts.max}`,
336
+ actual: `$${actual.toFixed(4)}`,
337
+ message: pass ? undefined : `Cost $${actual.toFixed(4)} exceeds max $${opts.max}`,
338
+ };
339
+ },
340
+ };
341
+ // =============================================================================
342
+ // HELPERS
343
+ // =============================================================================
344
+ function truncate(text, maxLen) {
345
+ if (text.length <= maxLen)
346
+ return text;
347
+ return text.slice(0, maxLen) + '...';
348
+ }
349
+ //# sourceMappingURL=assertions.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"assertions.js","sourceRoot":"","sources":["../../src/eval/assertions.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAwBH;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,SAAoB,EACpB,GAAqB;IAErB,2DAA2D;IAC3D,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAC1C,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,iBAAiB,EAAE,CAAC;IAC9E,CAAC;IAED,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,GAAG,OAAO,CAAC,CAAC,CAAsB,CAAC;IAC1D,MAAM,OAAO,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IAE5C,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO;YACL,QAAQ;YACR,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,+BAA+B,QAAQ,EAAE;SACnD,CAAC;IACJ,CAAC;IAED,IAAI,CAAC;QACH,OAAO,MAAM,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;IACnC,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO;YACL,QAAQ;YACR,MAAM,EAAE,OAAO;YACf,OAAO,EAAE,oBAAqB,GAAa,CAAC,OAAO,EAAE;SACtD,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,UAAuB,EACvB,GAAqB;IAErB,MAAM,OAAO,GAAsB,EAAE,CAAC;IACtC,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,OAAO,CAAC,IAAI,CAAC,MAAM,YAAY,CAAC,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACnD,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAWD,MAAM,iBAAiB,GAAqC;IAC1D,kBAAkB;IAClB,QAAQ,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC7B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC/B,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QACzC,OAAO;YACL,QAAQ,EAAE,UAAU;YACpB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ;YACR,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,GAAG,CAAC;YAClD,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,+BAA+B,QAAQ,GAAG;SACvE,CAAC;IACJ,CAAC;IAED,YAAY,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QACjC,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC/B,MAAM,IAAI,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAC1C,OAAO;YACL,QAAQ,EAAE,cAAc;YACxB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,QAAQ,QAAQ,GAAG;YAC7B,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,mCAAmC,QAAQ,GAAG;SAC3E,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC3B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC/B,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,QAAQ,CAAC,IAAI,EAAE,CAAC;QACjD,OAAO;YACL,QAAQ,EAAE,QAAQ;YAClB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ;YACR,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,GAAG,CAAC;YAClD,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,qCAAqC;SAClE,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC5B,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC9B,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC;QAClC,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO;YACL,QAAQ,EAAE,SAAS;YACnB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,OAAO;YACjB,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,kCAAkC,OAAO,EAAE;SACxE,CAAC;IACJ,CAAC;IAED,WAAW,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAChC,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC/B,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;QACvD,OAAO;YACL,QAAQ,EAAE,aAAa;YACvB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ;YACR,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,kCAAkC,QAAQ,GAAG;SAC1E,CAAC;IACJ,CAAC;IAED,SAAS,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC9B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC/B,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QACnD,OAAO;YACL,QAAQ,EAAE,WAAW;YACrB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ;YACR,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,gCAAgC,QAAQ,GAAG;SACxE,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC3B,MAAM,IAAI,GAAG,KAAuC,CAAC;QACrD,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC;QAC5B,IAAI,IAAI,GAAG,IAAI,CAAC;QAChB,MAAM,QAAQ,GAAa,EAAE,CAAC;QAE9B,IAAI,IAAI,CAAC,GAAG,IAAI,IAAI,IAAI,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACvC,IAAI,GAAG,KAAK,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,UAAU,GAAG,UAAU,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QACnD,CAAC;QACD,IAAI,IAAI,CAAC,GAAG,IAAI,IAAI,IAAI,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACvC,IAAI,GAAG,KAAK,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,UAAU,GAAG,UAAU,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QACnD,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,QAAQ;YAClB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,IAAI,GAAG,KAAK,IAAI,CAAC,GAAG,IAAI,GAAG,EAAE;YAClD,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC;YACnB,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC;SAChD,CAAC;IACJ,CAAC;IAED,UAAU,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC/B,MAAM,IAAI,GAAG,KAAuC,CAAC;QACrD,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAClE,IAAI,IAAI,GAAG,IAAI,CAAC;QAChB,MAAM,QAAQ,GAAa,EAAE,CAAC;QAE9B,IAAI,IAAI,CAAC,GAAG,IAAI,IAAI,IAAI,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACzC,IAAI,GAAG,KAAK,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,cAAc,KAAK,UAAU,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QACzD,CAAC;QACD,IAAI,IAAI,CAAC,GAAG,IAAI,IAAI,IAAI,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACzC,IAAI,GAAG,KAAK,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,cAAc,KAAK,UAAU,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QACzD,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,YAAY;YACtB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,IAAI,GAAG,KAAK,IAAI,CAAC,GAAG,IAAI,GAAG,QAAQ;YACxD,MAAM,EAAE,GAAG,KAAK,QAAQ;YACxB,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC;SAChD,CAAC;IACJ,CAAC;IAED,wBAAwB;IACxB,UAAU,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,EAAE;QAChC,IAAI,CAAC;YACH,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACrB,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC;QACpD,CAAC;QAAC,MAAM,CAAC;YACP,OAAO;gBACL,QAAQ,EAAE,YAAY;gBACtB,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,0BAA0B;aACpC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,WAAW,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,EAAE;QACjC,mEAAmE;QACnE,IAAI,CAAC;YACH,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACrB,OAAO;gBACL,QAAQ,EAAE,aAAa;gBACvB,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,uDAAuD;aACjE,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,OAAO;gBACL,QAAQ,EAAE,aAAa;gBACvB,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,0BAA0B;aACpC,CAAC;QACJ,CAAC;IACH,CAAC;IAED,gDAAgD;IAChD,SAAS,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC9B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC/B,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO;gBACL,QAAQ,EAAE,WAAW;gBACrB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,8DAA8D;aACxE,CAAC;QACJ,CAAC;QAED,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,MAAM,GAAG,CAAC,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;QAC5E,OAAO;YACL,QAAQ,EAAE,WAAW;YACrB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,QAAQ;YAClB,MAAM,EAAE,SAAS;YACjB,OAAO,EAAE,IAAI;gBACX,CAAC,CAAC,cAAc,SAAS,EAAE;gBAC3B,CAAC,CAAC,4BAA4B,SAAS,EAAE;SAC5C,CAAC;IACJ,CAAC;IAED,UAAU,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC/B,MAAM,IAAI,GAAG,KAA+C,CAAC;QAC7D,IAAI,CAAC,GAAG,CAAC,mBAAmB,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC;YACjD,OAAO;gBACL,QAAQ,EAAE,YAAY;gBACtB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,2EAA2E;aACrF,CAAC;QACJ,CAAC;QACD,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,CAAC;YACpB,OAAO;gBACL,QAAQ,EAAE,YAAY;gBACtB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,iEAAiE;aAC3E,CAAC;QACJ,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAClD,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO;gBACL,QAAQ,EAAE,YAAY;gBACtB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,wCAAwC,IAAI,CAAC,OAAO,GAAG;aACjE,CAAC;QACJ,CAAC;QAED,8DAA8D;QAC9D,IAAI,KAAa,CAAC;QAClB,IAAI,MAAc,CAAC;QACnB,IAAI,GAAG,CAAC,mBAAmB,EAAE,CAAC;YAC5B,KAAK,GAAG,MAAM,GAAG,CAAC,mBAAmB,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YACxD,MAAM,GAAG,YAAY,CAAC;QACxB,CAAC;aAAM,CAAC;YACN,KAAK,GAAG,MAAM,GAAG,CAAC,WAAY,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC5D,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;QAED,MAAM,IAAI,GAAG,KAAK,IAAI,IAAI,CAAC,SAAS,CAAC;QACrC,OAAO;YACL,QAAQ,EAAE,YAAY;YACtB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,iBAAiB,IAAI,CAAC,SAAS,EAAE;YAC3C,MAAM,EAAE,gBAAgB,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,MAAM,GAAG;YACtD,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,cAAc,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,oBAAoB,IAAI,CAAC,SAAS,EAAE;SAC/F,CAAC;IACJ,CAAC;IAED,SAAS,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC9B,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;QAC/B,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO;gBACL,QAAQ,EAAE,WAAW;gBACrB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,8DAA8D;aACxE,CAAC;QACJ,CAAC;QAED,MAAM,QAAQ,GAAG,yBAAyB,QAAQ,qBAAqB,CAAC;QACxE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,MAAM,GAAG,CAAC,WAAW,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;QAC5E,OAAO;YACL,QAAQ,EAAE,WAAW;YACrB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ;YACR,MAAM,EAAE,SAAS;YACjB,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,sCAAsC,QAAQ,KAAK,SAAS,EAAE;SAC3F,CAAC;IACJ,CAAC;IAED,qDAAqD;IACrD,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAC5B,MAAM,IAAI,GAAG,KAAwB,CAAC;QACtC,MAAM,MAAM,GAAG,GAAG,CAAC,WAAW,EAAE,SAAS,CAAC;QAC1C,IAAI,MAAM,IAAI,IAAI,EAAE,CAAC;YACnB,OAAO;gBACL,QAAQ,EAAE,SAAS;gBACnB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,2BAA2B;aACrC,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC;QAChC,OAAO;YACL,QAAQ,EAAE,SAAS;YACnB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,MAAM,IAAI,CAAC,GAAG,IAAI;YAC5B,MAAM,EAAE,GAAG,MAAM,IAAI;YACrB,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,MAAM,kBAAkB,IAAI,CAAC,GAAG,IAAI;SAC5E,CAAC;IACJ,CAAC;IAED,WAAW,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QAChC,MAAM,IAAI,GAAG,KAAuC,CAAC;QACrD,MAAM,MAAM,GAAG,GAAG,CAAC,WAAW,EAAE,MAAM,EAAE,UAAU,CAAC;QACnD,IAAI,MAAM,IAAI,IAAI,EAAE,CAAC;YACnB,OAAO;gBACL,QAAQ,EAAE,aAAa;gBACvB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,+BAA+B;aACzC,CAAC;QACJ,CAAC;QAED,IAAI,IAAI,GAAG,IAAI,CAAC;QAChB,MAAM,QAAQ,GAAa,EAAE,CAAC;QAC9B,IAAI,IAAI,CAAC,GAAG,IAAI,IAAI,IAAI,MAAM,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC1C,IAAI,GAAG,KAAK,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,UAAU,MAAM,UAAU,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QACtD,CAAC;QACD,IAAI,IAAI,CAAC,GAAG,IAAI,IAAI,IAAI,MAAM,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC1C,IAAI,GAAG,KAAK,CAAC;YACb,QAAQ,CAAC,IAAI,CAAC,UAAU,MAAM,UAAU,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QACtD,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,aAAa;YACvB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,GAAG,IAAI,CAAC,GAAG,IAAI,GAAG,KAAK,IAAI,CAAC,GAAG,IAAI,GAAG,SAAS;YACzD,MAAM,EAAE,GAAG,MAAM,SAAS;YAC1B,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC;SAChD,CAAC;IACJ,CAAC;IAED,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE;QACzB,MAAM,IAAI,GAAG,KAAwB,CAAC;QACtC,MAAM,MAAM,GAAG,GAAG,CAAC,WAAW,EAAE,OAAO,CAAC;QACxC,IAAI,MAAM,IAAI,IAAI,EAAE,CAAC;YACnB,OAAO;gBACL,QAAQ,EAAE,MAAM;gBAChB,MAAM,EAAE,OAAO;gBACf,OAAO,EAAE,wBAAwB;aAClC,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC;QAChC,OAAO;YACL,QAAQ,EAAE,MAAM;YAChB,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;YAC9B,QAAQ,EAAE,OAAO,IAAI,CAAC,GAAG,EAAE;YAC3B,MAAM,EAAE,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;YAC/B,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,iBAAiB,IAAI,CAAC,GAAG,EAAE;SAClF,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,gFAAgF;AAChF,UAAU;AACV,gFAAgF;AAEhF,SAAS,QAAQ,CAAC,IAAY,EAAE,MAAc;IAC5C,IAAI,IAAI,CAAC,MAAM,IAAI,MAAM;QAAE,OAAO,IAAI,CAAC;IACvC,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,GAAG,KAAK,CAAC;AACvC,CAAC"}
@@ -0,0 +1,42 @@
1
+ /**
2
+ * @fileoverview Dataset operations for eval system
3
+ *
4
+ * Handles loading datasets from .dset files, resolving parameter sets,
5
+ * and recording golden responses.
6
+ */
7
+ import type { EvalDataset, LLMResponse } from './types.js';
8
+ /**
9
+ * Manages datasets within a prompt's eval/datasets/ directory.
10
+ */
11
+ export declare class DatasetManager {
12
+ /** Path to the prompt directory (contains eval/datasets/) */
13
+ private promptDir;
14
+ private cache;
15
+ constructor(
16
+ /** Path to the prompt directory (contains eval/datasets/) */
17
+ promptDir: string);
18
+ /**
19
+ * Load a dataset by name.
20
+ * Looks for eval/datasets/{name}.dset
21
+ */
22
+ load(name: string): Promise<EvalDataset>;
23
+ /**
24
+ * Get a specific parameter set from a dataset.
25
+ */
26
+ getParams(datasetName: string, paramsName: string): Promise<Record<string, unknown>>;
27
+ /**
28
+ * Get the golden response from a dataset.
29
+ */
30
+ getGolden(datasetName: string): Promise<string | undefined>;
31
+ /**
32
+ * Record a golden response into a dataset.
33
+ * Creates the dataset if it doesn't exist.
34
+ */
35
+ recordGolden(datasetName: string, response: string, llmResponse: LLMResponse): Promise<void>;
36
+ /**
37
+ * Save a dataset to disk.
38
+ */
39
+ save(name: string, dataset: EvalDataset): Promise<void>;
40
+ private getDatasetPath;
41
+ }
42
+ //# sourceMappingURL=dataset.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataset.d.ts","sourceRoot":"","sources":["../../src/eval/dataset.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,OAAO,KAAK,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAM3D;;GAEG;AACH,qBAAa,cAAc;IAIvB,6DAA6D;IAC7D,OAAO,CAAC,SAAS;IAJnB,OAAO,CAAC,KAAK,CAAkC;;IAG7C,6DAA6D;IACrD,SAAS,EAAE,MAAM;IAG3B;;;OAGG;IACG,IAAI,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAU9C;;OAEG;IACG,SAAS,CACb,WAAW,EAAE,MAAM,EACnB,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAgBnC;;OAEG;IACG,SAAS,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC;IAKjE;;;OAGG;IACG,YAAY,CAChB,WAAW,EAAE,MAAM,EACnB,QAAQ,EAAE,MAAM,EAChB,WAAW,EAAE,WAAW,GACvB,OAAO,CAAC,IAAI,CAAC;IA6BhB;;OAEG;IACG,IAAI,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IAM7D,OAAO,CAAC,cAAc;CAGvB"}
@@ -0,0 +1,101 @@
1
+ /**
2
+ * @fileoverview Dataset operations for eval system
3
+ *
4
+ * Handles loading datasets from .dset files, resolving parameter sets,
5
+ * and recording golden responses.
6
+ */
7
+ import { writeFile } from 'fs/promises';
8
+ import { join, resolve } from 'path';
9
+ import { stringify as stringifyYaml } from 'yaml';
10
+ import { loadDatasetFile } from './loader.js';
11
+ // =============================================================================
12
+ // DATASET MANAGER
13
+ // =============================================================================
14
+ /**
15
+ * Manages datasets within a prompt's eval/datasets/ directory.
16
+ */
17
+ export class DatasetManager {
18
+ promptDir;
19
+ cache = new Map();
20
+ constructor(
21
+ /** Path to the prompt directory (contains eval/datasets/) */
22
+ promptDir) {
23
+ this.promptDir = promptDir;
24
+ }
25
+ /**
26
+ * Load a dataset by name.
27
+ * Looks for eval/datasets/{name}.dset
28
+ */
29
+ async load(name) {
30
+ const cached = this.cache.get(name);
31
+ if (cached)
32
+ return cached;
33
+ const filePath = this.getDatasetPath(name);
34
+ const dataset = await loadDatasetFile(filePath);
35
+ this.cache.set(name, dataset);
36
+ return dataset;
37
+ }
38
+ /**
39
+ * Get a specific parameter set from a dataset.
40
+ */
41
+ async getParams(datasetName, paramsName) {
42
+ const dataset = await this.load(datasetName);
43
+ const paramSet = dataset.parameters.find((p) => p.name === paramsName);
44
+ if (!paramSet) {
45
+ throw new Error(`Parameter set "${paramsName}" not found in dataset "${datasetName}". ` +
46
+ `Available: ${dataset.parameters.map((p) => p.name).join(', ')}`);
47
+ }
48
+ // Return all fields except 'name' as the variable context
49
+ const { name: _name, ...vars } = paramSet;
50
+ return vars;
51
+ }
52
+ /**
53
+ * Get the golden response from a dataset.
54
+ */
55
+ async getGolden(datasetName) {
56
+ const dataset = await this.load(datasetName);
57
+ return dataset.golden?.response;
58
+ }
59
+ /**
60
+ * Record a golden response into a dataset.
61
+ * Creates the dataset if it doesn't exist.
62
+ */
63
+ async recordGolden(datasetName, response, llmResponse) {
64
+ let dataset;
65
+ try {
66
+ dataset = await this.load(datasetName);
67
+ }
68
+ catch (_err) {
69
+ // Dataset doesn't exist — create a minimal one
70
+ dataset = {
71
+ name: datasetName,
72
+ parameters: [],
73
+ };
74
+ }
75
+ // Update golden
76
+ dataset.golden = {
77
+ response,
78
+ model: llmResponse.model,
79
+ recorded_at: new Date().toISOString(),
80
+ metadata: {
81
+ tokens: llmResponse.tokens?.total,
82
+ latency_ms: llmResponse.latencyMs,
83
+ },
84
+ };
85
+ // Write back
86
+ await this.save(datasetName, dataset);
87
+ this.cache.set(datasetName, dataset);
88
+ }
89
+ /**
90
+ * Save a dataset to disk.
91
+ */
92
+ async save(name, dataset) {
93
+ const filePath = this.getDatasetPath(name);
94
+ const content = stringifyYaml(dataset, { lineWidth: 120 });
95
+ await writeFile(filePath, content, 'utf-8');
96
+ }
97
+ getDatasetPath(name) {
98
+ return resolve(join(this.promptDir, 'eval', 'datasets', `${name}.dset`));
99
+ }
100
+ }
101
+ //# sourceMappingURL=dataset.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataset.js","sourceRoot":"","sources":["../../src/eval/dataset.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,SAAS,IAAI,aAAa,EAAE,MAAM,MAAM,CAAC;AAClD,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAG9C,gFAAgF;AAChF,kBAAkB;AAClB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,OAAO,cAAc;IAKf;IAJF,KAAK,GAAG,IAAI,GAAG,EAAuB,CAAC;IAE/C;IACE,6DAA6D;IACrD,SAAiB;QAAjB,cAAS,GAAT,SAAS,CAAQ;IACxB,CAAC;IAEJ;;;OAGG;IACH,KAAK,CAAC,IAAI,CAAC,IAAY;QACrB,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACpC,IAAI,MAAM;YAAE,OAAO,MAAM,CAAC;QAE1B,MAAM,QAAQ,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;QAC3C,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;QAChD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAC9B,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,SAAS,CACb,WAAmB,EACnB,UAAkB;QAElB,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC7C,MAAM,QAAQ,GAAG,OAAO,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,UAAU,CAAC,CAAC;QAEvE,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,MAAM,IAAI,KAAK,CACb,kBAAkB,UAAU,2BAA2B,WAAW,KAAK;gBACrE,cAAc,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CACnE,CAAC;QACJ,CAAC;QAED,0DAA0D;QAC1D,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,IAAI,EAAE,GAAG,QAAQ,CAAC;QAC1C,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,SAAS,CAAC,WAAmB;QACjC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC7C,OAAO,OAAO,CAAC,MAAM,EAAE,QAAQ,CAAC;IAClC,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,YAAY,CAChB,WAAmB,EACnB,QAAgB,EAChB,WAAwB;QAExB,IAAI,OAAoB,CAAC;QAEzB,IAAI,CAAC;YACH,OAAO,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACzC,CAAC;QAAC,OAAO,IAAI,EAAE,CAAC;YACd,+CAA+C;YAC/C,OAAO,GAAG;gBACR,IAAI,EAAE,WAAW;gBACjB,UAAU,EAAE,EAAE;aACf,CAAC;QACJ,CAAC;QAED,gBAAgB;QAChB,OAAO,CAAC,MAAM,GAAG;YACf,QAAQ;YACR,KAAK,EAAE,WAAW,CAAC,KAAK;YACxB,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACrC,QAAQ,EAAE;gBACR,MAAM,EAAE,WAAW,CAAC,MAAM,EAAE,KAAK;gBACjC,UAAU,EAAE,WAAW,CAAC,SAAS;aAClC;SACF,CAAC;QAEF,aAAa;QACb,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;QACtC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,IAAY,EAAE,OAAoB;QAC3C,MAAM,QAAQ,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;QAC3C,MAAM,OAAO,GAAG,aAAa,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC;QAC3D,MAAM,SAAS,CAAC,QAAQ,EAAE,OAAO,EAAE,OAAO,CAAC,CAAC;IAC9C,CAAC;IAEO,cAAc,CAAC,IAAY;QACjC,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,IAAI,OAAO,CAAC,CAAC,CAAC;IAC3E,CAAC;CACF"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * @fileoverview Echo PDK Eval Module - Public API
3
+ *
4
+ * Exports the eval runner, loader, assertions, dataset manager,
5
+ * and reporter for prompt evaluation and testing.
6
+ */
7
+ export type { EvalSuite, EvalSuiteConfig, EvalTest, Assertion, AssertionOperator, EvalDataset, EvalGolden, EvalParameterSet, EvalSuiteResult, EvalTestResult, AssertionResult, EvalSummary, EvalStatus, EvalRunnerConfig, LLMProvider, LLMResponse, } from './types.js';
8
+ export { loadEvalFile, parseEvalContent, loadDatasetFile, parseDatasetContent, EvalLoadError, } from './loader.js';
9
+ export { runAssertion, runAssertions } from './assertions.js';
10
+ export type { AssertionContext } from './assertions.js';
11
+ export { DatasetManager } from './dataset.js';
12
+ export { runEvalFile, runEvalSuite } from './runner.js';
13
+ export { formatConsole, formatJson, formatJunit, formatResults, } from './reporter.js';
14
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,YAAY,EACV,SAAS,EACT,eAAe,EACf,QAAQ,EACR,SAAS,EACT,iBAAiB,EACjB,WAAW,EACX,UAAU,EACV,gBAAgB,EAChB,eAAe,EACf,cAAc,EACd,eAAe,EACf,WAAW,EACX,UAAU,EACV,gBAAgB,EAChB,WAAW,EACX,WAAW,GACZ,MAAM,YAAY,CAAC;AAGpB,OAAO,EACL,YAAY,EACZ,gBAAgB,EAChB,eAAe,EACf,mBAAmB,EACnB,aAAa,GACd,MAAM,aAAa,CAAC;AAGrB,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAC9D,YAAY,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AAGxD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAG9C,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAGxD,OAAO,EACL,aAAa,EACb,UAAU,EACV,WAAW,EACX,aAAa,GACd,MAAM,eAAe,CAAC"}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * @fileoverview Echo PDK Eval Module - Public API
3
+ *
4
+ * Exports the eval runner, loader, assertions, dataset manager,
5
+ * and reporter for prompt evaluation and testing.
6
+ */
7
+ // Loader
8
+ export { loadEvalFile, parseEvalContent, loadDatasetFile, parseDatasetContent, EvalLoadError, } from './loader.js';
9
+ // Assertions
10
+ export { runAssertion, runAssertions } from './assertions.js';
11
+ // Dataset
12
+ export { DatasetManager } from './dataset.js';
13
+ // Runner
14
+ export { runEvalFile, runEvalSuite } from './runner.js';
15
+ // Reporter
16
+ export { formatConsole, formatJson, formatJunit, formatResults, } from './reporter.js';
17
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAsBH,SAAS;AACT,OAAO,EACL,YAAY,EACZ,gBAAgB,EAChB,eAAe,EACf,mBAAmB,EACnB,aAAa,GACd,MAAM,aAAa,CAAC;AAErB,aAAa;AACb,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAG9D,UAAU;AACV,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,SAAS;AACT,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAExD,WAAW;AACX,OAAO,EACL,aAAa,EACb,UAAU,EACV,WAAW,EACX,aAAa,GACd,MAAM,eAAe,CAAC"}
@@ -0,0 +1,30 @@
1
+ /**
2
+ * @fileoverview Loader for .eval and .dset YAML files
3
+ *
4
+ * Parses and validates eval suite definitions and dataset files.
5
+ */
6
+ import type { EvalSuite, EvalDataset } from './types.js';
7
+ /**
8
+ * Load and validate an .eval file from disk.
9
+ */
10
+ export declare function loadEvalFile(filePath: string): Promise<EvalSuite>;
11
+ /**
12
+ * Parse .eval YAML content into an EvalSuite.
13
+ */
14
+ export declare function parseEvalContent(content: string, source?: string): EvalSuite;
15
+ /**
16
+ * Load and validate a .dset file from disk.
17
+ */
18
+ export declare function loadDatasetFile(filePath: string): Promise<EvalDataset>;
19
+ /**
20
+ * Parse .dset YAML content into an EvalDataset.
21
+ */
22
+ export declare function parseDatasetContent(content: string, source?: string): EvalDataset;
23
+ /**
24
+ * Error thrown when loading/parsing eval or dataset files.
25
+ */
26
+ export declare class EvalLoadError extends Error {
27
+ source?: string | undefined;
28
+ constructor(message: string, source?: string | undefined);
29
+ }
30
+ //# sourceMappingURL=loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../src/eval/loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,OAAO,KAAK,EACV,SAAS,EAIT,WAAW,EAGZ,MAAM,YAAY,CAAC;AAMpB;;GAEG;AACH,wBAAsB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,CAGvE;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,SAAS,CA6B5E;AA6GD;;GAEG;AACH,wBAAsB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAG5E;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,WAAW,CA6CjF;AAMD;;GAEG;AACH,qBAAa,aAAc,SAAQ,KAAK;IAG7B,MAAM,CAAC,EAAE,MAAM;gBADtB,OAAO,EAAE,MAAM,EACR,MAAM,CAAC,EAAE,MAAM,YAAA;CAKzB"}