@goreal-ai/echo-pdk 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/README.md +1 -1
  2. package/dist/ai-judge/index.d.ts +11 -21
  3. package/dist/ai-judge/index.d.ts.map +1 -1
  4. package/dist/ai-judge/index.js +36 -90
  5. package/dist/ai-judge/index.js.map +1 -1
  6. package/dist/embeddings/cosine.d.ts +15 -0
  7. package/dist/embeddings/cosine.d.ts.map +1 -0
  8. package/dist/embeddings/cosine.js +37 -0
  9. package/dist/embeddings/cosine.js.map +1 -0
  10. package/dist/embeddings/index.d.ts +9 -0
  11. package/dist/embeddings/index.d.ts.map +1 -0
  12. package/dist/embeddings/index.js +11 -0
  13. package/dist/embeddings/index.js.map +1 -0
  14. package/dist/embeddings/openai.d.ts +11 -0
  15. package/dist/embeddings/openai.d.ts.map +1 -0
  16. package/dist/embeddings/openai.js +38 -0
  17. package/dist/embeddings/openai.js.map +1 -0
  18. package/dist/embeddings/registry.d.ts +13 -0
  19. package/dist/embeddings/registry.d.ts.map +1 -0
  20. package/dist/embeddings/registry.js +29 -0
  21. package/dist/embeddings/registry.js.map +1 -0
  22. package/dist/embeddings/types.d.ts +35 -0
  23. package/dist/embeddings/types.d.ts.map +1 -0
  24. package/dist/embeddings/types.js +8 -0
  25. package/dist/embeddings/types.js.map +1 -0
  26. package/dist/embeddings/voyage.d.ts +12 -0
  27. package/dist/embeddings/voyage.d.ts.map +1 -0
  28. package/dist/embeddings/voyage.js +39 -0
  29. package/dist/embeddings/voyage.js.map +1 -0
  30. package/dist/eval/assertions.d.ts +35 -0
  31. package/dist/eval/assertions.d.ts.map +1 -0
  32. package/dist/eval/assertions.js +349 -0
  33. package/dist/eval/assertions.js.map +1 -0
  34. package/dist/eval/dataset.d.ts +42 -0
  35. package/dist/eval/dataset.d.ts.map +1 -0
  36. package/dist/eval/dataset.js +101 -0
  37. package/dist/eval/dataset.js.map +1 -0
  38. package/dist/eval/index.d.ts +14 -0
  39. package/dist/eval/index.d.ts.map +1 -0
  40. package/dist/eval/index.js +17 -0
  41. package/dist/eval/index.js.map +1 -0
  42. package/dist/eval/loader.d.ts +30 -0
  43. package/dist/eval/loader.d.ts.map +1 -0
  44. package/dist/eval/loader.js +170 -0
  45. package/dist/eval/loader.js.map +1 -0
  46. package/dist/eval/reporter.d.ts +26 -0
  47. package/dist/eval/reporter.d.ts.map +1 -0
  48. package/dist/eval/reporter.js +164 -0
  49. package/dist/eval/reporter.js.map +1 -0
  50. package/dist/eval/runner.d.ts +28 -0
  51. package/dist/eval/runner.d.ts.map +1 -0
  52. package/dist/eval/runner.js +232 -0
  53. package/dist/eval/runner.js.map +1 -0
  54. package/dist/eval/types.d.ts +257 -0
  55. package/dist/eval/types.d.ts.map +1 -0
  56. package/dist/eval/types.js +11 -0
  57. package/dist/eval/types.js.map +1 -0
  58. package/dist/evaluator/evaluator.d.ts +2 -2
  59. package/dist/evaluator/evaluator.js +5 -5
  60. package/dist/evaluator/evaluator.js.map +1 -1
  61. package/dist/evaluator/index.d.ts +1 -1
  62. package/dist/evaluator/index.d.ts.map +1 -1
  63. package/dist/evaluator/index.js +1 -1
  64. package/dist/evaluator/index.js.map +1 -1
  65. package/dist/evaluator/operators.d.ts +9 -5
  66. package/dist/evaluator/operators.d.ts.map +1 -1
  67. package/dist/evaluator/operators.js +26 -33
  68. package/dist/evaluator/operators.js.map +1 -1
  69. package/dist/index.d.ts +8 -2
  70. package/dist/index.d.ts.map +1 -1
  71. package/dist/index.js +38 -20
  72. package/dist/index.js.map +1 -1
  73. package/dist/parser/ast.js +1 -1
  74. package/dist/parser/ast.js.map +1 -1
  75. package/dist/parser/lexer.d.ts +1 -1
  76. package/dist/parser/lexer.js +1 -1
  77. package/dist/project/index.d.ts.map +1 -1
  78. package/dist/project/index.js +10 -3
  79. package/dist/project/index.js.map +1 -1
  80. package/dist/project/types.d.ts +19 -4
  81. package/dist/project/types.d.ts.map +1 -1
  82. package/dist/project/types.js +3 -0
  83. package/dist/project/types.js.map +1 -1
  84. package/dist/providers/anthropic.d.ts +18 -0
  85. package/dist/providers/anthropic.d.ts.map +1 -0
  86. package/dist/providers/anthropic.js +123 -0
  87. package/dist/providers/anthropic.js.map +1 -0
  88. package/dist/providers/base.d.ts +45 -0
  89. package/dist/providers/base.d.ts.map +1 -0
  90. package/dist/providers/base.js +107 -0
  91. package/dist/providers/base.js.map +1 -0
  92. package/dist/providers/index.d.ts +14 -0
  93. package/dist/providers/index.d.ts.map +1 -0
  94. package/dist/providers/index.js +16 -0
  95. package/dist/providers/index.js.map +1 -0
  96. package/dist/providers/openai.d.ts +18 -0
  97. package/dist/providers/openai.d.ts.map +1 -0
  98. package/dist/providers/openai.js +106 -0
  99. package/dist/providers/openai.js.map +1 -0
  100. package/dist/providers/registry.d.ts +80 -0
  101. package/dist/providers/registry.d.ts.map +1 -0
  102. package/dist/providers/registry.js +118 -0
  103. package/dist/providers/registry.js.map +1 -0
  104. package/dist/providers/run-prompt.d.ts +69 -0
  105. package/dist/providers/run-prompt.d.ts.map +1 -0
  106. package/dist/providers/run-prompt.js +79 -0
  107. package/dist/providers/run-prompt.js.map +1 -0
  108. package/dist/providers/types.d.ts +123 -0
  109. package/dist/providers/types.d.ts.map +1 -0
  110. package/dist/providers/types.js +9 -0
  111. package/dist/providers/types.js.map +1 -0
  112. package/dist/types.d.ts +5 -5
  113. package/dist/types.d.ts.map +1 -1
  114. package/package.json +10 -7
  115. package/scripts/bundle-for-graaljs.mjs +45 -0
@@ -0,0 +1,170 @@
1
+ /**
2
+ * @fileoverview Loader for .eval and .dset YAML files
3
+ *
4
+ * Parses and validates eval suite definitions and dataset files.
5
+ */
6
+ import { readFile } from 'fs/promises';
7
+ import { parse as parseYaml } from 'yaml';
8
+ // =============================================================================
9
+ // EVAL FILE LOADER
10
+ // =============================================================================
11
+ /**
12
+ * Load and validate an .eval file from disk.
13
+ */
14
+ export async function loadEvalFile(filePath) {
15
+ const content = await readFile(filePath, 'utf-8');
16
+ return parseEvalContent(content, filePath);
17
+ }
18
+ /**
19
+ * Parse .eval YAML content into an EvalSuite.
20
+ */
21
+ export function parseEvalContent(content, source) {
22
+ const raw = parseYaml(content);
23
+ if (!raw || typeof raw !== 'object') {
24
+ throw new EvalLoadError('Invalid .eval file: expected YAML object', source);
25
+ }
26
+ // Validate required fields
27
+ if (typeof raw.suite !== 'string') {
28
+ throw new EvalLoadError('Missing required field: suite', source);
29
+ }
30
+ if (!Array.isArray(raw.tests) || raw.tests.length === 0) {
31
+ throw new EvalLoadError('Missing or empty required field: tests', source);
32
+ }
33
+ // Parse config
34
+ const config = parseConfig(raw.config, source);
35
+ // Parse tests
36
+ const tests = raw.tests.map((t, i) => parseTest(t, i, source));
37
+ return {
38
+ suite: raw.suite,
39
+ config,
40
+ tests,
41
+ };
42
+ }
43
+ function parseConfig(raw, _source) {
44
+ if (!raw || typeof raw !== 'object') {
45
+ return { target: 'prompt.pdk' };
46
+ }
47
+ return {
48
+ target: typeof raw.target === 'string' ? raw.target : 'prompt.pdk',
49
+ model: typeof raw.model === 'string' ? raw.model : undefined,
50
+ timeout: typeof raw.timeout === 'number' ? raw.timeout : undefined,
51
+ };
52
+ }
53
+ function parseTest(raw, index, source) {
54
+ if (typeof raw.name !== 'string') {
55
+ throw new EvalLoadError(`Test at index ${index} missing required field: name`, source);
56
+ }
57
+ const test = {
58
+ name: raw.name,
59
+ };
60
+ if (raw.given && typeof raw.given === 'object') {
61
+ test.given = raw.given;
62
+ }
63
+ if (typeof raw.dataset === 'string') {
64
+ test.dataset = raw.dataset;
65
+ }
66
+ if (typeof raw.params === 'string') {
67
+ test.params = raw.params;
68
+ }
69
+ if (Array.isArray(raw.expect_render)) {
70
+ test.expect_render = raw.expect_render.map((a, i) => parseAssertion(a, i, raw.name, source));
71
+ }
72
+ if (Array.isArray(raw.expect_llm)) {
73
+ test.expect_llm = raw.expect_llm.map((a, i) => parseAssertion(a, i, raw.name, source));
74
+ }
75
+ if (!test.expect_render && !test.expect_llm) {
76
+ throw new EvalLoadError(`Test "${test.name}" has no assertions (need expect_render or expect_llm)`, source);
77
+ }
78
+ return test;
79
+ }
80
+ /**
81
+ * Parse a single assertion from YAML.
82
+ * Each assertion is a single-key object.
83
+ */
84
+ function parseAssertion(raw, index, testName, source) {
85
+ const keys = Object.keys(raw);
86
+ if (keys.length === 0) {
87
+ throw new EvalLoadError(`Empty assertion at index ${index} in test "${testName}"`, source);
88
+ }
89
+ // The first key is the operator
90
+ const operator = keys[0];
91
+ const value = raw[operator];
92
+ // Validate known operators
93
+ const knownOperators = new Set([
94
+ 'contains', 'not_contains', 'equals', 'matches',
95
+ 'starts_with', 'ends_with', 'length', 'word_count',
96
+ 'json_valid', 'json_schema', 'llm_judge', 'similar_to',
97
+ 'sentiment', 'latency', 'token_count', 'cost',
98
+ ]);
99
+ if (!knownOperators.has(operator)) {
100
+ throw new EvalLoadError(`Unknown assertion operator "${operator}" in test "${testName}"`, source);
101
+ }
102
+ // Return as-is — the runner will handle type checking per operator
103
+ return { [operator]: value };
104
+ }
105
+ // =============================================================================
106
+ // DATASET FILE LOADER
107
+ // =============================================================================
108
+ /**
109
+ * Load and validate a .dset file from disk.
110
+ */
111
+ export async function loadDatasetFile(filePath) {
112
+ const content = await readFile(filePath, 'utf-8');
113
+ return parseDatasetContent(content, filePath);
114
+ }
115
+ /**
116
+ * Parse .dset YAML content into an EvalDataset.
117
+ */
118
+ export function parseDatasetContent(content, source) {
119
+ const raw = parseYaml(content);
120
+ if (!raw || typeof raw !== 'object') {
121
+ throw new EvalLoadError('Invalid .dset file: expected YAML object', source);
122
+ }
123
+ if (typeof raw.name !== 'string') {
124
+ throw new EvalLoadError('Missing required field: name', source);
125
+ }
126
+ if (!Array.isArray(raw.parameters) || raw.parameters.length === 0) {
127
+ throw new EvalLoadError('Missing or empty required field: parameters', source);
128
+ }
129
+ // Parse golden
130
+ let golden;
131
+ if (raw.golden && typeof raw.golden === 'object') {
132
+ const g = raw.golden;
133
+ golden = {
134
+ response: typeof g.response === 'string' ? g.response : '',
135
+ model: typeof g.model === 'string' ? g.model : undefined,
136
+ recorded_at: typeof g.recorded_at === 'string' ? g.recorded_at : undefined,
137
+ metadata: g.metadata && typeof g.metadata === 'object'
138
+ ? g.metadata
139
+ : undefined,
140
+ };
141
+ }
142
+ // Parse parameters
143
+ const parameters = raw.parameters.map((p, i) => {
144
+ if (typeof p.name !== 'string') {
145
+ throw new EvalLoadError(`Parameter set at index ${i} missing required field: name`, source);
146
+ }
147
+ return p;
148
+ });
149
+ return {
150
+ name: raw.name,
151
+ description: typeof raw.description === 'string' ? raw.description : undefined,
152
+ golden,
153
+ parameters,
154
+ };
155
+ }
156
+ // =============================================================================
157
+ // ERROR TYPE
158
+ // =============================================================================
159
+ /**
160
+ * Error thrown when loading/parsing eval or dataset files.
161
+ */
162
+ export class EvalLoadError extends Error {
163
+ source;
164
+ constructor(message, source) {
165
+ super(source ? `${message} (in ${source})` : message);
166
+ this.source = source;
167
+ this.name = 'EvalLoadError';
168
+ }
169
+ }
170
+ //# sourceMappingURL=loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/eval/loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAW1C,gFAAgF;AAChF,mBAAmB;AACnB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB;IACjD,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,OAAO,gBAAgB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;AAC7C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAe,EAAE,MAAe;IAC/D,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,CAA4B,CAAC;IAE1D,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,0CAA0C,EAAE,MAAM,CAAC,CAAC;IAC9E,CAAC;IAED,2BAA2B;IAC3B,IAAI,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;QAClC,MAAM,IAAI,aAAa,CAAC,+BAA+B,EAAE,MAAM,CAAC,CAAC;IACnE,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,aAAa,CAAC,wCAAwC,EAAE,MAAM,CAAC,CAAC;IAC5E,CAAC;IAED,eAAe;IACf,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,CAAC,MAA6C,EAAE,MAAM,CAAC,CAAC;IAEtF,cAAc;IACd,MAAM,KAAK,GAAI,GAAG,CAAC,KAAmC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAClE,SAAS,CAAC,CAAC,EAAE,CAAC,EAAE,MAAM,CAAC,CACxB,CAAC;IAEF,OAAO;QACL,KAAK,EAAE,GAAG,CAAC,KAAe;QAC1B,MAAM;QACN,KAAK;KACN,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAClB,GAAwC,EACxC,OAAgB;IAEhB,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QACpC,OAAO,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC;IAClC,CAAC;IAED,OAAO;QACL,MAAM,EAAE,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,YAAY;QAClE,KAAK,EAAE,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;QAC5D,OAAO,EAAE,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS;KACnE,CAAC;AACJ,CAAC;AAED,SAAS,SAAS,CAChB,GAA4B,EAC5B,KAAa,EACb,MAAe;IAEf,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QACjC,MAAM,IAAI,aAAa,CAAC,iBAAiB,KAAK,+BAA+B,EAAE,MAAM,CAAC,CAAC;IACzF,CAAC;IAED,MAAM,IAAI,GAAa;QACrB,IAAI,EAAE,GAAG,CAAC,IAAc;KACzB,CAAC;IAEF,IAAI,GAAG,CAAC,KAAK,IAAI,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC/C,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,KAAgC,CAAC;IACpD,CAAC;IAED,IAAI,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;QACpC,IAAI,CAAC,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC;IAC7B,CAAC;IAED,IAAI,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;QACnC,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC;IAC3B,CAAC;IAED,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,EAAE,CAAC;QACrC,IAAI,CAAC,aAAa,GAAG,GAAG,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAClD,cAAc,CAAC,CAA4B,EAAE,CAAC,EAAE,GAAG,CAAC,IAAc,EAAE,MAAM,CAAC,CAC5E,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;QAClC,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAC5C,cAAc,CAAC,CAA4B,EAAE,CAAC,EAAE,GAAG,CAAC,IAAc,EAAE,MAAM,CAAC,CAC5E,CAAC;IACJ,CAAC;IAED,IAAI,CAAC,IAAI,CAAC,aAAa,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;QAC5C,MAAM,IAAI,aAAa,CACrB,SAAS,IAAI,CAAC,IAAI,wDAAwD,EAC1E,MAAM,CACP,CAAC;IACJ,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CACrB,GAA4B,EAC5B,KAAa,EACb,QAAgB,EAChB,MAAe;IAEf,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC9B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,MAAM,IAAI,aAAa,CACrB,4BAA4B,KAAK,aAAa,QAAQ,GAAG,EACzD,MAAM,CACP,CAAC;IACJ,CAAC;IAED,gCAAgC;IAChC,MAAM,QAAQ,GAAG,IAAI,CAAC,CAAC,CAAW,CAAC;IACnC,MAAM,KAAK,GAAG,GAAG,CAAC,QAAQ,CAAC,CAAC;IAE5B,2BAA2B;IAC3B,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;QAC7B,UAAU,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAS;QAC/C,aAAa,EAAE,WAAW,EAAE,QAAQ,EAAE,YAAY;QAClD,YAAY,EAAE,aAAa,EAAE,WAAW,EAAE,YAAY;QACtD,WAAW,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM;KAC9C,CAAC,CAAC;IAEH,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,aAAa,CACrB,+BAA+B,QAAQ,cAAc,QAAQ,GAAG,EAChE,MAAM,CACP,CAAC;IACJ,CAAC;IAED,mEAAmE;IACnE,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE,KAAK,EAAe,CAAC;AAC5C,CAAC;AAED,gFAAgF;AAChF,sBAAsB;AACtB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CAAC,QAAgB;IACpD,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAClD,OAAO,mBAAmB,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;AAChD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,OAAe,EAAE,MAAe;IAClE,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,CAA4B,CAAC;IAE1D,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QACpC,MAAM,IAAI,aAAa,CAAC,0CAA0C,EAAE,MAAM,CAAC,CAAC;IAC9E,CAAC;IAED,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;QACjC,MAAM,IAAI,aAAa,CAAC,8BAA8B,EAAE,MAAM,CAAC,CAAC;IAClE,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAClE,MAAM,IAAI,aAAa,CAAC,6CAA6C,EAAE,MAAM,CAAC,CAAC;IACjF,CAAC;IAED,eAAe;IACf,IAAI,MAA8B,CAAC;IACnC,IAAI,GAAG,CAAC,MAAM,IAAI,OAAO,GAAG,CAAC,MAAM,KAAK,QAAQ,EAAE,CAAC;QACjD,MAAM,CAAC,GAAG,GAAG,CAAC,MAAiC,CAAC;QAChD,MAAM,GAAG;YACP,QAAQ,EAAE,OAAO,CAAC,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE;YAC1D,KAAK,EAAE,OAAO,CAAC,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;YACxD,WAAW,EAAE,OAAO,CAAC,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;YAC1E,QAAQ,EAAE,CAAC,CAAC,QAAQ,IAAI,OAAO,CAAC,CAAC,QAAQ,KAAK,QAAQ;gBACpD,CAAC,CAAC,CAAC,CAAC,QAAmC;gBACvC,CAAC,CAAC,SAAS;SACd,CAAC;IACJ,CAAC;IAED,mBAAmB;IACnB,MAAM,UAAU,GAAwB,GAAG,CAAC,UAAwC,CAAC,GAAG,CACtF,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACP,IAAI,OAAO,CAAC,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC/B,MAAM,IAAI,aAAa,CAAC,0BAA0B,CAAC,+BAA+B,EAAE,MAAM,CAAC,CAAC;QAC9F,CAAC;QACD,OAAO,CAAqB,CAAC;IAC/B,CAAC,CACF,CAAC;IAEF,OAAO;QACL,IAAI,EAAE,GAAG,CAAC,IAAc;QACxB,WAAW,EAAE,OAAO,GAAG,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;QAC9E,MAAM;QACN,UAAU;KACX,CAAC;AACJ,CAAC;AAED,gFAAgF;AAChF,aAAa;AACb,gFAAgF;AAEhF;;GAEG;AACH,MAAM,OAAO,aAAc,SAAQ,KAAK;IAG7B;IAFT,YACE,OAAe,EACR,MAAe;QAEtB,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,OAAO,QAAQ,MAAM,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QAF/C,WAAM,GAAN,MAAM,CAAS;QAGtB,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * @fileoverview Eval result reporters
3
+ *
4
+ * Formats EvalSuiteResult for different output targets:
5
+ * - Console: colorful human-readable output
6
+ * - JSON: structured data for programmatic use
7
+ * - JUnit XML: CI integration format
8
+ */
9
+ import type { EvalSuiteResult } from './types.js';
10
+ /**
11
+ * Format eval results for console output with colors (ANSI).
12
+ */
13
+ export declare function formatConsole(result: EvalSuiteResult): string;
14
+ /**
15
+ * Format eval results as JSON string.
16
+ */
17
+ export declare function formatJson(result: EvalSuiteResult): string;
18
+ /**
19
+ * Format eval results as JUnit XML for CI integration.
20
+ */
21
+ export declare function formatJunit(result: EvalSuiteResult): string;
22
+ /**
23
+ * Format eval results using the specified reporter.
24
+ */
25
+ export declare function formatResults(result: EvalSuiteResult, reporter?: 'console' | 'json' | 'junit'): string;
26
+ //# sourceMappingURL=reporter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reporter.d.ts","sourceRoot":"","sources":["../../src/eval/reporter.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAMlD;;GAEG;AACH,wBAAgB,aAAa,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CA4E7D;AAMD;;GAEG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CAE1D;AAMD;;GAEG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CAuC3D;AAMD;;GAEG;AACH,wBAAgB,aAAa,CAC3B,MAAM,EAAE,eAAe,EACvB,QAAQ,GAAE,SAAS,GAAG,MAAM,GAAG,OAAmB,GACjD,MAAM,CASR"}
@@ -0,0 +1,164 @@
1
+ /**
2
+ * @fileoverview Eval result reporters
3
+ *
4
+ * Formats EvalSuiteResult for different output targets:
5
+ * - Console: colorful human-readable output
6
+ * - JSON: structured data for programmatic use
7
+ * - JUnit XML: CI integration format
8
+ */
9
+ // =============================================================================
10
+ // CONSOLE REPORTER
11
+ // =============================================================================
12
+ /**
13
+ * Format eval results for console output with colors (ANSI).
14
+ */
15
+ export function formatConsole(result) {
16
+ const lines = [];
17
+ const { suiteName, tests, summary } = result;
18
+ // Header
19
+ const statusIcon = result.status === 'pass' ? '✓' : result.status === 'fail' ? '✗' : '⚠';
20
+ const statusColor = result.status === 'pass' ? '\x1b[32m' : result.status === 'fail' ? '\x1b[31m' : '\x1b[33m';
21
+ const reset = '\x1b[0m';
22
+ const dim = '\x1b[2m';
23
+ const bold = '\x1b[1m';
24
+ lines.push(`\n${bold}${statusColor}${statusIcon} ${suiteName}${reset}`);
25
+ lines.push(`${dim}${'─'.repeat(60)}${reset}`);
26
+ // Tests
27
+ for (const test of tests) {
28
+ const icon = test.status === 'pass' ? '\x1b[32m✓' : test.status === 'fail' ? '\x1b[31m✗' : '\x1b[33m⚠';
29
+ const duration = test.durationMs != null ? ` ${dim}(${test.durationMs}ms)${reset}` : '';
30
+ lines.push(` ${icon} ${test.name}${reset}${duration}`);
31
+ if (test.error) {
32
+ lines.push(` \x1b[31mError: ${test.error}${reset}`);
33
+ }
34
+ // Show rendered prompt if available
35
+ if (test.renderedOutput) {
36
+ lines.push(` ${dim}Rendered prompt:${reset}`);
37
+ const renderedLines = test.renderedOutput.split('\n');
38
+ for (const rl of renderedLines) {
39
+ lines.push(` ${dim} ${rl}${reset}`);
40
+ }
41
+ }
42
+ // Show LLM response if available (use !== undefined to handle empty strings)
43
+ if (test.llmResponse !== undefined) {
44
+ lines.push(` ${dim}LLM response:${reset}`);
45
+ if (test.llmResponse) {
46
+ const responseLines = test.llmResponse.split('\n');
47
+ for (const rl of responseLines) {
48
+ lines.push(` ${dim} ${rl}${reset}`);
49
+ }
50
+ }
51
+ else {
52
+ lines.push(` ${dim} (empty — model may have returned an error)${reset}`);
53
+ }
54
+ }
55
+ // Show assertions
56
+ for (const assertion of test.assertions) {
57
+ if (assertion.status !== 'pass') {
58
+ const aIcon = assertion.status === 'fail' ? '\x1b[31m✗' : '\x1b[33m⚠';
59
+ const msg = assertion.message ?? `${assertion.operator} failed`;
60
+ lines.push(` ${aIcon} ${assertion.operator}: ${msg}${reset}`);
61
+ if (assertion.expected) {
62
+ lines.push(` ${dim}expected: ${assertion.expected}${reset}`);
63
+ }
64
+ if (assertion.actual) {
65
+ lines.push(` ${dim}actual: ${assertion.actual}${reset}`);
66
+ }
67
+ }
68
+ else if (assertion.operator === 'llm_judge' || assertion.operator === 'sentiment') {
69
+ // Always show reasoning for AI assertions, even on pass
70
+ lines.push(` \x1b[32m✓ ${assertion.operator}:${reset} ${dim}${assertion.message ?? 'passed'}${reset}`);
71
+ }
72
+ }
73
+ }
74
+ // Summary
75
+ lines.push(`${dim}${'─'.repeat(60)}${reset}`);
76
+ const parts = [];
77
+ if (summary.passed > 0)
78
+ parts.push(`\x1b[32m${summary.passed} passed${reset}`);
79
+ if (summary.failed > 0)
80
+ parts.push(`\x1b[31m${summary.failed} failed${reset}`);
81
+ if (summary.errored > 0)
82
+ parts.push(`\x1b[33m${summary.errored} errored${reset}`);
83
+ parts.push(`${summary.total} total`);
84
+ lines.push(` ${parts.join(', ')} ${dim}(${summary.durationMs}ms)${reset}`);
85
+ lines.push('');
86
+ return lines.join('\n');
87
+ }
88
+ // =============================================================================
89
+ // JSON REPORTER
90
+ // =============================================================================
91
+ /**
92
+ * Format eval results as JSON string.
93
+ */
94
+ export function formatJson(result) {
95
+ return JSON.stringify(result, null, 2);
96
+ }
97
+ // =============================================================================
98
+ // JUNIT XML REPORTER
99
+ // =============================================================================
100
+ /**
101
+ * Format eval results as JUnit XML for CI integration.
102
+ */
103
+ export function formatJunit(result) {
104
+ const { suiteName, tests, summary } = result;
105
+ const lines = [];
106
+ lines.push('<?xml version="1.0" encoding="UTF-8"?>');
107
+ lines.push(`<testsuite name="${escapeXml(suiteName)}" tests="${summary.total}" ` +
108
+ `failures="${summary.failed}" errors="${summary.errored}" ` +
109
+ `time="${(summary.durationMs / 1000).toFixed(3)}">`);
110
+ for (const test of tests) {
111
+ const time = test.durationMs != null ? ` time="${(test.durationMs / 1000).toFixed(3)}"` : '';
112
+ lines.push(` <testcase name="${escapeXml(test.name)}"${time}>`);
113
+ if (test.status === 'fail') {
114
+ const failedAssertions = test.assertions.filter((a) => a.status === 'fail');
115
+ const message = failedAssertions
116
+ .map((a) => a.message ?? `${a.operator} failed`)
117
+ .join('; ');
118
+ lines.push(` <failure message="${escapeXml(message)}">`);
119
+ for (const a of failedAssertions) {
120
+ lines.push(` [${a.operator}] ${a.message ?? 'failed'}`);
121
+ if (a.expected)
122
+ lines.push(` expected: ${a.expected}`);
123
+ if (a.actual)
124
+ lines.push(` actual: ${a.actual}`);
125
+ }
126
+ lines.push(' </failure>');
127
+ }
128
+ if (test.status === 'error') {
129
+ const errorMsg = test.error ?? 'Unknown error';
130
+ lines.push(` <error message="${escapeXml(errorMsg)}">${escapeXml(errorMsg)}</error>`);
131
+ }
132
+ lines.push(' </testcase>');
133
+ }
134
+ lines.push('</testsuite>');
135
+ return lines.join('\n');
136
+ }
137
+ // =============================================================================
138
+ // FORMAT SELECTOR
139
+ // =============================================================================
140
+ /**
141
+ * Format eval results using the specified reporter.
142
+ */
143
+ export function formatResults(result, reporter = 'console') {
144
+ switch (reporter) {
145
+ case 'console':
146
+ return formatConsole(result);
147
+ case 'json':
148
+ return formatJson(result);
149
+ case 'junit':
150
+ return formatJunit(result);
151
+ }
152
+ }
153
+ // =============================================================================
154
+ // HELPERS
155
+ // =============================================================================
156
+ function escapeXml(text) {
157
+ return text
158
+ .replace(/&/g, '&amp;')
159
+ .replace(/</g, '&lt;')
160
+ .replace(/>/g, '&gt;')
161
+ .replace(/"/g, '&quot;')
162
+ .replace(/'/g, '&apos;');
163
+ }
164
+ //# sourceMappingURL=reporter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"reporter.js","sourceRoot":"","sources":["../../src/eval/reporter.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,gFAAgF;AAChF,mBAAmB;AACnB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,aAAa,CAAC,MAAuB;IACnD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,CAAC;IAE7C,SAAS;IACT,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IACzF,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC;IAC/G,MAAM,KAAK,GAAG,SAAS,CAAC;IACxB,MAAM,GAAG,GAAG,SAAS,CAAC;IACtB,MAAM,IAAI,GAAG,SAAS,CAAC;IAEvB,KAAK,CAAC,IAAI,CAAC,KAAK,IAAI,GAAG,WAAW,GAAG,UAAU,IAAI,SAAS,GAAG,KAAK,EAAE,CAAC,CAAC;IACxE,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,CAAC;IAE9C,QAAQ;IACR,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC;QACvG,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,GAAG,IAAI,IAAI,CAAC,UAAU,MAAM,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACxF,KAAK,CAAC,IAAI,CAAC,KAAK,IAAI,IAAI,IAAI,CAAC,IAAI,GAAG,KAAK,GAAG,QAAQ,EAAE,CAAC,CAAC;QAExD,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,sBAAsB,IAAI,CAAC,KAAK,GAAG,KAAK,EAAE,CAAC,CAAC;QACzD,CAAC;QAED,oCAAoC;QACpC,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACxB,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,mBAAmB,KAAK,EAAE,CAAC,CAAC;YACjD,MAAM,aAAa,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACtD,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;gBAC/B,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,KAAK,EAAE,GAAG,KAAK,EAAE,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;QAED,6EAA6E;QAC7E,IAAI,IAAI,CAAC,WAAW,KAAK,SAAS,EAAE,CAAC;YACnC,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,gBAAgB,KAAK,EAAE,CAAC,CAAC;YAC9C,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBACrB,MAAM,aAAa,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;gBACnD,KAAK,MAAM,EAAE,IAAI,aAAa,EAAE,CAAC;oBAC/B,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,KAAK,EAAE,GAAG,KAAK,EAAE,CAAC,CAAC;gBAC1C,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,+CAA+C,KAAK,EAAE,CAAC,CAAC;YAC/E,CAAC;QACH,CAAC;QAED,kBAAkB;QAClB,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACxC,IAAI,SAAS,CAAC,MAAM,KAAK,MAAM,EAAE,CAAC;gBAChC,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC;gBACtE,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,IAAI,GAAG,SAAS,CAAC,QAAQ,SAAS,CAAC;gBAChE,KAAK,CAAC,IAAI,CAAC,OAAO,KAAK,IAAI,SAAS,CAAC,QAAQ,KAAK,GAAG,GAAG,KAAK,EAAE,CAAC,CAAC;gBACjE,IAAI,SAAS,CAAC,QAAQ,EAAE,CAAC;oBACvB,KAAK,CAAC,IAAI,CAAC,SAAS,GAAG,aAAa,SAAS,CAAC,QAAQ,GAAG,KAAK,EAAE,CAAC,CAAC;gBACpE,CAAC;gBACD,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;oBACrB,KAAK,CAAC,IAAI,CAAC,SAAS,GAAG,aAAa,SAAS,CAAC,MAAM,GAAG,KAAK,EAAE,CAAC,CAAC;gBAClE,CAAC;YACH,CAAC;iBAAM,IAAI,SAAS,CAAC,QAAQ,KAAK,WAAW,IAAI,SAAS,CAAC,QAAQ,KAAK,WAAW,EAAE,CAAC;gBACpF,wDAAwD;gBACxD,KAAK,CAAC,IAAI,CAAC,iBAAiB,SAAS,CAAC,QAAQ,IAAI,KAAK,IAAI,GAAG,GAAG,SAAS,CAAC,OAAO,IAAI,QAAQ,GAAG,KAAK,EAAE,CAAC,CAAC;YAC5G,CAAC;QACH,CAAC;IACH,CAAC;IAED,UAAU;IACV,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,CAAC;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,OAAO,CAAC,MAAM,UAAU,KAAK,EAAE,CAAC,CAAC;IAC/E,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,OAAO,CAAC,MAAM,UAAU,KAAK,EAAE,CAAC,CAAC;IAC/E,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,WAAW,OAAO,CAAC,OAAO,WAAW,KAAK,EAAE,CAAC,CAAC;IAClF,KAAK,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,KAAK,QAAQ,CAAC,CAAC;IACrC,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,OAAO,CAAC,UAAU,MAAM,KAAK,EAAE,CAAC,CAAC;IAC5E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,gFAAgF;AAChF,gBAAgB;AAChB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,MAAuB;IAChD,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AACzC,CAAC;AAED,gFAAgF;AAChF,qBAAqB;AACrB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,MAAuB;IACjD,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,OAAO,EAAE,GAAG,MAAM,CAAC;IAE7C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,wCAAwC,CAAC,CAAC;IACrD,KAAK,CAAC,IAAI,CACR,oBAAoB,SAAS,CAAC,SAAS,CAAC,YAAY,OAAO,CAAC,KAAK,IAAI;QACnE,aAAa,OAAO,CAAC,MAAM,aAAa,OAAO,CAAC,OAAO,IAAI;QAC3D,SAAS,CAAC,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CACtD,CAAC;IAEF,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;QAC7F,KAAK,CAAC,IAAI,CAAC,qBAAqB,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,GAAG,CAAC,CAAC;QAEjE,IAAI,IAAI,CAAC,MAAM,KAAK,MAAM,EAAE,CAAC;YAC3B,MAAM,gBAAgB,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC;YAC5E,MAAM,OAAO,GAAG,gBAAgB;iBAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,IAAI,GAAG,CAAC,CAAC,QAAQ,SAAS,CAAC;iBAC/C,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,KAAK,CAAC,IAAI,CAAC,yBAAyB,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC5D,KAAK,MAAM,CAAC,IAAI,gBAAgB,EAAE,CAAC;gBACjC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,QAAQ,KAAK,CAAC,CAAC,OAAO,IAAI,QAAQ,EAAE,CAAC,CAAC;gBAC7D,IAAI,CAAC,CAAC,QAAQ;oBAAE,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;gBAC9D,IAAI,CAAC,CAAC,MAAM;oBAAE,KAAK,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YAC1D,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;QAC/B,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,KAAK,OAAO,EAAE,CAAC;YAC5B,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,IAAI,eAAe,CAAC;YAC/C,KAAK,CAAC,IAAI,CAAC,uBAAuB,SAAS,CAAC,QAAQ,CAAC,KAAK,SAAS,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QAC3F,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAC9B,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IAC3B,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,gFAAgF;AAChF,kBAAkB;AAClB,gFAAgF;AAEhF;;GAEG;AACH,MAAM,UAAU,aAAa,CAC3B,MAAuB,EACvB,WAAyC,SAAS;IAElD,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,SAAS;YACZ,OAAO,aAAa,CAAC,MAAM,CAAC,CAAC;QAC/B,KAAK,MAAM;YACT,OAAO,UAAU,CAAC,MAAM,CAAC,CAAC;QAC5B,KAAK,OAAO;YACV,OAAO,WAAW,CAAC,MAAM,CAAC,CAAC;IAC/B,CAAC;AACH,CAAC;AAED,gFAAgF;AAChF,UAAU;AACV,gFAAgF;AAEhF,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC;SACtB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC;SACvB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;AAC7B,CAAC"}
@@ -0,0 +1,28 @@
1
+ /**
2
+ * @fileoverview Eval test runner
3
+ *
4
+ * Pipeline:
5
+ * .eval file (YAML)
6
+ * ↓ load + validate
7
+ * EvalSuite { config, tests[] }
8
+ * ↓ for each test
9
+ * ├── Load variables (from given: or from dataset + params)
10
+ * ├── Read target prompt.pdk
11
+ * ├── Render with echo-pdk
12
+ * ├── If expect_render: run assertions on rendered output
13
+ * ├── If expect_llm: send to LLM → run assertions on response
14
+ * ├── If record mode: save response as golden in .dset file
15
+ * └── Collect results
16
+ * ↓
17
+ * EvalSuiteResult { tests[], summary }
18
+ */
19
+ import type { EvalSuite, EvalSuiteResult, EvalRunnerConfig } from './types.js';
20
+ /**
21
+ * Create and run an eval suite from a file.
22
+ */
23
+ export declare function runEvalFile(evalFilePath: string, config: EvalRunnerConfig): Promise<EvalSuiteResult>;
24
+ /**
25
+ * Run an eval suite.
26
+ */
27
+ export declare function runEvalSuite(suite: EvalSuite, evalFilePath: string, config: EvalRunnerConfig): Promise<EvalSuiteResult>;
28
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAaH,OAAO,KAAK,EACV,SAAS,EAET,eAAe,EAIf,gBAAgB,EAGjB,MAAM,YAAY,CAAC;AAMpB;;GAEG;AACH,wBAAsB,WAAW,CAC/B,YAAY,EAAE,MAAM,EACpB,MAAM,EAAE,gBAAgB,GACvB,OAAO,CAAC,eAAe,CAAC,CAG1B;AAED;;GAEG;AACH,wBAAsB,YAAY,CAChC,KAAK,EAAE,SAAS,EAChB,YAAY,EAAE,MAAM,EACpB,MAAM,EAAE,gBAAgB,GACvB,OAAO,CAAC,eAAe,CAAC,CAkF1B"}
@@ -0,0 +1,232 @@
1
+ /**
2
+ * @fileoverview Eval test runner
3
+ *
4
+ * Pipeline:
5
+ * .eval file (YAML)
6
+ * ↓ load + validate
7
+ * EvalSuite { config, tests[] }
8
+ * ↓ for each test
9
+ * ├── Load variables (from given: or from dataset + params)
10
+ * ├── Read target prompt.pdk
11
+ * ├── Render with echo-pdk
12
+ * ├── If expect_render: run assertions on rendered output
13
+ * ├── If expect_llm: send to LLM → run assertions on response
14
+ * ├── If record mode: save response as golden in .dset file
15
+ * └── Collect results
16
+ * ↓
17
+ * EvalSuiteResult { tests[], summary }
18
+ */
19
+ import { readFile } from 'fs/promises';
20
+ import { resolve, dirname } from 'path';
21
+ import { createEcho } from '../index.js';
22
+ import { loadEvalFile } from './loader.js';
23
+ import { runAssertions } from './assertions.js';
24
+ import { DatasetManager } from './dataset.js';
25
+ import { createProvider } from '../providers/registry.js';
26
+ import { toLLMProvider } from '../providers/base.js';
27
+ import { createEmbeddingProvider } from '../embeddings/registry.js';
28
+ import { cosineSimilarity } from '../embeddings/cosine.js';
29
+ // =============================================================================
30
+ // EVAL RUNNER
31
+ // =============================================================================
32
+ /**
33
+ * Create and run an eval suite from a file.
34
+ */
35
+ export async function runEvalFile(evalFilePath, config) {
36
+ const suite = await loadEvalFile(evalFilePath);
37
+ return runEvalSuite(suite, evalFilePath, config);
38
+ }
39
+ /**
40
+ * Run an eval suite.
41
+ */
42
+ export async function runEvalSuite(suite, evalFilePath, config) {
43
+ const suiteStart = Date.now();
44
+ // Determine prompt directory from eval file location
45
+ // Eval files live in <prompt>/eval/tests/ — go up to prompt dir
46
+ const evalDir = dirname(resolve(evalFilePath));
47
+ const promptDir = resolve(evalDir, '..', '..');
48
+ // Create dataset manager for this prompt
49
+ const datasetManager = new DatasetManager(promptDir);
50
+ // Load the target prompt template
51
+ const targetPath = resolve(promptDir, suite.config.target);
52
+ let template;
53
+ try {
54
+ template = await readFile(targetPath, 'utf-8');
55
+ }
56
+ catch {
57
+ return createErrorResult(suite.suite, `Failed to load target prompt: ${targetPath}`);
58
+ }
59
+ // Create echo instance
60
+ const echo = createEcho({ strict: false });
61
+ // Create embedding provider (explicit config or auto-detect from OpenAI)
62
+ let embeddingProvider;
63
+ if (config.embeddingProvider) {
64
+ embeddingProvider = createEmbeddingProvider(config.embeddingProvider);
65
+ }
66
+ else if (config.aiProvider?.type === 'openai') {
67
+ // Auto-detect: reuse OpenAI API key for embeddings
68
+ embeddingProvider = createEmbeddingProvider({
69
+ type: 'openai',
70
+ apiKey: config.aiProvider.apiKey,
71
+ });
72
+ }
73
+ // Build embeddings-based similarity closure
74
+ let embeddingSimilarity;
75
+ if (embeddingProvider) {
76
+ embeddingSimilarity = async (textA, textB) => {
77
+ const vectors = await embeddingProvider.embed([textA, textB]);
78
+ return cosineSimilarity(vectors[0], vectors[1]);
79
+ };
80
+ }
81
+ // Filter tests if needed
82
+ let tests = suite.tests;
83
+ if (config.filter) {
84
+ const pattern = config.filter.toLowerCase();
85
+ tests = tests.filter((t) => t.name.toLowerCase().includes(pattern));
86
+ }
87
+ // Run each test
88
+ const testResults = [];
89
+ for (const test of tests) {
90
+ const result = await runSingleTest(test, {
91
+ template,
92
+ echo,
93
+ datasetManager,
94
+ suiteConfig: suite.config,
95
+ runnerConfig: config,
96
+ promptDir,
97
+ embeddingSimilarity,
98
+ });
99
+ testResults.push(result);
100
+ }
101
+ // Compute summary
102
+ const summary = computeSummary(testResults, Date.now() - suiteStart);
103
+ // Determine overall status
104
+ const status = testResults.some((t) => t.status === 'error')
105
+ ? 'error'
106
+ : testResults.some((t) => t.status === 'fail')
107
+ ? 'fail'
108
+ : 'pass';
109
+ return {
110
+ suiteName: suite.suite,
111
+ status,
112
+ tests: testResults,
113
+ summary,
114
+ };
115
+ }
116
+ async function runSingleTest(test, ctx) {
117
+ const testStart = Date.now();
118
+ try {
119
+ // 1. Resolve variables
120
+ let variables;
121
+ if (test.given) {
122
+ variables = test.given;
123
+ }
124
+ else if (test.dataset && test.params) {
125
+ variables = await ctx.datasetManager.getParams(test.dataset, test.params);
126
+ }
127
+ else if (test.dataset) {
128
+ // Use first parameter set from dataset
129
+ const dataset = await ctx.datasetManager.load(test.dataset);
130
+ if (dataset.parameters.length === 0) {
131
+ throw new Error(`Dataset "${test.dataset}" has no parameter sets`);
132
+ }
133
+ const { name: _name, ...vars } = dataset.parameters[0];
134
+ variables = vars;
135
+ }
136
+ else {
137
+ variables = {};
138
+ }
139
+ const allAssertions = [];
140
+ let renderedOutput;
141
+ let llmResponseText;
142
+ // 2. Render the template
143
+ renderedOutput = await ctx.echo.render(ctx.template, variables);
144
+ // 3. Run expect_render assertions
145
+ if (test.expect_render) {
146
+ const renderCtx = { text: renderedOutput };
147
+ const results = await runAssertions(test.expect_render, renderCtx);
148
+ allAssertions.push(...results);
149
+ }
150
+ // 4. Run expect_llm assertions (requires LLM call)
151
+ if (test.expect_llm) {
152
+ // For now, LLM provider is optional.
153
+ // If not configured, LLM assertions will return 'error' status.
154
+ const llmProvider = ctx.runnerConfig.aiProvider
155
+ ? toLLMProvider(createProvider(ctx.runnerConfig.aiProvider))
156
+ : undefined;
157
+ let llmResponse;
158
+ if (llmProvider) {
159
+ const model = ctx.suiteConfig.model ?? ctx.runnerConfig.aiProvider?.model;
160
+ try {
161
+ llmResponse = await llmProvider.complete(renderedOutput, model);
162
+ llmResponseText = llmResponse.text;
163
+ }
164
+ catch (llmErr) {
165
+ llmResponseText = '';
166
+ allAssertions.push({
167
+ operator: 'llm_call',
168
+ status: 'error',
169
+ message: `LLM call failed: ${llmErr.message}`,
170
+ });
171
+ }
172
+ // Record mode — save golden
173
+ if (ctx.runnerConfig.record && test.dataset && llmResponse) {
174
+ await ctx.datasetManager.recordGolden(test.dataset, llmResponse.text, llmResponse);
175
+ }
176
+ }
177
+ const llmCtx = {
178
+ text: llmResponseText ?? '',
179
+ llmResponse,
180
+ llmProvider,
181
+ loadGolden: (name) => ctx.datasetManager.getGolden(name),
182
+ embeddingSimilarity: ctx.embeddingSimilarity,
183
+ };
184
+ const results = await runAssertions(test.expect_llm, llmCtx);
185
+ allAssertions.push(...results);
186
+ }
187
+ // 5. Determine status
188
+ const status = allAssertions.some((a) => a.status === 'error')
189
+ ? 'error'
190
+ : allAssertions.some((a) => a.status === 'fail')
191
+ ? 'fail'
192
+ : 'pass';
193
+ return {
194
+ name: test.name,
195
+ status,
196
+ assertions: allAssertions,
197
+ durationMs: Date.now() - testStart,
198
+ renderedOutput,
199
+ llmResponse: llmResponseText,
200
+ };
201
+ }
202
+ catch (err) {
203
+ return {
204
+ name: test.name,
205
+ status: 'error',
206
+ assertions: [],
207
+ durationMs: Date.now() - testStart,
208
+ error: err.message,
209
+ };
210
+ }
211
+ }
212
+ // =============================================================================
213
+ // HELPERS
214
+ // =============================================================================
215
+ function computeSummary(tests, durationMs) {
216
+ return {
217
+ total: tests.length,
218
+ passed: tests.filter((t) => t.status === 'pass').length,
219
+ failed: tests.filter((t) => t.status === 'fail').length,
220
+ errored: tests.filter((t) => t.status === 'error').length,
221
+ durationMs,
222
+ };
223
+ }
224
+ function createErrorResult(suiteName, _error) {
225
+ return {
226
+ suiteName,
227
+ status: 'error',
228
+ tests: [],
229
+ summary: { total: 0, passed: 0, failed: 0, errored: 0, durationMs: 0 },
230
+ };
231
+ }
232
+ //# sourceMappingURL=runner.js.map