@modular-prompt/experiment 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -5
- package/dist/{src/evaluators/base-module.d.ts → base-evaluation-module.d.ts} +2 -2
- package/dist/base-evaluation-module.d.ts.map +1 -0
- package/dist/{src/evaluators/base-module.js → base-evaluation-module.js} +1 -1
- package/dist/base-evaluation-module.js.map +1 -0
- package/dist/cli/args.d.ts.map +1 -0
- package/dist/{src/cli → cli}/args.js +2 -0
- package/dist/cli/args.js.map +1 -0
- package/dist/config/dynamic-loader.d.ts.map +1 -0
- package/dist/{src/config → config}/dynamic-loader.js +40 -29
- package/dist/config/dynamic-loader.js.map +1 -0
- package/dist/config/loader.d.ts.map +1 -0
- package/dist/{src/config → config}/loader.js +2 -3
- package/dist/config/loader.js.map +1 -0
- package/dist/evaluators/index.d.ts +12 -0
- package/dist/evaluators/index.d.ts.map +1 -0
- package/dist/evaluators/index.js +16 -0
- package/dist/evaluators/index.js.map +1 -0
- package/dist/{src/evaluators/functional-correctness.d.ts → evaluators/llm-requirement-fulfillment.d.ts} +3 -3
- package/dist/evaluators/llm-requirement-fulfillment.d.ts.map +1 -0
- package/dist/{src/evaluators/functional-correctness.js → evaluators/llm-requirement-fulfillment.js} +7 -7
- package/dist/evaluators/llm-requirement-fulfillment.js.map +1 -0
- package/dist/{src/evaluators/json-validator.d.ts → evaluators/structured-output-presence.d.ts} +3 -3
- package/dist/evaluators/structured-output-presence.d.ts.map +1 -0
- package/dist/{src/evaluators/json-validator.js → evaluators/structured-output-presence.js} +6 -6
- package/dist/evaluators/structured-output-presence.js.map +1 -0
- package/dist/{src/index.d.ts → index.d.ts} +1 -1
- package/dist/index.d.ts.map +1 -0
- package/dist/{src/index.js → index.js} +1 -1
- package/dist/index.js.map +1 -0
- package/dist/reporter/statistics.d.ts.map +1 -0
- package/dist/reporter/statistics.js.map +1 -0
- package/dist/{src/run-comparison.d.ts → run-comparison.d.ts} +1 -0
- package/dist/run-comparison.d.ts.map +1 -0
- package/dist/{src/run-comparison.js → run-comparison.js} +17 -0
- package/dist/run-comparison.js.map +1 -0
- package/dist/runner/driver-manager.d.ts.map +1 -0
- package/dist/runner/driver-manager.js.map +1 -0
- package/dist/{src/runner → runner}/evaluator.d.ts +2 -1
- package/dist/runner/evaluator.d.ts.map +1 -0
- package/dist/{src/runner → runner}/evaluator.js +9 -1
- package/dist/runner/evaluator.js.map +1 -0
- package/dist/runner/experiment.d.ts.map +1 -0
- package/dist/{src/runner → runner}/experiment.js +1 -1
- package/dist/runner/experiment.js.map +1 -0
- package/dist/{src/types.d.ts → types.d.ts} +24 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js.map +1 -0
- package/examples/experiment.yaml +9 -17
- package/package.json +4 -4
- package/dist/src/cli/args.d.ts.map +0 -1
- package/dist/src/cli/args.js.map +0 -1
- package/dist/src/config/dynamic-loader.d.ts.map +0 -1
- package/dist/src/config/dynamic-loader.js.map +0 -1
- package/dist/src/config/loader.d.ts.map +0 -1
- package/dist/src/config/loader.js.map +0 -1
- package/dist/src/evaluators/base-module.d.ts.map +0 -1
- package/dist/src/evaluators/base-module.js.map +0 -1
- package/dist/src/evaluators/functional-correctness.d.ts.map +0 -1
- package/dist/src/evaluators/functional-correctness.js.map +0 -1
- package/dist/src/evaluators/json-validator.d.ts.map +0 -1
- package/dist/src/evaluators/json-validator.js.map +0 -1
- package/dist/src/index.d.ts.map +0 -1
- package/dist/src/index.js.map +0 -1
- package/dist/src/reporter/statistics.d.ts.map +0 -1
- package/dist/src/reporter/statistics.js.map +0 -1
- package/dist/src/run-comparison.d.ts.map +0 -1
- package/dist/src/run-comparison.js.map +0 -1
- package/dist/src/runner/driver-manager.d.ts.map +0 -1
- package/dist/src/runner/driver-manager.js.map +0 -1
- package/dist/src/runner/evaluator.d.ts.map +0 -1
- package/dist/src/runner/evaluator.js.map +0 -1
- package/dist/src/runner/experiment.d.ts.map +0 -1
- package/dist/src/runner/experiment.js.map +0 -1
- package/dist/src/types.d.ts.map +0 -1
- package/dist/src/types.js.map +0 -1
- package/dist/tsconfig.tsbuildinfo +0 -1
- /package/dist/{src/cli → cli}/args.d.ts +0 -0
- /package/dist/{src/config → config}/dynamic-loader.d.ts +0 -0
- /package/dist/{src/config → config}/loader.d.ts +0 -0
- /package/dist/{src/reporter → reporter}/statistics.d.ts +0 -0
- /package/dist/{src/reporter → reporter}/statistics.js +0 -0
- /package/dist/{src/runner → runner}/driver-manager.d.ts +0 -0
- /package/dist/{src/runner → runner}/driver-manager.js +0 -0
- /package/dist/{src/runner → runner}/experiment.d.ts +0 -0
- /package/dist/{src/types.js → types.js} +0 -0
package/README.md
CHANGED
|
@@ -69,8 +69,12 @@ testCases:
|
|
|
69
69
|
- gemini-fast
|
|
70
70
|
|
|
71
71
|
evaluators:
|
|
72
|
-
- name
|
|
73
|
-
|
|
72
|
+
# Built-in evaluators (name only)
|
|
73
|
+
- name: structured-output-presence
|
|
74
|
+
- name: llm-requirement-fulfillment
|
|
75
|
+
# Or external evaluator (with path)
|
|
76
|
+
- name: custom-validator
|
|
77
|
+
path: ./evaluators/custom-validator.ts
|
|
74
78
|
# Or inline prompt evaluator
|
|
75
79
|
- name: quality-check
|
|
76
80
|
prompt:
|
|
@@ -124,9 +128,13 @@ export default {
|
|
|
124
128
|
},
|
|
125
129
|
],
|
|
126
130
|
evaluators: [
|
|
131
|
+
// Built-in evaluators (name only)
|
|
132
|
+
{ name: 'structured-output-presence' },
|
|
133
|
+
{ name: 'llm-requirement-fulfillment' },
|
|
134
|
+
// Or external evaluator (with path)
|
|
127
135
|
{
|
|
128
|
-
name: '
|
|
129
|
-
path: './evaluators/
|
|
136
|
+
name: 'custom-validator',
|
|
137
|
+
path: './evaluators/custom-validator.ts',
|
|
130
138
|
},
|
|
131
139
|
],
|
|
132
140
|
evaluation: {
|
|
@@ -143,6 +151,9 @@ export default {
|
|
|
143
151
|
### 2. Run Experiment
|
|
144
152
|
|
|
145
153
|
```bash
|
|
154
|
+
# Validate configuration and display execution plan (recommended first step)
|
|
155
|
+
npx modular-experiment examples/experiment.yaml --dry-run
|
|
156
|
+
|
|
146
157
|
# Run with YAML config
|
|
147
158
|
npx modular-experiment examples/experiment.yaml
|
|
148
159
|
|
|
@@ -249,6 +260,48 @@ export default {
|
|
|
249
260
|
|
|
250
261
|
All prompt evaluators are automatically merged with the base evaluation module.
|
|
251
262
|
|
|
263
|
+
## Built-in Evaluators
|
|
264
|
+
|
|
265
|
+
The framework includes built-in evaluators that can be referenced by name only (no path required):
|
|
266
|
+
|
|
267
|
+
### structured-output-presence
|
|
268
|
+
|
|
269
|
+
- **Type**: Code Evaluator
|
|
270
|
+
- **What it measures**: Checks if `structuredOutput` exists and is a valid object
|
|
271
|
+
- **Evaluation logic**:
|
|
272
|
+
- Verifies presence of `structuredOutput` in query result
|
|
273
|
+
- Confirms it's a non-null object type
|
|
274
|
+
- **Score**: `(validCount / totalRuns) * 10`
|
|
275
|
+
- **Use case**: Verify that the model returns structured JSON output (essential for structured output workflows)
|
|
276
|
+
- **Usage**:
|
|
277
|
+
```yaml
|
|
278
|
+
evaluators:
|
|
279
|
+
- name: "structured-output-presence"
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### llm-requirement-fulfillment
|
|
283
|
+
|
|
284
|
+
- **Type**: Prompt Evaluator (uses LLM for evaluation)
|
|
285
|
+
- **What it measures**: Uses LLM to comprehensively evaluate whether output meets functional requirements
|
|
286
|
+
- **Evaluation criteria**:
|
|
287
|
+
1. **Requirement Fulfillment**: Does it satisfy the intent described in the prompt?
|
|
288
|
+
2. **Parameter Correctness**: Are all required parameters present and correct?
|
|
289
|
+
3. **Parameter Completeness**: Are optional parameters appropriately used or omitted?
|
|
290
|
+
4. **Logical Consistency**: Is the output logically consistent with the facts?
|
|
291
|
+
- **Score**: 0-10 overall score with detailed sub-scores for each criterion
|
|
292
|
+
- **Use case**: Comprehensive quality assessment of output (requires evaluation model to be configured)
|
|
293
|
+
- **Usage**:
|
|
294
|
+
```yaml
|
|
295
|
+
evaluators:
|
|
296
|
+
- name: "llm-requirement-fulfillment"
|
|
297
|
+
|
|
298
|
+
evaluation:
|
|
299
|
+
enabled: true
|
|
300
|
+
model: "gemini-fast" # Model used for evaluation
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
**Note**: `llm-requirement-fulfillment` requires an evaluation model to be configured in the `evaluation` section.
|
|
304
|
+
|
|
252
305
|
## Architecture
|
|
253
306
|
|
|
254
307
|
```
|
|
@@ -280,7 +333,8 @@ All prompt evaluators are automatically merged with the base evaluation module.
|
|
|
280
333
|
| `runner/evaluator.ts` | Execute evaluations |
|
|
281
334
|
| `runner/driver-manager.ts` | Cache and manage AI drivers |
|
|
282
335
|
| `reporter/statistics.ts` | Generate statistical reports |
|
|
283
|
-
| `
|
|
336
|
+
| `base-evaluation-module.ts` | Base evaluation prompt module |
|
|
337
|
+
| `evaluators/index.ts` | Built-in evaluator registry |
|
|
284
338
|
|
|
285
339
|
## Examples
|
|
286
340
|
|
|
@@ -338,6 +392,7 @@ Options:
|
|
|
338
392
|
--repeat <count> Number of repetitions (default: 1)
|
|
339
393
|
--evaluate Enable evaluation phase
|
|
340
394
|
--evaluators <names> Comma-separated evaluator names (default: all)
|
|
395
|
+
--dry-run Display execution plan without running the experiment
|
|
341
396
|
```
|
|
342
397
|
|
|
343
398
|
**Note**: All paths specified in the config file are resolved relative to the config file's directory.
|
|
@@ -5,6 +5,6 @@
|
|
|
5
5
|
* It defines how test data is presented to the evaluator.
|
|
6
6
|
*/
|
|
7
7
|
import type { PromptModule } from '@modular-prompt/core';
|
|
8
|
-
import type { EvaluationContext } from '
|
|
8
|
+
import type { EvaluationContext } from './types.js';
|
|
9
9
|
export declare const baseEvaluationModule: PromptModule<EvaluationContext>;
|
|
10
|
-
//# sourceMappingURL=base-module.d.ts.map
|
|
10
|
+
//# sourceMappingURL=base-evaluation-module.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"base-evaluation-module.d.ts","sourceRoot":"","sources":["../src/base-evaluation-module.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,YAAY,EAA4B,MAAM,sBAAsB,CAAC;AACnF,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAEpD,eAAO,MAAM,oBAAoB,EAAE,YAAY,CAAC,iBAAiB,CAuGhE,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"base-evaluation-module.js","sourceRoot":"","sources":["../src/base-evaluation-module.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,MAAM,CAAC,MAAM,oBAAoB,GAAoC;IACnE,aAAa,EAAE,GAAsB,EAAE,CAAC,CAAC;QACvC,UAAU,EAAE,EAAE;QACd,MAAM,EAAE,EAAE;QACV,IAAI,EAAE,EAAE;KACT,CAAC;IAEF,SAAS,EAAE;QACT,0CAA0C;QAC1C,yDAAyD;KAC1D;IAED,KAAK,EAAE;QACL,2CAA2C;QAC3C,2DAA2D;QAC3D,iDAAiD;QACjD,yCAAyC;KAC1C;IAED,YAAY,EAAE;QACZ;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,eAAe;YACtB,KAAK,EAAE;gBACL,gEAAgE;gBAChE,+BAA+B;gBAC/B,6CAA6C;gBAC7C,sDAAsD;aACvD;SACF;KACF;IAED,SAAS,EAAE;QACT;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,aAAa;YACpB,KAAK,EAAE;gBACL,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,UAAU;aACxB;SACF;QACD;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,aAAa;YACpB,KAAK,EAAE;gBACL,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;oBACR,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,GAAG,CAAC,MAAM;iBACJ,CAAA;aAClB;SACF;KACF;IAED,MAAM,EAAE;QACN,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;YACrC,MAAM,MAAM,GAAG,GAAG,CAAC,WAAW,CAAC;YAC/B,MAAM,QAAQ,GAAqC,EAAE,CAAC;YAEtD,aAAa;YACb,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,MAAM;gBACZ,OAAO,EAAE,OAAO,GAAG,GAAG,CAAC,EAAE;aAC1B,CAAC,CAAC;YAEH,gDAAgD;YAChD,IAAI,MAAM,CAAC,gBAAgB,EAAE,CAAC;gBAC5B,QAAQ,CAAC,IAAI,CAAC;oBACZ,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,MAAM,CAAC,gBAAgB;iBACjC,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,QAAQ,CAAC,IAAI,CAAC;oBACZ,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,MAAM,CAAC,OAAO;iBACxB,CAAC,CAAC;YACL,CAAC;YAED,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC;KACH;IAED,MAAM,EAAE;QACN;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;gBACP,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,KAAK,EAAE;wBACL,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,sBAAsB;qBACpC;oBACD,SAAS,EAAE;wBACT,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,0BAA0B;qBACxC;oBACD,OAAO,EAAE;wBACP,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,+BAA+B;qBAC7C;iBACF;gBACD,QAAQ,EAAE,CAAC,OAAO,EAAE,WAAW,CAAC;aACjC;SACF;KACF;CACF,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"args.d.ts","sourceRoot":"","sources":["../../src/cli/args.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,KAAK,EAAE,yBAAyB,EAAE,MAAM,aAAa,CAAC;AAE7D,wBAAgB,SAAS,IAAI,yBAAyB,CA6BrD"}
|
|
@@ -15,6 +15,7 @@ export function parseArgs() {
|
|
|
15
15
|
.option('--repeat <count>', 'Number of repetitions', '1')
|
|
16
16
|
.option('--evaluate', 'Enable AI-based evaluation of outputs', false)
|
|
17
17
|
.option('--evaluators <names>', 'Comma-separated evaluator names (default: all)')
|
|
18
|
+
.option('--dry-run', 'Display execution plan without running the experiment', false)
|
|
18
19
|
.parse();
|
|
19
20
|
const config = program.args[0];
|
|
20
21
|
const options = program.opts();
|
|
@@ -26,6 +27,7 @@ export function parseArgs() {
|
|
|
26
27
|
repeatCount: parseInt(options.repeat, 10),
|
|
27
28
|
enableEvaluation: options.evaluate,
|
|
28
29
|
evaluatorFilter: options.evaluators?.split(',').map((s) => s.trim()),
|
|
30
|
+
dryRun: options.dryRun,
|
|
29
31
|
};
|
|
30
32
|
}
|
|
31
33
|
//# sourceMappingURL=args.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"args.js","sourceRoot":"","sources":["../../src/cli/args.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAG/B,MAAM,UAAU,SAAS;IACvB,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;IAE9B,OAAO;SACJ,IAAI,CAAC,oBAAoB,CAAC;SAC1B,WAAW,CAAC,2CAA2C,CAAC;SACxD,QAAQ,CAAC,UAAU,EAAE,oDAAoD,CAAC;SAC1E,MAAM,CAAC,oBAAoB,EAAE,uBAAuB,CAAC;SACrD,MAAM,CAAC,oBAAoB,EAAE,oDAAoD,CAAC;SAClF,MAAM,CAAC,mBAAmB,EAAE,qDAAqD,CAAC;SAClF,MAAM,CAAC,kBAAkB,EAAE,uBAAuB,EAAE,GAAG,CAAC;SACxD,MAAM,CAAC,YAAY,EAAE,uCAAuC,EAAE,KAAK,CAAC;SACpE,MAAM,CAAC,sBAAsB,EAAE,gDAAgD,CAAC;SAChF,MAAM,CAAC,WAAW,EAAE,uDAAuD,EAAE,KAAK,CAAC;SACnF,KAAK,EAAE,CAAC;IAEX,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC/B,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;IAE/B,OAAO;QACL,UAAU,EAAE,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,MAAM,CAAC;QAC1C,cAAc,EAAE,OAAO,CAAC,QAAQ;QAChC,WAAW,EAAE,OAAO,CAAC,KAAK;QAC1B,YAAY,EAAE,OAAO,CAAC,OAAO,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACtE,WAAW,EAAE,QAAQ,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;QACzC,gBAAgB,EAAE,OAAO,CAAC,QAAQ;QAClC,eAAe,EAAE,OAAO,CAAC,UAAU,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5E,MAAM,EAAE,OAAO,CAAC,MAAM;KACvB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dynamic-loader.d.ts","sourceRoot":"","sources":["../../src/config/dynamic-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,OAAO,KAAK,EACV,kBAAkB,EAClB,aAAa,EACb,eAAe,EACf,gBAAgB,EACjB,MAAM,aAAa,CAAC;AAIrB;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,GAAG,QAAQ,CAAC;IACxB,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,eAAe,CAAC,EAAE,eAAe,CAAC;CACnC;AAED;;;;;;GAMG;AACH,wBAAsB,cAAc,CAClC,IAAI,EAAE,kBAAkB,EAAE,EAC1B,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,eAAe,EAAE,CAAC,CAqE5B;AAED;;GAEG;AACH,MAAM,MAAM,eAAe,GACvB;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAEzD;;;;;;GAMG;AACH,wBAAsB,WAAW,CAC/B,IAAI,EAAE,eAAe,EAAE,EACvB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAsB7B"}
|
|
@@ -6,7 +6,8 @@
|
|
|
6
6
|
import { merge } from '@modular-prompt/core';
|
|
7
7
|
import { pathToFileURL } from 'url';
|
|
8
8
|
import { resolve } from 'path';
|
|
9
|
-
import { baseEvaluationModule } from '../
|
|
9
|
+
import { baseEvaluationModule } from '../base-evaluation-module.js';
|
|
10
|
+
import { getBuiltinEvaluator } from '../evaluators/index.js';
|
|
10
11
|
/**
|
|
11
12
|
* Load evaluators from references
|
|
12
13
|
*
|
|
@@ -17,43 +18,17 @@ import { baseEvaluationModule } from '../evaluators/base-module.js';
|
|
|
17
18
|
export async function loadEvaluators(refs, basePath) {
|
|
18
19
|
const evaluators = [];
|
|
19
20
|
for (const ref of refs) {
|
|
21
|
+
let evaluator;
|
|
20
22
|
if ('path' in ref) {
|
|
21
23
|
// External file
|
|
22
24
|
const filePath = resolve(basePath, ref.path);
|
|
23
25
|
const fileUrl = pathToFileURL(filePath).href;
|
|
24
26
|
const imported = await import(fileUrl);
|
|
25
|
-
|
|
27
|
+
evaluator = imported.default;
|
|
26
28
|
if (!evaluator) {
|
|
27
29
|
console.warn(`⚠️ No default export in ${ref.path}`);
|
|
28
30
|
continue;
|
|
29
31
|
}
|
|
30
|
-
// Detect type by checking properties
|
|
31
|
-
if ('evaluate' in evaluator && typeof evaluator.evaluate === 'function') {
|
|
32
|
-
// Code evaluator
|
|
33
|
-
evaluators.push({
|
|
34
|
-
name: ref.name,
|
|
35
|
-
description: ref.description || evaluator.description || '',
|
|
36
|
-
type: 'code',
|
|
37
|
-
codeEvaluator: evaluator,
|
|
38
|
-
});
|
|
39
|
-
}
|
|
40
|
-
else if ('module' in evaluator) {
|
|
41
|
-
// Prompt evaluator - merge with base module
|
|
42
|
-
const mergedModule = merge(baseEvaluationModule, evaluator.module);
|
|
43
|
-
evaluators.push({
|
|
44
|
-
name: ref.name,
|
|
45
|
-
description: ref.description || evaluator.description || '',
|
|
46
|
-
type: 'prompt',
|
|
47
|
-
promptEvaluator: {
|
|
48
|
-
name: evaluator.name,
|
|
49
|
-
description: evaluator.description,
|
|
50
|
-
module: mergedModule,
|
|
51
|
-
},
|
|
52
|
-
});
|
|
53
|
-
}
|
|
54
|
-
else {
|
|
55
|
-
console.warn(`⚠️ Unknown evaluator type in ${ref.path}`);
|
|
56
|
-
}
|
|
57
32
|
}
|
|
58
33
|
else if ('prompt' in ref) {
|
|
59
34
|
// Inline prompt definition - merge with base module
|
|
@@ -68,6 +43,42 @@ export async function loadEvaluators(refs, basePath) {
|
|
|
68
43
|
module: mergedModule,
|
|
69
44
|
},
|
|
70
45
|
});
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
// Builtin evaluator (name only)
|
|
50
|
+
evaluator = getBuiltinEvaluator(ref.name);
|
|
51
|
+
if (!evaluator) {
|
|
52
|
+
console.warn(`⚠️ Builtin evaluator not found: ${ref.name}`);
|
|
53
|
+
continue;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Detect type by checking properties
|
|
57
|
+
if ('evaluate' in evaluator && typeof evaluator.evaluate === 'function') {
|
|
58
|
+
// Code evaluator
|
|
59
|
+
evaluators.push({
|
|
60
|
+
name: ref.name,
|
|
61
|
+
description: ref.description || evaluator.description || '',
|
|
62
|
+
type: 'code',
|
|
63
|
+
codeEvaluator: evaluator,
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
else if ('module' in evaluator) {
|
|
67
|
+
// Prompt evaluator - merge with base module
|
|
68
|
+
const mergedModule = merge(baseEvaluationModule, evaluator.module);
|
|
69
|
+
evaluators.push({
|
|
70
|
+
name: ref.name,
|
|
71
|
+
description: ref.description || evaluator.description || '',
|
|
72
|
+
type: 'prompt',
|
|
73
|
+
promptEvaluator: {
|
|
74
|
+
name: evaluator.name,
|
|
75
|
+
description: evaluator.description,
|
|
76
|
+
module: mergedModule,
|
|
77
|
+
},
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
console.warn(`⚠️ Unknown evaluator type: ${ref.name}`);
|
|
71
82
|
}
|
|
72
83
|
}
|
|
73
84
|
return evaluators;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dynamic-loader.js","sourceRoot":"","sources":["../../src/config/dynamic-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AAC7C,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAO/B,OAAO,EAAE,oBAAoB,EAAE,MAAM,8BAA8B,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAa7D;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAA0B,EAC1B,QAAgB;IAEhB,MAAM,UAAU,GAAsB,EAAE,CAAC;IAEzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,SAAsD,CAAC;QAE3D,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,gBAAgB;YAChB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;YAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;YACvC,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC;YAE7B,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC,4BAA4B,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBACrD,SAAS;YACX,CAAC;QACH,CAAC;aAAM,IAAI,QAAQ,IAAI,GAAG,EAAE,CAAC;YAC3B,oDAAoD;YACpD,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;YAC7D,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;gBAClC,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;YACH,SAAS;QACX,CAAC;aAAM,CAAC;YACN,gCAAgC;YAChC,SAAS,GAAG,mBAAmB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAE1C,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC,oCAAoC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAC7D,SAAS;YACX,CAAC;QACH,CAAC;QAED,qCAAqC;QACrC,IAAI,UAAU,IAAI,SAAS,IAAI,OAAO,SAAS,CAAC,QAAQ,KAAK,UAAU,EAAE,CAAC;YACxE,iBAAiB;YACjB,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,MAAM;gBACZ,aAAa,EAAE,SAA0B;aAC1C,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,4CAA4C;YAC5C,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YACnE,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,SAAS,CAAC,IAAI;oBACpB,WAAW,EAAE,SAAS,CAAC,WAAW;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CAAC,+BAA+B,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAQD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAuB,EACvB,QAAgB;IAEhB,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;QAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC;QAEhC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,CAAC,IAAI,CAAC,4BAA4B,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YACrD,SAAS;QACX,CAAC;QAED,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,MAAM,CAAC,WAAW,IAAI,EAAE;YACxD,OAAO,EAAE,MAAM,CAAC,OAAO;SACxB,CAAC,CAAC;IACL,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../src/config/loader.ts"],"names":[],"mappings":"AAAA;;GAEG;AAMH,OAAO,EAAE,SAAS,EAA0B,MAAM,wBAAwB,CAAC;AAC3E,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AAC3D,OAAO,KAAK,EAAE,kBAAkB,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAEhE,MAAM,WAAW,YAAY;IAC3B,YAAY,EAAE,GAAG,CAAC;IAClB,OAAO,EAAE,eAAe,EAAE,CAAC;IAC3B,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,UAAU,EAAE,kBAAkB,EAAE,CAAC;IACjC,SAAS,EAAE,SAAS,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;CACnB;AAGD,MAAM,MAAM,gBAAgB,GAAG,YAAY,CAAC;AAsB5C;;;;;GAKG;AACH,wBAAsB,oBAAoB,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAsGpF"}
|
|
@@ -67,9 +67,6 @@ export async function loadExperimentConfig(configPath) {
|
|
|
67
67
|
drivers: config.drivers,
|
|
68
68
|
evaluation: config.evaluation,
|
|
69
69
|
credentials: config.credentials,
|
|
70
|
-
selection: config.selection,
|
|
71
|
-
server: config.server,
|
|
72
|
-
logging: config.logging,
|
|
73
70
|
};
|
|
74
71
|
// Resolve paths in driver configurations relative to config file
|
|
75
72
|
if (serverConfig.drivers) {
|
|
@@ -104,6 +101,8 @@ export async function loadExperimentConfig(configPath) {
|
|
|
104
101
|
}
|
|
105
102
|
}
|
|
106
103
|
// Initialize AIService
|
|
104
|
+
// Note: AIService is used only as a driver factory.
|
|
105
|
+
// Model selection is explicit in experiment configuration, not capability-based.
|
|
107
106
|
const aiServiceConfig = {
|
|
108
107
|
models: serverConfig.models,
|
|
109
108
|
drivers: serverConfig.drivers || {},
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/config/loader.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAC1C,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACjD,OAAO,EAAE,UAAU,EAAE,MAAM,MAAM,CAAC;AAClC,OAAO,EAAE,SAAS,EAA0B,MAAM,wBAAwB,CAAC;AAgB3E;;;;;;GAMG;AACH,SAAS,iBAAiB,CAAC,SAAiB,EAAE,IAAY;IACxD,sDAAsD;IACtD,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,GAAG,CAAC,IAAI,IAAI,GAAG,CAAC,CAAC;IACpD,CAAC;IACD,oCAAoC;IACpC,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC;IACd,CAAC;IACD,kDAAkD;IAClD,OAAO,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;AAClC,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,UAAkB;IAC3D,0CAA0C;IAC1C,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IACtC,MAAM,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IAEhC,iCAAiC;IACjC,IAAI,MAAW,CAAC;IAEhB,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QACtC,cAAc;QACd,MAAM,OAAO,GAAG,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;QAClD,MAAM,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;IAC9B,CAAC;SAAM,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QAC9E,0DAA0D;QAC1D,MAAM,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,EAAE;YACvC,cAAc,EAAE,IAAI,EAAG,mCAAmC;YAC1D,KAAK,EAAE,IAAI,EAAY,wCAAwC;YAC/D,YAAY,EAAE,KAAK,EAAI,0BAA0B;SAClD,CAAC,CAAC;QAEH,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QAEvC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,0BAA0B,UAAU,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,IAAI,KAAK,CAAC,qCAAqC,GAAG,4CAA4C,CAAC,CAAC;IACxG,CAAC;IAED,qBAAqB;IACrB,MAAM,OAAO,GAAsB,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC;IACxD,MAAM,SAAS,GAAe,MAAM,CAAC,SAAS,IAAI,EAAE,CAAC;IACrD,MAAM,UAAU,GAAyB,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC;IAEjE,oDAAoD;IACpD,MAAM,YAAY,GAAG;QACnB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,OAAO,EAAE,MAAM,CAAC,OAAO;QACvB,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,WAAW,EAAE,MAAM,CAAC,WAAW;KAChC,CAAC;IAEF,iEAAiE;IACjE,IAAI,YAAY,CAAC,OAAO,EAAE,CAAC;QACzB,KAAK,MAAM,UAAU,IAAI,YAAY,CAAC,OAAO,EAAE,CAAC;YAC9C,MAAM,YAAY,GAAG,YAAY,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;YAEtD,8CAA8C;YAC9C,IAAI,YAAY,CAAC,eAAe,EAAE,CAAC;gBACjC,YAAY,CAAC,eAAe,GAAG,iBAAiB,CAAC,SAAS,EAAE,YAAY,CAAC,eAAe,CAAC,CAAC;YAC5F,CAAC;QACH,CAAC;IACH,CAAC;IAED,kDAAkD;IAClD,IAAI,YAAY,CAAC,WAAW,EAAE,4BAA4B,EAAE,CAAC;QAC3D,MAAM,YAAY,GAAG,iBAAiB,CAAC,SAAS,EAAE,YAAY,CAAC,WAAW,CAAC,4BAA4B,CAAC,CAAC;QACzG,OAAO,CAAC,GAAG,CAAC,8BAA8B,GAAG,YAAY,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,0CAA0C,YAAY,EAAE,CAAC,CAAC;IACxE,CAAC;IAED,aAAa;IACb,IAAI,CAAC,YAAY,CAAC,MAAM,IAAI,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1E,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;IAC3D,CAAC;IAED,mCAAmC;IACnC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAS,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC;IAErE,qCAAqC;IACrC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;YACpB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;gBACxC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;oBAC/B,MAAM,IAAI,KAAK,CAAC,eAAe,QAAQ,CAAC,IAAI,+BAA+B,SAAS,GAAG,CAAC,CAAC;gBAC3F,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,uBAAuB;IACvB,oDAAoD;IACpD,iFAAiF;IACjF,MAAM,eAAe,GAAsB;QACzC,MAAM,EAAE,YAAY,CAAC,MAAM;QAC3B,OAAO,EAAE,YAAY,CAAC,OAAO,IAAI,EAAE;QACnC,cAAc,EAAE;YACd,WAAW,EAAE,GAAG;YAChB,SAAS,EAAE,IAAI;SAChB;KACF,CAAC;IAEF,MAAM,SAAS,GAAG,IAAI,SAAS,CAAC,eAAe,CAAC,CAAC;IAEjD,OAAO;QACL,YAAY;QACZ,OAAO;QACP,SAAS;QACT,UAAU;QACV,SAAS;QACT,SAAS;KACV,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in evaluators
|
|
3
|
+
*/
|
|
4
|
+
import type { CodeEvaluator, PromptEvaluator } from '../types.js';
|
|
5
|
+
type BuiltinEvaluator = CodeEvaluator | PromptEvaluator;
|
|
6
|
+
export declare const builtinEvaluators: Record<string, BuiltinEvaluator>;
|
|
7
|
+
/**
|
|
8
|
+
* Get builtin evaluator by name
|
|
9
|
+
*/
|
|
10
|
+
export declare function getBuiltinEvaluator(name: string): BuiltinEvaluator | undefined;
|
|
11
|
+
export {};
|
|
12
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/evaluators/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,KAAK,EAAE,aAAa,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAElE,KAAK,gBAAgB,GAAG,aAAa,GAAG,eAAe,CAAC;AAExD,eAAO,MAAM,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAG9D,CAAC;AAEF;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,gBAAgB,GAAG,SAAS,CAE9E"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in evaluators
|
|
3
|
+
*/
|
|
4
|
+
import structuredOutputPresence from './structured-output-presence.js';
|
|
5
|
+
import llmRequirementFulfillment from './llm-requirement-fulfillment.js';
|
|
6
|
+
export const builtinEvaluators = {
|
|
7
|
+
'structured-output-presence': structuredOutputPresence,
|
|
8
|
+
'llm-requirement-fulfillment': llmRequirementFulfillment,
|
|
9
|
+
};
|
|
10
|
+
/**
|
|
11
|
+
* Get builtin evaluator by name
|
|
12
|
+
*/
|
|
13
|
+
export function getBuiltinEvaluator(name) {
|
|
14
|
+
return builtinEvaluators[name];
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/evaluators/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,wBAAwB,MAAM,iCAAiC,CAAC;AACvE,OAAO,yBAAyB,MAAM,kCAAkC,CAAC;AAKzE,MAAM,CAAC,MAAM,iBAAiB,GAAqC;IACjE,4BAA4B,EAAE,wBAAwB;IACtD,6BAA6B,EAAE,yBAAyB;CACzD,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,IAAY;IAC9C,OAAO,iBAAiB,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC"}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* LLM Requirement Fulfillment Evaluator
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Uses LLM to evaluate whether the output meets the functional requirements
|
|
5
5
|
*/
|
|
6
6
|
import type { PromptModule } from '@modular-prompt/core';
|
|
7
7
|
import type { EvaluationContext } from '../types.js';
|
|
@@ -11,4 +11,4 @@ declare const _default: {
|
|
|
11
11
|
module: PromptModule<EvaluationContext>;
|
|
12
12
|
};
|
|
13
13
|
export default _default;
|
|
14
|
-
//# sourceMappingURL=
|
|
14
|
+
//# sourceMappingURL=llm-requirement-fulfillment.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm-requirement-fulfillment.d.ts","sourceRoot":"","sources":["../../src/evaluators/llm-requirement-fulfillment.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAmB,iBAAiB,EAAE,MAAM,aAAa,CAAC;;;;;;AA0FtE,wBAI4B"}
|
package/dist/{src/evaluators/functional-correctness.js → evaluators/llm-requirement-fulfillment.js}
RENAMED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* LLM Requirement Fulfillment Evaluator
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Uses LLM to evaluate whether the output meets the functional requirements
|
|
5
5
|
*/
|
|
6
|
-
const
|
|
6
|
+
const llmRequirementFulfillmentModule = {
|
|
7
7
|
createContext: () => ({
|
|
8
8
|
moduleName: '',
|
|
9
9
|
prompt: '',
|
|
@@ -88,8 +88,8 @@ const functionalCorrectnessModule = {
|
|
|
88
88
|
],
|
|
89
89
|
};
|
|
90
90
|
export default {
|
|
91
|
-
name: '
|
|
92
|
-
description: '
|
|
93
|
-
module:
|
|
91
|
+
name: 'LLM Requirement Fulfillment',
|
|
92
|
+
description: 'Overall requirement fulfillment score based on LLM evaluation. Compares prompt and output to evaluate requirement fulfillment, parameter correctness, completeness, and logical consistency.',
|
|
93
|
+
module: llmRequirementFulfillmentModule,
|
|
94
94
|
};
|
|
95
|
-
//# sourceMappingURL=
|
|
95
|
+
//# sourceMappingURL=llm-requirement-fulfillment.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm-requirement-fulfillment.js","sourceRoot":"","sources":["../../src/evaluators/llm-requirement-fulfillment.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,MAAM,+BAA+B,GAAoC;IACvE,aAAa,EAAE,GAAsB,EAAE,CAAC,CAAC;QACvC,UAAU,EAAE,EAAE;QACd,MAAM,EAAE,EAAE;QACV,IAAI,EAAE,EAAE;KACT,CAAC;IAEF,SAAS,EAAE;QACT,uEAAuE;KACxE;IAED,YAAY,EAAE;QACZ,6CAA6C;QAC7C;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,qBAAqB;YAC5B,KAAK,EAAE;gBACL,qFAAqF;gBACrF,gFAAgF;gBAChF,uFAAuF;gBACvF,gFAAgF;aACjF;SACF;QACD;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,SAAS;YAChB,KAAK,EAAE;gBACL,2CAA2C;gBAC3C,iDAAiD;gBACjD,0CAA0C;aAC3C;SACF;KACF;IAED,MAAM,EAAE;QACN;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;gBACP,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,KAAK,EAAE;wBACL,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,sBAAsB;qBACpC;oBACD,SAAS,EAAE;wBACT,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,uBAAuB;qBACrC;oBACD,OAAO,EAAE;wBACP,IAAI,EAAE,QAAQ;wBACd,UAAU,EAAE;4BACV,sBAAsB,EAAE;gCACtB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;4BACD,oBAAoB,EAAE;gCACpB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;4BACD,qBAAqB,EAAE;gCACrB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;4BACD,kBAAkB,EAAE;gCAClB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;yBACF;qBACF;iBACF;gBACD,QAAQ,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,SAAS,CAAC;aAC5C;SACF;KACF;CACF,CAAC;AAEF,eAAe;IACb,IAAI,EAAE,6BAA6B;IACnC,WAAW,EAAE,8LAA8L;IAC3M,MAAM,EAAE,+BAA+B;CACd,CAAC"}
|
package/dist/{src/evaluators/json-validator.d.ts → evaluators/structured-output-presence.d.ts}
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Structured Output Presence Evaluator
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Checks if structuredOutput exists and is a valid object
|
|
5
5
|
*/
|
|
6
6
|
import type { EvaluationContext, EvaluationResult } from '../types.js';
|
|
7
7
|
declare const _default: {
|
|
@@ -10,4 +10,4 @@ declare const _default: {
|
|
|
10
10
|
evaluate(context: EvaluationContext): Promise<EvaluationResult>;
|
|
11
11
|
};
|
|
12
12
|
export default _default;
|
|
13
|
-
//# sourceMappingURL=
|
|
13
|
+
//# sourceMappingURL=structured-output-presence.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-output-presence.d.ts","sourceRoot":"","sources":["../../src/evaluators/structured-output-presence.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAiB,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;;;;sBAM5D,iBAAiB,GAAG,OAAO,CAAC,gBAAgB,CAAC;;AAJvE,wBAiD0B"}
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Structured Output Presence Evaluator
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Checks if structuredOutput exists and is a valid object
|
|
5
5
|
*/
|
|
6
6
|
export default {
|
|
7
|
-
name: '
|
|
8
|
-
description: '
|
|
7
|
+
name: 'Structured Output Presence',
|
|
8
|
+
description: 'Measures structured output presence rate (percentage of runs with valid structuredOutput). Checks if structuredOutput exists and is an object type for each run.',
|
|
9
9
|
async evaluate(context) {
|
|
10
10
|
const errors = [];
|
|
11
11
|
let validCount = 0;
|
|
@@ -33,7 +33,7 @@ export default {
|
|
|
33
33
|
? (validCount / context.runs.length) * 10
|
|
34
34
|
: 0;
|
|
35
35
|
return {
|
|
36
|
-
evaluator: '
|
|
36
|
+
evaluator: 'structured-output-presence',
|
|
37
37
|
moduleName: context.moduleName,
|
|
38
38
|
score,
|
|
39
39
|
reasoning: errors.length > 0
|
|
@@ -48,4 +48,4 @@ export default {
|
|
|
48
48
|
};
|
|
49
49
|
},
|
|
50
50
|
};
|
|
51
|
-
//# sourceMappingURL=
|
|
51
|
+
//# sourceMappingURL=structured-output-presence.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-output-presence.js","sourceRoot":"","sources":["../../src/evaluators/structured-output-presence.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,eAAe;IACb,IAAI,EAAE,4BAA4B;IAClC,WAAW,EAAE,kKAAkK;IAE/K,KAAK,CAAC,QAAQ,CAAC,OAA0B;QACvC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,MAAM,UAAU,GAA2D,EAAE,CAAC;QAE9E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,EAAE,gBAAgB,EAAE,GAAG,GAAG,CAAC,WAAW,CAAC;YAE7C,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACtB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;gBAClD,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,sBAAsB,EAAE,CAAC,CAAC;gBAC7E,SAAS;YACX,CAAC;YAED,wBAAwB;YACxB,IAAI,OAAO,gBAAgB,KAAK,QAAQ,IAAI,gBAAgB,KAAK,IAAI,EAAE,CAAC;gBACtE,UAAU,EAAE,CAAC;gBACb,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YAC/C,CAAC;iBAAM,CAAC;gBACN,MAAM,KAAK,GAAG,wBAAwB,CAAC;gBACvC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;gBACtC,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;YACvD,CAAC;QACH,CAAC;QAED,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC;YACnC,CAAC,CAAC,CAAC,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE;YACzC,CAAC,CAAC,CAAC,CAAC;QAEN,OAAO;YACL,SAAS,EAAE,4BAA4B;YACvC,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,KAAK;YACL,SAAS,EAAE,MAAM,CAAC,MAAM,GAAG,CAAC;gBAC1B,CAAC,CAAC,GAAG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,2BAA2B,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;gBACpF,CAAC,CAAC,OAAO,UAAU,oCAAoC;YACzD,OAAO,EAAE;gBACP,UAAU;gBACV,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM;gBAC/B,MAAM;gBACN,IAAI,EAAE,UAAU;aACjB;SACF,CAAC;IACJ,CAAC;CACsB,CAAC"}
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
export * from './types.js';
|
|
7
7
|
export { loadExperimentConfig } from './config/loader.js';
|
|
8
8
|
export { loadModules, loadEvaluators } from './config/dynamic-loader.js';
|
|
9
|
-
export { baseEvaluationModule } from './
|
|
9
|
+
export { baseEvaluationModule } from './base-evaluation-module.js';
|
|
10
10
|
export { DriverManager } from './runner/driver-manager.js';
|
|
11
11
|
export { ExperimentRunner } from './runner/experiment.js';
|
|
12
12
|
export { EvaluatorRunner } from './runner/evaluator.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,cAAc,YAAY,CAAC;AAG3B,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAC1D,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAGzE,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AAGnE,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAC3D,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAGxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC"}
|
|
@@ -9,7 +9,7 @@ export * from './types.js';
|
|
|
9
9
|
export { loadExperimentConfig } from './config/loader.js';
|
|
10
10
|
export { loadModules, loadEvaluators } from './config/dynamic-loader.js';
|
|
11
11
|
// Evaluators
|
|
12
|
-
export { baseEvaluationModule } from './
|
|
12
|
+
export { baseEvaluationModule } from './base-evaluation-module.js';
|
|
13
13
|
// Runners
|
|
14
14
|
export { DriverManager } from './runner/driver-manager.js';
|
|
15
15
|
export { ExperimentRunner } from './runner/experiment.js';
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,QAAQ;AACR,cAAc,YAAY,CAAC;AAE3B,wBAAwB;AACxB,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAC1D,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAEzE,aAAa;AACb,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AAEnE,UAAU;AACV,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAC3D,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAExD,YAAY;AACZ,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"statistics.d.ts","sourceRoot":"","sources":["../../src/reporter/statistics.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAa,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzD,qBAAa,kBAAkB;IACjB,OAAO,CAAC,OAAO;gBAAP,OAAO,EAAE,UAAU,EAAE;IAEzC;;OAEG;IACH,MAAM,IAAI,IAAI;IA2Bd;;OAEG;IACH,OAAO,CAAC,YAAY;IASpB;;OAEG;IACH,OAAO,CAAC,iBAAiB;CAwB1B"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"statistics.js","sourceRoot":"","sources":["../../src/reporter/statistics.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,MAAM,OAAO,kBAAkB;IACT;IAApB,YAAoB,OAAqB;QAArB,YAAO,GAAP,OAAO,CAAc;IAAG,CAAC;IAE7C;;OAEG;IACH,MAAM;QACJ,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,EAAE,CAAC;QAEd,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,QAAQ,MAAM,MAAM,CAAC,KAAK,OAAO,MAAM,CAAC,MAAM,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC;YACvF,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;YAE5B,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;YACvD,MAAM,WAAW,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;YAEpE,OAAO,CAAC,GAAG,CAAC,iBAAiB,WAAW,CAAC,MAAM,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,KAAK,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAEtG,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC;gBAC/B,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;YACtC,CAAC;YAED,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC9B,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAiB;QACpC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;QAC5D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;QAC/B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;QAE/B,OAAO,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,GAAG,WAAW,GAAG,IAAI,CAAC,CAAC;IACrF,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,IAAiB;QACzC,2BAA2B;QAC3B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE;YAC/B,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;YAC7D,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QACxC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC;QAE3B,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO;QACT,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;QAC3C,OAAO,CAAC,GAAG,CAAC,uBAAuB,aAAa,CAAC,IAAI,0BAA0B,WAAW,CAAC,MAAM,SAAS,CAAC,CAAC;QAE5G,IAAI,aAAa,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;YACjC,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;gBAChD,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;gBAC3D,OAAO,CAAC,GAAG,CAAC,cAAc,GAAG,GAAG,CAAC,KAAK,KAAK,OAAO,MAAM,EAAE,CAAC,CAAC;YAC9D,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;CACF"}
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
* --repeat <count> Number of repetitions (default: 1)
|
|
18
18
|
* --evaluate Enable evaluation phase
|
|
19
19
|
* --evaluators <names> Comma-separated evaluator names (default: all)
|
|
20
|
+
* --dry-run Display execution plan without running the experiment
|
|
20
21
|
*/
|
|
21
22
|
export {};
|
|
22
23
|
//# sourceMappingURL=run-comparison.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run-comparison.d.ts","sourceRoot":"","sources":["../src/run-comparison.ts"],"names":[],"mappings":";AACA;;;;;;;;;;;;;;;;;;;GAmBG"}
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
* --repeat <count> Number of repetitions (default: 1)
|
|
18
18
|
* --evaluate Enable evaluation phase
|
|
19
19
|
* --evaluators <names> Comma-separated evaluator names (default: all)
|
|
20
|
+
* --dry-run Display execution plan without running the experiment
|
|
20
21
|
*/
|
|
21
22
|
import { parseArgs } from './cli/args.js';
|
|
22
23
|
import { loadExperimentConfig } from './config/loader.js';
|
|
@@ -39,6 +40,7 @@ console.log(`Evaluation: ${options.enableEvaluation ? 'enabled' : 'disabled'}`);
|
|
|
39
40
|
if (options.enableEvaluation) {
|
|
40
41
|
console.log(`Evaluators: ${options.evaluatorFilter?.join(', ') || 'all'}`);
|
|
41
42
|
}
|
|
43
|
+
console.log(`Dry run: ${options.dryRun ? 'enabled (plan only)' : 'disabled'}`);
|
|
42
44
|
console.log('='.repeat(80));
|
|
43
45
|
console.log();
|
|
44
46
|
// Load configuration
|
|
@@ -60,6 +62,12 @@ else {
|
|
|
60
62
|
console.log(`📋 Testing with ${modelEntries.length} model(s):`);
|
|
61
63
|
modelEntries.forEach(([name, spec]) => console.log(` - ${name}: ${spec.model} (${spec.provider})`));
|
|
62
64
|
}
|
|
65
|
+
// Warn about MLX resource usage
|
|
66
|
+
const hasMLX = modelEntries.some(([_, spec]) => spec.provider === 'mlx');
|
|
67
|
+
if (hasMLX) {
|
|
68
|
+
console.log();
|
|
69
|
+
console.log('⚠️ MLX models detected: Running multiple MLX models may consume significant system resources (CPU/Memory)');
|
|
70
|
+
}
|
|
63
71
|
console.log();
|
|
64
72
|
// Load test cases
|
|
65
73
|
const allTestCases = configTestCases;
|
|
@@ -124,6 +132,15 @@ if (options.enableEvaluation) {
|
|
|
124
132
|
console.log(`🔍 Evaluator model: ${modelName} (${modelSpec.provider}:${modelSpec.model})`);
|
|
125
133
|
console.log();
|
|
126
134
|
}
|
|
135
|
+
// Exit if dry run
|
|
136
|
+
if (options.dryRun) {
|
|
137
|
+
console.log('='.repeat(80));
|
|
138
|
+
console.log('✅ Configuration validated successfully');
|
|
139
|
+
console.log('📋 Execution plan displayed above');
|
|
140
|
+
console.log(' Remove --dry-run to execute the experiment');
|
|
141
|
+
console.log('='.repeat(80));
|
|
142
|
+
process.exit(0);
|
|
143
|
+
}
|
|
127
144
|
// Run experiment
|
|
128
145
|
const driverManager = new DriverManager();
|
|
129
146
|
const runner = new ExperimentRunner(aiService, driverManager, modules, testCases, models, options.repeatCount, evaluators, evaluatorModel);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run-comparison.js","sourceRoot":"","sources":["../src/run-comparison.ts"],"names":[],"mappings":";AACA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAC1C,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAC1D,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AACzE,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAC3D,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC;AAE9D,sBAAsB;AACtB,MAAM,OAAO,GAAG,SAAS,EAAE,CAAC;AAE5B,iBAAiB;AACjB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;AAC5B,OAAO,CAAC,GAAG,CAAC,8BAA8B,CAAC,CAAC;AAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;AAC5B,OAAO,CAAC,GAAG,CAAC,WAAW,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;AAC7C,OAAO,CAAC,GAAG,CAAC,qBAAqB,OAAO,CAAC,cAAc,IAAI,KAAK,EAAE,CAAC,CAAC;AACpE,OAAO,CAAC,GAAG,CAAC,iBAAiB,OAAO,CAAC,WAAW,IAAI,oBAAoB,EAAE,CAAC,CAAC;AAC5E,OAAO,CAAC,GAAG,CAAC,YAAY,OAAO,CAAC,YAAY,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,KAAK,EAAE,CAAC,CAAC;AACrE,OAAO,CAAC,GAAG,CAAC,WAAW,OAAO,CAAC,WAAW,UAAU,CAAC,CAAC;AACtD,OAAO,CAAC,GAAG,CAAC,eAAe,OAAO,CAAC,gBAAgB,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC;AAChF,IAAI,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAC7B,OAAO,CAAC,GAAG,CAAC,eAAe,OAAO,CAAC,eAAe,EAAE,IAAI,CAAC,IAAI,CAAC,IAAI,KAAK,EAAE,CAAC,CAAC;AAC7E,CAAC;AACD,OAAO,CAAC,GAAG,CAAC,YAAY,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC;AAC/E,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;AAC5B,OAAO,CAAC,GAAG,EAAE,CAAC;AAEd,qBAAqB;AACrB,MAAM,EACJ,YAAY,EACZ,OAAO,EAAE,aAAa,EACtB,SAAS,EAAE,eAAe,EAC1B,UAAU,EAAE,gBAAgB,EAC5B,SAAS,EACT,SAAS,EACV,GAAG,MAAM,oBAAoB,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;AAEnD,8CAA8C;AAC9C,MAAM,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC;AAEnC,uCAAuC;AACvC,MAAM,YAAY,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,CAAgB,EAAE,EAAE,CAC9E,IAAI,CAAC,OAAO,KAAK,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC,CAC/D,CAAC;AAEF,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;IACxB,MAAM,eAAe,GAAG,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,CAAgB,EAAE,EAAE,CACvE,IAAI,CAAC,QAAQ,KAAK,OAAO,CAAC,WAAW,CACtC,CAAC;IACF,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACjC,OAAO,CAAC,KAAK,CAAC,gDAAgD,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC;QACrF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,mBAAmB,eAAe,CAAC,MAAM,0BAA0B,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IACxG,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAgB,EAAE,EAAE,CACtD,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,KAAK,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,QAAQ,GAAG,CAAC,CAC7D,CAAC;AACJ,CAAC;KAAM,CAAC;IACN,OAAO,CAAC,GAAG,CAAC,mBAAmB,YAAY,CAAC,MAAM,YAAY,CAAC,CAAC;IAChE,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAgB,EAAE,EAAE,CACnD,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,KAAK,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,QAAQ,GAAG,CAAC,CAC7D,CAAC;AACJ,CAAC;AAED,gCAAgC;AAChC,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,CAAgB,EAAE,EAAE,CAAC,IAAI,CAAC,QAAQ,KAAK,KAAK,CAAC,CAAC;AACxF,IAAI,MAAM,EAAE,CAAC;IACX,OAAO,CAAC,GAAG,EAAE,CAAC;IACd,OAAO,CAAC,GAAG,CAAC,4GAA4G,CAAC,CAAC;AAC5H,CAAC;AACD,OAAO,CAAC,GAAG,EAAE,CAAC;AAEd,kBAAkB;AAClB,MAAM,YAAY,GAAG,eAAe,CAAC;AACrC,MAAM,SAAS,GAAG,OAAO,CAAC,cAAc;IACtC,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,EAAO,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,KAAK,OAAO,CAAC,cAAc,CAAC;IACtE,CAAC,CAAC,YAAY,CAAC;AAEjB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;IAC3B,OAAO,CAAC,KAAK,CAAC,wBAAwB,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,cAAc,OAAO,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC9G,OAAO,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;IACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,GAAG,CAAC,cAAc,SAAS,CAAC,MAAM,eAAe,CAAC,CAAC;AAC3D,OAAO,CAAC,GAAG,EAAE,CAAC;AAEd,wCAAwC;AACxC,MAAM,UAAU,GAAG,MAAM,WAAW,CAAC,aAAa,EAAE,SAAS,CAAC,CAAC;AAC/D,MAAM,OAAO,GAAG,OAAO,CAAC,YAAY;IAClC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,YAAa,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAChE,CAAC,CAAC,UAAU,CAAC;AAEf,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;IACzB,OAAO,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;IACtC,OAAO,CAAC,KAAK,CAAC,sCAAsC,CAAC,CAAC;IACtD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,OAAO,CAAC,GAAG,CAAC,cAAc,OAAO,CAAC,MAAM,aAAa,CAAC,CAAC;AACvD,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;AACrE,OAAO,CAAC,GAAG,EAAE,CAAC;AAEd,8DAA8D;AAC9D,IAAI,UAAU,CAAC;AACf,IAAI,cAAc,CAAC;AACnB,IAAI,OAAO,CAAC,gBAAgB,EAAE,CAAC;IAC7B,8CAA8C;IAC9C,MAAM,aAAa,GAAG,MAAM,cAAc,CAAC,gBAAgB,EAAE,SAAS,CAAC,CAAC;IACxE,UAAU,GAAG,OAAO,CAAC,eAAe;QAClC,CAAC,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,eAAgB,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACtE,CAAC,CAAC,aAAa,CAAC;IAElB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,OAAO,CAAC,KAAK,CAAC,uBAAuB,CAAC,CAAC;QACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,8CAA8C;IAC9C,IAAI,CAAC,YAAY,CAAC,UAAU,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,OAAO,EAAE,CAAC;QACjE,OAAO,CAAC,KAAK,CAAC,+CAA+C,CAAC,CAAC;QAC/D,OAAO,CAAC,KAAK,CAAC,uDAAuD,CAAC,CAAC;QACvE,OAAO,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;QAChC,OAAO,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;QACpC,OAAO,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QAC1C,OAAO,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;QAChD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,gBAAgB,GAAG,YAAY,CAAC,UAAU,CAAC;IAEjD,mCAAmC;IACnC,MAAM,SAAS,GAAG,gBAAgB,CAAC,KAAK,CAAC;IACzC,MAAM,SAAS,GAAG,YAAY,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;IAEjD,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,OAAO,KAAK,KAAK,EAAE,CAAC;QAC9C,OAAO,CAAC,KAAK,CAAC,4CAA4C,SAAS,EAAE,CAAC,CAAC;QACvE,OAAO,CAAC,KAAK,CAAC,yEAAyE,CAAC,CAAC;QACzF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,cAAc,GAAG,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC;IAEtD,OAAO,CAAC,GAAG,CAAC,8BAA8B,UAAU,CAAC,MAAM,gBAAgB,CAAC,CAAC;IAC7E,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;IACpF,OAAO,CAAC,GAAG,CAAC,uBAAuB,SAAS,KAAK,SAAS,CAAC,QAAQ,IAAI,SAAS,CAAC,KAAK,GAAG,CAAC,CAAC;IAC3F,OAAO,CAAC,GAAG,EAAE,CAAC;AAChB,CAAC;AAED,kBAAkB;AAClB,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;IACnB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,wCAAwC,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,mCAAmC,CAAC,CAAC;IACjD,OAAO,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC;IAC7D,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,iBAAiB;AACjB,MAAM,aAAa,GAAG,IAAI,aAAa,EAAE,CAAC;AAC1C,MAAM,MAAM,GAAG,IAAI,gBAAgB,CACjC,SAAS,EACT,aAAa,EACb,OAAO,EACP,SAAS,EACT,MAAM,EACN,OAAO,CAAC,WAAW,EACnB,UAAU,EACV,cAAc,CACf,CAAC;AAEF,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,GAAG,EAAE,CAAC;AAEnC,qBAAqB;AACrB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;AAC5B,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;AACtC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;AAE5B,kBAAkB;AAClB,MAAM,aAAa,CAAC,OAAO,EAAE,CAAC;AAE9B,iCAAiC;AACjC,IAAI,OAAO,CAAC,WAAW,GAAG,CAAC,EAAE,CAAC;IAC5B,MAAM,QAAQ,GAAG,IAAI,kBAAkB,CAAC,OAAO,CAAC,CAAC;IACjD,QAAQ,CAAC,MAAM,EAAE,CAAC;AACpB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"driver-manager.d.ts","sourceRoot":"","sources":["../../src/runner/driver-manager.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAC;AAEnE,qBAAa,aAAa;IACxB,OAAO,CAAC,KAAK,CAA0B;IAEvC;;;;;;;;;;OAUG;IACG,WAAW,CAAC,SAAS,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC;IAY9F;;;;OAIG;IACG,KAAK,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAkB7C;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAiB/B"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"driver-manager.js","sourceRoot":"","sources":["../../src/runner/driver-manager.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,MAAM,OAAO,aAAa;IAChB,KAAK,GAAG,IAAI,GAAG,EAAe,CAAC;IAEvC;;;;;;;;;;OAUG;IACH,KAAK,CAAC,WAAW,CAAC,SAAoB,EAAE,SAAiB,EAAE,SAAoB;QAC7E,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;YAC9B,OAAO,CAAC,GAAG,CAAC,8BAA8B,SAAS,EAAE,CAAC,CAAC;YACvD,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACnC,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,8BAA8B,SAAS,KAAK,SAAS,CAAC,QAAQ,IAAI,SAAS,CAAC,KAAK,GAAG,CAAC,CAAC;QAClG,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;QACvD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;QAClC,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,KAAK,CAAC,SAAiB;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAEzC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO;QACT,CAAC;QAED,IAAI,CAAC;YACH,IAAI,OAAO,MAAM,CAAC,KAAK,KAAK,UAAU,EAAE,CAAC;gBACvC,MAAM,MAAM,CAAC,KAAK,EAAE,CAAC;gBACrB,OAAO,CAAC,GAAG,CAAC,uBAAuB,SAAS,EAAE,CAAC,CAAC;YAClD,CAAC;YACD,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC/B,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,GAAG,CAAC,iCAAiC,SAAS,KAAK,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACvH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;QAEjC,KAAK,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;YACjD,IAAI,CAAC;gBACH,IAAI,MAAM,IAAI,OAAO,MAAM,CAAC,KAAK,KAAK,UAAU,EAAE,CAAC;oBACjD,MAAM,MAAM,CAAC,KAAK,EAAE,CAAC;oBACrB,OAAO,CAAC,GAAG,CAAC,qBAAqB,GAAG,EAAE,CAAC,CAAC;gBAC1C,CAAC;YACH,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,GAAG,CAAC,sCAAsC,GAAG,KAAK,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;YACtH,CAAC;QACH,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;IACrC,CAAC;CACF"}
|
|
@@ -26,7 +26,8 @@ export declare class EvaluatorRunner {
|
|
|
26
26
|
* Display evaluation results
|
|
27
27
|
*
|
|
28
28
|
* @param results - Evaluation results to display
|
|
29
|
+
* @param evaluators - Loaded evaluators (optional, for showing descriptions)
|
|
29
30
|
*/
|
|
30
|
-
displayResults(results: EvaluationResult[]): void;
|
|
31
|
+
displayResults(results: EvaluationResult[], evaluators?: LoadedEvaluator[]): void;
|
|
31
32
|
}
|
|
32
33
|
//# sourceMappingURL=evaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../../src/runner/evaluator.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAC;AACnE,OAAO,KAAK,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACvE,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAEnE,qBAAa,eAAe;IAExB,OAAO,CAAC,SAAS;IACjB,OAAO,CAAC,cAAc;gBADd,SAAS,EAAE,SAAS,EACpB,cAAc,EAAE,SAAS;IAGnC;;;;;;OAMG;IACG,QAAQ,CACZ,SAAS,EAAE,eAAe,EAC1B,OAAO,EAAE,iBAAiB,GACzB,OAAO,CAAC,gBAAgB,CAAC;IAuB5B;;OAEG;YACW,kBAAkB;IAiEhC;;;;;OAKG;IACH,cAAc,CAAC,OAAO,EAAE,gBAAgB,EAAE,EAAE,UAAU,CAAC,EAAE,eAAe,EAAE,GAAG,IAAI;CAmDlF"}
|
|
@@ -102,8 +102,9 @@ export class EvaluatorRunner {
|
|
|
102
102
|
* Display evaluation results
|
|
103
103
|
*
|
|
104
104
|
* @param results - Evaluation results to display
|
|
105
|
+
* @param evaluators - Loaded evaluators (optional, for showing descriptions)
|
|
105
106
|
*/
|
|
106
|
-
displayResults(results) {
|
|
107
|
+
displayResults(results, evaluators) {
|
|
107
108
|
console.log();
|
|
108
109
|
console.log('='.repeat(80));
|
|
109
110
|
console.log('📊 Evaluation Results');
|
|
@@ -122,6 +123,13 @@ export class EvaluatorRunner {
|
|
|
122
123
|
console.log('─'.repeat(80));
|
|
123
124
|
for (const result of moduleResults) {
|
|
124
125
|
console.log(` 🔍 ${result.evaluator}`);
|
|
126
|
+
// Show description if evaluators are provided
|
|
127
|
+
if (evaluators) {
|
|
128
|
+
const evaluator = evaluators.find(e => e.name === result.evaluator);
|
|
129
|
+
if (evaluator?.description) {
|
|
130
|
+
console.log(` ${evaluator.description}`);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
125
133
|
if (result.error) {
|
|
126
134
|
console.log(` ❌ Error: ${result.error}`);
|
|
127
135
|
}
|