@modular-prompt/experiment 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +347 -0
- package/dist/src/cli/args.d.ts +6 -0
- package/dist/src/cli/args.d.ts.map +1 -0
- package/dist/src/cli/args.js +31 -0
- package/dist/src/cli/args.js.map +1 -0
- package/dist/src/config/dynamic-loader.d.ts +41 -0
- package/dist/src/config/dynamic-loader.d.ts.map +1 -0
- package/dist/src/config/dynamic-loader.js +101 -0
- package/dist/src/config/dynamic-loader.js.map +1 -0
- package/dist/src/config/loader.d.ts +23 -0
- package/dist/src/config/loader.d.ts.map +1 -0
- package/dist/src/config/loader.js +125 -0
- package/dist/src/config/loader.js.map +1 -0
- package/dist/src/evaluators/base-module.d.ts +10 -0
- package/dist/src/evaluators/base-module.d.ts.map +1 -0
- package/dist/src/evaluators/base-module.js +103 -0
- package/dist/src/evaluators/base-module.js.map +1 -0
- package/dist/src/evaluators/functional-correctness.d.ts +14 -0
- package/dist/src/evaluators/functional-correctness.d.ts.map +1 -0
- package/dist/src/evaluators/functional-correctness.js +95 -0
- package/dist/src/evaluators/functional-correctness.js.map +1 -0
- package/dist/src/evaluators/json-validator.d.ts +13 -0
- package/dist/src/evaluators/json-validator.d.ts.map +1 -0
- package/dist/src/evaluators/json-validator.js +51 -0
- package/dist/src/evaluators/json-validator.js.map +1 -0
- package/dist/src/index.d.ts +14 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +19 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/reporter/statistics.d.ts +21 -0
- package/dist/src/reporter/statistics.d.ts.map +1 -0
- package/dist/src/reporter/statistics.js +68 -0
- package/dist/src/reporter/statistics.js.map +1 -0
- package/dist/src/run-comparison.d.ts +22 -0
- package/dist/src/run-comparison.d.ts.map +1 -0
- package/dist/src/run-comparison.js +142 -0
- package/dist/src/run-comparison.js.map +1 -0
- package/dist/src/runner/driver-manager.d.ts +30 -0
- package/dist/src/runner/driver-manager.d.ts.map +1 -0
- package/dist/src/runner/driver-manager.js +68 -0
- package/dist/src/runner/driver-manager.js.map +1 -0
- package/dist/src/runner/evaluator.d.ts +32 -0
- package/dist/src/runner/evaluator.d.ts.map +1 -0
- package/dist/src/runner/evaluator.js +146 -0
- package/dist/src/runner/evaluator.js.map +1 -0
- package/dist/src/runner/experiment.d.ts +40 -0
- package/dist/src/runner/experiment.d.ts.map +1 -0
- package/dist/src/runner/experiment.js +214 -0
- package/dist/src/runner/experiment.js.map +1 -0
- package/dist/src/types.d.ts +112 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +5 -0
- package/dist/src/types.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/examples/experiment.yaml +70 -0
- package/package.json +70 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Configuration loader
|
|
3
|
+
*/
|
|
4
|
+
import { readFileSync } from 'fs';
|
|
5
|
+
import { parse as parseYaml } from 'yaml';
|
|
6
|
+
import { resolve, dirname, extname } from 'path';
|
|
7
|
+
import { createJiti } from 'jiti';
|
|
8
|
+
import { AIService } from '@modular-prompt/driver';
|
|
9
|
+
/**
|
|
10
|
+
* Resolve path relative to config file directory
|
|
11
|
+
*
|
|
12
|
+
* @param configDir - Config file directory
|
|
13
|
+
* @param path - Path to resolve (can be relative or absolute)
|
|
14
|
+
* @returns Resolved absolute path
|
|
15
|
+
*/
|
|
16
|
+
function resolveConfigPath(configDir, path) {
|
|
17
|
+
// If path starts with ~, resolve it as home directory
|
|
18
|
+
if (path.startsWith('~')) {
|
|
19
|
+
return path.replace('~', process.env.HOME || '~');
|
|
20
|
+
}
|
|
21
|
+
// If path is absolute, return as is
|
|
22
|
+
if (path.startsWith('/')) {
|
|
23
|
+
return path;
|
|
24
|
+
}
|
|
25
|
+
// Otherwise, resolve relative to config directory
|
|
26
|
+
return resolve(configDir, path);
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Load experiment configuration
|
|
30
|
+
*
|
|
31
|
+
* @param configPath - Path to config file (YAML or TypeScript)
|
|
32
|
+
* @returns LoadedConfig with all configuration
|
|
33
|
+
*/
|
|
34
|
+
export async function loadExperimentConfig(configPath) {
|
|
35
|
+
// Get config directory and file extension
|
|
36
|
+
const configDir = dirname(configPath);
|
|
37
|
+
const ext = extname(configPath);
|
|
38
|
+
// Load config based on file type
|
|
39
|
+
let config;
|
|
40
|
+
if (ext === '.yaml' || ext === '.yml') {
|
|
41
|
+
// YAML format
|
|
42
|
+
const content = readFileSync(configPath, 'utf-8');
|
|
43
|
+
config = parseYaml(content);
|
|
44
|
+
}
|
|
45
|
+
else if (ext === '.ts' || ext === '.js' || ext === '.mjs' || ext === '.cjs') {
|
|
46
|
+
// TypeScript/JavaScript format (auto-transpile with jiti)
|
|
47
|
+
const jiti = createJiti(import.meta.url, {
|
|
48
|
+
interopDefault: true, // Automatically get default export
|
|
49
|
+
cache: true, // Enable caching for better performance
|
|
50
|
+
requireCache: false, // Don't use require cache
|
|
51
|
+
});
|
|
52
|
+
config = await jiti.import(configPath);
|
|
53
|
+
if (!config) {
|
|
54
|
+
throw new Error(`❌ No default export in ${configPath}`);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
throw new Error(`❌ Unsupported config file format: ${ext}. Use .yaml, .yml, .ts, .js, .mjs, or .cjs`);
|
|
59
|
+
}
|
|
60
|
+
// Extract components
|
|
61
|
+
const modules = config.modules || [];
|
|
62
|
+
const testCases = config.testCases || [];
|
|
63
|
+
const evaluators = config.evaluators || [];
|
|
64
|
+
// Server config (models, drivers, evaluation, etc.)
|
|
65
|
+
const serverConfig = {
|
|
66
|
+
models: config.models,
|
|
67
|
+
drivers: config.drivers,
|
|
68
|
+
evaluation: config.evaluation,
|
|
69
|
+
credentials: config.credentials,
|
|
70
|
+
selection: config.selection,
|
|
71
|
+
server: config.server,
|
|
72
|
+
logging: config.logging,
|
|
73
|
+
};
|
|
74
|
+
// Resolve paths in driver configurations relative to config file
|
|
75
|
+
if (serverConfig.drivers) {
|
|
76
|
+
for (const driverName in serverConfig.drivers) {
|
|
77
|
+
const driverConfig = serverConfig.drivers[driverName];
|
|
78
|
+
// Resolve credentialsPath for vertexai driver
|
|
79
|
+
if (driverConfig.credentialsPath) {
|
|
80
|
+
driverConfig.credentialsPath = resolveConfigPath(configDir, driverConfig.credentialsPath);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
// Setup driver environment variables if specified
|
|
85
|
+
if (serverConfig.credentials?.googleApplicationCredentials) {
|
|
86
|
+
const resolvedPath = resolveConfigPath(configDir, serverConfig.credentials.googleApplicationCredentials);
|
|
87
|
+
process.env.GOOGLE_APPLICATION_CREDENTIALS = resolvedPath;
|
|
88
|
+
console.log(`Setting GOOGLE_APPLICATION_CREDENTIALS=${resolvedPath}`);
|
|
89
|
+
}
|
|
90
|
+
// Validation
|
|
91
|
+
if (!serverConfig.models || Object.keys(serverConfig.models).length === 0) {
|
|
92
|
+
throw new Error('❌ No models configured in config file');
|
|
93
|
+
}
|
|
94
|
+
// Get model names from object keys
|
|
95
|
+
const modelNames = new Set(Object.keys(serverConfig.models));
|
|
96
|
+
// Validate testCase model references
|
|
97
|
+
for (const testCase of testCases) {
|
|
98
|
+
if (testCase.models) {
|
|
99
|
+
for (const modelName of testCase.models) {
|
|
100
|
+
if (!modelNames.has(modelName)) {
|
|
101
|
+
throw new Error(`❌ TestCase '${testCase.name}' references unknown model '${modelName}'`);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
// Initialize AIService
|
|
107
|
+
const aiServiceConfig = {
|
|
108
|
+
models: serverConfig.models,
|
|
109
|
+
drivers: serverConfig.drivers || {},
|
|
110
|
+
defaultOptions: {
|
|
111
|
+
temperature: 0.7,
|
|
112
|
+
maxTokens: 2048,
|
|
113
|
+
},
|
|
114
|
+
};
|
|
115
|
+
const aiService = new AIService(aiServiceConfig);
|
|
116
|
+
return {
|
|
117
|
+
serverConfig,
|
|
118
|
+
modules,
|
|
119
|
+
testCases,
|
|
120
|
+
evaluators,
|
|
121
|
+
aiService,
|
|
122
|
+
configDir
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
//# sourceMappingURL=loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.js","sourceRoot":"","sources":["../../../src/config/loader.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,KAAK,IAAI,SAAS,EAAE,MAAM,MAAM,CAAC;AAC1C,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACjD,OAAO,EAAE,UAAU,EAAE,MAAM,MAAM,CAAC;AAClC,OAAO,EAAE,SAAS,EAA0B,MAAM,wBAAwB,CAAC;AAgB3E;;;;;;GAMG;AACH,SAAS,iBAAiB,CAAC,SAAiB,EAAE,IAAY;IACxD,sDAAsD;IACtD,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,GAAG,CAAC,IAAI,IAAI,GAAG,CAAC,CAAC;IACpD,CAAC;IACD,oCAAoC;IACpC,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC;IACd,CAAC;IACD,kDAAkD;IAClD,OAAO,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC;AAClC,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,UAAkB;IAC3D,0CAA0C;IAC1C,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IACtC,MAAM,GAAG,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;IAEhC,iCAAiC;IACjC,IAAI,MAAW,CAAC;IAEhB,IAAI,GAAG,KAAK,OAAO,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QACtC,cAAc;QACd,MAAM,OAAO,GAAG,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;QAClD,MAAM,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC;IAC9B,CAAC;SAAM,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,KAAK,IAAI,GAAG,KAAK,MAAM,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QAC9E,0DAA0D;QAC1D,MAAM,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,EAAE;YACvC,cAAc,EAAE,IAAI,EAAG,mCAAmC;YAC1D,KAAK,EAAE,IAAI,EAAY,wCAAwC;YAC/D,YAAY,EAAE,KAAK,EAAI,0BAA0B;SAClD,CAAC,CAAC;QAEH,MAAM,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QAEvC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,IAAI,KAAK,CAAC,0BAA0B,UAAU,EAAE,CAAC,CAAC;QAC1D,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,IAAI,KAAK,CAAC,qCAAqC,GAAG,4CAA4C,CAAC,CAAC;IACxG,CAAC;IAED,qBAAqB;IACrB,MAAM,OAAO,GAAsB,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC;IACxD,MAAM,SAAS,GAAe,MAAM,CAAC,SAAS,IAAI,EAAE,CAAC;IACrD,MAAM,UAAU,GAAyB,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC;IAEjE,oDAAoD;IACpD,MAAM,YAAY,GAAG;QACnB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,OAAO,EAAE,MAAM,CAAC,OAAO;QACvB,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,WAAW,EAAE,MAAM,CAAC,WAAW;QAC/B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,OAAO,EAAE,MAAM,CAAC,OAAO;KACxB,CAAC;IAEF,iEAAiE;IACjE,IAAI,YAAY,CAAC,OAAO,EAAE,CAAC;QACzB,KAAK,MAAM,UAAU,IAAI,YAAY,CAAC,OAAO,EAAE,CAAC;YAC9C,MAAM,YAAY,GAAG,YAAY,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;YAEtD,8CAA8C;YAC9C,IAAI,YAAY,CAAC,eAAe,EAAE,CAAC;gBACjC,YAAY,CAAC,eAAe,GAAG,iBAAiB,CAAC,SAAS,EAAE,YAAY,CAAC,eAAe,CAAC,CAAC;YAC5F,CAAC;QACH,CAAC;IACH,CAAC;IAED,kDAAkD;IAClD,IAAI,YAAY,CAAC,WAAW,EAAE,4BAA4B,EAAE,CAAC;QAC3D,MAAM,YAAY,GAAG,iBAAiB,CAAC,SAAS,EAAE,YAAY,CAAC,WAAW,CAAC,4BAA4B,CAAC,CAAC;QACzG,OAAO,CAAC,GAAG,CAAC,8BAA8B,GAAG,YAAY,CAAC;QAC1D,OAAO,CAAC,GAAG,CAAC,0CAA0C,YAAY,EAAE,CAAC,CAAC;IACxE,CAAC;IAED,aAAa;IACb,IAAI,CAAC,YAAY,CAAC,MAAM,IAAI,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1E,MAAM,IAAI,KAAK,CAAC,uCAAuC,CAAC,CAAC;IAC3D,CAAC;IAED,mCAAmC;IACnC,MAAM,UAAU,GAAG,IAAI,GAAG,CAAS,MAAM,CAAC,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC;IAErE,qCAAqC;IACrC,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;YACpB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;gBACxC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;oBAC/B,MAAM,IAAI,KAAK,CAAC,eAAe,QAAQ,CAAC,IAAI,+BAA+B,SAAS,GAAG,CAAC,CAAC;gBAC3F,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,uBAAuB;IACvB,MAAM,eAAe,GAAsB;QACzC,MAAM,EAAE,YAAY,CAAC,MAAM;QAC3B,OAAO,EAAE,YAAY,CAAC,OAAO,IAAI,EAAE;QACnC,cAAc,EAAE;YACd,WAAW,EAAE,GAAG;YAChB,SAAS,EAAE,IAAI;SAChB;KACF,CAAC;IAEF,MAAM,SAAS,GAAG,IAAI,SAAS,CAAC,eAAe,CAAC,CAAC;IAEjD,OAAO;QACL,YAAY;QACZ,OAAO;QACP,SAAS;QACT,UAAU;QACV,SAAS;QACT,SAAS;KACV,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base evaluation prompt module
|
|
3
|
+
*
|
|
4
|
+
* This module provides the foundation for all evaluation prompts.
|
|
5
|
+
* It defines how test data is presented to the evaluator.
|
|
6
|
+
*/
|
|
7
|
+
import type { PromptModule } from '@modular-prompt/core';
|
|
8
|
+
import type { EvaluationContext } from '../types.js';
|
|
9
|
+
export declare const baseEvaluationModule: PromptModule<EvaluationContext>;
|
|
10
|
+
//# sourceMappingURL=base-module.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"base-module.d.ts","sourceRoot":"","sources":["../../../src/evaluators/base-module.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,YAAY,EAA4B,MAAM,sBAAsB,CAAC;AACnF,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAErD,eAAO,MAAM,oBAAoB,EAAE,YAAY,CAAC,iBAAiB,CAuGhE,CAAC"}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base evaluation prompt module
|
|
3
|
+
*
|
|
4
|
+
* This module provides the foundation for all evaluation prompts.
|
|
5
|
+
* It defines how test data is presented to the evaluator.
|
|
6
|
+
*/
|
|
7
|
+
export const baseEvaluationModule = {
|
|
8
|
+
createContext: () => ({
|
|
9
|
+
moduleName: '',
|
|
10
|
+
prompt: '',
|
|
11
|
+
runs: [],
|
|
12
|
+
}),
|
|
13
|
+
objective: [
|
|
14
|
+
'- Evaluate the output of a prompt module',
|
|
15
|
+
'- Provide detailed assessment with scores and reasoning',
|
|
16
|
+
],
|
|
17
|
+
terms: [
|
|
18
|
+
'- Module: A prompt variation being tested',
|
|
19
|
+
'- Prompt: The compiled prompt used to generate the output',
|
|
20
|
+
'- Query Result: The output generated by the LLM',
|
|
21
|
+
'- Run: A single execution of the prompt',
|
|
22
|
+
],
|
|
23
|
+
instructions: [
|
|
24
|
+
{
|
|
25
|
+
type: 'subsection',
|
|
26
|
+
title: 'Output Format',
|
|
27
|
+
items: [
|
|
28
|
+
'Return evaluation in JSON format with the following structure:',
|
|
29
|
+
'- score: Overall score (0-10)',
|
|
30
|
+
'- reasoning: Clear explanation of the score',
|
|
31
|
+
'- details: Object with additional evaluation metrics',
|
|
32
|
+
],
|
|
33
|
+
},
|
|
34
|
+
],
|
|
35
|
+
materials: [
|
|
36
|
+
{
|
|
37
|
+
type: 'subsection',
|
|
38
|
+
title: 'Module Name',
|
|
39
|
+
items: [
|
|
40
|
+
(ctx) => ctx.moduleName,
|
|
41
|
+
],
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
type: 'subsection',
|
|
45
|
+
title: 'Prompt Used',
|
|
46
|
+
items: [
|
|
47
|
+
(ctx) => ({
|
|
48
|
+
type: 'text',
|
|
49
|
+
content: ctx.prompt,
|
|
50
|
+
}),
|
|
51
|
+
],
|
|
52
|
+
},
|
|
53
|
+
],
|
|
54
|
+
inputs: [
|
|
55
|
+
(ctx) => ctx.runs.flatMap((run, idx) => {
|
|
56
|
+
const result = run.queryResult;
|
|
57
|
+
const elements = [];
|
|
58
|
+
// Run number
|
|
59
|
+
elements.push({
|
|
60
|
+
type: 'text',
|
|
61
|
+
content: `Run ${idx + 1}`,
|
|
62
|
+
});
|
|
63
|
+
// Output (prefer structuredOutput over content)
|
|
64
|
+
if (result.structuredOutput) {
|
|
65
|
+
elements.push({
|
|
66
|
+
type: 'json',
|
|
67
|
+
content: result.structuredOutput,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
elements.push({
|
|
72
|
+
type: 'text',
|
|
73
|
+
content: result.content,
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
return elements;
|
|
77
|
+
}),
|
|
78
|
+
],
|
|
79
|
+
schema: [
|
|
80
|
+
{
|
|
81
|
+
type: 'json',
|
|
82
|
+
content: {
|
|
83
|
+
type: 'object',
|
|
84
|
+
properties: {
|
|
85
|
+
score: {
|
|
86
|
+
type: 'number',
|
|
87
|
+
description: 'Overall score (0-10)',
|
|
88
|
+
},
|
|
89
|
+
reasoning: {
|
|
90
|
+
type: 'string',
|
|
91
|
+
description: 'Explanation of the score',
|
|
92
|
+
},
|
|
93
|
+
details: {
|
|
94
|
+
type: 'object',
|
|
95
|
+
description: 'Additional evaluation metrics',
|
|
96
|
+
},
|
|
97
|
+
},
|
|
98
|
+
required: ['score', 'reasoning'],
|
|
99
|
+
},
|
|
100
|
+
},
|
|
101
|
+
],
|
|
102
|
+
};
|
|
103
|
+
//# sourceMappingURL=base-module.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"base-module.js","sourceRoot":"","sources":["../../../src/evaluators/base-module.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,MAAM,CAAC,MAAM,oBAAoB,GAAoC;IACnE,aAAa,EAAE,GAAsB,EAAE,CAAC,CAAC;QACvC,UAAU,EAAE,EAAE;QACd,MAAM,EAAE,EAAE;QACV,IAAI,EAAE,EAAE;KACT,CAAC;IAEF,SAAS,EAAE;QACT,0CAA0C;QAC1C,yDAAyD;KAC1D;IAED,KAAK,EAAE;QACL,2CAA2C;QAC3C,2DAA2D;QAC3D,iDAAiD;QACjD,yCAAyC;KAC1C;IAED,YAAY,EAAE;QACZ;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,eAAe;YACtB,KAAK,EAAE;gBACL,gEAAgE;gBAChE,+BAA+B;gBAC/B,6CAA6C;gBAC7C,sDAAsD;aACvD;SACF;KACF;IAED,SAAS,EAAE;QACT;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,aAAa;YACpB,KAAK,EAAE;gBACL,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,UAAU;aACxB;SACF;QACD;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,aAAa;YACpB,KAAK,EAAE;gBACL,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;oBACR,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,GAAG,CAAC,MAAM;iBACJ,CAAA;aAClB;SACF;KACF;IAED,MAAM,EAAE;QACN,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;YACrC,MAAM,MAAM,GAAG,GAAG,CAAC,WAAW,CAAC;YAC/B,MAAM,QAAQ,GAAqC,EAAE,CAAC;YAEtD,aAAa;YACb,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,MAAM;gBACZ,OAAO,EAAE,OAAO,GAAG,GAAG,CAAC,EAAE;aAC1B,CAAC,CAAC;YAEH,gDAAgD;YAChD,IAAI,MAAM,CAAC,gBAAgB,EAAE,CAAC;gBAC5B,QAAQ,CAAC,IAAI,CAAC;oBACZ,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,MAAM,CAAC,gBAAgB;iBACjC,CAAC,CAAC;YACL,CAAC;iBAAM,CAAC;gBACN,QAAQ,CAAC,IAAI,CAAC;oBACZ,IAAI,EAAE,MAAM;oBACZ,OAAO,EAAE,MAAM,CAAC,OAAO;iBACxB,CAAC,CAAC;YACL,CAAC;YAED,OAAO,QAAQ,CAAC;QAClB,CAAC,CAAC;KACH;IAED,MAAM,EAAE;QACN;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;gBACP,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,KAAK,EAAE;wBACL,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,sBAAsB;qBACpC;oBACD,SAAS,EAAE;wBACT,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,0BAA0B;qBACxC;oBACD,OAAO,EAAE;wBACP,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,+BAA+B;qBAC7C;iBACF;gBACD,QAAQ,EAAE,CAAC,OAAO,EAAE,WAAW,CAAC;aACjC;SACF;KACF;CACF,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Functional Correctness Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Evaluates whether the output meets the functional requirements
|
|
5
|
+
*/
|
|
6
|
+
import type { PromptModule } from '@modular-prompt/core';
|
|
7
|
+
import type { EvaluationContext } from '../types.js';
|
|
8
|
+
declare const _default: {
|
|
9
|
+
name: string;
|
|
10
|
+
description: string;
|
|
11
|
+
module: PromptModule<EvaluationContext>;
|
|
12
|
+
};
|
|
13
|
+
export default _default;
|
|
14
|
+
//# sourceMappingURL=functional-correctness.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"functional-correctness.d.ts","sourceRoot":"","sources":["../../../src/evaluators/functional-correctness.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAmB,iBAAiB,EAAE,MAAM,aAAa,CAAC;;;;;;AA0FtE,wBAI4B"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Functional Correctness Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Evaluates whether the output meets the functional requirements
|
|
5
|
+
*/
|
|
6
|
+
const functionalCorrectnessModule = {
|
|
7
|
+
createContext: () => ({
|
|
8
|
+
moduleName: '',
|
|
9
|
+
prompt: '',
|
|
10
|
+
runs: [],
|
|
11
|
+
}),
|
|
12
|
+
objective: [
|
|
13
|
+
'- Assess whether the output correctly fulfills the given requirements',
|
|
14
|
+
],
|
|
15
|
+
instructions: [
|
|
16
|
+
'- Evaluate based on the following criteria:',
|
|
17
|
+
{
|
|
18
|
+
type: 'subsection',
|
|
19
|
+
title: 'Evaluation Criteria',
|
|
20
|
+
items: [
|
|
21
|
+
'1. **Requirement Fulfillment**: Does it satisfy the intent described in the prompt?',
|
|
22
|
+
'2. **Parameter Correctness**: Are all required parameters present and correct?',
|
|
23
|
+
'3. **Parameter Completeness**: Are optional parameters appropriately used or omitted?',
|
|
24
|
+
'4. **Logical Consistency**: Is the output logically consistent with the facts?',
|
|
25
|
+
],
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
type: 'subsection',
|
|
29
|
+
title: 'Scoring',
|
|
30
|
+
items: [
|
|
31
|
+
'- Assign scores (0-10) for each criterion',
|
|
32
|
+
'- Calculate overall score based on all criteria',
|
|
33
|
+
'- Provide clear reasoning for each score',
|
|
34
|
+
],
|
|
35
|
+
},
|
|
36
|
+
],
|
|
37
|
+
schema: [
|
|
38
|
+
{
|
|
39
|
+
type: 'json',
|
|
40
|
+
content: {
|
|
41
|
+
type: 'object',
|
|
42
|
+
properties: {
|
|
43
|
+
score: {
|
|
44
|
+
type: 'number',
|
|
45
|
+
description: 'Overall score (0-10)',
|
|
46
|
+
},
|
|
47
|
+
reasoning: {
|
|
48
|
+
type: 'string',
|
|
49
|
+
description: 'Summary of evaluation',
|
|
50
|
+
},
|
|
51
|
+
details: {
|
|
52
|
+
type: 'object',
|
|
53
|
+
properties: {
|
|
54
|
+
requirementFulfillment: {
|
|
55
|
+
type: 'object',
|
|
56
|
+
properties: {
|
|
57
|
+
score: { type: 'number' },
|
|
58
|
+
reasoning: { type: 'string' },
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
parameterCorrectness: {
|
|
62
|
+
type: 'object',
|
|
63
|
+
properties: {
|
|
64
|
+
score: { type: 'number' },
|
|
65
|
+
reasoning: { type: 'string' },
|
|
66
|
+
},
|
|
67
|
+
},
|
|
68
|
+
parameterCompleteness: {
|
|
69
|
+
type: 'object',
|
|
70
|
+
properties: {
|
|
71
|
+
score: { type: 'number' },
|
|
72
|
+
reasoning: { type: 'string' },
|
|
73
|
+
},
|
|
74
|
+
},
|
|
75
|
+
logicalConsistency: {
|
|
76
|
+
type: 'object',
|
|
77
|
+
properties: {
|
|
78
|
+
score: { type: 'number' },
|
|
79
|
+
reasoning: { type: 'string' },
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
required: ['score', 'reasoning', 'details'],
|
|
86
|
+
},
|
|
87
|
+
},
|
|
88
|
+
],
|
|
89
|
+
};
|
|
90
|
+
export default {
|
|
91
|
+
name: 'Functional Correctness',
|
|
92
|
+
description: 'Evaluates whether the output meets the functional requirements',
|
|
93
|
+
module: functionalCorrectnessModule,
|
|
94
|
+
};
|
|
95
|
+
//# sourceMappingURL=functional-correctness.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"functional-correctness.js","sourceRoot":"","sources":["../../../src/evaluators/functional-correctness.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,MAAM,2BAA2B,GAAoC;IACnE,aAAa,EAAE,GAAsB,EAAE,CAAC,CAAC;QACvC,UAAU,EAAE,EAAE;QACd,MAAM,EAAE,EAAE;QACV,IAAI,EAAE,EAAE;KACT,CAAC;IAEF,SAAS,EAAE;QACT,uEAAuE;KACxE;IAED,YAAY,EAAE;QACZ,6CAA6C;QAC7C;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,qBAAqB;YAC5B,KAAK,EAAE;gBACL,qFAAqF;gBACrF,gFAAgF;gBAChF,uFAAuF;gBACvF,gFAAgF;aACjF;SACF;QACD;YACE,IAAI,EAAE,YAAY;YAClB,KAAK,EAAE,SAAS;YAChB,KAAK,EAAE;gBACL,2CAA2C;gBAC3C,iDAAiD;gBACjD,0CAA0C;aAC3C;SACF;KACF;IAED,MAAM,EAAE;QACN;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;gBACP,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,KAAK,EAAE;wBACL,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,sBAAsB;qBACpC;oBACD,SAAS,EAAE;wBACT,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,uBAAuB;qBACrC;oBACD,OAAO,EAAE;wBACP,IAAI,EAAE,QAAQ;wBACd,UAAU,EAAE;4BACV,sBAAsB,EAAE;gCACtB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;4BACD,oBAAoB,EAAE;gCACpB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;4BACD,qBAAqB,EAAE;gCACrB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;4BACD,kBAAkB,EAAE;gCAClB,IAAI,EAAE,QAAQ;gCACd,UAAU,EAAE;oCACV,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;oCACzB,SAAS,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE;iCAC9B;6BACF;yBACF;qBACF;iBACF;gBACD,QAAQ,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,SAAS,CAAC;aAC5C;SACF;KACF;CACF,CAAC;AAEF,eAAe;IACb,IAAI,EAAE,wBAAwB;IAC9B,WAAW,EAAE,gEAAgE;IAC7E,MAAM,EAAE,2BAA2B;CACV,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON Validator Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Validates JSON structure in structured output
|
|
5
|
+
*/
|
|
6
|
+
import type { EvaluationContext, EvaluationResult } from '../types.js';
|
|
7
|
+
declare const _default: {
|
|
8
|
+
name: string;
|
|
9
|
+
description: string;
|
|
10
|
+
evaluate(context: EvaluationContext): Promise<EvaluationResult>;
|
|
11
|
+
};
|
|
12
|
+
export default _default;
|
|
13
|
+
//# sourceMappingURL=json-validator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json-validator.d.ts","sourceRoot":"","sources":["../../../src/evaluators/json-validator.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAiB,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;;;;sBAM5D,iBAAiB,GAAG,OAAO,CAAC,gBAAgB,CAAC;;AAJvE,wBAiD0B"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON Validator Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Validates JSON structure in structured output
|
|
5
|
+
*/
|
|
6
|
+
export default {
|
|
7
|
+
name: 'JSON Validator',
|
|
8
|
+
description: 'Validates JSON structure in output',
|
|
9
|
+
async evaluate(context) {
|
|
10
|
+
const errors = [];
|
|
11
|
+
let validCount = 0;
|
|
12
|
+
const runDetails = [];
|
|
13
|
+
for (let i = 0; i < context.runs.length; i++) {
|
|
14
|
+
const run = context.runs[i];
|
|
15
|
+
const { structuredOutput } = run.queryResult;
|
|
16
|
+
if (!structuredOutput) {
|
|
17
|
+
errors.push(`Run ${i + 1}: No structured output`);
|
|
18
|
+
runDetails.push({ run: i + 1, valid: false, error: 'No structured output' });
|
|
19
|
+
continue;
|
|
20
|
+
}
|
|
21
|
+
// Basic JSON validation
|
|
22
|
+
if (typeof structuredOutput === 'object' && structuredOutput !== null) {
|
|
23
|
+
validCount++;
|
|
24
|
+
runDetails.push({ run: i + 1, valid: true });
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
const error = 'Invalid JSON structure';
|
|
28
|
+
errors.push(`Run ${i + 1}: ${error}`);
|
|
29
|
+
runDetails.push({ run: i + 1, valid: false, error });
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
const score = context.runs.length > 0
|
|
33
|
+
? (validCount / context.runs.length) * 10
|
|
34
|
+
: 0;
|
|
35
|
+
return {
|
|
36
|
+
evaluator: 'json-validator',
|
|
37
|
+
moduleName: context.moduleName,
|
|
38
|
+
score,
|
|
39
|
+
reasoning: errors.length > 0
|
|
40
|
+
? `${validCount}/${context.runs.length} valid outputs. Issues: ${errors.join('; ')}`
|
|
41
|
+
: `All ${validCount} outputs have valid JSON structure`,
|
|
42
|
+
details: {
|
|
43
|
+
validCount,
|
|
44
|
+
totalCount: context.runs.length,
|
|
45
|
+
errors,
|
|
46
|
+
runs: runDetails,
|
|
47
|
+
},
|
|
48
|
+
};
|
|
49
|
+
},
|
|
50
|
+
};
|
|
51
|
+
//# sourceMappingURL=json-validator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"json-validator.js","sourceRoot":"","sources":["../../../src/evaluators/json-validator.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH,eAAe;IACb,IAAI,EAAE,gBAAgB;IACtB,WAAW,EAAE,oCAAoC;IAEjD,KAAK,CAAC,QAAQ,CAAC,OAA0B;QACvC,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,MAAM,UAAU,GAA2D,EAAE,CAAC;QAE9E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC5B,MAAM,EAAE,gBAAgB,EAAE,GAAG,GAAG,CAAC,WAAW,CAAC;YAE7C,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACtB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;gBAClD,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,sBAAsB,EAAE,CAAC,CAAC;gBAC7E,SAAS;YACX,CAAC;YAED,wBAAwB;YACxB,IAAI,OAAO,gBAAgB,KAAK,QAAQ,IAAI,gBAAgB,KAAK,IAAI,EAAE,CAAC;gBACtE,UAAU,EAAE,CAAC;gBACb,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YAC/C,CAAC;iBAAM,CAAC;gBACN,MAAM,KAAK,GAAG,wBAAwB,CAAC;gBACvC,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;gBACtC,UAAU,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;YACvD,CAAC;QACH,CAAC;QAED,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC;YACnC,CAAC,CAAC,CAAC,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE;YACzC,CAAC,CAAC,CAAC,CAAC;QAEN,OAAO;YACL,SAAS,EAAE,gBAAgB;YAC3B,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,KAAK;YACL,SAAS,EAAE,MAAM,CAAC,MAAM,GAAG,CAAC;gBAC1B,CAAC,CAAC,GAAG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,2BAA2B,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;gBACpF,CAAC,CAAC,OAAO,UAAU,oCAAoC;YACzD,OAAO,EAAE;gBACP,UAAU;gBACV,UAAU,EAAE,OAAO,CAAC,IAAI,CAAC,MAAM;gBAC/B,MAAM;gBACN,IAAI,EAAE,UAAU;aACjB;SACF,CAAC;IACJ,CAAC;CACsB,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @modular-prompt/experiment
|
|
3
|
+
*
|
|
4
|
+
* Experiment framework for comparing and evaluating prompt modules
|
|
5
|
+
*/
|
|
6
|
+
export * from './types.js';
|
|
7
|
+
export { loadExperimentConfig } from './config/loader.js';
|
|
8
|
+
export { loadModules, loadEvaluators } from './config/dynamic-loader.js';
|
|
9
|
+
export { baseEvaluationModule } from './evaluators/base-module.js';
|
|
10
|
+
export { DriverManager } from './runner/driver-manager.js';
|
|
11
|
+
export { ExperimentRunner } from './runner/experiment.js';
|
|
12
|
+
export { EvaluatorRunner } from './runner/evaluator.js';
|
|
13
|
+
export { StatisticsReporter } from './reporter/statistics.js';
|
|
14
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,cAAc,YAAY,CAAC;AAG3B,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAC1D,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAGzE,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AAGnE,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAC3D,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAGxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @modular-prompt/experiment
|
|
3
|
+
*
|
|
4
|
+
* Experiment framework for comparing and evaluating prompt modules
|
|
5
|
+
*/
|
|
6
|
+
// Types
|
|
7
|
+
export * from './types.js';
|
|
8
|
+
// Configuration loaders
|
|
9
|
+
export { loadExperimentConfig } from './config/loader.js';
|
|
10
|
+
export { loadModules, loadEvaluators } from './config/dynamic-loader.js';
|
|
11
|
+
// Evaluators
|
|
12
|
+
export { baseEvaluationModule } from './evaluators/base-module.js';
|
|
13
|
+
// Runners
|
|
14
|
+
export { DriverManager } from './runner/driver-manager.js';
|
|
15
|
+
export { ExperimentRunner } from './runner/experiment.js';
|
|
16
|
+
export { EvaluatorRunner } from './runner/evaluator.js';
|
|
17
|
+
// Reporters
|
|
18
|
+
export { StatisticsReporter } from './reporter/statistics.js';
|
|
19
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,QAAQ;AACR,cAAc,YAAY,CAAC;AAE3B,wBAAwB;AACxB,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAC1D,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAEzE,aAAa;AACb,OAAO,EAAE,oBAAoB,EAAE,MAAM,6BAA6B,CAAC;AAEnE,UAAU;AACV,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAC3D,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAExD,YAAY;AACZ,OAAO,EAAE,kBAAkB,EAAE,MAAM,0BAA0B,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Statistics reporter
|
|
3
|
+
*/
|
|
4
|
+
import type { TestResult } from '../types.js';
|
|
5
|
+
export declare class StatisticsReporter {
|
|
6
|
+
private results;
|
|
7
|
+
constructor(results: TestResult[]);
|
|
8
|
+
/**
|
|
9
|
+
* Generate and display statistics report
|
|
10
|
+
*/
|
|
11
|
+
report(): void;
|
|
12
|
+
/**
|
|
13
|
+
* Report timing statistics
|
|
14
|
+
*/
|
|
15
|
+
private reportTiming;
|
|
16
|
+
/**
|
|
17
|
+
* Report output consistency
|
|
18
|
+
*/
|
|
19
|
+
private reportConsistency;
|
|
20
|
+
}
|
|
21
|
+
//# sourceMappingURL=statistics.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"statistics.d.ts","sourceRoot":"","sources":["../../../src/reporter/statistics.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAa,UAAU,EAAE,MAAM,aAAa,CAAC;AAEzD,qBAAa,kBAAkB;IACjB,OAAO,CAAC,OAAO;gBAAP,OAAO,EAAE,UAAU,EAAE;IAEzC;;OAEG;IACH,MAAM,IAAI,IAAI;IA2Bd;;OAEG;IACH,OAAO,CAAC,YAAY;IASpB;;OAEG;IACH,OAAO,CAAC,iBAAiB;CAwB1B"}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Statistics reporter
|
|
3
|
+
*/
|
|
4
|
+
export class StatisticsReporter {
|
|
5
|
+
results;
|
|
6
|
+
constructor(results) {
|
|
7
|
+
this.results = results;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Generate and display statistics report
|
|
11
|
+
*/
|
|
12
|
+
report() {
|
|
13
|
+
console.log();
|
|
14
|
+
console.log('='.repeat(80));
|
|
15
|
+
console.log('📊 Statistics Summary');
|
|
16
|
+
console.log('='.repeat(80));
|
|
17
|
+
console.log();
|
|
18
|
+
for (const result of this.results) {
|
|
19
|
+
console.log(`${result.testCase} - ${result.model} - [${result.module.toUpperCase()}]`);
|
|
20
|
+
console.log('─'.repeat(80));
|
|
21
|
+
const successRuns = result.runs.filter(r => r.success);
|
|
22
|
+
const successRate = (successRuns.length / result.runs.length) * 100;
|
|
23
|
+
console.log(`Success rate: ${successRuns.length}/${result.runs.length} (${successRate.toFixed(1)}%)`);
|
|
24
|
+
if (successRuns.length > 0) {
|
|
25
|
+
this.reportTiming(successRuns);
|
|
26
|
+
this.reportConsistency(successRuns);
|
|
27
|
+
}
|
|
28
|
+
console.log();
|
|
29
|
+
}
|
|
30
|
+
console.log('='.repeat(80));
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Report timing statistics
|
|
34
|
+
*/
|
|
35
|
+
reportTiming(runs) {
|
|
36
|
+
const times = runs.map(r => r.elapsed);
|
|
37
|
+
const avg = times.reduce((a, b) => a + b, 0) / times.length;
|
|
38
|
+
const min = Math.min(...times);
|
|
39
|
+
const max = Math.max(...times);
|
|
40
|
+
console.log(`Execution time: avg=${avg.toFixed(0)}ms, min=${min}ms, max=${max}ms`);
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Report output consistency
|
|
44
|
+
*/
|
|
45
|
+
reportConsistency(runs) {
|
|
46
|
+
// Extract JSON from output
|
|
47
|
+
const jsonOutputs = runs.map(r => {
|
|
48
|
+
const match = r.content.match(/```json\s*\n([\s\S]*?)\n```/);
|
|
49
|
+
return match ? match[1].trim() : null;
|
|
50
|
+
}).filter(j => j !== null);
|
|
51
|
+
if (jsonOutputs.length === 0) {
|
|
52
|
+
return;
|
|
53
|
+
}
|
|
54
|
+
const uniqueOutputs = new Set(jsonOutputs);
|
|
55
|
+
console.log(`Output consistency: ${uniqueOutputs.size} unique output(s) from ${jsonOutputs.length} run(s)`);
|
|
56
|
+
if (uniqueOutputs.size === 1) {
|
|
57
|
+
console.log('✅ All outputs are identical');
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
console.log('⚠️ Outputs vary:');
|
|
61
|
+
Array.from(uniqueOutputs).forEach((output, idx) => {
|
|
62
|
+
const count = jsonOutputs.filter(j => j === output).length;
|
|
63
|
+
console.log(` Variant ${idx + 1} (${count}x): ${output}`);
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
//# sourceMappingURL=statistics.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"statistics.js","sourceRoot":"","sources":["../../../src/reporter/statistics.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,MAAM,OAAO,kBAAkB;IACT;IAApB,YAAoB,OAAqB;QAArB,YAAO,GAAP,OAAO,CAAc;IAAG,CAAC;IAE7C;;OAEG;IACH,MAAM;QACJ,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;QACrC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,EAAE,CAAC;QAEd,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,QAAQ,MAAM,MAAM,CAAC,KAAK,OAAO,MAAM,CAAC,MAAM,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC;YACvF,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;YAE5B,MAAM,WAAW,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;YACvD,MAAM,WAAW,GAAG,CAAC,WAAW,CAAC,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;YAEpE,OAAO,CAAC,GAAG,CAAC,iBAAiB,WAAW,CAAC,MAAM,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,KAAK,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAEtG,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC;gBAC/B,IAAI,CAAC,iBAAiB,CAAC,WAAW,CAAC,CAAC;YACtC,CAAC;YAED,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC9B,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAiB;QACpC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,GAAG,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;QAC5D,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;QAC/B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;QAE/B,OAAO,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,GAAG,WAAW,GAAG,IAAI,CAAC,CAAC;IACrF,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,IAAiB;QACzC,2BAA2B;QAC3B,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE;YAC/B,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;YAC7D,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QACxC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC;QAE3B,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO;QACT,CAAC;QAED,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;QAC3C,OAAO,CAAC,GAAG,CAAC,uBAAuB,aAAa,CAAC,IAAI,0BAA0B,WAAW,CAAC,MAAM,SAAS,CAAC,CAAC;QAE5G,IAAI,aAAa,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC;QAC7C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;YACjC,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;gBAChD,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;gBAC3D,OAAO,CAAC,GAAG,CAAC,cAAc,GAAG,GAAG,CAAC,KAAK,KAAK,OAAO,MAAM,EAAE,CAAC,CAAC;YAC9D,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Module Comparison Experiment
|
|
4
|
+
*
|
|
5
|
+
* Compares the performance and output quality of multiple prompt modules.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* moduler-experiment <config> [options]
|
|
9
|
+
*
|
|
10
|
+
* Arguments:
|
|
11
|
+
* <config> Config file path (YAML, TypeScript, or JavaScript)
|
|
12
|
+
*
|
|
13
|
+
* Options:
|
|
14
|
+
* --test-case <name> Test case name filter
|
|
15
|
+
* --model <provider> Model provider filter (mlx, vertexai, googlegenai)
|
|
16
|
+
* --modules <names> Comma-separated module names (default: all)
|
|
17
|
+
* --repeat <count> Number of repetitions (default: 1)
|
|
18
|
+
* --evaluate Enable evaluation phase
|
|
19
|
+
* --evaluators <names> Comma-separated evaluator names (default: all)
|
|
20
|
+
*/
|
|
21
|
+
export {};
|
|
22
|
+
//# sourceMappingURL=run-comparison.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"run-comparison.d.ts","sourceRoot":"","sources":["../../src/run-comparison.ts"],"names":[],"mappings":";AACA;;;;;;;;;;;;;;;;;;GAkBG"}
|