@modular-prompt/experiment 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -369
- package/dist/config/dynamic-loader.js +2 -2
- package/dist/config/dynamic-loader.js.map +1 -1
- package/dist/runner/experiment.d.ts +8 -4
- package/dist/runner/experiment.d.ts.map +1 -1
- package/dist/runner/experiment.js +112 -106
- package/dist/runner/experiment.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/types.d.ts.map +1 -1
- package/examples/tools-experiment.yaml +50 -37
- package/examples/tools-test-module.mjs +18 -24
- package/examples/tools-test-module.ts +19 -23
- package/package.json +8 -5
- package/skills/experiment/SKILL.md +313 -0
package/README.md
CHANGED
|
@@ -1,412 +1,79 @@
|
|
|
1
1
|
# @modular-prompt/experiment
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
プロンプトモジュールの比較・評価フレームワーク。
|
|
4
4
|
|
|
5
|
-
##
|
|
6
|
-
|
|
7
|
-
This framework provides tools to compare and evaluate different prompt module variations under identical conditions. It integrates with the `@modular-prompt/core` system to test multiple prompt variations and evaluate their output quality.
|
|
8
|
-
|
|
9
|
-
### Use Cases
|
|
10
|
-
|
|
11
|
-
- **Prompt Engineering**: Validate the effectiveness of new prompt structures
|
|
12
|
-
- **Module Separation**: Verify that modularized prompts produce equivalent outputs
|
|
13
|
-
- **Quality Evaluation**: Assess output stability and consistency through repeated executions
|
|
14
|
-
- **Multi-Model Testing**: Test across different LLM providers (MLX, VertexAI, GoogleGenAI, etc.)
|
|
15
|
-
|
|
16
|
-
## Features
|
|
17
|
-
|
|
18
|
-
- ✅ **Dynamic Module Loading**: Load prompt modules from external files or inline definitions
|
|
19
|
-
- ✅ **Flexible Evaluators**: Support both code-based and AI-based evaluation
|
|
20
|
-
- ✅ **Statistical Analysis**: Analyze success rates, execution times, and output consistency
|
|
21
|
-
- ✅ **Prompt Diff Detection**: Automatically detect differences between module outputs
|
|
22
|
-
- ✅ **Driver Caching**: Reuse drivers for improved memory efficiency
|
|
23
|
-
- ✅ **Detailed Logging**: Comprehensive logging of all executions
|
|
24
|
-
|
|
25
|
-
## Installation
|
|
5
|
+
## インストール
|
|
26
6
|
|
|
27
7
|
```bash
|
|
28
|
-
|
|
8
|
+
npm install @modular-prompt/experiment
|
|
29
9
|
```
|
|
30
10
|
|
|
31
|
-
##
|
|
11
|
+
## 概要
|
|
32
12
|
|
|
33
|
-
|
|
13
|
+
複数のプロンプトモジュールを同一条件下で比較・評価する。YAML設定で実験を定義し、CLIで実行。
|
|
34
14
|
|
|
35
|
-
|
|
15
|
+
- **プロンプト比較**: 異なるプロンプト構造の効果を定量的に比較
|
|
16
|
+
- **マルチモデルテスト**: 異なるLLMプロバイダーでの動作比較
|
|
17
|
+
- **品質評価**: 繰り返し実行による安定性・一貫性の評価
|
|
18
|
+
- **柔軟な評価器**: コードベース・AIベースの評価をサポート
|
|
36
19
|
|
|
37
|
-
|
|
20
|
+
## クイックスタート
|
|
38
21
|
|
|
39
|
-
|
|
22
|
+
### 1. 設定ファイルを作成
|
|
40
23
|
|
|
41
24
|
```yaml
|
|
25
|
+
# experiment.yaml
|
|
42
26
|
models:
|
|
43
|
-
|
|
44
|
-
provider:
|
|
45
|
-
model:
|
|
46
|
-
capabilities: ["tools", "fast"]
|
|
47
|
-
enabled: true
|
|
27
|
+
gpt4o:
|
|
28
|
+
provider: openai
|
|
29
|
+
model: gpt-4o
|
|
48
30
|
|
|
49
31
|
drivers:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
location: us-central1
|
|
53
|
-
# Paths are resolved relative to this config file
|
|
54
|
-
# Can use ~/ for home directory or absolute paths
|
|
55
|
-
credentialsPath: ./credentials.json
|
|
32
|
+
openai:
|
|
33
|
+
apiKey: ${OPENAI_API_KEY}
|
|
56
34
|
|
|
57
35
|
modules:
|
|
58
36
|
- name: my-module
|
|
59
37
|
path: ./my-module.ts
|
|
60
|
-
description: My custom prompt module
|
|
61
38
|
|
|
62
39
|
testCases:
|
|
63
|
-
- name:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
query: user question
|
|
67
|
-
context: additional information
|
|
68
|
-
models: # Optional: specify which models to test (uses all enabled if not specified)
|
|
69
|
-
- gemini-fast
|
|
70
|
-
|
|
71
|
-
evaluators:
|
|
72
|
-
# Built-in evaluators (name only)
|
|
73
|
-
- name: structured-output-presence
|
|
74
|
-
- name: llm-requirement-fulfillment
|
|
75
|
-
# Or external evaluator (with path)
|
|
76
|
-
- name: custom-validator
|
|
77
|
-
path: ./evaluators/custom-validator.ts
|
|
78
|
-
# Or inline prompt evaluator
|
|
79
|
-
- name: quality-check
|
|
80
|
-
prompt:
|
|
81
|
-
objective:
|
|
82
|
-
- Evaluate output quality
|
|
83
|
-
instructions:
|
|
84
|
-
- Check clarity and accuracy
|
|
85
|
-
|
|
86
|
-
evaluation:
|
|
87
|
-
enabled: true
|
|
88
|
-
model: gemini-fast # Reference by model name
|
|
89
|
-
```
|
|
90
|
-
|
|
91
|
-
#### Option B: TypeScript Configuration (For dynamic configurations)
|
|
92
|
-
|
|
93
|
-
Create `examples/experiment.ts`:
|
|
94
|
-
|
|
95
|
-
```typescript
|
|
96
|
-
export default {
|
|
97
|
-
models: {
|
|
98
|
-
'gemini-fast': {
|
|
99
|
-
provider: 'vertexai',
|
|
100
|
-
model: 'gemini-2.0-flash-exp',
|
|
101
|
-
capabilities: ['tools', 'fast'],
|
|
102
|
-
enabled: true,
|
|
103
|
-
},
|
|
104
|
-
},
|
|
105
|
-
drivers: {
|
|
106
|
-
vertexai: {
|
|
107
|
-
projectId: 'your-project-id',
|
|
108
|
-
location: 'us-central1',
|
|
109
|
-
credentialsPath: './credentials.json',
|
|
110
|
-
},
|
|
111
|
-
},
|
|
112
|
-
modules: [
|
|
113
|
-
{
|
|
114
|
-
name: 'my-module',
|
|
115
|
-
path: './my-module.ts',
|
|
116
|
-
description: 'My custom prompt module',
|
|
117
|
-
},
|
|
118
|
-
],
|
|
119
|
-
testCases: [
|
|
120
|
-
{
|
|
121
|
-
name: 'Basic Test',
|
|
122
|
-
description: 'Test basic functionality',
|
|
123
|
-
input: { // Structured context object
|
|
124
|
-
query: 'user question',
|
|
125
|
-
options: { temperature: 0.7 },
|
|
126
|
-
},
|
|
127
|
-
models: ['gemini-fast'], // Optional
|
|
128
|
-
},
|
|
129
|
-
],
|
|
130
|
-
evaluators: [
|
|
131
|
-
// Built-in evaluators (name only)
|
|
132
|
-
{ name: 'structured-output-presence' },
|
|
133
|
-
{ name: 'llm-requirement-fulfillment' },
|
|
134
|
-
// Or external evaluator (with path)
|
|
135
|
-
{
|
|
136
|
-
name: 'custom-validator',
|
|
137
|
-
path: './evaluators/custom-validator.ts',
|
|
138
|
-
},
|
|
139
|
-
],
|
|
140
|
-
evaluation: {
|
|
141
|
-
enabled: true,
|
|
142
|
-
model: 'gemini-fast', // Reference by model name
|
|
143
|
-
},
|
|
144
|
-
};
|
|
145
|
-
```
|
|
146
|
-
|
|
147
|
-
**TypeScript Support**: TypeScript configuration files are automatically transpiled using [jiti](https://github.com/unjs/jiti). You can use TypeScript syntax directly without pre-compilation. Type annotations are stripped automatically, and the file is executed as JavaScript.
|
|
148
|
-
|
|
149
|
-
**Important**: All file paths in the configuration (modules, evaluators, credentials) are resolved relative to the config file location.
|
|
150
|
-
|
|
151
|
-
### 2. Run Experiment
|
|
152
|
-
|
|
153
|
-
```bash
|
|
154
|
-
# Validate configuration and display execution plan (recommended first step)
|
|
155
|
-
npx modular-experiment examples/experiment.yaml --dry-run
|
|
156
|
-
|
|
157
|
-
# Run with YAML config
|
|
158
|
-
npx modular-experiment examples/experiment.yaml
|
|
159
|
-
|
|
160
|
-
# Run with TypeScript config
|
|
161
|
-
npx modular-experiment examples/experiment.ts
|
|
162
|
-
|
|
163
|
-
# Run specific module
|
|
164
|
-
npx modular-experiment examples/experiment.yaml --modules my-module
|
|
165
|
-
|
|
166
|
-
# Run with evaluation
|
|
167
|
-
npx modular-experiment examples/experiment.yaml --evaluate
|
|
168
|
-
|
|
169
|
-
# Run multiple times for statistics
|
|
170
|
-
npx modular-experiment examples/experiment.yaml --repeat 10
|
|
171
|
-
|
|
172
|
-
# Run with detailed logging to JSONL file
|
|
173
|
-
npx modular-experiment examples/experiment.yaml --log-file experiment.jsonl
|
|
174
|
-
|
|
175
|
-
# Run with verbose output (show internal operations)
|
|
176
|
-
npx modular-experiment examples/experiment.yaml --verbose
|
|
177
|
-
|
|
178
|
-
# Combine options
|
|
179
|
-
npx modular-experiment examples/experiment.yaml --evaluate --log-file experiment.jsonl --verbose
|
|
180
|
-
```
|
|
181
|
-
|
|
182
|
-
## Configuration
|
|
183
|
-
|
|
184
|
-
### Module Definition
|
|
185
|
-
|
|
186
|
-
Modules can be defined inline or loaded from external files:
|
|
187
|
-
|
|
188
|
-
```typescript
|
|
189
|
-
// External file
|
|
190
|
-
export const modules: ModuleReference[] = [
|
|
191
|
-
{
|
|
192
|
-
name: 'my-module',
|
|
193
|
-
path: './modules/my-module.ts',
|
|
194
|
-
description: 'Description',
|
|
195
|
-
},
|
|
196
|
-
];
|
|
197
|
-
```
|
|
198
|
-
|
|
199
|
-
A module file should export a default object with:
|
|
200
|
-
|
|
201
|
-
```typescript
|
|
202
|
-
import { compile } from '@modular-prompt/core';
|
|
203
|
-
import { myPromptModule } from './prompts.js';
|
|
40
|
+
- name: 基本テスト
|
|
41
|
+
input:
|
|
42
|
+
query: "TypeScriptについて説明して"
|
|
204
43
|
|
|
205
|
-
|
|
206
|
-
name: 'My Module',
|
|
207
|
-
description: 'Module description',
|
|
208
|
-
compile: (context: any) => compile(myPromptModule, context),
|
|
209
|
-
};
|
|
44
|
+
evaluators: []
|
|
210
45
|
```
|
|
211
46
|
|
|
212
|
-
###
|
|
213
|
-
|
|
214
|
-
Two types of evaluators are supported:
|
|
215
|
-
|
|
216
|
-
#### 1. Code Evaluator
|
|
217
|
-
|
|
218
|
-
Programmatic validation (e.g., JSON structure validation):
|
|
219
|
-
|
|
220
|
-
```typescript
|
|
221
|
-
import type { CodeEvaluator, EvaluationContext, EvaluationResult } from '@modular-prompt/experiment';
|
|
222
|
-
|
|
223
|
-
export default {
|
|
224
|
-
name: 'JSON Validator',
|
|
225
|
-
description: 'Validates JSON structure in output',
|
|
226
|
-
|
|
227
|
-
async evaluate(context: EvaluationContext): Promise<EvaluationResult> {
|
|
228
|
-
// Validation logic
|
|
229
|
-
return {
|
|
230
|
-
evaluator: 'json-validator',
|
|
231
|
-
moduleName: context.moduleName,
|
|
232
|
-
score: 10,
|
|
233
|
-
reasoning: 'Valid JSON structure',
|
|
234
|
-
};
|
|
235
|
-
},
|
|
236
|
-
} satisfies CodeEvaluator;
|
|
237
|
-
```
|
|
238
|
-
|
|
239
|
-
#### 2. Prompt Evaluator
|
|
240
|
-
|
|
241
|
-
AI-based evaluation using LLM:
|
|
47
|
+
### 2. モジュールファイルを作成
|
|
242
48
|
|
|
243
49
|
```typescript
|
|
244
|
-
|
|
50
|
+
// my-module.ts
|
|
245
51
|
import type { PromptModule } from '@modular-prompt/core';
|
|
246
52
|
|
|
247
|
-
const
|
|
248
|
-
|
|
249
|
-
moduleName: '',
|
|
250
|
-
prompt: '',
|
|
251
|
-
runs: [],
|
|
252
|
-
}),
|
|
253
|
-
|
|
254
|
-
objective: [
|
|
255
|
-
'- Assess output quality',
|
|
256
|
-
],
|
|
257
|
-
|
|
53
|
+
const module: PromptModule<{ query: string }> = {
|
|
54
|
+
objective: ['ユーザーの質問に回答する'],
|
|
258
55
|
instructions: [
|
|
259
|
-
|
|
56
|
+
(ctx) => `質問: ${ctx.query}`,
|
|
260
57
|
],
|
|
261
58
|
};
|
|
262
59
|
|
|
263
|
-
export default
|
|
264
|
-
name: 'Quality Evaluator',
|
|
265
|
-
description: 'Evaluates output quality',
|
|
266
|
-
module: evaluationModule,
|
|
267
|
-
} satisfies PromptEvaluator;
|
|
60
|
+
export default module;
|
|
268
61
|
```
|
|
269
62
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
## Built-in Evaluators
|
|
273
|
-
|
|
274
|
-
The framework includes built-in evaluators that can be referenced by name only (no path required):
|
|
275
|
-
|
|
276
|
-
### structured-output-presence
|
|
277
|
-
|
|
278
|
-
- **Type**: Code Evaluator
|
|
279
|
-
- **What it measures**: Checks if `structuredOutput` exists and is a valid object
|
|
280
|
-
- **Evaluation logic**:
|
|
281
|
-
- Verifies presence of `structuredOutput` in query result
|
|
282
|
-
- Confirms it's a non-null object type
|
|
283
|
-
- **Score**: `(validCount / totalRuns) * 10`
|
|
284
|
-
- **Use case**: Verify that the model returns structured JSON output (essential for structured output workflows)
|
|
285
|
-
- **Usage**:
|
|
286
|
-
```yaml
|
|
287
|
-
evaluators:
|
|
288
|
-
- name: "structured-output-presence"
|
|
289
|
-
```
|
|
290
|
-
|
|
291
|
-
### llm-requirement-fulfillment
|
|
292
|
-
|
|
293
|
-
- **Type**: Prompt Evaluator (uses LLM for evaluation)
|
|
294
|
-
- **What it measures**: Uses LLM to comprehensively evaluate whether output meets functional requirements
|
|
295
|
-
- **Evaluation criteria**:
|
|
296
|
-
1. **Requirement Fulfillment**: Does it satisfy the intent described in the prompt?
|
|
297
|
-
2. **Parameter Correctness**: Are all required parameters present and correct?
|
|
298
|
-
3. **Parameter Completeness**: Are optional parameters appropriately used or omitted?
|
|
299
|
-
4. **Logical Consistency**: Is the output logically consistent with the facts?
|
|
300
|
-
- **Score**: 0-10 overall score with detailed sub-scores for each criterion
|
|
301
|
-
- **Use case**: Comprehensive quality assessment of output (requires evaluation model to be configured)
|
|
302
|
-
- **Usage**:
|
|
303
|
-
```yaml
|
|
304
|
-
evaluators:
|
|
305
|
-
- name: "llm-requirement-fulfillment"
|
|
306
|
-
|
|
307
|
-
evaluation:
|
|
308
|
-
enabled: true
|
|
309
|
-
model: "gemini-fast" # Model used for evaluation
|
|
310
|
-
```
|
|
311
|
-
|
|
312
|
-
**Note**: `llm-requirement-fulfillment` requires an evaluation model to be configured in the `evaluation` section.
|
|
313
|
-
|
|
314
|
-
## Architecture
|
|
315
|
-
|
|
316
|
-
```
|
|
317
|
-
┌─────────────────────────────────────────┐
|
|
318
|
-
│ run-comparison.ts (CLI Entry Point) │
|
|
319
|
-
└─────────────────────────────────────────┘
|
|
320
|
-
│
|
|
321
|
-
┌─────────┼─────────┐
|
|
322
|
-
▼ ▼ ▼
|
|
323
|
-
┌────────┐ ┌────────┐ ┌────────┐
|
|
324
|
-
│ Config │ │ Runner │ │Reporter│
|
|
325
|
-
│ Loader │ │ │ │ │
|
|
326
|
-
└────────┘ └────────┘ └────────┘
|
|
327
|
-
│ │
|
|
328
|
-
▼ ▼
|
|
329
|
-
┌────────┐ ┌────────┐
|
|
330
|
-
│Dynamic │ │Driver │
|
|
331
|
-
│Loader │ │Manager │
|
|
332
|
-
└────────┘ └────────┘
|
|
333
|
-
```
|
|
334
|
-
|
|
335
|
-
### Components
|
|
336
|
-
|
|
337
|
-
| Component | Responsibility |
|
|
338
|
-
|-----------|----------------|
|
|
339
|
-
| `config/loader.ts` | Load YAML configuration |
|
|
340
|
-
| `config/dynamic-loader.ts` | Dynamic module/evaluator loading |
|
|
341
|
-
| `runner/experiment.ts` | Orchestrate experiment execution |
|
|
342
|
-
| `runner/evaluator.ts` | Execute evaluations |
|
|
343
|
-
| `runner/driver-manager.ts` | Cache and manage AI drivers |
|
|
344
|
-
| `reporter/statistics.ts` | Generate statistical reports |
|
|
345
|
-
| `base-evaluation-module.ts` | Base evaluation prompt module |
|
|
346
|
-
| `evaluators/index.ts` | Built-in evaluator registry |
|
|
347
|
-
|
|
348
|
-
## Examples
|
|
349
|
-
|
|
350
|
-
See `examples/experiment.yaml` for a complete configuration template with:
|
|
351
|
-
- Model definitions (MLX, Vertex AI, Google GenAI)
|
|
352
|
-
- Driver configurations with credential paths
|
|
353
|
-
- Evaluation settings
|
|
354
|
-
- Empty sections for modules, test cases, and evaluators (ready for your content)
|
|
355
|
-
|
|
356
|
-
## API
|
|
357
|
-
|
|
358
|
-
### Programmatic Usage
|
|
359
|
-
|
|
360
|
-
```typescript
|
|
361
|
-
import {
|
|
362
|
-
loadExperimentConfig,
|
|
363
|
-
loadModules,
|
|
364
|
-
loadEvaluators,
|
|
365
|
-
ExperimentRunner,
|
|
366
|
-
DriverManager,
|
|
367
|
-
} from '@modular-prompt/experiment';
|
|
368
|
-
|
|
369
|
-
const { serverConfig, aiService } = loadExperimentConfig('config.yaml');
|
|
370
|
-
const modules = await loadModules(moduleRefs, basePath);
|
|
371
|
-
const evaluators = await loadEvaluators(evaluatorRefs, basePath);
|
|
372
|
-
|
|
373
|
-
const driverManager = new DriverManager();
|
|
374
|
-
const runner = new ExperimentRunner(
|
|
375
|
-
aiService,
|
|
376
|
-
driverManager,
|
|
377
|
-
modules,
|
|
378
|
-
testCases,
|
|
379
|
-
models,
|
|
380
|
-
repeatCount,
|
|
381
|
-
evaluators,
|
|
382
|
-
evaluatorModel
|
|
383
|
-
);
|
|
384
|
-
|
|
385
|
-
const results = await runner.run();
|
|
386
|
-
await driverManager.cleanup();
|
|
387
|
-
```
|
|
388
|
-
|
|
389
|
-
## CLI Options
|
|
63
|
+
### 3. 実行
|
|
390
64
|
|
|
65
|
+
```bash
|
|
66
|
+
npx modular-experiment experiment.yaml --dry-run # 確認
|
|
67
|
+
npx modular-experiment experiment.yaml # 実行
|
|
68
|
+
npx modular-experiment experiment.yaml --evaluate # 評価付き
|
|
69
|
+
npx modular-experiment experiment.yaml --repeat 10 # 複数回実行
|
|
391
70
|
```
|
|
392
|
-
Usage: modular-experiment <config> [options]
|
|
393
71
|
|
|
394
|
-
|
|
395
|
-
<config> Config file path (YAML or TypeScript)
|
|
72
|
+
設定ファイルの詳細、評価器の書き方、プログラマティックAPIについては `skills/experiment/SKILL.md` を参照。
|
|
396
73
|
|
|
397
|
-
|
|
398
|
-
--test-case <name> Test case name filter
|
|
399
|
-
--model <provider> Model provider filter
|
|
400
|
-
--modules <names> Comma-separated module names (default: all)
|
|
401
|
-
--repeat <count> Number of repetitions (default: 1)
|
|
402
|
-
--evaluate Enable evaluation phase
|
|
403
|
-
--evaluators <names> Comma-separated evaluator names (default: all)
|
|
404
|
-
--dry-run Display execution plan without running the experiment
|
|
405
|
-
--log-file <path> Log file path for JSONL output (detailed logs)
|
|
406
|
-
--verbose Enable verbose output (show detailed internal operations)
|
|
407
|
-
```
|
|
74
|
+
## Skills (for Claude Code)
|
|
408
75
|
|
|
409
|
-
|
|
76
|
+
This package includes `skills/experiment/SKILL.md`. It can be used as a Claude Code skill to guide experiment framework usage.
|
|
410
77
|
|
|
411
78
|
## License
|
|
412
79
|
|
|
@@ -105,8 +105,8 @@ export async function loadModules(refs, basePath) {
|
|
|
105
105
|
}
|
|
106
106
|
modules.push({
|
|
107
107
|
name: ref.name,
|
|
108
|
-
description: ref.description ||
|
|
109
|
-
|
|
108
|
+
description: ref.description || '',
|
|
109
|
+
module: module,
|
|
110
110
|
});
|
|
111
111
|
}
|
|
112
112
|
return modules;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dynamic-loader.js","sourceRoot":"","sources":["../../src/config/dynamic-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AAC7C,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAO/B,OAAO,EAAE,oBAAoB,EAAE,MAAM,8BAA8B,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,EAAE,MAAM,IAAI,UAAU,EAAE,MAAM,cAAc,CAAC;AAEpD,MAAM,MAAM,GAAG,UAAU,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;AAapD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAA0B,EAC1B,QAAgB;IAEhB,MAAM,UAAU,GAAsB,EAAE,CAAC;IAEzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,SAAsD,CAAC;QAE3D,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,gBAAgB;YAChB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;YAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;YACvC,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC;YAE7B,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAChD,SAAS;YACX,CAAC;QACH,CAAC;aAAM,IAAI,QAAQ,IAAI,GAAG,EAAE,CAAC;YAC3B,oDAAoD;YACpD,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;YAC7D,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;gBAClC,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;YACH,SAAS;QACX,CAAC;aAAM,CAAC;YACN,gCAAgC;YAChC,SAAS,GAAG,mBAAmB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAE1C,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,gCAAgC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBACxD,SAAS;YACX,CAAC;QACH,CAAC;QAED,qCAAqC;QACrC,IAAI,UAAU,IAAI,SAAS,IAAI,OAAO,SAAS,CAAC,QAAQ,KAAK,UAAU,EAAE,CAAC;YACxE,iBAAiB;YACjB,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,MAAM;gBACZ,aAAa,EAAE,SAA0B;aAC1C,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,4CAA4C;YAC5C,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YACnE,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,SAAS,CAAC,IAAI;oBACpB,WAAW,EAAE,SAAS,CAAC,WAAW;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAQD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAuB,EACvB,QAAgB;IAEhB,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;QAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC;QAEhC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAChD,SAAS;QACX,CAAC;QAED,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,
|
|
1
|
+
{"version":3,"file":"dynamic-loader.js","sourceRoot":"","sources":["../../src/config/dynamic-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AAC7C,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAO/B,OAAO,EAAE,oBAAoB,EAAE,MAAM,8BAA8B,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,EAAE,MAAM,IAAI,UAAU,EAAE,MAAM,cAAc,CAAC;AAEpD,MAAM,MAAM,GAAG,UAAU,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;AAapD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAA0B,EAC1B,QAAgB;IAEhB,MAAM,UAAU,GAAsB,EAAE,CAAC;IAEzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,SAAsD,CAAC;QAE3D,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,gBAAgB;YAChB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;YAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;YACvC,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC;YAE7B,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAChD,SAAS;YACX,CAAC;QACH,CAAC;aAAM,IAAI,QAAQ,IAAI,GAAG,EAAE,CAAC;YAC3B,oDAAoD;YACpD,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;YAC7D,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;gBAClC,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;YACH,SAAS;QACX,CAAC;aAAM,CAAC;YACN,gCAAgC;YAChC,SAAS,GAAG,mBAAmB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAE1C,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,gCAAgC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBACxD,SAAS;YACX,CAAC;QACH,CAAC;QAED,qCAAqC;QACrC,IAAI,UAAU,IAAI,SAAS,IAAI,OAAO,SAAS,CAAC,QAAQ,KAAK,UAAU,EAAE,CAAC;YACxE,iBAAiB;YACjB,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,MAAM;gBACZ,aAAa,EAAE,SAA0B;aAC1C,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,4CAA4C;YAC5C,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YACnE,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,SAAS,CAAC,IAAI;oBACpB,WAAW,EAAE,SAAS,CAAC,WAAW;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAQD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAuB,EACvB,QAAgB;IAEhB,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;QAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC;QAEhC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAChD,SAAS;QACX,CAAC;QAED,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;YAClC,MAAM,EAAE,MAAM;SACf,CAAC,CAAC;IACL,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -24,6 +24,14 @@ export declare class ExperimentRunner {
|
|
|
24
24
|
* @returns Array of TestResult
|
|
25
25
|
*/
|
|
26
26
|
run(): Promise<TestResult[]>;
|
|
27
|
+
/**
|
|
28
|
+
* Build test plan: expand all testCase × model × module combinations
|
|
29
|
+
*/
|
|
30
|
+
private buildTestPlan;
|
|
31
|
+
/**
|
|
32
|
+
* Execute test plan grouped by model
|
|
33
|
+
*/
|
|
34
|
+
private executePlan;
|
|
27
35
|
/**
|
|
28
36
|
* Run module test with multiple repetitions
|
|
29
37
|
*/
|
|
@@ -32,9 +40,5 @@ export declare class ExperimentRunner {
|
|
|
32
40
|
* Run evaluation phase
|
|
33
41
|
*/
|
|
34
42
|
private runEvaluationPhase;
|
|
35
|
-
/**
|
|
36
|
-
* Compare prompts across modules
|
|
37
|
-
*/
|
|
38
|
-
private comparePrompts;
|
|
39
43
|
}
|
|
40
44
|
//# sourceMappingURL=experiment.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"experiment.d.ts","sourceRoot":"","sources":["../../src/runner/experiment.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"experiment.d.ts","sourceRoot":"","sources":["../../src/runner/experiment.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EAAE,SAAS,EAAe,SAAS,EAAY,MAAM,wBAAwB,CAAC;AAE1F,OAAO,KAAK,EAAE,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAuC,MAAM,aAAa,CAAC;AAC/G,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACzD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAenE,qBAAa,gBAAgB;IAEzB,OAAO,CAAC,SAAS;IACjB,OAAO,CAAC,aAAa;IACrB,OAAO,CAAC,OAAO;IACf,OAAO,CAAC,SAAS;IACjB,OAAO,CAAC,MAAM;IACd,OAAO,CAAC,WAAW;IACnB,OAAO,CAAC,UAAU,CAAC;IACnB,OAAO,CAAC,cAAc,CAAC;gBAPf,SAAS,EAAE,SAAS,EACpB,aAAa,EAAE,aAAa,EAC5B,OAAO,EAAE,gBAAgB,EAAE,EAC3B,SAAS,EAAE,QAAQ,EAAE,EACrB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,EACjC,WAAW,EAAE,MAAM,EACnB,UAAU,CAAC,EAAE,eAAe,EAAE,YAAA,EAC9B,cAAc,CAAC,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,SAAS,CAAA;KAAE,YAAA;IAG5D;;;;OAIG;IACG,GAAG,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;IAmBlC;;OAEG;IACH,OAAO,CAAC,aAAa;IAyCrB;;OAEG;YACW,WAAW;IAmFzB;;OAEG;YACW,aAAa;IAuE3B;;OAEG;YACW,kBAAkB;CA2BjC"}
|