@modular-prompt/experiment 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/dist/config/dynamic-loader.js +2 -2
- package/dist/config/dynamic-loader.js.map +1 -1
- package/dist/runner/experiment.d.ts +8 -4
- package/dist/runner/experiment.d.ts.map +1 -1
- package/dist/runner/experiment.js +112 -106
- package/dist/runner/experiment.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/types.d.ts.map +1 -1
- package/examples/tools-experiment.yaml +41 -52
- package/examples/tools-test-module.mjs +18 -24
- package/examples/tools-test-module.ts +19 -23
- package/package.json +5 -4
- package/skills/experiment/SKILL.md +35 -14
package/README.md
CHANGED
|
@@ -48,12 +48,16 @@ evaluators: []
|
|
|
48
48
|
|
|
49
49
|
```typescript
|
|
50
50
|
// my-module.ts
|
|
51
|
-
import {
|
|
51
|
+
import type { PromptModule } from '@modular-prompt/core';
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
const module: PromptModule<{ query: string }> = {
|
|
54
|
+
objective: ['ユーザーの質問に回答する'],
|
|
55
|
+
instructions: [
|
|
56
|
+
(ctx) => `質問: ${ctx.query}`,
|
|
57
|
+
],
|
|
56
58
|
};
|
|
59
|
+
|
|
60
|
+
export default module;
|
|
57
61
|
```
|
|
58
62
|
|
|
59
63
|
### 3. 実行
|
|
@@ -105,8 +105,8 @@ export async function loadModules(refs, basePath) {
|
|
|
105
105
|
}
|
|
106
106
|
modules.push({
|
|
107
107
|
name: ref.name,
|
|
108
|
-
description: ref.description ||
|
|
109
|
-
|
|
108
|
+
description: ref.description || '',
|
|
109
|
+
module: module,
|
|
110
110
|
});
|
|
111
111
|
}
|
|
112
112
|
return modules;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dynamic-loader.js","sourceRoot":"","sources":["../../src/config/dynamic-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AAC7C,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAO/B,OAAO,EAAE,oBAAoB,EAAE,MAAM,8BAA8B,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,EAAE,MAAM,IAAI,UAAU,EAAE,MAAM,cAAc,CAAC;AAEpD,MAAM,MAAM,GAAG,UAAU,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;AAapD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAA0B,EAC1B,QAAgB;IAEhB,MAAM,UAAU,GAAsB,EAAE,CAAC;IAEzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,SAAsD,CAAC;QAE3D,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,gBAAgB;YAChB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;YAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;YACvC,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC;YAE7B,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAChD,SAAS;YACX,CAAC;QACH,CAAC;aAAM,IAAI,QAAQ,IAAI,GAAG,EAAE,CAAC;YAC3B,oDAAoD;YACpD,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;YAC7D,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;gBAClC,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;YACH,SAAS;QACX,CAAC;aAAM,CAAC;YACN,gCAAgC;YAChC,SAAS,GAAG,mBAAmB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAE1C,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,gCAAgC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBACxD,SAAS;YACX,CAAC;QACH,CAAC;QAED,qCAAqC;QACrC,IAAI,UAAU,IAAI,SAAS,IAAI,OAAO,SAAS,CAAC,QAAQ,KAAK,UAAU,EAAE,CAAC;YACxE,iBAAiB;YACjB,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,MAAM;gBACZ,aAAa,EAAE,SAA0B;aAC1C,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,4CAA4C;YAC5C,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YACnE,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,SAAS,CAAC,IAAI;oBACpB,WAAW,EAAE,SAAS,CAAC,WAAW;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAQD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAuB,EACvB,QAAgB;IAEhB,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;QAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC;QAEhC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAChD,SAAS;QACX,CAAC;QAED,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,
|
|
1
|
+
{"version":3,"file":"dynamic-loader.js","sourceRoot":"","sources":["../../src/config/dynamic-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,sBAAsB,CAAC;AAC7C,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AACpC,OAAO,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AAO/B,OAAO,EAAE,oBAAoB,EAAE,MAAM,8BAA8B,CAAC;AACpE,OAAO,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAC7D,OAAO,EAAE,MAAM,IAAI,UAAU,EAAE,MAAM,cAAc,CAAC;AAEpD,MAAM,MAAM,GAAG,UAAU,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;AAapD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,IAA0B,EAC1B,QAAgB;IAEhB,MAAM,UAAU,GAAsB,EAAE,CAAC;IAEzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,SAAsD,CAAC;QAE3D,IAAI,MAAM,IAAI,GAAG,EAAE,CAAC;YAClB,gBAAgB;YAChB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;YAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;YACvC,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC;YAE7B,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBAChD,SAAS;YACX,CAAC;QACH,CAAC;aAAM,IAAI,QAAQ,IAAI,GAAG,EAAE,CAAC;YAC3B,oDAAoD;YACpD,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;YAC7D,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;gBAClC,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;YACH,SAAS;QACX,CAAC;aAAM,CAAC;YACN,gCAAgC;YAChC,SAAS,GAAG,mBAAmB,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YAE1C,IAAI,CAAC,SAAS,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,gCAAgC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;gBACxD,SAAS;YACX,CAAC;QACH,CAAC;QAED,qCAAqC;QACrC,IAAI,UAAU,IAAI,SAAS,IAAI,OAAO,SAAS,CAAC,QAAQ,KAAK,UAAU,EAAE,CAAC;YACxE,iBAAiB;YACjB,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,MAAM;gBACZ,aAAa,EAAE,SAA0B;aAC1C,CAAC,CAAC;QACL,CAAC;aAAM,IAAI,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,4CAA4C;YAC5C,MAAM,YAAY,GAAG,KAAK,CAAC,oBAAoB,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YACnE,UAAU,CAAC,IAAI,CAAC;gBACd,IAAI,EAAE,GAAG,CAAC,IAAI;gBACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,SAAS,CAAC,WAAW,IAAI,EAAE;gBAC3D,IAAI,EAAE,QAAQ;gBACd,eAAe,EAAE;oBACf,IAAI,EAAE,SAAS,CAAC,IAAI;oBACpB,WAAW,EAAE,SAAS,CAAC,WAAW;oBAClC,MAAM,EAAE,YAAY;iBACrB;aACF,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,2BAA2B,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAQD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,IAAuB,EACvB,QAAgB;IAEhB,MAAM,OAAO,GAAuB,EAAE,CAAC;IAEvC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAC7C,MAAM,OAAO,GAAG,aAAa,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC;QAC7C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAC,CAAC;QACvC,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC;QAEhC,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,MAAM,CAAC,IAAI,CAAC,wBAAwB,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAChD,SAAS;QACX,CAAC;QAED,OAAO,CAAC,IAAI,CAAC;YACX,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,WAAW,EAAE,GAAG,CAAC,WAAW,IAAI,EAAE;YAClC,MAAM,EAAE,MAAM;SACf,CAAC,CAAC;IACL,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -24,6 +24,14 @@ export declare class ExperimentRunner {
|
|
|
24
24
|
* @returns Array of TestResult
|
|
25
25
|
*/
|
|
26
26
|
run(): Promise<TestResult[]>;
|
|
27
|
+
/**
|
|
28
|
+
* Build test plan: expand all testCase × model × module combinations
|
|
29
|
+
*/
|
|
30
|
+
private buildTestPlan;
|
|
31
|
+
/**
|
|
32
|
+
* Execute test plan grouped by model
|
|
33
|
+
*/
|
|
34
|
+
private executePlan;
|
|
27
35
|
/**
|
|
28
36
|
* Run module test with multiple repetitions
|
|
29
37
|
*/
|
|
@@ -32,9 +40,5 @@ export declare class ExperimentRunner {
|
|
|
32
40
|
* Run evaluation phase
|
|
33
41
|
*/
|
|
34
42
|
private runEvaluationPhase;
|
|
35
|
-
/**
|
|
36
|
-
* Compare prompts across modules
|
|
37
|
-
*/
|
|
38
|
-
private comparePrompts;
|
|
39
43
|
}
|
|
40
44
|
//# sourceMappingURL=experiment.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"experiment.d.ts","sourceRoot":"","sources":["../../src/runner/experiment.ts"],"names":[],"mappings":"AAAA;;GAEG;
|
|
1
|
+
{"version":3,"file":"experiment.d.ts","sourceRoot":"","sources":["../../src/runner/experiment.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,KAAK,EAAE,SAAS,EAAe,SAAS,EAAY,MAAM,wBAAwB,CAAC;AAE1F,OAAO,KAAK,EAAE,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAuC,MAAM,aAAa,CAAC;AAC/G,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACzD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,6BAA6B,CAAC;AAenE,qBAAa,gBAAgB;IAEzB,OAAO,CAAC,SAAS;IACjB,OAAO,CAAC,aAAa;IACrB,OAAO,CAAC,OAAO;IACf,OAAO,CAAC,SAAS;IACjB,OAAO,CAAC,MAAM;IACd,OAAO,CAAC,WAAW;IACnB,OAAO,CAAC,UAAU,CAAC;IACnB,OAAO,CAAC,cAAc,CAAC;gBAPf,SAAS,EAAE,SAAS,EACpB,aAAa,EAAE,aAAa,EAC5B,OAAO,EAAE,gBAAgB,EAAE,EAC3B,SAAS,EAAE,QAAQ,EAAE,EACrB,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,EACjC,WAAW,EAAE,MAAM,EACnB,UAAU,CAAC,EAAE,eAAe,EAAE,YAAA,EAC9B,cAAc,CAAC,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,SAAS,CAAA;KAAE,YAAA;IAG5D;;;;OAIG;IACG,GAAG,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;IAmBlC;;OAEG;IACH,OAAO,CAAC,aAAa;IAyCrB;;OAEG;YACW,WAAW;IAmFzB;;OAEG;YACW,aAAa;IAuE3B;;OAEG;YACW,kBAAkB;CA2BjC"}
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Experiment runner - orchestrates the entire experiment
|
|
3
3
|
*/
|
|
4
|
+
import { compile } from '@modular-prompt/core';
|
|
4
5
|
import { formatCompletionPrompt } from '@modular-prompt/driver';
|
|
6
|
+
import { defaultProcess } from '@modular-prompt/process';
|
|
5
7
|
import { EvaluatorRunner } from './evaluator.js';
|
|
6
8
|
import { logger as baseLogger } from '../logger.js';
|
|
7
9
|
const logger = baseLogger.context('runner');
|
|
@@ -30,35 +32,28 @@ export class ExperimentRunner {
|
|
|
30
32
|
* @returns Array of TestResult
|
|
31
33
|
*/
|
|
32
34
|
async run() {
|
|
33
|
-
|
|
34
|
-
const
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
// Phase 1: テスト計画の生成
|
|
36
|
+
const plan = this.buildTestPlan();
|
|
37
|
+
if (plan.length === 0) {
|
|
38
|
+
console.log('No test plan items generated.');
|
|
39
|
+
return [];
|
|
40
|
+
}
|
|
41
|
+
// Phase 2: モデルごとにグループ化して実行
|
|
42
|
+
const { results, evaluationContexts } = await this.executePlan(plan);
|
|
43
|
+
// Phase 3: 評価フェーズ
|
|
44
|
+
if (this.evaluators && this.evaluators.length > 0 && this.evaluatorModel) {
|
|
45
|
+
await this.runEvaluationPhase(evaluationContexts);
|
|
46
|
+
}
|
|
47
|
+
return results;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Build test plan: expand all testCase × model × module combinations
|
|
51
|
+
*/
|
|
52
|
+
buildTestPlan() {
|
|
53
|
+
const plan = [];
|
|
54
|
+
let order = 0;
|
|
37
55
|
for (const testCase of this.testCases) {
|
|
38
|
-
|
|
39
|
-
console.log(`Test Case: ${testCase.name}`);
|
|
40
|
-
if (testCase.description) {
|
|
41
|
-
console.log(`Description: ${testCase.description}`);
|
|
42
|
-
}
|
|
43
|
-
console.log('─'.repeat(80));
|
|
44
|
-
console.log();
|
|
45
|
-
// Compile all modules with testCase.input as context
|
|
46
|
-
const compiledModules = this.modules.map(module => {
|
|
47
|
-
logger.verbose(`Compiling prompt for module: ${module.name}`);
|
|
48
|
-
const compiled = module.compile(testCase.input);
|
|
49
|
-
const prompt = formatCompletionPrompt(compiled);
|
|
50
|
-
logger.verbose(`Prompt length for ${module.name}: ${prompt.length} chars`);
|
|
51
|
-
return {
|
|
52
|
-
name: module.name,
|
|
53
|
-
compiled,
|
|
54
|
-
prompt,
|
|
55
|
-
};
|
|
56
|
-
});
|
|
57
|
-
// Compare prompts if multiple modules
|
|
58
|
-
if (compiledModules.length > 1) {
|
|
59
|
-
this.comparePrompts(compiledModules);
|
|
60
|
-
}
|
|
61
|
-
// Determine which models to test with this testCase
|
|
56
|
+
// テストケースで使うモデルを決定
|
|
62
57
|
const modelsToTest = testCase.models
|
|
63
58
|
? testCase.models.map(name => {
|
|
64
59
|
const spec = this.models[name];
|
|
@@ -71,36 +66,62 @@ export class ExperimentRunner {
|
|
|
71
66
|
: Object.entries(this.models)
|
|
72
67
|
.filter(([_, spec]) => !spec.disabled)
|
|
73
68
|
.map(([name, spec]) => ({ name, spec }));
|
|
74
|
-
if (modelsToTest.length === 0) {
|
|
75
|
-
console.log('⚠️ No models to test for this test case, skipping');
|
|
76
|
-
console.log();
|
|
77
|
-
continue;
|
|
78
|
-
}
|
|
79
|
-
// Test with each model
|
|
80
69
|
for (const { name: modelName, spec: modelSpec } of modelsToTest) {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
70
|
+
for (const module of this.modules) {
|
|
71
|
+
// compile for logging/evaluation purposes
|
|
72
|
+
const compiled = compile(module.module, testCase.input);
|
|
73
|
+
const prompt = formatCompletionPrompt(compiled);
|
|
74
|
+
plan.push({
|
|
75
|
+
order: order++,
|
|
76
|
+
testCase,
|
|
77
|
+
modelName,
|
|
78
|
+
modelSpec,
|
|
79
|
+
module,
|
|
80
|
+
prompt,
|
|
81
|
+
});
|
|
89
82
|
}
|
|
90
|
-
|
|
91
|
-
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
logger.info(`Test plan: ${plan.length} items (${this.testCases.length} test cases × models × ${this.modules.length} modules)`);
|
|
86
|
+
return plan;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Execute test plan grouped by model
|
|
90
|
+
*/
|
|
91
|
+
async executePlan(plan) {
|
|
92
|
+
const allResults = [];
|
|
93
|
+
const allEvalContexts = [];
|
|
94
|
+
// モデルごとにグループ化(出現順を維持)
|
|
95
|
+
const modelGroups = new Map();
|
|
96
|
+
for (const item of plan) {
|
|
97
|
+
const group = modelGroups.get(item.modelName);
|
|
98
|
+
if (group) {
|
|
99
|
+
group.push(item);
|
|
100
|
+
}
|
|
101
|
+
else {
|
|
102
|
+
modelGroups.set(item.modelName, [item]);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// モデルごとに実行
|
|
106
|
+
for (const [modelName, items] of modelGroups) {
|
|
107
|
+
const modelSpec = items[0].modelSpec;
|
|
108
|
+
console.log('='.repeat(80));
|
|
109
|
+
console.log(`🤖 Model: ${modelName} (${modelSpec.provider}:${modelSpec.model})`);
|
|
110
|
+
console.log('='.repeat(80));
|
|
111
|
+
logger.info(`Creating driver for ${modelName} (${modelSpec.provider}:${modelSpec.model})`);
|
|
112
|
+
const driver = await this.driverManager.getOrCreate(this.aiService, modelName, modelSpec);
|
|
113
|
+
for (const item of items) {
|
|
114
|
+
console.log(` ── ${item.testCase.name} ──`);
|
|
115
|
+
if (item.testCase.description) {
|
|
116
|
+
console.log(` ${item.testCase.description}`);
|
|
92
117
|
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
for (const { name, compiled, prompt } of compiledModules) {
|
|
99
|
-
const runs = await this.runModuleTest(name, compiled, driver, testCase);
|
|
100
|
-
allResults.push({
|
|
101
|
-
testCase: testCase.name,
|
|
118
|
+
const runs = await this.runModuleTest(item.module.name, item.module.module, driver, item.testCase);
|
|
119
|
+
allResults.push({
|
|
120
|
+
order: item.order,
|
|
121
|
+
result: {
|
|
122
|
+
testCase: item.testCase.name,
|
|
102
123
|
model: modelName,
|
|
103
|
-
module: name,
|
|
124
|
+
module: item.module.name,
|
|
104
125
|
runs: runs.map(r => ({
|
|
105
126
|
success: r.success,
|
|
106
127
|
elapsed: r.elapsed,
|
|
@@ -109,44 +130,61 @@ export class ExperimentRunner {
|
|
|
109
130
|
finishReason: r.queryResult?.finishReason,
|
|
110
131
|
error: r.error,
|
|
111
132
|
})),
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
133
|
+
},
|
|
134
|
+
});
|
|
135
|
+
// Collect for evaluation
|
|
136
|
+
const successfulRuns = runs.filter(r => r.success);
|
|
137
|
+
if (successfulRuns.length > 0) {
|
|
138
|
+
allEvalContexts.push({
|
|
139
|
+
order: item.order,
|
|
140
|
+
context: {
|
|
141
|
+
moduleName: item.module.name,
|
|
142
|
+
prompt: item.prompt,
|
|
119
143
|
runs: successfulRuns.map(r => ({ queryResult: r.queryResult })),
|
|
120
|
-
}
|
|
121
|
-
}
|
|
144
|
+
},
|
|
145
|
+
});
|
|
122
146
|
}
|
|
123
147
|
}
|
|
148
|
+
// モデルの全テスト完了後にドライバーをクローズ
|
|
149
|
+
logger.info(`Closing driver: ${modelName}`);
|
|
150
|
+
await this.driverManager.close(modelName);
|
|
151
|
+
console.log();
|
|
124
152
|
}
|
|
125
|
-
//
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
153
|
+
// Retire: 元の定義順にソートして返す
|
|
154
|
+
allResults.sort((a, b) => a.order - b.order);
|
|
155
|
+
allEvalContexts.sort((a, b) => a.order - b.order);
|
|
156
|
+
return {
|
|
157
|
+
results: allResults.map(r => r.result),
|
|
158
|
+
evaluationContexts: allEvalContexts.map(e => e.context),
|
|
159
|
+
};
|
|
130
160
|
}
|
|
131
161
|
/**
|
|
132
162
|
* Run module test with multiple repetitions
|
|
133
163
|
*/
|
|
134
|
-
async runModuleTest(moduleName,
|
|
164
|
+
async runModuleTest(moduleName, module, driver, testCase) {
|
|
135
165
|
logger.verbose(`Running ${this.repeatCount} time(s) for module: ${moduleName}`);
|
|
136
166
|
const runs = [];
|
|
137
167
|
for (let i = 0; i < this.repeatCount; i++) {
|
|
138
168
|
logger.verbose(`Run ${i + 1}/${this.repeatCount} for module: ${moduleName}`);
|
|
139
169
|
const startTime = Date.now();
|
|
140
170
|
try {
|
|
141
|
-
const
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
171
|
+
const workflowResult = await defaultProcess(driver, module, testCase.input, {
|
|
172
|
+
queryOptions: {
|
|
173
|
+
temperature: 0.7,
|
|
174
|
+
maxTokens: 2048,
|
|
175
|
+
...testCase.queryOptions,
|
|
176
|
+
},
|
|
145
177
|
});
|
|
146
178
|
const elapsed = Date.now() - startTime;
|
|
179
|
+
// Convert workflow result to QueryResult-like structure
|
|
180
|
+
const result = {
|
|
181
|
+
content: workflowResult.output,
|
|
182
|
+
toolCalls: workflowResult.metadata?.toolCalls,
|
|
183
|
+
finishReason: workflowResult.metadata?.finishReason,
|
|
184
|
+
usage: workflowResult.metadata?.usage,
|
|
185
|
+
};
|
|
147
186
|
logger.verbose(`Module ${moduleName} run ${i + 1}: Success (${elapsed}ms)`);
|
|
148
187
|
// Display result summary (思考ブロックはプレビューから除外)
|
|
149
|
-
// パターン: <think>...</think> または 先頭から</think>まで(テンプレートが<think>を付与する場合)
|
|
150
188
|
const displayContent = result.content
|
|
151
189
|
.replace(/<think>[\s\S]*?<\/think>\s*/g, '')
|
|
152
190
|
.replace(/^[\s\S]*?<\/think>\s*/g, '');
|
|
@@ -204,37 +242,5 @@ export class ExperimentRunner {
|
|
|
204
242
|
// Display all evaluation results
|
|
205
243
|
evaluatorRunner.displayResults(allEvaluations, this.evaluators);
|
|
206
244
|
}
|
|
207
|
-
/**
|
|
208
|
-
* Compare prompts across modules
|
|
209
|
-
*/
|
|
210
|
-
comparePrompts(compiledModules) {
|
|
211
|
-
console.log('📊 Prompt Comparison:');
|
|
212
|
-
for (let i = 0; i < compiledModules.length; i++) {
|
|
213
|
-
const module1 = compiledModules[i];
|
|
214
|
-
for (let j = i + 1; j < compiledModules.length; j++) {
|
|
215
|
-
const module2 = compiledModules[j];
|
|
216
|
-
if (module1.prompt === module2.prompt) {
|
|
217
|
-
console.log(` ✅ [${module1.name}] and [${module2.name}] are identical`);
|
|
218
|
-
}
|
|
219
|
-
else {
|
|
220
|
-
console.log(` ⚠️ [${module1.name}] and [${module2.name}] differ`);
|
|
221
|
-
logger.verbose(`Prompt comparison details:`);
|
|
222
|
-
logger.verbose(` ${module1.name}: ${module1.prompt.length} chars`);
|
|
223
|
-
logger.verbose(` ${module2.name}: ${module2.prompt.length} chars`);
|
|
224
|
-
logger.verbose(` Diff: ${module2.prompt.length - module1.prompt.length} chars`);
|
|
225
|
-
// Find first difference (verbose only)
|
|
226
|
-
for (let k = 0; k < Math.max(module1.prompt.length, module2.prompt.length); k++) {
|
|
227
|
-
if (module1.prompt[k] !== module2.prompt[k]) {
|
|
228
|
-
logger.verbose(` First diff at position ${k}:`);
|
|
229
|
-
logger.verbose(` ${module1.name}: ${JSON.stringify(module1.prompt.substring(k, k + 50))}`);
|
|
230
|
-
logger.verbose(` ${module2.name}: ${JSON.stringify(module2.prompt.substring(k, k + 50))}`);
|
|
231
|
-
break;
|
|
232
|
-
}
|
|
233
|
-
}
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
console.log();
|
|
238
|
-
}
|
|
239
245
|
}
|
|
240
246
|
//# sourceMappingURL=experiment.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"experiment.js","sourceRoot":"","sources":["../../src/runner/experiment.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAC;
|
|
1
|
+
{"version":3,"file":"experiment.js","sourceRoot":"","sources":["../../src/runner/experiment.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,sBAAsB,CAAC;AAE/C,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAC;AAEhE,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAIzD,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,MAAM,IAAI,UAAU,EAAE,MAAM,cAAc,CAAC;AAEpD,MAAM,MAAM,GAAG,UAAU,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;AAW5C,MAAM,OAAO,gBAAgB;IAEjB;IACA;IACA;IACA;IACA;IACA;IACA;IACA;IARV,YACU,SAAoB,EACpB,aAA4B,EAC5B,OAA2B,EAC3B,SAAqB,EACrB,MAAiC,EACjC,WAAmB,EACnB,UAA8B,EAC9B,cAAkD;QAPlD,cAAS,GAAT,SAAS,CAAW;QACpB,kBAAa,GAAb,aAAa,CAAe;QAC5B,YAAO,GAAP,OAAO,CAAoB;QAC3B,cAAS,GAAT,SAAS,CAAY;QACrB,WAAM,GAAN,MAAM,CAA2B;QACjC,gBAAW,GAAX,WAAW,CAAQ;QACnB,eAAU,GAAV,UAAU,CAAoB;QAC9B,mBAAc,GAAd,cAAc,CAAoC;IACzD,CAAC;IAEJ;;;;OAIG;IACH,KAAK,CAAC,GAAG;QACP,oBAAoB;QACpB,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QAClC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,+BAA+B,CAAC,CAAC;YAC7C,OAAO,EAAE,CAAC;QACZ,CAAC;QAED,2BAA2B;QAC3B,MAAM,EAAE,OAAO,EAAE,kBAAkB,EAAE,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QAErE,kBAAkB;QAClB,IAAI,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACzE,MAAM,IAAI,CAAC,kBAAkB,CAAC,kBAAkB,CAAC,CAAC;QACpD,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,MAAM,IAAI,GAAmB,EAAE,CAAC;QAChC,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACtC,kBAAkB;YAClB,MAAM,YAAY,GAA6C,QAAQ,CAAC,MAAM;gBAC5E,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE;oBACzB,MAAM,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;oBAC/B,IAAI,CAAC,IAAI,EAAE,CAAC;wBACV,OAAO,CAAC,IAAI,CAAC,cAAc,IAAI,wCAAwC,CAAC,CAAC;wBACzE,OAAO,IAAI,CAAC;oBACd,CAAC;oBACD,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;gBACxB,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAA6C;gBAChE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC;qBACxB,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC;qBACrC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YAE/C,KAAK,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,YAAY,EAAE,CAAC;gBAChE,KAAK,MAAM,MAAM,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;oBAClC,0CAA0C;oBAC1C,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC;oBACxD,MAAM,MAAM,GAAG,sBAAsB,CAAC,QAAQ,CAAC,CAAC;oBAEhD,IAAI,CAAC,IAAI,CAAC;wBACR,KAAK,EAAE,KAAK,EAAE;wBACd,QAAQ;wBACR,SAAS;wBACT,SAAS;wBACT,MAAM;wBACN,MAAM;qBACP,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;QACH,CAAC;QAED,MAAM,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,MAAM,WAAW,IAAI,CAAC,SAAS,CAAC,MAAM,0BAA0B,IAAI,CAAC,OAAO,CAAC,MAAM,WAAW,CAAC,CAAC;QAC/H,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,WAAW,CAAC,IAAoB;QAI5C,MAAM,UAAU,GAAiD,EAAE,CAAC;QACpE,MAAM,eAAe,GAAyD,EAAE,CAAC;QAEjF,sBAAsB;QACtB,MAAM,WAAW,GAAG,IAAI,GAAG,EAA0B,CAAC;QACtD,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,WAAW,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC9C,IAAI,KAAK,EAAE,CAAC;gBACV,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACnB,CAAC;iBAAM,CAAC;gBACN,WAAW,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;QAED,WAAW;QACX,KAAK,MAAM,CAAC,SAAS,EAAE,KAAK,CAAC,IAAI,WAAW,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;YAC5B,OAAO,CAAC,GAAG,CAAC,aAAa,SAAS,KAAK,SAAS,CAAC,QAAQ,IAAI,SAAS,CAAC,KAAK,GAAG,CAAC,CAAC;YACjF,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;YAE5B,MAAM,CAAC,IAAI,CAAC,uBAAuB,SAAS,KAAK,SAAS,CAAC,QAAQ,IAAI,SAAS,CAAC,KAAK,GAAG,CAAC,CAAC;YAC3F,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,WAAW,CAAC,IAAI,CAAC,SAAS,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;YAE1F,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,OAAO,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,QAAQ,CAAC,IAAI,KAAK,CAAC,CAAC;gBAC7C,IAAI,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;oBAC9B,OAAO,CAAC,GAAG,CAAC,QAAQ,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;gBACnD,CAAC;gBAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;gBAEnG,UAAU,CAAC,IAAI,CAAC;oBACd,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,MAAM,EAAE;wBACN,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,IAAI;wBAC5B,KAAK,EAAE,SAAS;wBAChB,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI;wBACxB,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;4BACnB,OAAO,EAAE,CAAC,CAAC,OAAO;4BAClB,OAAO,EAAE,CAAC,CAAC,OAAO;4BAClB,OAAO,EAAE,CAAC,CAAC,WAAW,EAAE,OAAO,IAAI,EAAE;4BACrC,SAAS,EAAE,CAAC,CAAC,WAAW,EAAE,SAAS;4BACnC,YAAY,EAAE,CAAC,CAAC,WAAW,EAAE,YAAY;4BACzC,KAAK,EAAE,CAAC,CAAC,KAAK;yBACf,CAAC,CAAC;qBACJ;iBACF,CAAC,CAAC;gBAEH,yBAAyB;gBACzB,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;gBACnD,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC9B,eAAe,CAAC,IAAI,CAAC;wBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;wBACjB,OAAO,EAAE;4BACP,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI;4BAC5B,MAAM,EAAE,IAAI,CAAC,MAAM;4BACnB,IAAI,EAAE,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,WAAY,EAAE,CAAC,CAAC;yBACjE;qBACF,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAED,yBAAyB;YACzB,MAAM,CAAC,IAAI,CAAC,mBAAmB,SAAS,EAAE,CAAC,CAAC;YAC5C,MAAM,IAAI,CAAC,aAAa,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YAC1C,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;QAED,wBAAwB;QACxB,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAC7C,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAElD,OAAO;YACL,OAAO,EAAE,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;YACtC,kBAAkB,EAAE,eAAe,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;SACxD,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,aAAa,CACzB,UAAkB,EAClB,MAAyB,EACzB,MAAgB,EAChB,QAAkB;QAElB,MAAM,CAAC,OAAO,CAAC,WAAW,IAAI,CAAC,WAAW,wBAAwB,UAAU,EAAE,CAAC,CAAC;QAEhF,MAAM,IAAI,GAA4F,EAAE,CAAC;QAEzG,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,WAAW,gBAAgB,UAAU,EAAE,CAAC,CAAC;YAE7E,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAC7B,IAAI,CAAC;gBACH,MAAM,cAAc,GAAG,MAAM,cAAc,CAAC,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,KAAK,EAAE;oBAC1E,YAAY,EAAE;wBACZ,WAAW,EAAE,GAAG;wBAChB,SAAS,EAAE,IAAI;wBACf,GAAG,QAAQ,CAAC,YAAY;qBACzB;iBACF,CAAC,CAAC;gBACH,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBAEvC,wDAAwD;gBACxD,MAAM,MAAM,GAAgB;oBAC1B,OAAO,EAAE,cAAc,CAAC,MAAM;oBAC9B,SAAS,EAAE,cAAc,CAAC,QAAQ,EAAE,SAAgB;oBACpD,YAAY,EAAE,cAAc,CAAC,QAAQ,EAAE,YAAmB;oBAC1D,KAAK,EAAE,cAAc,CAAC,QAAQ,EAAE,KAAY;iBAC7C,CAAC;gBAEF,MAAM,CAAC,OAAO,CAAC,UAAU,UAAU,QAAQ,CAAC,GAAG,CAAC,cAAc,OAAO,KAAK,CAAC,CAAC;gBAE5E,4CAA4C;gBAC5C,MAAM,cAAc,GAAG,MAAM,CAAC,OAAO;qBAClC,OAAO,CAAC,8BAA8B,EAAE,EAAE,CAAC;qBAC3C,OAAO,CAAC,wBAAwB,EAAE,EAAE,CAAC,CAAC;gBACzC,MAAM,cAAc,GAAG,cAAc,CAAC,MAAM,GAAG,GAAG;oBAChD,CAAC,CAAC,cAAc,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK;oBAC1C,CAAC,CAAC,cAAc,CAAC;gBACnB,OAAO,CAAC,GAAG,CAAC,SAAS,UAAU,SAAS,CAAC,GAAG,CAAC,KAAK,OAAO,oBAAoB,MAAM,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC,CAAC;gBACjH,IAAI,MAAM,CAAC,SAAS,IAAI,MAAM,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACpD,KAAK,MAAM,EAAE,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;wBAClC,OAAO,CAAC,GAAG,CAAC,sBAAsB,EAAE,CAAC,IAAI,IAAI,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;oBAChF,CAAC;gBACH,CAAC;gBACD,IAAI,cAAc,CAAC,IAAI,EAAE,EAAE,CAAC;oBAC1B,OAAO,CAAC,GAAG,CAAC,YAAY,cAAc,EAAE,CAAC,CAAC;gBAC5C,CAAC;gBAED,IAAI,CAAC,IAAI,CAAC;oBACR,OAAO,EAAE,IAAI;oBACb,OAAO;oBACP,WAAW,EAAE,MAAM;iBACpB,CAAC,CAAC;YACL,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACvC,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAC5E,MAAM,CAAC,KAAK,CAAC,UAAU,UAAU,QAAQ,CAAC,GAAG,CAAC,YAAY,OAAO,QAAQ,YAAY,EAAE,CAAC,CAAC;gBACzF,IAAI,CAAC,IAAI,CAAC;oBACR,OAAO,EAAE,KAAK;oBACd,OAAO;oBACP,KAAK,EAAE,YAAY;iBACpB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,kBAAkB,CAC9B,kBAAuC;QAEvC,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,EAAE,CAAC;QAEd,MAAM,eAAe,GAAG,IAAI,eAAe,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,cAAe,CAAC,IAAI,CAAC,CAAC;QACvF,MAAM,cAAc,GAAuB,EAAE,CAAC;QAE9C,2CAA2C;QAC3C,KAAK,MAAM,OAAO,IAAI,kBAAkB,EAAE,CAAC;YACzC,OAAO,CAAC,GAAG,CAAC,kBAAkB,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;YACpD,OAAO,CAAC,GAAG,EAAE,CAAC;YAEd,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,UAAW,EAAE,CAAC;gBACzC,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;gBAClE,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,eAAe,CAAC,cAAc,CAAC,cAAc,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;IAClE,CAAC;CAEF"}
|
package/dist/types.d.ts
CHANGED
|
@@ -54,7 +54,7 @@ export interface ExperimentOptions {
|
|
|
54
54
|
export interface ModuleDefinition {
|
|
55
55
|
name: string;
|
|
56
56
|
description: string;
|
|
57
|
-
|
|
57
|
+
module: PromptModule<any>;
|
|
58
58
|
}
|
|
59
59
|
/**
|
|
60
60
|
* Evaluation context (common for both code and prompt evaluators)
|
package/dist/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAC;AAElF;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,qBAAqB;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+CAA+C;IAC/C,KAAK,EAAE,GAAG,CAAC;IACX,iGAAiG;IACjG,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAClB,kEAAkE;IAClE,YAAY,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,CAAC;CACtC;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,SAAS,EAAE,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACzD,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAC;AAElF;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,qBAAqB;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+CAA+C;IAC/C,KAAK,EAAE,GAAG,CAAC;IACX,iGAAiG;IACjG,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAClB,kEAAkE;IAClE,YAAY,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,CAAC;CACtC;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,OAAO,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,SAAS,EAAE,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,YAAY,CAAC,GAAG,CAAC,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,KAAK,CAAC;QACV,WAAW,EAAE,WAAW,CAAC;KAC1B,CAAC,CAAC;CACJ;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC9B,GAAG,CAAC,EAAE,GAAG,CAAC;IACV,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb;;;;;;;;;OASG;IACH,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,CAAC,OAAO,EAAE,iBAAiB,KAAK,OAAO,CAAC,gBAAgB,CAAC,CAAC;CACrE;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb;;;;;;;;;OASG;IACH,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,YAAY,CAAC,iBAAiB,CAAC,CAAC;CACzC;AAED;;GAEG;AACH,MAAM,MAAM,kBAAkB,GAC1B;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,GACpD;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,YAAY,CAAC,iBAAiB,CAAC,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,GAC/E;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAE3C;;GAEG;AACH,MAAM,WAAW,yBAA0B,SAAQ,iBAAiB;IAClE,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,OAAO,CAAC;CACnB"}
|
|
@@ -7,16 +7,25 @@ models:
|
|
|
7
7
|
provider: "mlx"
|
|
8
8
|
capabilities: ["local", "tools"]
|
|
9
9
|
priority: 10
|
|
10
|
+
disabled: true
|
|
11
|
+
qwen3.5-27b:
|
|
12
|
+
model: "mlx-community/Qwen3.5-27B-4bit"
|
|
13
|
+
provider: "mlx"
|
|
14
|
+
capabilities: ["local", "tools"]
|
|
15
|
+
priority: 10
|
|
16
|
+
# disabled: true
|
|
10
17
|
gemma3-12b:
|
|
11
18
|
model: "mlx-community/gemma-3-12b-it-qat-4bit"
|
|
12
19
|
provider: "mlx"
|
|
13
20
|
capabilities: ["local", "tools"]
|
|
14
21
|
priority: 20
|
|
22
|
+
disabled: true
|
|
15
23
|
lfm2.5-jp:
|
|
16
24
|
model: LiquidAI/LFM2.5-1.2B-JP-MLX-8bit
|
|
17
25
|
provider: "mlx"
|
|
18
26
|
capabilities: ["local", "fast", "japanese"]
|
|
19
27
|
priority: 20
|
|
28
|
+
disabled: true
|
|
20
29
|
lfm2.5-instruct:
|
|
21
30
|
model: LiquidAI/LFM2.5-1.2B-Instruct-MLX-8bit
|
|
22
31
|
provider: "mlx"
|
|
@@ -26,80 +35,60 @@ models:
|
|
|
26
35
|
lfm2.5-thinking:
|
|
27
36
|
model: LiquidAI/LFM2.5-1.2B-Thinking-MLX-8bit
|
|
28
37
|
provider: "mlx"
|
|
29
|
-
capabilities: ["local", "
|
|
38
|
+
capabilities: ["local", "thinking", "japanese"]
|
|
30
39
|
priority: 20
|
|
31
40
|
disabled: true
|
|
32
41
|
|
|
33
42
|
drivers:
|
|
34
43
|
mlx: {}
|
|
35
44
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
45
|
+
# 共通定義
|
|
46
|
+
_shared:
|
|
47
|
+
tools: &tools_def
|
|
48
|
+
- name: get_weather
|
|
49
|
+
description: "指定された場所の現在の天気を取得する"
|
|
50
|
+
parameters:
|
|
51
|
+
type: object
|
|
52
|
+
properties:
|
|
53
|
+
location:
|
|
54
|
+
type: string
|
|
55
|
+
description: "天気を取得する場所(都市名)"
|
|
56
|
+
required:
|
|
57
|
+
- location
|
|
58
|
+
queryOptions:
|
|
59
|
+
weather: &tool_weather
|
|
49
60
|
temperature: 0.3
|
|
50
61
|
maxTokens: 512
|
|
51
|
-
tools:
|
|
52
|
-
- name: get_weather
|
|
53
|
-
description: "指定された場所の現在の天気を取得する"
|
|
54
|
-
parameters:
|
|
55
|
-
type: object
|
|
56
|
-
properties:
|
|
57
|
-
location:
|
|
58
|
-
type: string
|
|
59
|
-
description: "天気を取得する場所(都市名)"
|
|
60
|
-
required:
|
|
61
|
-
- location
|
|
62
|
+
tools: *tools_def
|
|
62
63
|
toolChoice: auto
|
|
63
|
-
|
|
64
|
-
- name: "[gemma3] ツール不要の質問"
|
|
65
|
-
description: "ツールを呼び出さずにテキストで回答することを期待"
|
|
66
|
-
models: ["gemma3-12b"]
|
|
67
|
-
input:
|
|
68
|
-
question: "1 + 1 は何ですか?"
|
|
69
|
-
queryOptions: &tool_math
|
|
64
|
+
math: &tool_math
|
|
70
65
|
temperature: 0.3
|
|
71
66
|
maxTokens: 1024
|
|
72
67
|
tools: *tools_def
|
|
73
68
|
toolChoice: auto
|
|
74
69
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
# input:
|
|
80
|
-
# question: "東京の天気を教えてください。"
|
|
81
|
-
# queryOptions: *tool_weather
|
|
82
|
-
|
|
83
|
-
# - name: "[qwen3] ツール不要の質問"
|
|
84
|
-
# description: "ツールを呼び出さずにテキストで回答することを期待"
|
|
85
|
-
# models: ["qwen3-4b"]
|
|
86
|
-
# input:
|
|
87
|
-
# question: "1 + 1 は何ですか?"
|
|
88
|
-
# queryOptions: *tool_math
|
|
70
|
+
modules:
|
|
71
|
+
- name: tools-test
|
|
72
|
+
path: ./tools-test-module.mjs
|
|
73
|
+
description: "ツール呼び出し実験用"
|
|
89
74
|
|
|
90
|
-
|
|
91
|
-
- name: "
|
|
75
|
+
testCases:
|
|
76
|
+
- name: "天気ツール呼び出し"
|
|
92
77
|
description: "get_weatherツールを呼び出すことを期待"
|
|
93
|
-
models: ["lfm2.5-instruct"]
|
|
94
78
|
input:
|
|
95
79
|
question: "東京の天気を教えてください。"
|
|
96
80
|
queryOptions: *tool_weather
|
|
97
81
|
|
|
98
|
-
- name: "
|
|
82
|
+
- name: "ツール不要の質問"
|
|
99
83
|
description: "ツールを呼び出さずにテキストで回答することを期待"
|
|
100
|
-
models: ["lfm2.5-instruct"]
|
|
101
84
|
input:
|
|
102
85
|
question: "1 + 1 は何ですか?"
|
|
103
86
|
queryOptions: *tool_math
|
|
104
87
|
|
|
105
|
-
evaluators:
|
|
88
|
+
evaluators:
|
|
89
|
+
- name: llm-requirement-fulfillment
|
|
90
|
+
|
|
91
|
+
evaluation:
|
|
92
|
+
enabled: true
|
|
93
|
+
provider: mlx
|
|
94
|
+
model: lfm2.5-thinking
|
|
@@ -2,28 +2,22 @@
|
|
|
2
2
|
* Tools実験用モジュール
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
{
|
|
22
|
-
type: 'message',
|
|
23
|
-
role: 'user',
|
|
24
|
-
content: context.question || 'Hello',
|
|
25
|
-
},
|
|
26
|
-
],
|
|
27
|
-
});
|
|
28
|
-
},
|
|
5
|
+
const module = {
|
|
6
|
+
objective: [
|
|
7
|
+
'- あなたは利用者からの質問に答えるアシスタントです。',
|
|
8
|
+
],
|
|
9
|
+
instructions: [
|
|
10
|
+
'- 質問の内容に応じて、適切なツールを使ってください。',
|
|
11
|
+
' - ツールの結果が返ってくるまで、推測で答えないでください。',
|
|
12
|
+
'- 必要がない場合は通常の応答を返します。',
|
|
13
|
+
],
|
|
14
|
+
messages: [
|
|
15
|
+
(ctx) => ({
|
|
16
|
+
type: 'message',
|
|
17
|
+
role: 'user',
|
|
18
|
+
content: ctx.question || 'Hello',
|
|
19
|
+
}),
|
|
20
|
+
],
|
|
29
21
|
};
|
|
22
|
+
|
|
23
|
+
export default module;
|
|
@@ -2,28 +2,24 @@
|
|
|
2
2
|
* Tools実験用モジュール
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
-
import {
|
|
5
|
+
import type { PromptModule } from '@modular-prompt/core';
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
role: 'user' as const,
|
|
24
|
-
content: context.question || 'Hello',
|
|
25
|
-
},
|
|
26
|
-
],
|
|
27
|
-
});
|
|
28
|
-
},
|
|
7
|
+
const module: PromptModule<{ question?: string }> = {
|
|
8
|
+
objective: [
|
|
9
|
+
'あなたはツールを使って質問に答えるアシスタントです。',
|
|
10
|
+
'必要に応じてツールを呼び出してください。',
|
|
11
|
+
],
|
|
12
|
+
instructions: [
|
|
13
|
+
'質問の内容に応じて、適切なツールを使ってください。',
|
|
14
|
+
'ツールの結果が返ってくるまで、推測で答えないでください。',
|
|
15
|
+
],
|
|
16
|
+
messages: [
|
|
17
|
+
(ctx) => ({
|
|
18
|
+
type: 'message' as const,
|
|
19
|
+
role: 'user' as const,
|
|
20
|
+
content: ctx.question || 'Hello',
|
|
21
|
+
}),
|
|
22
|
+
],
|
|
29
23
|
};
|
|
24
|
+
|
|
25
|
+
export default module;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@modular-prompt/experiment",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.3",
|
|
4
4
|
"description": "Experiment framework for comparing and evaluating prompt modules",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -25,9 +25,10 @@
|
|
|
25
25
|
"jiti": "^2.4.2",
|
|
26
26
|
"yaml": "^2.3.4",
|
|
27
27
|
"zod": "^3.22.4",
|
|
28
|
-
"@modular-prompt/
|
|
29
|
-
"@modular-prompt/
|
|
30
|
-
"@modular-prompt/
|
|
28
|
+
"@modular-prompt/process": "0.1.25",
|
|
29
|
+
"@modular-prompt/utils": "0.2.4",
|
|
30
|
+
"@modular-prompt/driver": "0.9.0",
|
|
31
|
+
"@modular-prompt/core": "0.1.13"
|
|
31
32
|
},
|
|
32
33
|
"devDependencies": {
|
|
33
34
|
"@eslint/js": "^9.34.0",
|
|
@@ -133,20 +133,23 @@ evaluation:
|
|
|
133
133
|
|
|
134
134
|
## モジュール定義
|
|
135
135
|
|
|
136
|
-
|
|
136
|
+
テスト対象のモジュールファイルでは、PromptModule を直接 default export する:
|
|
137
137
|
|
|
138
138
|
```typescript
|
|
139
|
-
import {
|
|
140
|
-
import { myPromptModule } from './prompts.js';
|
|
139
|
+
import type { PromptModule } from '@modular-prompt/core';
|
|
141
140
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
141
|
+
const module: PromptModule<{ query: string }> = {
|
|
142
|
+
objective: ['ユーザーの質問に回答する'],
|
|
143
|
+
instructions: [
|
|
144
|
+
'- 正確で分かりやすい説明を心がける',
|
|
145
|
+
(ctx) => `質問: ${ctx.query}`,
|
|
146
|
+
],
|
|
146
147
|
};
|
|
148
|
+
|
|
149
|
+
export default module;
|
|
147
150
|
```
|
|
148
151
|
|
|
149
|
-
`
|
|
152
|
+
テストケースの `input` は実行時にコンテキストとして注入される。runner 内部で `defaultProcess` を使用してコンパイル・実行が行われる。
|
|
150
153
|
|
|
151
154
|
## 評価器
|
|
152
155
|
|
|
@@ -283,10 +286,28 @@ await driverManager.cleanup();
|
|
|
283
286
|
|
|
284
287
|
## 実験フロー
|
|
285
288
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
289
|
+
実験は3つのフェーズに分けて実行される:
|
|
290
|
+
|
|
291
|
+
### Phase 1: テスト計画の生成 (buildTestPlan)
|
|
292
|
+
- テストケース × モデル × モジュール の全組み合わせを展開
|
|
293
|
+
- 各組み合わせに順序番号(order)を付与して計画リストを作成
|
|
294
|
+
- コンパイル済みプロンプトを事前生成(ログ・評価用)
|
|
295
|
+
|
|
296
|
+
### Phase 2: 実行フェーズ (executePlan)
|
|
297
|
+
- **モデルごとにグループ化して実行**(モデル切り替えコストの最小化)
|
|
298
|
+
- 各モデルグループで:
|
|
299
|
+
- ドライバーを作成
|
|
300
|
+
- テストケース × モジュール の組み合わせを実行(`defaultProcess` を使用)
|
|
301
|
+
- モデルのテスト完了後にドライバーをクローズ
|
|
302
|
+
- **実行完了後、元の定義順にソート** (retire)
|
|
303
|
+
|
|
304
|
+
### Phase 3: 評価フェーズ (runEvaluationPhase)
|
|
305
|
+
- 評価器が有効な場合のみ実行
|
|
306
|
+
- 各モジュールの出力を評価器で採点
|
|
307
|
+
- 評価結果を表示
|
|
308
|
+
|
|
309
|
+
### 設計上の特徴
|
|
310
|
+
|
|
311
|
+
**アウトオブオーダー実行**: モデルごとにグループ化して実行することで、ローカルLLM(MLX等)のモデル切り替えコストを削減。実行後は元の定義順にソートして結果を返す。
|
|
291
312
|
|
|
292
|
-
DriverManager
|
|
313
|
+
**ドライバーキャッシング**: DriverManagerがモデル名をキーにドライバーをキャッシュ。同じモデルであればドライバーを再利用し、異なるモデルに切り替わると前のドライバーをcloseする。
|