@nahisaho/katashiro-evaluation 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/BenchmarkSuite.d.ts +63 -0
- package/dist/BenchmarkSuite.d.ts.map +1 -0
- package/dist/BenchmarkSuite.js +152 -0
- package/dist/BenchmarkSuite.js.map +1 -0
- package/dist/DatasetManager.d.ts +68 -0
- package/dist/DatasetManager.d.ts.map +1 -0
- package/dist/DatasetManager.js +161 -0
- package/dist/DatasetManager.js.map +1 -0
- package/dist/ExperimentRunner.d.ts +51 -0
- package/dist/ExperimentRunner.d.ts.map +1 -0
- package/dist/ExperimentRunner.js +170 -0
- package/dist/ExperimentRunner.js.map +1 -0
- package/dist/evaluators/CompositeEvaluator.d.ts +66 -0
- package/dist/evaluators/CompositeEvaluator.d.ts.map +1 -0
- package/dist/evaluators/CompositeEvaluator.js +122 -0
- package/dist/evaluators/CompositeEvaluator.js.map +1 -0
- package/dist/evaluators/HeuristicEvaluator.d.ts +82 -0
- package/dist/evaluators/HeuristicEvaluator.d.ts.map +1 -0
- package/dist/evaluators/HeuristicEvaluator.js +233 -0
- package/dist/evaluators/HeuristicEvaluator.js.map +1 -0
- package/dist/evaluators/LLMJudgeEvaluator.d.ts +93 -0
- package/dist/evaluators/LLMJudgeEvaluator.d.ts.map +1 -0
- package/dist/evaluators/LLMJudgeEvaluator.js +296 -0
- package/dist/evaluators/LLMJudgeEvaluator.js.map +1 -0
- package/dist/evaluators/RAGASEvaluators.d.ts +128 -0
- package/dist/evaluators/RAGASEvaluators.d.ts.map +1 -0
- package/dist/evaluators/RAGASEvaluators.js +521 -0
- package/dist/evaluators/RAGASEvaluators.js.map +1 -0
- package/dist/evaluators/index.d.ts +13 -0
- package/dist/evaluators/index.d.ts.map +1 -0
- package/dist/evaluators/index.js +12 -0
- package/dist/evaluators/index.js.map +1 -0
- package/dist/index.d.ts +20 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +24 -0
- package/dist/index.js.map +1 -0
- package/dist/reporting/EvaluationReporter.d.ts +135 -0
- package/dist/reporting/EvaluationReporter.d.ts.map +1 -0
- package/dist/reporting/EvaluationReporter.js +285 -0
- package/dist/reporting/EvaluationReporter.js.map +1 -0
- package/dist/reporting/index.d.ts +8 -0
- package/dist/reporting/index.d.ts.map +1 -0
- package/dist/reporting/index.js +8 -0
- package/dist/reporting/index.js.map +1 -0
- package/dist/reporting/templates.d.ts +91 -0
- package/dist/reporting/templates.d.ts.map +1 -0
- package/dist/reporting/templates.js +150 -0
- package/dist/reporting/templates.js.map +1 -0
- package/dist/types.d.ts +408 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +8 -0
- package/dist/types.js.map +1 -0
- package/package.json +47 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"templates.d.ts","sourceRoot":"","sources":["../../src/reporting/templates.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACpD,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAEjE;;GAEG;AACH,MAAM,MAAM,gBAAgB,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,EAAE,QAAQ,EAAE,IAAI,GAAG,IAAI,KAAK,MAAM,CAAC;AAE7E;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,KAAK,EAAE,iBAAiB,CAAC;IACzB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,gBAAgB,CAAC;IACzB,MAAM,EAAE,OAAO,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,MAAM,EAAE,gBAAgB,CAAC,kBAAkB,CAAC,CAAC;IAC7C,OAAO,EAAE,gBAAgB,CAAC,mBAAmB,CAAC,CAAC;IAC/C,SAAS,EAAE,gBAAgB,CAAC,qBAAqB,CAAC,CAAC;IACnD,MAAM,EAAE,gBAAgB,CAAC;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,UAAU,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;CACtE;AAED;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,gBAAgB,CAAC,kBAAkB,CAoBtE,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,sBAAsB,EAAE,gBAAgB,CAAC,mBAAmB,CAiCxE,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,gBAAgB,CAAC,qBAAqB,CAM5E,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,gBAAgB,CAAC;IACnD,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB,CAqBA,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,gBAAgB,EAAE,kBAK9B,CAAC;AAEF;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,SAAK,GAAG,MAAM,CAIrE;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAChC,KAAK,EAAE,MAAM,EACb,UAAU;;;CAA2B,GACpC,MAAM,CAQR;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,QAAQ,EAAE,iBAAiB,EAC3B,OAAO,EAAE,iBAAiB,EAC1B,QAAQ,GAAE,IAAI,GAAG,IAAW,GAC3B,MAAM,CA4BR;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,gBAAgB,EAAE,EAC3B,OAAO,SAAK,GACX,MAAM,CA6BR"}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown Templates for Evaluation Reports
|
|
3
|
+
*
|
|
4
|
+
* レポートテンプレートのカスタマイズ用モジュール
|
|
5
|
+
*
|
|
6
|
+
* @requirement REQ-EVAL-103
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* デフォルトヘッダーテンプレート
|
|
10
|
+
*/
|
|
11
|
+
export const defaultHeaderTemplate = (data, language) => {
|
|
12
|
+
const lines = [`# ${data.title}`];
|
|
13
|
+
if (data.description) {
|
|
14
|
+
lines.push('', data.description);
|
|
15
|
+
}
|
|
16
|
+
if (data.datasetName) {
|
|
17
|
+
lines.push('', language === 'ja'
|
|
18
|
+
? `**データセット**: ${data.datasetName}`
|
|
19
|
+
: `**Dataset**: ${data.datasetName}`);
|
|
20
|
+
}
|
|
21
|
+
return lines.join('\n');
|
|
22
|
+
};
|
|
23
|
+
/**
|
|
24
|
+
* デフォルトサマリーテンプレート
|
|
25
|
+
*/
|
|
26
|
+
export const defaultSummaryTemplate = (data, language) => {
|
|
27
|
+
const { stats } = data;
|
|
28
|
+
const isJa = language === 'ja';
|
|
29
|
+
const lines = [];
|
|
30
|
+
lines.push(isJa ? '## 📊 サマリー' : '## 📊 Summary');
|
|
31
|
+
lines.push('');
|
|
32
|
+
// プログレスバー生成
|
|
33
|
+
const progressBar = generateProgressBar(stats.passRate);
|
|
34
|
+
lines.push(`${progressBar} ${(stats.passRate * 100).toFixed(1)}%`);
|
|
35
|
+
lines.push('');
|
|
36
|
+
lines.push(isJa ? '| 指標 | 値 |' : '| Metric | Value |');
|
|
37
|
+
lines.push('|------|-----|');
|
|
38
|
+
lines.push(isJa ? `| 総件数 | ${stats.total} |` : `| Total | ${stats.total} |`);
|
|
39
|
+
lines.push(isJa
|
|
40
|
+
? `| 合格 / 不合格 | ${stats.passed} / ${stats.failed} |`
|
|
41
|
+
: `| Passed / Failed | ${stats.passed} / ${stats.failed} |`);
|
|
42
|
+
lines.push(isJa
|
|
43
|
+
? `| 平均スコア | ${stats.avgScore.toFixed(3)} |`
|
|
44
|
+
: `| Avg Score | ${stats.avgScore.toFixed(3)} |`);
|
|
45
|
+
return lines.join('\n');
|
|
46
|
+
};
|
|
47
|
+
/**
|
|
48
|
+
* デフォルト結果行テンプレート
|
|
49
|
+
*/
|
|
50
|
+
export const defaultResultRowTemplate = (data) => {
|
|
51
|
+
const { index, result, passed } = data;
|
|
52
|
+
const statusIcon = passed ? '✅' : '❌';
|
|
53
|
+
return `| ${index} | ${result.evaluator} | ${result.score.toFixed(3)} | ${result.normalizedScore.toFixed(3)} | ${statusIcon} |`;
|
|
54
|
+
};
|
|
55
|
+
/**
|
|
56
|
+
* デフォルトフッターテンプレート
|
|
57
|
+
*/
|
|
58
|
+
export const defaultFooterTemplate = (data, language) => {
|
|
59
|
+
const isJa = language === 'ja';
|
|
60
|
+
const lines = ['---', ''];
|
|
61
|
+
lines.push(isJa
|
|
62
|
+
? `_生成日時: ${data.timestamp}_`
|
|
63
|
+
: `_Generated at: ${data.timestamp}_`);
|
|
64
|
+
if (data.durationMs !== undefined) {
|
|
65
|
+
lines.push(isJa
|
|
66
|
+
? `_実行時間: ${data.durationMs}ms_`
|
|
67
|
+
: `_Duration: ${data.durationMs}ms_`);
|
|
68
|
+
}
|
|
69
|
+
lines.push('', '_Powered by KATASHIRO Evaluation Framework_');
|
|
70
|
+
return lines.join('\n');
|
|
71
|
+
};
|
|
72
|
+
/**
|
|
73
|
+
* デフォルトテンプレートコレクション
|
|
74
|
+
*/
|
|
75
|
+
export const defaultTemplates = {
|
|
76
|
+
header: defaultHeaderTemplate,
|
|
77
|
+
summary: defaultSummaryTemplate,
|
|
78
|
+
resultRow: defaultResultRowTemplate,
|
|
79
|
+
footer: defaultFooterTemplate,
|
|
80
|
+
};
|
|
81
|
+
/**
|
|
82
|
+
* プログレスバー生成
|
|
83
|
+
*/
|
|
84
|
+
export function generateProgressBar(ratio, width = 20) {
|
|
85
|
+
const filled = Math.round(ratio * width);
|
|
86
|
+
const empty = width - filled;
|
|
87
|
+
return `[${'█'.repeat(filled)}${'░'.repeat(empty)}]`;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* スコアバッジ生成
|
|
91
|
+
*/
|
|
92
|
+
export function generateScoreBadge(score, thresholds = { good: 0.8, fair: 0.5 }) {
|
|
93
|
+
if (score >= thresholds.good) {
|
|
94
|
+
return `.toFixed(0)}%25-brightgreen)`;
|
|
95
|
+
}
|
|
96
|
+
else if (score >= thresholds.fair) {
|
|
97
|
+
return `.toFixed(0)}%25-yellow)`;
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
return `.toFixed(0)}%25-red)`;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* 比較テーブル生成
|
|
105
|
+
*/
|
|
106
|
+
export function generateComparisonTable(baseline, current, language = 'en') {
|
|
107
|
+
const isJa = language === 'ja';
|
|
108
|
+
const lines = [];
|
|
109
|
+
lines.push(isJa ? '## 📈 比較' : '## 📈 Comparison');
|
|
110
|
+
lines.push('');
|
|
111
|
+
lines.push(isJa
|
|
112
|
+
? '| 指標 | ベースライン | 今回 | 差分 |'
|
|
113
|
+
: '| Metric | Baseline | Current | Diff |');
|
|
114
|
+
lines.push('|------|------------|-------|------|');
|
|
115
|
+
const formatDiff = (diff) => {
|
|
116
|
+
if (diff > 0)
|
|
117
|
+
return `+${diff.toFixed(3)} 📈`;
|
|
118
|
+
if (diff < 0)
|
|
119
|
+
return `${diff.toFixed(3)} 📉`;
|
|
120
|
+
return '0.000 ➡️';
|
|
121
|
+
};
|
|
122
|
+
lines.push(`| ${isJa ? '合格率' : 'Pass Rate'} | ${(baseline.passRate * 100).toFixed(1)}% | ${(current.passRate * 100).toFixed(1)}% | ${formatDiff((current.passRate - baseline.passRate) * 100)}% |`);
|
|
123
|
+
lines.push(`| ${isJa ? '平均スコア' : 'Avg Score'} | ${baseline.avgScore.toFixed(3)} | ${current.avgScore.toFixed(3)} | ${formatDiff(current.avgScore - baseline.avgScore)} |`);
|
|
124
|
+
return lines.join('\n');
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* ヒートマップ生成(テキストベース)
|
|
128
|
+
*/
|
|
129
|
+
export function generateScoreHeatmap(results, buckets = 10) {
|
|
130
|
+
const lines = [];
|
|
131
|
+
// スコア分布を計算
|
|
132
|
+
const distribution = new Array(buckets).fill(0);
|
|
133
|
+
for (const result of results) {
|
|
134
|
+
const bucket = Math.min(Math.floor(result.normalizedScore * buckets), buckets - 1);
|
|
135
|
+
distribution[bucket]++;
|
|
136
|
+
}
|
|
137
|
+
const max = Math.max(...distribution);
|
|
138
|
+
lines.push('```');
|
|
139
|
+
lines.push('Score Distribution:');
|
|
140
|
+
lines.push('');
|
|
141
|
+
for (let i = buckets - 1; i >= 0; i--) {
|
|
142
|
+
const label = `${((i / buckets) * 100).toFixed(0).padStart(3)}%-${(((i + 1) / buckets) * 100).toFixed(0).padStart(3)}%`;
|
|
143
|
+
const bar = '█'.repeat(Math.round((distribution[i] / max) * 20));
|
|
144
|
+
const count = distribution[i];
|
|
145
|
+
lines.push(`${label} | ${bar} (${count})`);
|
|
146
|
+
}
|
|
147
|
+
lines.push('```');
|
|
148
|
+
return lines.join('\n');
|
|
149
|
+
}
|
|
150
|
+
//# sourceMappingURL=templates.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"templates.js","sourceRoot":"","sources":["../../src/reporting/templates.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AA8CH;;GAEG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAAyC,CACzE,IAAI,EACJ,QAAQ,EACR,EAAE;IACF,MAAM,KAAK,GAAa,CAAC,KAAK,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;IAE5C,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;QACrB,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;QACrB,KAAK,CAAC,IAAI,CACR,EAAE,EACF,QAAQ,KAAK,IAAI;YACf,CAAC,CAAC,eAAe,IAAI,CAAC,WAAW,EAAE;YACnC,CAAC,CAAC,gBAAgB,IAAI,CAAC,WAAW,EAAE,CACvC,CAAC;IACJ,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,sBAAsB,GAA0C,CAC3E,IAAI,EACJ,QAAQ,EACR,EAAE;IACF,MAAM,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC;IACvB,MAAM,IAAI,GAAG,QAAQ,KAAK,IAAI,CAAC;IAC/B,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC;IAClD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,YAAY;IACZ,MAAM,WAAW,GAAG,mBAAmB,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACxD,KAAK,CAAC,IAAI,CAAC,GAAG,WAAW,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;IACnE,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC;IACvD,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC7B,KAAK,CAAC,IAAI,CACR,IAAI,CAAC,CAAC,CAAC,WAAW,KAAK,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC,aAAa,KAAK,CAAC,KAAK,IAAI,CACjE,CAAC;IACF,KAAK,CAAC,IAAI,CACR,IAAI;QACF,CAAC,CAAC,gBAAgB,KAAK,CAAC,MAAM,MAAM,KAAK,CAAC,MAAM,IAAI;QACpD,CAAC,CAAC,uBAAuB,KAAK,CAAC,MAAM,MAAM,KAAK,CAAC,MAAM,IAAI,CAC9D,CAAC;IACF,KAAK,CAAC,IAAI,CACR,IAAI;QACF,CAAC,CAAC,aAAa,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;QAC5C,CAAC,CAAC,iBAAiB,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CACnD,CAAC;IAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAA4C,CAC/E,IAAI,EACJ,EAAE;IACF,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC;IACvC,MAAM,UAAU,GAAG,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IACtC,OAAO,KAAK,KAAK,MAAM,MAAM,CAAC,SAAS,MAAM,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,UAAU,IAAI,CAAC;AAClI,CAAC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAG7B,CAAC,IAAI,EAAE,QAAQ,EAAE,EAAE;IACtB,MAAM,IAAI,GAAG,QAAQ,KAAK,IAAI,CAAC;IAC/B,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAE1B,KAAK,CAAC,IAAI,CACR,IAAI;QACF,CAAC,CAAC,UAAU,IAAI,CAAC,SAAS,GAAG;QAC7B,CAAC,CAAC,kBAAkB,IAAI,CAAC,SAAS,GAAG,CACxC,CAAC;IAEF,IAAI,IAAI,CAAC,UAAU,KAAK,SAAS,EAAE,CAAC;QAClC,KAAK,CAAC,IAAI,CACR,IAAI;YACF,CAAC,CAAC,UAAU,IAAI,CAAC,UAAU,KAAK;YAChC,CAAC,CAAC,cAAc,IAAI,CAAC,UAAU,KAAK,CACvC,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,6CAA6C,CAAC,CAAC;IAE9D,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAuB;IAClD,MAAM,EAAE,qBAAqB;IAC7B,OAAO,EAAE,sBAAsB;IAC/B,SAAS,EAAE,wBAAwB;IACnC,MAAM,EAAE,qBAAqB;CAC9B,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAa,EAAE,KAAK,GAAG,EAAE;IAC3D,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC;IACzC,MAAM,KAAK,GAAG,KAAK,GAAG,MAAM,CAAC;IAC7B,OAAO,IAAI,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC;AACvD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAChC,KAAa,EACb,UAAU,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE;IAErC,IAAI,KAAK,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC;QAC7B,OAAO,+CAA+C,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,kBAAkB,CAAC;IACnG,CAAC;SAAM,IAAI,KAAK,IAAI,UAAU,CAAC,IAAI,EAAE,CAAC;QACpC,OAAO,+CAA+C,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,aAAa,CAAC;IAC9F,CAAC;SAAM,CAAC;QACN,OAAO,+CAA+C,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC;IAC3F,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,QAA2B,EAC3B,OAA0B,EAC1B,WAAwB,IAAI;IAE5B,MAAM,IAAI,GAAG,QAAQ,KAAK,IAAI,CAAC;IAC/B,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACnD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,KAAK,CAAC,IAAI,CACR,IAAI;QACF,CAAC,CAAC,2BAA2B;QAC7B,CAAC,CAAC,wCAAwC,CAC7C,CAAC;IACF,KAAK,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC;IAEnD,MAAM,UAAU,GAAG,CAAC,IAAY,EAAE,EAAE;QAClC,IAAI,IAAI,GAAG,CAAC;YAAE,OAAO,IAAI,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC;QAC9C,IAAI,IAAI,GAAG,CAAC;YAAE,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC;QAC7C,OAAO,UAAU,CAAC;IACpB,CAAC,CAAC;IAEF,KAAK,CAAC,IAAI,CACR,KAAK,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,WAAW,MAAM,CAAC,QAAQ,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,UAAU,CAAC,CAAC,OAAO,CAAC,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,GAAG,GAAG,CAAC,KAAK,CACxL,CAAC;IACF,KAAK,CAAC,IAAI,CACR,KAAK,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW,MAAM,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,UAAU,CAAC,OAAO,CAAC,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAC/J,CAAC;IAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAClC,OAA2B,EAC3B,OAAO,GAAG,EAAE;IAEZ,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,WAAW;IACX,MAAM,YAAY,GAAG,IAAI,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAChD,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CACrB,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,eAAe,GAAG,OAAO,CAAC,EAC5C,OAAO,GAAG,CAAC,CACZ,CAAC;QACF,YAAY,CAAC,MAAM,CAAC,EAAE,CAAC;IACzB,CAAC;IAED,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,YAAY,CAAC,CAAC;IAEtC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAClB,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;IAClC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,KAAK,IAAI,CAAC,GAAG,OAAO,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC;QACxH,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC;QACjE,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;QAC9B,KAAK,CAAC,IAAI,CAAC,GAAG,KAAK,MAAM,GAAG,KAAK,KAAK,GAAG,CAAC,CAAC;IAC7C,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAElB,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Types
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-EVAL-001, REQ-EVAL-002, REQ-EVAL-003, REQ-EVAL-004, REQ-EVAL-005
|
|
5
|
+
* @design DES-KATASHIRO-003-EVAL §3
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* 評価入力
|
|
9
|
+
*/
|
|
10
|
+
export interface EvaluationInput {
|
|
11
|
+
/** 評価対象出力 */
|
|
12
|
+
output: string;
|
|
13
|
+
/** 入力(コンテキスト) */
|
|
14
|
+
input?: string;
|
|
15
|
+
/** 期待出力(比較用) */
|
|
16
|
+
expected?: string;
|
|
17
|
+
/** 追加コンテキスト */
|
|
18
|
+
context?: Record<string, unknown>;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* 評価結果
|
|
22
|
+
*/
|
|
23
|
+
export interface EvaluationResult {
|
|
24
|
+
/** 評価器名 */
|
|
25
|
+
evaluator: string;
|
|
26
|
+
/** スコア */
|
|
27
|
+
score: number;
|
|
28
|
+
/** 正規化スコア(0-1) */
|
|
29
|
+
normalizedScore: number;
|
|
30
|
+
/** 合否 */
|
|
31
|
+
passed?: boolean;
|
|
32
|
+
/** 根拠 */
|
|
33
|
+
reasoning: string;
|
|
34
|
+
/** 生スコア(複数回試行時) */
|
|
35
|
+
rawScores?: number[];
|
|
36
|
+
/** メタデータ */
|
|
37
|
+
metadata?: EvaluationMetadata;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* 評価メタデータ
|
|
41
|
+
*/
|
|
42
|
+
export interface EvaluationMetadata {
|
|
43
|
+
/** 最適長(LengthEvaluator) */
|
|
44
|
+
optimalLength?: number;
|
|
45
|
+
/** 発見キーワード(KeywordEvaluator) */
|
|
46
|
+
foundKeywords?: string[];
|
|
47
|
+
/** 欠落キーワード(KeywordEvaluator) */
|
|
48
|
+
missingKeywords?: string[];
|
|
49
|
+
/** 欠落フィールド(JsonStructureEvaluator) */
|
|
50
|
+
missingFields?: string[];
|
|
51
|
+
/** コンポーネントスコア(CompositeEvaluator) */
|
|
52
|
+
componentScores?: Array<{
|
|
53
|
+
evaluator: string;
|
|
54
|
+
score: number;
|
|
55
|
+
}>;
|
|
56
|
+
/** その他のメタデータ */
|
|
57
|
+
[key: string]: unknown;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* 評価器インターフェース
|
|
61
|
+
*/
|
|
62
|
+
export interface Evaluator {
|
|
63
|
+
/** 評価器名 */
|
|
64
|
+
readonly name: string;
|
|
65
|
+
/** 評価実行 */
|
|
66
|
+
evaluate(input: EvaluationInput): Promise<EvaluationResult>;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* 評価スケール
|
|
70
|
+
*/
|
|
71
|
+
export interface EvaluationScale {
|
|
72
|
+
min: number;
|
|
73
|
+
max: number;
|
|
74
|
+
descriptions?: Record<number, string>;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* データセット
|
|
78
|
+
*/
|
|
79
|
+
export interface Dataset {
|
|
80
|
+
/** データセットID */
|
|
81
|
+
id: string;
|
|
82
|
+
/** 名前 */
|
|
83
|
+
name: string;
|
|
84
|
+
/** 説明 */
|
|
85
|
+
description?: string;
|
|
86
|
+
/** データ件数 */
|
|
87
|
+
size: number;
|
|
88
|
+
/** タグ */
|
|
89
|
+
tags?: string[];
|
|
90
|
+
/** 作成日時 */
|
|
91
|
+
createdAt: string;
|
|
92
|
+
/** 更新日時 */
|
|
93
|
+
updatedAt: string;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* データセットアイテム
|
|
97
|
+
*/
|
|
98
|
+
export interface DatasetItem {
|
|
99
|
+
/** アイテムID */
|
|
100
|
+
id: string;
|
|
101
|
+
/** 入力 */
|
|
102
|
+
input: string;
|
|
103
|
+
/** 期待出力 */
|
|
104
|
+
expected?: string;
|
|
105
|
+
/** メタデータ */
|
|
106
|
+
metadata?: Record<string, unknown>;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* 実験設定
|
|
110
|
+
*/
|
|
111
|
+
export interface ExperimentConfig {
|
|
112
|
+
/** 実験名 */
|
|
113
|
+
name: string;
|
|
114
|
+
/** 説明 */
|
|
115
|
+
description?: string;
|
|
116
|
+
/** データセットID */
|
|
117
|
+
datasetId: string;
|
|
118
|
+
/** 使用評価器 */
|
|
119
|
+
evaluators: string[];
|
|
120
|
+
/** タグ */
|
|
121
|
+
tags?: string[];
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* 実験結果
|
|
125
|
+
*/
|
|
126
|
+
export interface ExperimentResult {
|
|
127
|
+
/** 実験ID */
|
|
128
|
+
id: string;
|
|
129
|
+
/** 実験名 */
|
|
130
|
+
name: string;
|
|
131
|
+
/** 実行日時 */
|
|
132
|
+
timestamp: string;
|
|
133
|
+
/** 使用データセット */
|
|
134
|
+
datasetId: string;
|
|
135
|
+
/** 評価結果サマリー */
|
|
136
|
+
summary: ExperimentSummary;
|
|
137
|
+
/** 詳細結果 */
|
|
138
|
+
details: ExperimentDetailResult[];
|
|
139
|
+
/** 実行時間(ミリ秒) */
|
|
140
|
+
durationMs: number;
|
|
141
|
+
/** メタデータ */
|
|
142
|
+
metadata?: Record<string, unknown>;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* 実験結果サマリー
|
|
146
|
+
*/
|
|
147
|
+
export interface ExperimentSummary {
|
|
148
|
+
/** 評価器ごとの平均スコア */
|
|
149
|
+
averageScores: Record<string, number>;
|
|
150
|
+
/** 評価器ごとの標準偏差 */
|
|
151
|
+
stdDevs: Record<string, number>;
|
|
152
|
+
/** 全体の平均スコア */
|
|
153
|
+
overallScore: number;
|
|
154
|
+
/** 評価件数 */
|
|
155
|
+
totalItems: number;
|
|
156
|
+
/** 成功件数 */
|
|
157
|
+
successCount: number;
|
|
158
|
+
/** エラー件数 */
|
|
159
|
+
errorCount: number;
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* 実験詳細結果
|
|
163
|
+
*/
|
|
164
|
+
export interface ExperimentDetailResult {
|
|
165
|
+
/** アイテムID */
|
|
166
|
+
itemId: string;
|
|
167
|
+
/** 入力 */
|
|
168
|
+
input: string;
|
|
169
|
+
/** 出力 */
|
|
170
|
+
output: string;
|
|
171
|
+
/** 期待出力 */
|
|
172
|
+
expected?: string;
|
|
173
|
+
/** 評価結果 */
|
|
174
|
+
evaluations: EvaluationResult[];
|
|
175
|
+
/** 成功フラグ */
|
|
176
|
+
success: boolean;
|
|
177
|
+
/** エラーメッセージ */
|
|
178
|
+
error?: string;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* A/Bテスト設定
|
|
182
|
+
*/
|
|
183
|
+
export interface ABTestConfig {
|
|
184
|
+
/** テスト名 */
|
|
185
|
+
name: string;
|
|
186
|
+
/** 説明 */
|
|
187
|
+
description?: string;
|
|
188
|
+
/** バリアントA設定 */
|
|
189
|
+
variantA: VariantConfig;
|
|
190
|
+
/** バリアントB設定 */
|
|
191
|
+
variantB: VariantConfig;
|
|
192
|
+
/** データセットID */
|
|
193
|
+
datasetId: string;
|
|
194
|
+
/** 評価器 */
|
|
195
|
+
evaluators: string[];
|
|
196
|
+
/** 統計的有意水準 */
|
|
197
|
+
significanceLevel?: number;
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* バリアント設定
|
|
201
|
+
*/
|
|
202
|
+
export interface VariantConfig {
|
|
203
|
+
/** バリアント名 */
|
|
204
|
+
name: string;
|
|
205
|
+
/** 生成関数 */
|
|
206
|
+
generator: (input: string) => Promise<string>;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* A/Bテスト結果
|
|
210
|
+
*/
|
|
211
|
+
export interface ABTestResult {
|
|
212
|
+
/** テストID */
|
|
213
|
+
id: string;
|
|
214
|
+
/** テスト名 */
|
|
215
|
+
name: string;
|
|
216
|
+
/** 実行日時 */
|
|
217
|
+
timestamp: string;
|
|
218
|
+
/** バリアントA結果 */
|
|
219
|
+
variantA: VariantResult;
|
|
220
|
+
/** バリアントB結果 */
|
|
221
|
+
variantB: VariantResult;
|
|
222
|
+
/** 統計分析結果 */
|
|
223
|
+
analysis: ABAnalysis;
|
|
224
|
+
/** 勝者 */
|
|
225
|
+
winner: 'A' | 'B' | 'tie';
|
|
226
|
+
/** 結論 */
|
|
227
|
+
conclusion: string;
|
|
228
|
+
}
|
|
229
|
+
/**
|
|
230
|
+
* バリアント結果
|
|
231
|
+
*/
|
|
232
|
+
export interface VariantResult {
|
|
233
|
+
/** バリアント名 */
|
|
234
|
+
name: string;
|
|
235
|
+
/** 評価器ごとの平均スコア */
|
|
236
|
+
averageScores: Record<string, number>;
|
|
237
|
+
/** 評価器ごとの標準偏差 */
|
|
238
|
+
stdDevs: Record<string, number>;
|
|
239
|
+
/** サンプル数 */
|
|
240
|
+
sampleSize: number;
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* A/Bテストバリアント型
|
|
244
|
+
*/
|
|
245
|
+
export type ABTestVariant = 'A' | 'B';
|
|
246
|
+
/**
|
|
247
|
+
* A/B分析結果
|
|
248
|
+
*/
|
|
249
|
+
export interface ABAnalysis {
|
|
250
|
+
/** 評価器ごとのt検定結果 */
|
|
251
|
+
tTests: Record<string, {
|
|
252
|
+
tStatistic: number;
|
|
253
|
+
pValue: number;
|
|
254
|
+
significant: boolean;
|
|
255
|
+
effectSize: number;
|
|
256
|
+
}>;
|
|
257
|
+
/** 全体の有意性 */
|
|
258
|
+
overallSignificant: boolean;
|
|
259
|
+
/** 信頼区間 */
|
|
260
|
+
confidenceIntervals: Record<string, {
|
|
261
|
+
lower: number;
|
|
262
|
+
upper: number;
|
|
263
|
+
}>;
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* ベンチマーク設定
|
|
267
|
+
*/
|
|
268
|
+
export interface BenchmarkConfig {
|
|
269
|
+
/** ベンチマーク名 */
|
|
270
|
+
name: string;
|
|
271
|
+
/** 反復回数 */
|
|
272
|
+
iterations?: number;
|
|
273
|
+
/** ウォームアップ回数 */
|
|
274
|
+
warmupIterations?: number;
|
|
275
|
+
/** タイムアウト(ミリ秒) */
|
|
276
|
+
timeout?: number;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* ベンチマーク結果
|
|
280
|
+
*/
|
|
281
|
+
export interface BenchmarkResult {
|
|
282
|
+
/** ベンチマーク名 */
|
|
283
|
+
name: string;
|
|
284
|
+
/** 平均実行時間(ミリ秒) */
|
|
285
|
+
meanMs: number;
|
|
286
|
+
/** 標準偏差 */
|
|
287
|
+
stdDevMs: number;
|
|
288
|
+
/** 最小実行時間 */
|
|
289
|
+
minMs: number;
|
|
290
|
+
/** 最大実行時間 */
|
|
291
|
+
maxMs: number;
|
|
292
|
+
/** パーセンタイル */
|
|
293
|
+
percentiles: {
|
|
294
|
+
p50: number;
|
|
295
|
+
p90: number;
|
|
296
|
+
p99: number;
|
|
297
|
+
};
|
|
298
|
+
/** 反復回数 */
|
|
299
|
+
iterations: number;
|
|
300
|
+
/** 実行日時 */
|
|
301
|
+
timestamp: string;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* 評価基準定義
|
|
305
|
+
* @requirement REQ-EVAL-101
|
|
306
|
+
*/
|
|
307
|
+
export interface EvaluationCriteria {
|
|
308
|
+
/** 基準名 */
|
|
309
|
+
name: string;
|
|
310
|
+
/** 説明 */
|
|
311
|
+
description: string;
|
|
312
|
+
/** スコア範囲 (デフォルト: 1-5) */
|
|
313
|
+
scale?: {
|
|
314
|
+
min: number;
|
|
315
|
+
max: number;
|
|
316
|
+
};
|
|
317
|
+
/** 各スコアの説明(オプション) */
|
|
318
|
+
rubric?: Record<number, string>;
|
|
319
|
+
/** 重み(複合評価時) */
|
|
320
|
+
weight?: number;
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* LLMJudge評価器設定
|
|
324
|
+
* @requirement REQ-EVAL-101
|
|
325
|
+
*/
|
|
326
|
+
export interface LLMJudgeEvaluatorConfig {
|
|
327
|
+
/** 評価器名 */
|
|
328
|
+
name?: string;
|
|
329
|
+
/** 評価基準リスト */
|
|
330
|
+
criteria: EvaluationCriteria[];
|
|
331
|
+
/** 評価スケール(デフォルト: 1-5) */
|
|
332
|
+
scale?: {
|
|
333
|
+
min: number;
|
|
334
|
+
max: number;
|
|
335
|
+
};
|
|
336
|
+
/** システムプロンプト(カスタマイズ用) */
|
|
337
|
+
systemPrompt?: string;
|
|
338
|
+
/** 評価プロンプトテンプレート */
|
|
339
|
+
evaluationPromptTemplate?: string;
|
|
340
|
+
/** リトライ回数(パース失敗時) */
|
|
341
|
+
maxRetries?: number;
|
|
342
|
+
/** 温度パラメータ */
|
|
343
|
+
temperature?: number;
|
|
344
|
+
/** 評価結果のJSON出力を強制 */
|
|
345
|
+
forceJsonOutput?: boolean;
|
|
346
|
+
}
|
|
347
|
+
/**
|
|
348
|
+
* LLMJudge評価結果
|
|
349
|
+
* @requirement REQ-EVAL-101
|
|
350
|
+
*/
|
|
351
|
+
export interface LLMJudgeResult extends EvaluationResult {
|
|
352
|
+
/** 各基準ごとのスコア */
|
|
353
|
+
criteriaScores: Record<string, {
|
|
354
|
+
score: number;
|
|
355
|
+
reasoning: string;
|
|
356
|
+
}>;
|
|
357
|
+
/** LLMの生の出力 */
|
|
358
|
+
rawLLMOutput?: string;
|
|
359
|
+
/** 使用トークン数 */
|
|
360
|
+
tokenUsage?: {
|
|
361
|
+
promptTokens: number;
|
|
362
|
+
completionTokens: number;
|
|
363
|
+
totalTokens: number;
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
/**
|
|
367
|
+
* RAG評価入力
|
|
368
|
+
* @requirement REQ-EVAL-102
|
|
369
|
+
*/
|
|
370
|
+
export interface RAGEvaluationInput extends EvaluationInput {
|
|
371
|
+
/** ユーザークエリ(質問) */
|
|
372
|
+
query: string;
|
|
373
|
+
/** 取得されたコンテキスト */
|
|
374
|
+
retrievedContexts: string[];
|
|
375
|
+
/** 生成された回答 */
|
|
376
|
+
generatedAnswer: string;
|
|
377
|
+
/** グラウンドトゥルース(オプション) */
|
|
378
|
+
groundTruth?: string;
|
|
379
|
+
}
|
|
380
|
+
/**
|
|
381
|
+
* RAGAS評価結果
|
|
382
|
+
* @requirement REQ-EVAL-102
|
|
383
|
+
*/
|
|
384
|
+
export interface RAGASEvaluationResult extends EvaluationResult {
|
|
385
|
+
/** 各メトリクスのスコア */
|
|
386
|
+
metrics: {
|
|
387
|
+
/** Faithfulness: 回答がコンテキストに基づいているか */
|
|
388
|
+
faithfulness?: number;
|
|
389
|
+
/** Context Relevancy: 取得コンテキストの関連性 */
|
|
390
|
+
contextRelevancy?: number;
|
|
391
|
+
/** Answer Relevancy: 回答がクエリに関連しているか */
|
|
392
|
+
answerRelevancy?: number;
|
|
393
|
+
/** Context Recall: コンテキストがグラウンドトゥルースをカバーしているか */
|
|
394
|
+
contextRecall?: number;
|
|
395
|
+
/** Context Precision: 関連コンテキストの精度 */
|
|
396
|
+
contextPrecision?: number;
|
|
397
|
+
};
|
|
398
|
+
/** 詳細な分析 */
|
|
399
|
+
analysis?: {
|
|
400
|
+
/** 回答から抽出されたステートメント */
|
|
401
|
+
statements?: string[];
|
|
402
|
+
/** コンテキストでサポートされているステートメント */
|
|
403
|
+
supportedStatements?: string[];
|
|
404
|
+
/** サポートされていないステートメント */
|
|
405
|
+
unsupportedStatements?: string[];
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,aAAa;IACb,MAAM,EAAE,MAAM,CAAC;IACf,iBAAiB;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,gBAAgB;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,eAAe;IACf,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,WAAW;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU;IACV,KAAK,EAAE,MAAM,CAAC;IACd,kBAAkB;IAClB,eAAe,EAAE,MAAM,CAAC;IACxB,SAAS;IACT,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,SAAS;IACT,SAAS,EAAE,MAAM,CAAC;IAClB,mBAAmB;IACnB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC;IACrB,YAAY;IACZ,QAAQ,CAAC,EAAE,kBAAkB,CAAC;CAC/B;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,2BAA2B;IAC3B,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gCAAgC;IAChC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,gCAAgC;IAChC,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,sCAAsC;IACtC,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,qCAAqC;IACrC,eAAe,CAAC,EAAE,KAAK,CAAC;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC9D,gBAAgB;IAChB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,WAAW;IACX,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,WAAW;IACX,QAAQ,CAAC,KAAK,EAAE,eAAe,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC;CAC7D;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,GAAG,EAAE,MAAM,CAAC;IACZ,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACvC;AAED;;GAEG;AACH,MAAM,WAAW,OAAO;IACtB,eAAe;IACf,EAAE,EAAE,MAAM,CAAC;IACX,SAAS;IACT,IAAI,EAAE,MAAM,CAAC;IACb,SAAS;IACT,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,YAAY;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,SAAS;IACT,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,WAAW;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW;IACX,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,aAAa;IACb,EAAE,EAAE,MAAM,CAAC;IACX,SAAS;IACT,KAAK,EAAE,MAAM,CAAC;IACd,WAAW;IACX,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY;IACZ,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,UAAU;IACV,IAAI,EAAE,MAAM,CAAC;IACb,SAAS;IACT,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,eAAe;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY;IACZ,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,SAAS;IACT,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,WAAW;IACX,EAAE,EAAE,MAAM,CAAC;IACX,UAAU;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe;IACf,OAAO,EAAE,iBAAiB,CAAC;IAC3B,WAAW;IACX,OAAO,EAAE,sBAAsB,EAAE,CAAC;IAClC,gBAAgB;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY;IACZ,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,kBAAkB;IAClB,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACtC,iBAAiB;IACjB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,eAAe;IACf,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW;IACX,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW;IACX,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY;IACZ,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC,aAAa;IACb,MAAM,EAAE,MAAM,CAAC;IACf,SAAS;IACT,KAAK,EAAE,MAAM,CAAC;IACd,SAAS;IACT,MAAM,EAAE,MAAM,CAAC;IACf,WAAW;IACX,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,WAAW;IACX,WAAW,EAAE,gBAAgB,EAAE,CAAC;IAChC,YAAY;IACZ,OAAO,EAAE,OAAO,CAAC;IACjB,eAAe;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,WAAW;IACX,IAAI,EAAE,MAAM,CAAC;IACb,SAAS;IACT,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,eAAe;IACf,QAAQ,EAAE,aAAa,CAAC;IACxB,eAAe;IACf,QAAQ,EAAE,aAAa,CAAC;IACxB,eAAe;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU;IACV,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,cAAc;IACd,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,aAAa;IACb,IAAI,EAAE,MAAM,CAAC;IACb,WAAW;IACX,SAAS,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,CAAC,CAAC;CAC/C;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,YAAY;IACZ,EAAE,EAAE,MAAM,CAAC;IACX,WAAW;IACX,IAAI,EAAE,MAAM,CAAC;IACb,WAAW;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe;IACf,QAAQ,EAAE,aAAa,CAAC;IACxB,eAAe;IACf,QAAQ,EAAE,aAAa,CAAC;IACxB,aAAa;IACb,QAAQ,EAAE,UAAU,CAAC;IACrB,SAAS;IACT,MAAM,EAAE,GAAG,GAAG,GAAG,GAAG,KAAK,CAAC;IAC1B,SAAS;IACT,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,aAAa;IACb,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACtC,iBAAiB;IACjB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,YAAY;IACZ,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,MAAM,aAAa,GAAG,GAAG,GAAG,GAAG,CAAC;AAEtC;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,kBAAkB;IAClB,MAAM,EAAE,MAAM,CACZ,MAAM,EACN;QACE,UAAU,EAAE,MAAM,CAAC;QACnB,MAAM,EAAE,MAAM,CAAC;QACf,WAAW,EAAE,OAAO,CAAC;QACrB,UAAU,EAAE,MAAM,CAAC;KACpB,CACF,CAAC;IACF,aAAa;IACb,kBAAkB,EAAE,OAAO,CAAC;IAC5B,WAAW;IACX,mBAAmB,EAAE,MAAM,CACzB,MAAM,EACN;QACE,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CACF,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,cAAc;IACd,IAAI,EAAE,MAAM,CAAC;IACb,WAAW;IACX,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB;IAChB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,kBAAkB;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,cAAc;IACd,IAAI,EAAE,MAAM,CAAC;IACb,kBAAkB;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW;IACX,QAAQ,EAAE,MAAM,CAAC;IACjB,aAAa;IACb,KAAK,EAAE,MAAM,CAAC;IACd,aAAa;IACb,KAAK,EAAE,MAAM,CAAC;IACd,cAAc;IACd,WAAW,EAAE;QACX,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;KACb,CAAC;IACF,WAAW;IACX,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW;IACX,SAAS,EAAE,MAAM,CAAC;CACnB;AAMD;;;GAGG;AACH,MAAM,WAAW,kBAAkB;IACjC,UAAU;IACV,IAAI,EAAE,MAAM,CAAC;IACb,SAAS;IACT,WAAW,EAAE,MAAM,CAAC;IACpB,yBAAyB;IACzB,KAAK,CAAC,EAAE;QACN,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;KACb,CAAC;IACF,qBAAqB;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAChC,gBAAgB;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED;;;GAGG;AACH,MAAM,WAAW,uBAAuB;IACtC,WAAW;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,cAAc;IACd,QAAQ,EAAE,kBAAkB,EAAE,CAAC;IAC/B,yBAAyB;IACzB,KAAK,CAAC,EAAE;QACN,GAAG,EAAE,MAAM,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;KACb,CAAC;IACF,yBAAyB;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,oBAAoB;IACpB,wBAAwB,CAAC,EAAE,MAAM,CAAC;IAClC,qBAAqB;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,cAAc;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,qBAAqB;IACrB,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B;AAED;;;GAGG;AACH,MAAM,WAAW,cAAe,SAAQ,gBAAgB;IACtD,gBAAgB;IAChB,cAAc,EAAE,MAAM,CAAC,MAAM,EAAE;QAC7B,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC,CAAC;IACH,eAAe;IACf,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,cAAc;IACd,UAAU,CAAC,EAAE;QACX,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;CACH;AAMD;;;GAGG;AACH,MAAM,WAAW,kBAAmB,SAAQ,eAAe;IACzD,kBAAkB;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,kBAAkB;IAClB,iBAAiB,EAAE,MAAM,EAAE,CAAC;IAC5B,cAAc;IACd,eAAe,EAAE,MAAM,CAAC;IACxB,wBAAwB;IACxB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;GAGG;AACH,MAAM,WAAW,qBAAsB,SAAQ,gBAAgB;IAC7D,iBAAiB;IACjB,OAAO,EAAE;QACP,sCAAsC;QACtC,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,sCAAsC;QACtC,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,uCAAuC;QACvC,eAAe,CAAC,EAAE,MAAM,CAAC;QACzB,iDAAiD;QACjD,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,qCAAqC;QACrC,gBAAgB,CAAC,EAAE,MAAM,CAAC;KAC3B,CAAC;IACF,YAAY;IACZ,QAAQ,CAAC,EAAE;QACT,uBAAuB;QACvB,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;QACtB,8BAA8B;QAC9B,mBAAmB,CAAC,EAAE,MAAM,EAAE,CAAC;QAC/B,wBAAwB;QACxB,qBAAqB,CAAC,EAAE,MAAM,EAAE,CAAC;KAClC,CAAC;CACH"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;;;;GAKG"}
|
package/package.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@nahisaho/katashiro-evaluation",
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "KATASHIRO Evaluation - 品質評価・A/Bテスト・実験管理",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.js",
|
|
12
|
+
"default": "./dist/index.js"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"dist"
|
|
17
|
+
],
|
|
18
|
+
"keywords": [
|
|
19
|
+
"katashiro",
|
|
20
|
+
"evaluation",
|
|
21
|
+
"testing",
|
|
22
|
+
"benchmark",
|
|
23
|
+
"llm-as-judge"
|
|
24
|
+
],
|
|
25
|
+
"author": "nahisaho",
|
|
26
|
+
"license": "MIT",
|
|
27
|
+
"repository": {
|
|
28
|
+
"type": "git",
|
|
29
|
+
"url": "https://github.com/nahisaho/katashiro.git",
|
|
30
|
+
"directory": "packages/evaluation"
|
|
31
|
+
},
|
|
32
|
+
"publishConfig": {
|
|
33
|
+
"access": "public"
|
|
34
|
+
},
|
|
35
|
+
"dependencies": {
|
|
36
|
+
"@nahisaho/katashiro-core": "2.0.0"
|
|
37
|
+
},
|
|
38
|
+
"devDependencies": {
|
|
39
|
+
"typescript": "^5.3.0"
|
|
40
|
+
},
|
|
41
|
+
"scripts": {
|
|
42
|
+
"build": "tsc",
|
|
43
|
+
"clean": "rm -rf dist",
|
|
44
|
+
"test": "vitest run",
|
|
45
|
+
"test:watch": "vitest"
|
|
46
|
+
}
|
|
47
|
+
}
|