@vfarcic/dot-ai 0.111.0 ā 0.113.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/ai-provider-factory.d.ts +0 -10
- package/dist/core/ai-provider-factory.d.ts.map +1 -1
- package/dist/core/ai-provider-factory.js +14 -24
- package/dist/core/ai-provider.interface.d.ts +28 -1
- package/dist/core/ai-provider.interface.d.ts.map +1 -1
- package/dist/core/capabilities.d.ts +1 -1
- package/dist/core/capabilities.d.ts.map +1 -1
- package/dist/core/capabilities.js +7 -4
- package/dist/core/capability-scan-workflow.js +2 -2
- package/dist/core/embedding-service.d.ts +35 -2
- package/dist/core/embedding-service.d.ts.map +1 -1
- package/dist/core/embedding-service.js +228 -15
- package/dist/core/model-config.d.ts +23 -0
- package/dist/core/model-config.d.ts.map +1 -0
- package/dist/core/model-config.js +28 -0
- package/dist/core/platform-operations.d.ts.map +1 -1
- package/dist/core/platform-operations.js +3 -5
- package/dist/core/platform-utils.d.ts +13 -2
- package/dist/core/platform-utils.d.ts.map +1 -1
- package/dist/core/platform-utils.js +91 -9
- package/dist/core/providers/anthropic-provider.d.ts +6 -1
- package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
- package/dist/core/providers/anthropic-provider.js +99 -27
- package/dist/core/providers/provider-debug-utils.d.ts +53 -20
- package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
- package/dist/core/providers/provider-debug-utils.js +106 -51
- package/dist/core/providers/vercel-provider.d.ts +6 -1
- package/dist/core/providers/vercel-provider.d.ts.map +1 -1
- package/dist/core/providers/vercel-provider.js +212 -130
- package/dist/core/schema.d.ts +1 -101
- package/dist/core/schema.d.ts.map +1 -1
- package/dist/core/schema.js +20 -154
- package/dist/core/unified-creation-session.d.ts.map +1 -1
- package/dist/core/unified-creation-session.js +15 -7
- package/dist/evaluation/dataset-analyzer.d.ts +118 -0
- package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
- package/dist/evaluation/dataset-analyzer.js +234 -0
- package/dist/evaluation/datasets/loader.d.ts +42 -0
- package/dist/evaluation/datasets/loader.d.ts.map +1 -0
- package/dist/evaluation/datasets/loader.js +104 -0
- package/dist/evaluation/eval-runner.d.ts +9 -0
- package/dist/evaluation/eval-runner.d.ts.map +1 -0
- package/dist/evaluation/eval-runner.js +399 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts +94 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base-comparative.js +187 -0
- package/dist/evaluation/evaluators/base.d.ts +47 -0
- package/dist/evaluation/evaluators/base.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base.js +10 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/capability-comparative.js +104 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/policy-comparative.js +97 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
- package/dist/evaluation/platform-synthesizer.d.ts +54 -0
- package/dist/evaluation/platform-synthesizer.d.ts.map +1 -0
- package/dist/evaluation/platform-synthesizer.js +368 -0
- package/dist/evaluation/run-platform-synthesis.d.ts +9 -0
- package/dist/evaluation/run-platform-synthesis.d.ts.map +1 -0
- package/dist/evaluation/run-platform-synthesis.js +45 -0
- package/dist/interfaces/mcp.d.ts.map +1 -1
- package/dist/interfaces/mcp.js +23 -29
- package/dist/interfaces/rest-api.d.ts.map +1 -1
- package/dist/tools/answer-question.d.ts +2 -0
- package/dist/tools/answer-question.d.ts.map +1 -1
- package/dist/tools/answer-question.js +18 -11
- package/dist/tools/generate-manifests.d.ts +2 -0
- package/dist/tools/generate-manifests.d.ts.map +1 -1
- package/dist/tools/generate-manifests.js +11 -12
- package/dist/tools/organizational-data.d.ts +1 -0
- package/dist/tools/organizational-data.d.ts.map +1 -1
- package/dist/tools/organizational-data.js +2 -1
- package/dist/tools/recommend.d.ts +1 -0
- package/dist/tools/recommend.d.ts.map +1 -1
- package/dist/tools/recommend.js +13 -21
- package/dist/tools/remediate.d.ts +3 -0
- package/dist/tools/remediate.d.ts.map +1 -1
- package/dist/tools/remediate.js +35 -14
- package/dist/tools/test-docs.d.ts +1 -0
- package/dist/tools/test-docs.d.ts.map +1 -1
- package/dist/tools/test-docs.js +4 -2
- package/dist/tools/version.d.ts +5 -1
- package/dist/tools/version.d.ts.map +1 -1
- package/dist/tools/version.js +23 -8
- package/package.json +19 -1
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluation Runner for Multi-Model Comparative Analysis
|
|
5
|
+
*
|
|
6
|
+
* Runs comparative evaluation on available datasets from multiple models
|
|
7
|
+
* Automatically detects and evaluates both remediation and recommendation datasets
|
|
8
|
+
*/
|
|
9
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
12
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
13
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
14
|
+
}
|
|
15
|
+
Object.defineProperty(o, k2, desc);
|
|
16
|
+
}) : (function(o, m, k, k2) {
|
|
17
|
+
if (k2 === undefined) k2 = k;
|
|
18
|
+
o[k2] = m[k];
|
|
19
|
+
}));
|
|
20
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
21
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
22
|
+
}) : function(o, v) {
|
|
23
|
+
o["default"] = v;
|
|
24
|
+
});
|
|
25
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
26
|
+
var ownKeys = function(o) {
|
|
27
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
28
|
+
var ar = [];
|
|
29
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
30
|
+
return ar;
|
|
31
|
+
};
|
|
32
|
+
return ownKeys(o);
|
|
33
|
+
};
|
|
34
|
+
return function (mod) {
|
|
35
|
+
if (mod && mod.__esModule) return mod;
|
|
36
|
+
var result = {};
|
|
37
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
38
|
+
__setModuleDefault(result, mod);
|
|
39
|
+
return result;
|
|
40
|
+
};
|
|
41
|
+
})();
|
|
42
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
43
|
+
const remediation_comparative_js_1 = require("./evaluators/remediation-comparative.js");
|
|
44
|
+
const recommendation_comparative_js_1 = require("./evaluators/recommendation-comparative.js");
|
|
45
|
+
const capability_comparative_js_1 = require("./evaluators/capability-comparative.js");
|
|
46
|
+
const pattern_comparative_js_1 = require("./evaluators/pattern-comparative.js");
|
|
47
|
+
const policy_comparative_js_1 = require("./evaluators/policy-comparative.js");
|
|
48
|
+
const promises_1 = require("fs/promises");
|
|
49
|
+
const child_process_1 = require("child_process");
|
|
50
|
+
const util_1 = require("util");
|
|
51
|
+
const execAsync = (0, util_1.promisify)(child_process_1.exec);
|
|
52
|
+
const EVALUATOR_CONFIG = {
|
|
53
|
+
remediation: {
|
|
54
|
+
evaluator: remediation_comparative_js_1.RemediationComparativeEvaluator,
|
|
55
|
+
prefix: 'remediate_',
|
|
56
|
+
title: 'Remediation AI Model Comparison Report'
|
|
57
|
+
},
|
|
58
|
+
recommendation: {
|
|
59
|
+
evaluator: recommendation_comparative_js_1.RecommendationComparativeEvaluator,
|
|
60
|
+
prefix: 'recommend_',
|
|
61
|
+
title: 'Recommendation AI Model Comparison Report'
|
|
62
|
+
},
|
|
63
|
+
capability: {
|
|
64
|
+
evaluator: capability_comparative_js_1.CapabilityComparativeEvaluator,
|
|
65
|
+
prefix: 'capability_',
|
|
66
|
+
title: 'Capability AI Model Comparison Report'
|
|
67
|
+
},
|
|
68
|
+
pattern: {
|
|
69
|
+
evaluator: pattern_comparative_js_1.PatternComparativeEvaluator,
|
|
70
|
+
prefix: 'pattern_',
|
|
71
|
+
title: 'Pattern AI Model Comparison Report'
|
|
72
|
+
},
|
|
73
|
+
policy: {
|
|
74
|
+
evaluator: policy_comparative_js_1.PolicyComparativeEvaluator,
|
|
75
|
+
prefix: 'policy_',
|
|
76
|
+
title: 'Policy AI Model Comparison Report'
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
function generateMarkdownReport(results, stats, evaluationType, finalAssessment) {
|
|
80
|
+
const timestamp = new Date().toISOString();
|
|
81
|
+
// Use final assessment if provided
|
|
82
|
+
const overallAssessment = finalAssessment?.overall_assessment || null;
|
|
83
|
+
// Calculate basic statistics for reference
|
|
84
|
+
const modelScores = new Map();
|
|
85
|
+
results.forEach(result => {
|
|
86
|
+
if (result.modelRankings) {
|
|
87
|
+
result.modelRankings.forEach((ranking) => {
|
|
88
|
+
if (!modelScores.has(ranking.model)) {
|
|
89
|
+
modelScores.set(ranking.model, []);
|
|
90
|
+
}
|
|
91
|
+
modelScores.get(ranking.model).push(ranking.score);
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
// Calculate average scores for supplementary information
|
|
96
|
+
const modelAverages = new Map();
|
|
97
|
+
modelScores.forEach((scores, model) => {
|
|
98
|
+
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
99
|
+
modelAverages.set(model, Math.round(avg * 1000) / 1000);
|
|
100
|
+
});
|
|
101
|
+
const reportTitle = EVALUATOR_CONFIG[evaluationType].title;
|
|
102
|
+
return `# ${reportTitle}
|
|
103
|
+
|
|
104
|
+
**Generated**: ${timestamp}
|
|
105
|
+
**Scenarios Analyzed**: ${results.length}
|
|
106
|
+
**Models Evaluated**: ${stats.availableModels.length}
|
|
107
|
+
**Total Datasets**: ${stats.totalDatasets}
|
|
108
|
+
|
|
109
|
+
## Executive Summary
|
|
110
|
+
|
|
111
|
+
### š Overall Winner (AI Assessment)
|
|
112
|
+
${overallAssessment ? `
|
|
113
|
+
**${overallAssessment.winner}**
|
|
114
|
+
|
|
115
|
+
${overallAssessment.rationale}
|
|
116
|
+
` : 'Overall assessment not available'}
|
|
117
|
+
|
|
118
|
+
### š AI Reliability Rankings
|
|
119
|
+
|
|
120
|
+
${overallAssessment ? overallAssessment.reliability_ranking
|
|
121
|
+
.map((ranking, index) => `${index + 1}. **${ranking.model}** (${Math.round(ranking.reliability_score * 100)}%) - ${ranking.reliability_notes}`)
|
|
122
|
+
.join('\n') : 'Reliability rankings not available'}
|
|
123
|
+
|
|
124
|
+
### š Production Recommendations
|
|
125
|
+
|
|
126
|
+
${overallAssessment ? `
|
|
127
|
+
- **Primary Choice**: ${overallAssessment.production_recommendations.primary}
|
|
128
|
+
- **Secondary Option**: ${overallAssessment.production_recommendations.secondary}
|
|
129
|
+
- **Avoid for Production**: ${overallAssessment.production_recommendations.avoid.length > 0 ? overallAssessment.production_recommendations.avoid.join(', ') : 'None'}
|
|
130
|
+
${Object.keys(overallAssessment.production_recommendations.specialized_use).length > 0 ?
|
|
131
|
+
'\n**Specialized Use Cases:**\n' + Object.entries(overallAssessment.production_recommendations.specialized_use)
|
|
132
|
+
.map(([useCase, model]) => `- **${useCase}**: ${model}`)
|
|
133
|
+
.join('\n') : ''}
|
|
134
|
+
` : 'Production recommendations not available'}
|
|
135
|
+
|
|
136
|
+
### š Supplementary Statistics (Reference Only)
|
|
137
|
+
|
|
138
|
+
| Model | Avg Score | Notes |
|
|
139
|
+
|-------|-----------|-------|
|
|
140
|
+
${Array.from(modelAverages.entries())
|
|
141
|
+
.sort((a, b) => b[1] - a[1])
|
|
142
|
+
.map(([model, avgScore]) => `| ${model} | ${avgScore} | See AI assessment above |`)
|
|
143
|
+
.join('\n')}
|
|
144
|
+
|
|
145
|
+
## Detailed Scenario Results
|
|
146
|
+
|
|
147
|
+
${results.map((result, index) => {
|
|
148
|
+
const scenarioTitle = result.key.replace(/_/g, ' ').replace(/(remediation|recommendation) comparative /, '').toUpperCase();
|
|
149
|
+
return `### ${index + 1}. ${scenarioTitle}
|
|
150
|
+
|
|
151
|
+
**Winner**: ${result.bestModel} (Score: ${result.score})
|
|
152
|
+
**Models Compared**: ${result.modelCount}
|
|
153
|
+
**Confidence**: ${result.confidence ? Math.round(result.confidence * 100) : 0}%
|
|
154
|
+
|
|
155
|
+
#### Rankings
|
|
156
|
+
${result.modelRankings ? result.modelRankings.map((rank) => `${rank.rank}. **${rank.model}** - ${rank.score}`).join('\n') : 'No detailed rankings available'}
|
|
157
|
+
|
|
158
|
+
#### Analysis
|
|
159
|
+
${result.comment}
|
|
160
|
+
|
|
161
|
+
---`;
|
|
162
|
+
}).join('\n\n')}
|
|
163
|
+
|
|
164
|
+
## AI Model Selection Guide
|
|
165
|
+
|
|
166
|
+
${overallAssessment ? `
|
|
167
|
+
### Key Insights
|
|
168
|
+
${overallAssessment.key_insights}
|
|
169
|
+
|
|
170
|
+
### Recommended Selection Strategy
|
|
171
|
+
- **For Production Use**: Choose ${overallAssessment.production_recommendations.primary}
|
|
172
|
+
- **For Secondary Option**: Consider ${overallAssessment.production_recommendations.secondary}
|
|
173
|
+
${overallAssessment.production_recommendations.avoid.length > 0 ?
|
|
174
|
+
`- **Avoid**: ${overallAssessment.production_recommendations.avoid.join(', ')} (reliability concerns)` : ''}
|
|
175
|
+
|
|
176
|
+
### Decision Framework
|
|
177
|
+
The AI assessment prioritizes **reliability and consistency** over peak performance. Models that fail completely in any scenario are heavily penalized, ensuring production-ready recommendations.
|
|
178
|
+
` : 'AI model selection guide not available'}
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## Report Attribution
|
|
183
|
+
|
|
184
|
+
Report generated by DevOps AI Toolkit Comparative Evaluation System
|
|
185
|
+
`;
|
|
186
|
+
}
|
|
187
|
+
function loadModelMetadata() {
|
|
188
|
+
try {
|
|
189
|
+
const fs = require('fs');
|
|
190
|
+
const path = require('path');
|
|
191
|
+
const metadataPath = path.join(__dirname, 'model-metadata.json');
|
|
192
|
+
if (!fs.existsSync(metadataPath)) {
|
|
193
|
+
console.error('ā Model metadata file not found');
|
|
194
|
+
console.error('š Pricing and capabilities data required for cost analysis');
|
|
195
|
+
console.error('');
|
|
196
|
+
console.error('š To create model metadata, run:');
|
|
197
|
+
console.error(' /update-model-metadata');
|
|
198
|
+
console.error('');
|
|
199
|
+
process.exit(1);
|
|
200
|
+
}
|
|
201
|
+
const metadata = JSON.parse(fs.readFileSync(metadataPath, 'utf8'));
|
|
202
|
+
// Check if metadata is older than 30 days
|
|
203
|
+
const metadataAge = Date.now() - new Date(metadata.lastUpdated).getTime();
|
|
204
|
+
const thirtyDays = 30 * 24 * 60 * 60 * 1000;
|
|
205
|
+
if (metadataAge > thirtyDays) {
|
|
206
|
+
console.error('ā Model metadata is over 30 days old (last updated: ' + metadata.lastUpdated + ')');
|
|
207
|
+
console.error('š Pricing and capabilities data may be outdated, affecting cost analysis accuracy');
|
|
208
|
+
console.error('');
|
|
209
|
+
console.error('š To update model metadata, run:');
|
|
210
|
+
console.error(' /update-model-metadata');
|
|
211
|
+
console.error('');
|
|
212
|
+
process.exit(1);
|
|
213
|
+
}
|
|
214
|
+
console.log('ā
Model metadata loaded (updated: ' + metadata.lastUpdated + ')');
|
|
215
|
+
return metadata;
|
|
216
|
+
}
|
|
217
|
+
catch (error) {
|
|
218
|
+
console.error('ā Failed to load model metadata:', error instanceof Error ? error.message : String(error));
|
|
219
|
+
console.error('š To create model metadata, run: /update-model-metadata');
|
|
220
|
+
process.exit(1);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
function generateJsonReport(results, stats, evaluationType, modelMetadata, finalAssessment) {
|
|
224
|
+
const timestamp = new Date().toISOString();
|
|
225
|
+
// Use final assessment if provided
|
|
226
|
+
const overallAssessment = finalAssessment || null;
|
|
227
|
+
return {
|
|
228
|
+
metadata: {
|
|
229
|
+
reportType: 'comparative-evaluation',
|
|
230
|
+
evaluationType: evaluationType,
|
|
231
|
+
generated: timestamp,
|
|
232
|
+
scenariosAnalyzed: results.length,
|
|
233
|
+
modelsEvaluated: stats.availableModels.length,
|
|
234
|
+
totalDatasets: stats.totalDatasets,
|
|
235
|
+
tool: EVALUATOR_CONFIG[evaluationType].title
|
|
236
|
+
},
|
|
237
|
+
modelMetadata: modelMetadata.models,
|
|
238
|
+
overallAssessment: overallAssessment,
|
|
239
|
+
results: results,
|
|
240
|
+
summary: stats
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
async function detectAvailableDatasets(datasetsDir, filterType) {
|
|
244
|
+
try {
|
|
245
|
+
const files = await (0, promises_1.readdir)(datasetsDir);
|
|
246
|
+
const result = {};
|
|
247
|
+
for (const [type, config] of Object.entries(EVALUATOR_CONFIG)) {
|
|
248
|
+
// If filter specified, only check for that type
|
|
249
|
+
if (filterType && type !== filterType) {
|
|
250
|
+
result[type] = false;
|
|
251
|
+
}
|
|
252
|
+
else {
|
|
253
|
+
result[type] = files.some(file => file.startsWith(config.prefix));
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
return result;
|
|
257
|
+
}
|
|
258
|
+
catch (error) {
|
|
259
|
+
console.warn('Could not read datasets directory, assuming no datasets available');
|
|
260
|
+
const result = {};
|
|
261
|
+
for (const type of Object.keys(EVALUATOR_CONFIG)) {
|
|
262
|
+
result[type] = false;
|
|
263
|
+
}
|
|
264
|
+
return result;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
async function runEvaluation(evaluatorType, datasetsDir, modelMetadata) {
|
|
268
|
+
const EvaluatorClass = EVALUATOR_CONFIG[evaluatorType].evaluator;
|
|
269
|
+
const evaluator = new EvaluatorClass(datasetsDir);
|
|
270
|
+
console.log(`\nš¬ Starting ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Evaluation\n`);
|
|
271
|
+
// Show dataset stats
|
|
272
|
+
console.log('š Dataset Analysis:');
|
|
273
|
+
const stats = evaluator.getDatasetStats();
|
|
274
|
+
console.log(`- Total datasets: ${stats.totalDatasets}`);
|
|
275
|
+
console.log(`- Available models: ${stats.availableModels.join(', ')}`);
|
|
276
|
+
console.log(`- Scenarios with multiple models: ${stats.scenariosWithMultipleModels}`);
|
|
277
|
+
console.log(`- Interaction types: ${stats.interactionTypes.join(', ')}`);
|
|
278
|
+
console.log();
|
|
279
|
+
// Show evaluation phases
|
|
280
|
+
console.log('šÆ Evaluation Phases:');
|
|
281
|
+
const phases = evaluator.getEvaluationPhases();
|
|
282
|
+
phases.forEach(phase => {
|
|
283
|
+
console.log(`- ${phase.phase}: ${phase.description}`);
|
|
284
|
+
console.log(` Models: ${phase.availableModels.join(', ')}`);
|
|
285
|
+
console.log(` Scenarios: ${phase.scenarioCount}`);
|
|
286
|
+
console.log();
|
|
287
|
+
});
|
|
288
|
+
// Run comparative evaluation on all scenarios
|
|
289
|
+
console.log('š Running Comparative Evaluation...\n');
|
|
290
|
+
const results = await evaluator.evaluateAllScenarios();
|
|
291
|
+
console.log(`ā
${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Evaluation Complete! Analyzed ${results.length} scenarios\n`);
|
|
292
|
+
// Conduct final assessment across all scenarios
|
|
293
|
+
const finalAssessment = await evaluator.conductFinalAssessment(results);
|
|
294
|
+
// Generate dual-format reports using final assessment
|
|
295
|
+
const reportContent = generateMarkdownReport(results, stats, evaluatorType, finalAssessment);
|
|
296
|
+
const jsonResults = generateJsonReport(results, stats, evaluatorType, modelMetadata, finalAssessment);
|
|
297
|
+
// Save reports to files
|
|
298
|
+
const dateStamp = new Date().toISOString().split('T')[0];
|
|
299
|
+
const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation-${dateStamp}.md`;
|
|
300
|
+
const jsonPath = `./eval/analysis/individual/${evaluatorType}-results-${dateStamp}.json`;
|
|
301
|
+
const reportDir = './eval/analysis/individual';
|
|
302
|
+
// Ensure report directory exists
|
|
303
|
+
const fs = await Promise.resolve().then(() => __importStar(require('fs')));
|
|
304
|
+
if (!fs.existsSync(reportDir)) {
|
|
305
|
+
fs.mkdirSync(reportDir, { recursive: true });
|
|
306
|
+
}
|
|
307
|
+
fs.writeFileSync(markdownPath, reportContent);
|
|
308
|
+
fs.writeFileSync(jsonPath, JSON.stringify(jsonResults, null, 2));
|
|
309
|
+
console.log(`š ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} reports generated:`);
|
|
310
|
+
console.log(` š Markdown: ${markdownPath}`);
|
|
311
|
+
console.log(` š JSON: ${jsonPath}`);
|
|
312
|
+
// Brief console summary
|
|
313
|
+
console.log(`š ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Results:`);
|
|
314
|
+
results.forEach((result, index) => {
|
|
315
|
+
console.log(` ${index + 1}. ${result.key}: ${result.bestModel} (${result.score})`);
|
|
316
|
+
});
|
|
317
|
+
return results;
|
|
318
|
+
}
|
|
319
|
+
async function main() {
|
|
320
|
+
console.log('š¬ Starting Multi-Model Comparative Evaluation\n');
|
|
321
|
+
// Clean old debug files but preserve evaluation datasets
|
|
322
|
+
console.log('š§¹ Cleaning old debug files...');
|
|
323
|
+
try {
|
|
324
|
+
await execAsync('find ./tmp/debug-ai -type f ! -name \'*.jsonl\' -delete 2>/dev/null || true');
|
|
325
|
+
await execAsync('mkdir -p ./tmp/debug-ai');
|
|
326
|
+
console.log('ā
Debug files cleaned (datasets preserved)\n');
|
|
327
|
+
}
|
|
328
|
+
catch (error) {
|
|
329
|
+
console.warn('ā ļø Could not clean debug files:', error instanceof Error ? error.message : String(error));
|
|
330
|
+
}
|
|
331
|
+
// Check model metadata freshness before starting any evaluation work
|
|
332
|
+
const modelMetadata = loadModelMetadata();
|
|
333
|
+
const datasetsDir = './eval/datasets';
|
|
334
|
+
// Parse command line arguments for subset evaluation
|
|
335
|
+
const args = process.argv.slice(2);
|
|
336
|
+
let filterType = undefined;
|
|
337
|
+
if (args.length > 0) {
|
|
338
|
+
const requestedType = args[0];
|
|
339
|
+
if (requestedType in EVALUATOR_CONFIG) {
|
|
340
|
+
filterType = requestedType;
|
|
341
|
+
}
|
|
342
|
+
else {
|
|
343
|
+
console.error(`ā Invalid evaluation type: "${requestedType}"`);
|
|
344
|
+
console.error(`ā
Available types: ${Object.keys(EVALUATOR_CONFIG).join(', ')}`);
|
|
345
|
+
process.exit(1);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
const availableDatasets = await detectAvailableDatasets(datasetsDir, filterType);
|
|
349
|
+
console.log('š Dataset Detection:');
|
|
350
|
+
for (const [type, available] of Object.entries(availableDatasets)) {
|
|
351
|
+
console.log(`- ${type.charAt(0).toUpperCase() + type.slice(1)} datasets: ${available ? 'ā
' : 'ā'}`);
|
|
352
|
+
}
|
|
353
|
+
if (filterType) {
|
|
354
|
+
console.log(`\nšÆ Running evaluation for: ${filterType}`);
|
|
355
|
+
}
|
|
356
|
+
const hasAnyDatasets = Object.values(availableDatasets).some(Boolean);
|
|
357
|
+
if (!hasAnyDatasets) {
|
|
358
|
+
if (filterType) {
|
|
359
|
+
console.error(`ā No datasets found for type: ${filterType}`);
|
|
360
|
+
}
|
|
361
|
+
else {
|
|
362
|
+
console.error('ā No evaluation datasets found. Please run integration tests first to generate datasets.');
|
|
363
|
+
}
|
|
364
|
+
process.exit(1);
|
|
365
|
+
}
|
|
366
|
+
try {
|
|
367
|
+
const allResults = [];
|
|
368
|
+
// If filterType is specified, only run that evaluation type
|
|
369
|
+
if (filterType) {
|
|
370
|
+
if (availableDatasets[filterType]) {
|
|
371
|
+
const results = await runEvaluation(filterType, datasetsDir, modelMetadata);
|
|
372
|
+
allResults.push(...results);
|
|
373
|
+
}
|
|
374
|
+
else {
|
|
375
|
+
console.error(`ā No datasets available for type: ${filterType}`);
|
|
376
|
+
process.exit(1);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
else {
|
|
380
|
+
// Run all available evaluations
|
|
381
|
+
for (const [type, available] of Object.entries(availableDatasets)) {
|
|
382
|
+
if (available) {
|
|
383
|
+
const results = await runEvaluation(type, datasetsDir, modelMetadata);
|
|
384
|
+
allResults.push(...results);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
console.log(`\nš All Evaluations Complete! Total scenarios analyzed: ${allResults.length}`);
|
|
389
|
+
console.log(`š Check ./eval/reports/ for detailed analysis reports\n`);
|
|
390
|
+
}
|
|
391
|
+
catch (error) {
|
|
392
|
+
console.error('ā Evaluation failed:', error);
|
|
393
|
+
process.exit(1);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
// Run if this file is executed directly
|
|
397
|
+
if (require.main === module) {
|
|
398
|
+
main().catch(console.error);
|
|
399
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base Comparative Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Shared functionality for comparing multiple AI models across scenarios
|
|
5
|
+
* Eliminates code duplication between remediation, recommendation, and capability evaluators
|
|
6
|
+
*/
|
|
7
|
+
import { EvaluationScore } from './base.js';
|
|
8
|
+
import { VercelProvider } from '../../core/providers/vercel-provider';
|
|
9
|
+
import { DatasetAnalyzer, ComparisonScenario } from '../dataset-analyzer.js';
|
|
10
|
+
export interface ComparativeEvaluationResult {
|
|
11
|
+
scenario_summary: string;
|
|
12
|
+
models_compared: string[];
|
|
13
|
+
comparative_analysis: Record<string, {
|
|
14
|
+
quality_score?: number;
|
|
15
|
+
efficiency_score?: number;
|
|
16
|
+
performance_score?: number;
|
|
17
|
+
communication_score?: number;
|
|
18
|
+
accuracy_score?: number;
|
|
19
|
+
completeness_score?: number;
|
|
20
|
+
clarity_score?: number;
|
|
21
|
+
consistency_score?: number;
|
|
22
|
+
weighted_total: number;
|
|
23
|
+
strengths: string;
|
|
24
|
+
weaknesses: string;
|
|
25
|
+
}>;
|
|
26
|
+
ranking: Array<{
|
|
27
|
+
rank: number;
|
|
28
|
+
model: string;
|
|
29
|
+
score: number;
|
|
30
|
+
rationale?: string;
|
|
31
|
+
reasoning?: string;
|
|
32
|
+
}>;
|
|
33
|
+
overall_insights?: string;
|
|
34
|
+
}
|
|
35
|
+
export interface ComparativeEvaluationScore extends EvaluationScore {
|
|
36
|
+
modelRankings: Array<{
|
|
37
|
+
rank: number;
|
|
38
|
+
model: string;
|
|
39
|
+
score: number;
|
|
40
|
+
}>;
|
|
41
|
+
bestModel: string;
|
|
42
|
+
modelCount: number;
|
|
43
|
+
}
|
|
44
|
+
export declare abstract class BaseComparativeEvaluator {
|
|
45
|
+
abstract readonly name: string;
|
|
46
|
+
abstract readonly description: string;
|
|
47
|
+
protected abstract readonly promptFileName: string;
|
|
48
|
+
protected abstract readonly toolName: string;
|
|
49
|
+
protected evaluatorModel: VercelProvider;
|
|
50
|
+
protected datasetAnalyzer: DatasetAnalyzer;
|
|
51
|
+
protected promptTemplate: string;
|
|
52
|
+
constructor(datasetDir?: string);
|
|
53
|
+
/**
|
|
54
|
+
* Initialize the evaluator - must be called by subclass constructor
|
|
55
|
+
*/
|
|
56
|
+
protected initializePrompt(): void;
|
|
57
|
+
/**
|
|
58
|
+
* Evaluate all available models for scenarios
|
|
59
|
+
* This method finds all scenarios with multiple model responses and evaluates them comparatively
|
|
60
|
+
*/
|
|
61
|
+
evaluateAllScenarios(): Promise<ComparativeEvaluationScore[]>;
|
|
62
|
+
/**
|
|
63
|
+
* Conduct final assessment across all scenarios to determine overall winner
|
|
64
|
+
*/
|
|
65
|
+
conductFinalAssessment(scenarioResults: ComparativeEvaluationScore[]): Promise<any>;
|
|
66
|
+
/**
|
|
67
|
+
* Evaluate a single scenario comparing all available models
|
|
68
|
+
*/
|
|
69
|
+
evaluateScenario(scenario: ComparisonScenario): Promise<ComparativeEvaluationScore>;
|
|
70
|
+
/**
|
|
71
|
+
* Build the evaluation prompt - can be overridden by subclasses for custom behavior
|
|
72
|
+
*/
|
|
73
|
+
protected buildEvaluationPrompt(scenario: ComparisonScenario, modelResponsesText: string, modelList: string): string;
|
|
74
|
+
/**
|
|
75
|
+
* Get statistics about available datasets
|
|
76
|
+
*/
|
|
77
|
+
getDatasetStats(): {
|
|
78
|
+
totalDatasets: number;
|
|
79
|
+
availableModels: string[];
|
|
80
|
+
scenariosWithMultipleModels: number;
|
|
81
|
+
interactionTypes: string[];
|
|
82
|
+
};
|
|
83
|
+
/**
|
|
84
|
+
* Get detailed breakdown of evaluation phases available
|
|
85
|
+
* Must be implemented by subclasses to provide domain-specific phase descriptions
|
|
86
|
+
*/
|
|
87
|
+
abstract getEvaluationPhases(): {
|
|
88
|
+
phase: string;
|
|
89
|
+
description: string;
|
|
90
|
+
availableModels: string[];
|
|
91
|
+
scenarioCount: number;
|
|
92
|
+
}[];
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=base-comparative.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAE7E,MAAM,WAAW,2BAA2B;IAC1C,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE;QACnC,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,0BAA2B,SAAQ,eAAe;IACjE,aAAa,EAAE,KAAK,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,8BAAsB,wBAAwB;IAC5C,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE7C,SAAS,CAAC,cAAc,EAAE,cAAc,CAAC;IACzC,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC;IAC3C,SAAS,CAAC,cAAc,EAAE,MAAM,CAAC;gBAErB,UAAU,CAAC,EAAE,MAAM;IAe/B;;OAEG;IACH,SAAS,CAAC,gBAAgB;IAK1B;;;OAGG;IACG,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAkBnE;;OAEG;IACG,sBAAsB,CAAC,eAAe,EAAE,0BAA0B,EAAE,GAAG,OAAO,CAAC,GAAG,CAAC;IA6CzF;;OAEG;IACG,gBAAgB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,OAAO,CAAC,0BAA0B,CAAC;IAsFzF;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAQpH;;OAEG;IACH,eAAe;;;;;;IAIf;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,IAAI;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CACJ"}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Base Comparative Evaluator
|
|
4
|
+
*
|
|
5
|
+
* Shared functionality for comparing multiple AI models across scenarios
|
|
6
|
+
* Eliminates code duplication between remediation, recommendation, and capability evaluators
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.BaseComparativeEvaluator = void 0;
|
|
10
|
+
const vercel_provider_1 = require("../../core/providers/vercel-provider");
|
|
11
|
+
const model_config_1 = require("../../core/model-config");
|
|
12
|
+
const platform_utils_1 = require("../../core/platform-utils");
|
|
13
|
+
const fs_1 = require("fs");
|
|
14
|
+
const path_1 = require("path");
|
|
15
|
+
const dataset_analyzer_js_1 = require("../dataset-analyzer.js");
|
|
16
|
+
class BaseComparativeEvaluator {
|
|
17
|
+
evaluatorModel;
|
|
18
|
+
datasetAnalyzer;
|
|
19
|
+
promptTemplate;
|
|
20
|
+
constructor(datasetDir) {
|
|
21
|
+
// Use Claude via VercelProvider as the evaluator (most reliable for complex comparative evaluation)
|
|
22
|
+
this.evaluatorModel = new vercel_provider_1.VercelProvider({
|
|
23
|
+
provider: 'anthropic',
|
|
24
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
25
|
+
model: (0, model_config_1.getCurrentModel)('anthropic'),
|
|
26
|
+
debugMode: process.env.DEBUG_DOT_AI === 'true'
|
|
27
|
+
});
|
|
28
|
+
this.datasetAnalyzer = new dataset_analyzer_js_1.DatasetAnalyzer(datasetDir || './eval/datasets');
|
|
29
|
+
// Prompt template will be loaded by subclass
|
|
30
|
+
this.promptTemplate = '';
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Initialize the evaluator - must be called by subclass constructor
|
|
34
|
+
*/
|
|
35
|
+
initializePrompt() {
|
|
36
|
+
const promptPath = (0, path_1.join)(process.cwd(), 'src', 'evaluation', 'prompts', this.promptFileName);
|
|
37
|
+
this.promptTemplate = (0, fs_1.readFileSync)(promptPath, 'utf8');
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Evaluate all available models for scenarios
|
|
41
|
+
* This method finds all scenarios with multiple model responses and evaluates them comparatively
|
|
42
|
+
*/
|
|
43
|
+
async evaluateAllScenarios() {
|
|
44
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
45
|
+
const results = [];
|
|
46
|
+
console.log(`Found ${scenarios.length} scenarios with multiple models for comparative evaluation`);
|
|
47
|
+
for (const scenario of scenarios) {
|
|
48
|
+
try {
|
|
49
|
+
const result = await this.evaluateScenario(scenario);
|
|
50
|
+
results.push(result);
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
console.error(`Failed to evaluate scenario ${scenario.interaction_id}:`, error);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return results;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Conduct final assessment across all scenarios to determine overall winner
|
|
60
|
+
*/
|
|
61
|
+
async conductFinalAssessment(scenarioResults) {
|
|
62
|
+
if (scenarioResults.length === 0) {
|
|
63
|
+
throw new Error('No scenario results provided for final assessment');
|
|
64
|
+
}
|
|
65
|
+
// Load the overall winner assessment prompt
|
|
66
|
+
const promptPath = (0, path_1.join)(process.cwd(), 'src', 'evaluation', 'prompts', 'overall-winner-assessment.md');
|
|
67
|
+
const overallWinnerTemplate = (0, fs_1.readFileSync)(promptPath, 'utf8');
|
|
68
|
+
// Get all models that should have been tested (from first scenario)
|
|
69
|
+
const allModels = scenarioResults[0]?.modelRankings?.map(r => r.model) || [];
|
|
70
|
+
// Build the final assessment prompt with raw data
|
|
71
|
+
const finalPrompt = overallWinnerTemplate
|
|
72
|
+
.replace('{tool_type}', this.toolName)
|
|
73
|
+
.replace('{total_scenarios}', scenarioResults.length.toString())
|
|
74
|
+
.replace('{expected_models}', JSON.stringify(allModels))
|
|
75
|
+
.replace('{scenario_results}', JSON.stringify(scenarioResults, null, 2));
|
|
76
|
+
try {
|
|
77
|
+
console.log(`\nš Conducting final assessment across ${scenarioResults.length} scenarios for ${this.toolName}\n`);
|
|
78
|
+
const response = await this.evaluatorModel.sendMessage(finalPrompt, `${this.name}-final-assessment`, {
|
|
79
|
+
user_intent: `Final cross-scenario assessment for ${this.toolName}`,
|
|
80
|
+
interaction_id: 'final-assessment'
|
|
81
|
+
});
|
|
82
|
+
// Extract JSON from AI response
|
|
83
|
+
const finalAssessment = (0, platform_utils_1.extractJsonFromAIResponse)(response.content);
|
|
84
|
+
console.log(`ā
Final Assessment Complete for ${this.toolName}`);
|
|
85
|
+
console.log(`š Overall Winner: ${finalAssessment.overall_assessment?.winner || 'Unknown'}`);
|
|
86
|
+
return finalAssessment;
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
console.error(`Final assessment failed for ${this.toolName}:`, error);
|
|
90
|
+
throw error;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Evaluate a single scenario comparing all available models
|
|
95
|
+
*/
|
|
96
|
+
async evaluateScenario(scenario) {
|
|
97
|
+
// Build model responses section for the prompt
|
|
98
|
+
const modelResponsesText = scenario.models.map((modelResponse, index) => {
|
|
99
|
+
// Build failure analysis context
|
|
100
|
+
let reliabilityContext = 'ā
Completed successfully';
|
|
101
|
+
if (modelResponse.metadata.failure_analysis) {
|
|
102
|
+
const failure = modelResponse.metadata.failure_analysis;
|
|
103
|
+
reliabilityContext = `ā ļø **${failure.failure_type.toUpperCase()} FAILURE**: ${failure.failure_reason}`;
|
|
104
|
+
if (failure.failure_type === 'timeout') {
|
|
105
|
+
reliabilityContext += `\n- **Time to failure**: ${Math.round(failure.time_to_failure / 1000)}s (${Math.round(failure.time_to_failure / 60000)}min)`;
|
|
106
|
+
reliabilityContext += `\n- **Impact**: Model could not complete full workflow within time limit`;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
return `### Model ${index + 1}: ${modelResponse.model}
|
|
110
|
+
|
|
111
|
+
**Performance Metrics:**
|
|
112
|
+
- Duration: ${modelResponse.performance.duration_ms}ms
|
|
113
|
+
- Input Tokens: ${modelResponse.performance.input_tokens}
|
|
114
|
+
- Output Tokens: ${modelResponse.performance.output_tokens}
|
|
115
|
+
- Total Tokens: ${modelResponse.performance.total_tokens}
|
|
116
|
+
- Iterations: ${modelResponse.performance.iterations || 'N/A'}
|
|
117
|
+
- Tool Calls: ${modelResponse.performance.tool_calls_executed || 'N/A'}
|
|
118
|
+
- Cache Read: ${modelResponse.performance.cache_read_tokens || 0} tokens
|
|
119
|
+
- Cache Creation: ${modelResponse.performance.cache_creation_tokens || 0} tokens
|
|
120
|
+
|
|
121
|
+
**Reliability Status:**
|
|
122
|
+
${reliabilityContext}
|
|
123
|
+
|
|
124
|
+
**Response:**
|
|
125
|
+
${modelResponse.response}
|
|
126
|
+
|
|
127
|
+
---`;
|
|
128
|
+
}).join('\n\n');
|
|
129
|
+
const modelList = scenario.models.map(m => m.model).join('", "');
|
|
130
|
+
// Generate the comparative evaluation prompt
|
|
131
|
+
const evaluationPrompt = this.buildEvaluationPrompt(scenario, modelResponsesText, modelList);
|
|
132
|
+
try {
|
|
133
|
+
const response = await this.evaluatorModel.sendMessage(evaluationPrompt, `${this.name}-${scenario.interaction_id}`, {
|
|
134
|
+
user_intent: `Comparative ${this.name} evaluation for ${scenario.interaction_id}`,
|
|
135
|
+
interaction_id: scenario.interaction_id
|
|
136
|
+
});
|
|
137
|
+
// Extract JSON from AI response with robust parsing
|
|
138
|
+
const evaluation = (0, platform_utils_1.extractJsonFromAIResponse)(response.content);
|
|
139
|
+
// Convert to standard EvaluationScore format
|
|
140
|
+
const rankings = evaluation.ranking || [];
|
|
141
|
+
const bestModel = rankings.length > 0 ? rankings[0].model : scenario.models[0].model;
|
|
142
|
+
const bestScore = rankings.length > 0 ? rankings[0].score : 0;
|
|
143
|
+
return {
|
|
144
|
+
key: `${this.name}_${scenario.interaction_id}`,
|
|
145
|
+
score: bestScore,
|
|
146
|
+
comment: evaluation.overall_insights || 'Comparative evaluation completed',
|
|
147
|
+
confidence: 0.9, // High confidence for comparative evaluation
|
|
148
|
+
modelRankings: rankings.map(r => ({
|
|
149
|
+
rank: r.rank,
|
|
150
|
+
model: r.model,
|
|
151
|
+
score: r.score
|
|
152
|
+
})),
|
|
153
|
+
bestModel,
|
|
154
|
+
modelCount: scenario.models.length
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
catch (error) {
|
|
158
|
+
console.error(`Comparative evaluation failed for ${scenario.interaction_id}:`, error);
|
|
159
|
+
return {
|
|
160
|
+
key: `${this.name}_${scenario.interaction_id}`,
|
|
161
|
+
score: 0,
|
|
162
|
+
comment: `Evaluation error: ${error}`,
|
|
163
|
+
confidence: 0,
|
|
164
|
+
modelRankings: [],
|
|
165
|
+
bestModel: 'unknown',
|
|
166
|
+
modelCount: scenario.models.length
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Build the evaluation prompt - can be overridden by subclasses for custom behavior
|
|
172
|
+
*/
|
|
173
|
+
buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
|
|
174
|
+
return this.promptTemplate
|
|
175
|
+
.replace('{issue}', scenario.issue)
|
|
176
|
+
.replace('{model_responses}', modelResponsesText)
|
|
177
|
+
.replace('{model_list}', modelList)
|
|
178
|
+
.replace('{phase}', scenario.interaction_id);
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Get statistics about available datasets
|
|
182
|
+
*/
|
|
183
|
+
getDatasetStats() {
|
|
184
|
+
return this.datasetAnalyzer.getDatasetStats(this.toolName);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
exports.BaseComparativeEvaluator = BaseComparativeEvaluator;
|