@vfarcic/dot-ai 0.111.0 → 0.113.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/dist/core/ai-provider-factory.d.ts +0 -10
  2. package/dist/core/ai-provider-factory.d.ts.map +1 -1
  3. package/dist/core/ai-provider-factory.js +14 -24
  4. package/dist/core/ai-provider.interface.d.ts +28 -1
  5. package/dist/core/ai-provider.interface.d.ts.map +1 -1
  6. package/dist/core/capabilities.d.ts +1 -1
  7. package/dist/core/capabilities.d.ts.map +1 -1
  8. package/dist/core/capabilities.js +7 -4
  9. package/dist/core/capability-scan-workflow.js +2 -2
  10. package/dist/core/embedding-service.d.ts +35 -2
  11. package/dist/core/embedding-service.d.ts.map +1 -1
  12. package/dist/core/embedding-service.js +228 -15
  13. package/dist/core/model-config.d.ts +23 -0
  14. package/dist/core/model-config.d.ts.map +1 -0
  15. package/dist/core/model-config.js +28 -0
  16. package/dist/core/platform-operations.d.ts.map +1 -1
  17. package/dist/core/platform-operations.js +3 -5
  18. package/dist/core/platform-utils.d.ts +13 -2
  19. package/dist/core/platform-utils.d.ts.map +1 -1
  20. package/dist/core/platform-utils.js +91 -9
  21. package/dist/core/providers/anthropic-provider.d.ts +6 -1
  22. package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
  23. package/dist/core/providers/anthropic-provider.js +99 -27
  24. package/dist/core/providers/provider-debug-utils.d.ts +53 -20
  25. package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
  26. package/dist/core/providers/provider-debug-utils.js +106 -51
  27. package/dist/core/providers/vercel-provider.d.ts +6 -1
  28. package/dist/core/providers/vercel-provider.d.ts.map +1 -1
  29. package/dist/core/providers/vercel-provider.js +212 -130
  30. package/dist/core/schema.d.ts +1 -101
  31. package/dist/core/schema.d.ts.map +1 -1
  32. package/dist/core/schema.js +20 -154
  33. package/dist/core/unified-creation-session.d.ts.map +1 -1
  34. package/dist/core/unified-creation-session.js +15 -7
  35. package/dist/evaluation/dataset-analyzer.d.ts +118 -0
  36. package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
  37. package/dist/evaluation/dataset-analyzer.js +234 -0
  38. package/dist/evaluation/datasets/loader.d.ts +42 -0
  39. package/dist/evaluation/datasets/loader.d.ts.map +1 -0
  40. package/dist/evaluation/datasets/loader.js +104 -0
  41. package/dist/evaluation/eval-runner.d.ts +9 -0
  42. package/dist/evaluation/eval-runner.d.ts.map +1 -0
  43. package/dist/evaluation/eval-runner.js +399 -0
  44. package/dist/evaluation/evaluators/base-comparative.d.ts +94 -0
  45. package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
  46. package/dist/evaluation/evaluators/base-comparative.js +187 -0
  47. package/dist/evaluation/evaluators/base.d.ts +47 -0
  48. package/dist/evaluation/evaluators/base.d.ts.map +1 -0
  49. package/dist/evaluation/evaluators/base.js +10 -0
  50. package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
  51. package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
  52. package/dist/evaluation/evaluators/capability-comparative.js +104 -0
  53. package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
  54. package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
  55. package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
  56. package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
  57. package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
  58. package/dist/evaluation/evaluators/policy-comparative.js +97 -0
  59. package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
  60. package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
  61. package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
  62. package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
  63. package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
  64. package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
  65. package/dist/evaluation/platform-synthesizer.d.ts +54 -0
  66. package/dist/evaluation/platform-synthesizer.d.ts.map +1 -0
  67. package/dist/evaluation/platform-synthesizer.js +368 -0
  68. package/dist/evaluation/run-platform-synthesis.d.ts +9 -0
  69. package/dist/evaluation/run-platform-synthesis.d.ts.map +1 -0
  70. package/dist/evaluation/run-platform-synthesis.js +45 -0
  71. package/dist/interfaces/mcp.d.ts.map +1 -1
  72. package/dist/interfaces/mcp.js +23 -29
  73. package/dist/interfaces/rest-api.d.ts.map +1 -1
  74. package/dist/tools/answer-question.d.ts +2 -0
  75. package/dist/tools/answer-question.d.ts.map +1 -1
  76. package/dist/tools/answer-question.js +18 -11
  77. package/dist/tools/generate-manifests.d.ts +2 -0
  78. package/dist/tools/generate-manifests.d.ts.map +1 -1
  79. package/dist/tools/generate-manifests.js +11 -12
  80. package/dist/tools/organizational-data.d.ts +1 -0
  81. package/dist/tools/organizational-data.d.ts.map +1 -1
  82. package/dist/tools/organizational-data.js +2 -1
  83. package/dist/tools/recommend.d.ts +1 -0
  84. package/dist/tools/recommend.d.ts.map +1 -1
  85. package/dist/tools/recommend.js +13 -21
  86. package/dist/tools/remediate.d.ts +3 -0
  87. package/dist/tools/remediate.d.ts.map +1 -1
  88. package/dist/tools/remediate.js +35 -14
  89. package/dist/tools/test-docs.d.ts +1 -0
  90. package/dist/tools/test-docs.d.ts.map +1 -1
  91. package/dist/tools/test-docs.js +4 -2
  92. package/dist/tools/version.d.ts +5 -1
  93. package/dist/tools/version.d.ts.map +1 -1
  94. package/dist/tools/version.js +23 -8
  95. package/package.json +19 -1
@@ -0,0 +1,399 @@
1
+ #!/usr/bin/env npx tsx
2
+ "use strict";
3
+ /**
4
+ * Evaluation Runner for Multi-Model Comparative Analysis
5
+ *
6
+ * Runs comparative evaluation on available datasets from multiple models
7
+ * Automatically detects and evaluates both remediation and recommendation datasets
8
+ */
9
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ var desc = Object.getOwnPropertyDescriptor(m, k);
12
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
13
+ desc = { enumerable: true, get: function() { return m[k]; } };
14
+ }
15
+ Object.defineProperty(o, k2, desc);
16
+ }) : (function(o, m, k, k2) {
17
+ if (k2 === undefined) k2 = k;
18
+ o[k2] = m[k];
19
+ }));
20
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
21
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
22
+ }) : function(o, v) {
23
+ o["default"] = v;
24
+ });
25
+ var __importStar = (this && this.__importStar) || (function () {
26
+ var ownKeys = function(o) {
27
+ ownKeys = Object.getOwnPropertyNames || function (o) {
28
+ var ar = [];
29
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
30
+ return ar;
31
+ };
32
+ return ownKeys(o);
33
+ };
34
+ return function (mod) {
35
+ if (mod && mod.__esModule) return mod;
36
+ var result = {};
37
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
38
+ __setModuleDefault(result, mod);
39
+ return result;
40
+ };
41
+ })();
42
+ Object.defineProperty(exports, "__esModule", { value: true });
43
+ const remediation_comparative_js_1 = require("./evaluators/remediation-comparative.js");
44
+ const recommendation_comparative_js_1 = require("./evaluators/recommendation-comparative.js");
45
+ const capability_comparative_js_1 = require("./evaluators/capability-comparative.js");
46
+ const pattern_comparative_js_1 = require("./evaluators/pattern-comparative.js");
47
+ const policy_comparative_js_1 = require("./evaluators/policy-comparative.js");
48
+ const promises_1 = require("fs/promises");
49
+ const child_process_1 = require("child_process");
50
+ const util_1 = require("util");
51
+ const execAsync = (0, util_1.promisify)(child_process_1.exec);
52
+ const EVALUATOR_CONFIG = {
53
+ remediation: {
54
+ evaluator: remediation_comparative_js_1.RemediationComparativeEvaluator,
55
+ prefix: 'remediate_',
56
+ title: 'Remediation AI Model Comparison Report'
57
+ },
58
+ recommendation: {
59
+ evaluator: recommendation_comparative_js_1.RecommendationComparativeEvaluator,
60
+ prefix: 'recommend_',
61
+ title: 'Recommendation AI Model Comparison Report'
62
+ },
63
+ capability: {
64
+ evaluator: capability_comparative_js_1.CapabilityComparativeEvaluator,
65
+ prefix: 'capability_',
66
+ title: 'Capability AI Model Comparison Report'
67
+ },
68
+ pattern: {
69
+ evaluator: pattern_comparative_js_1.PatternComparativeEvaluator,
70
+ prefix: 'pattern_',
71
+ title: 'Pattern AI Model Comparison Report'
72
+ },
73
+ policy: {
74
+ evaluator: policy_comparative_js_1.PolicyComparativeEvaluator,
75
+ prefix: 'policy_',
76
+ title: 'Policy AI Model Comparison Report'
77
+ }
78
+ };
79
+ function generateMarkdownReport(results, stats, evaluationType, finalAssessment) {
80
+ const timestamp = new Date().toISOString();
81
+ // Use final assessment if provided
82
+ const overallAssessment = finalAssessment?.overall_assessment || null;
83
+ // Calculate basic statistics for reference
84
+ const modelScores = new Map();
85
+ results.forEach(result => {
86
+ if (result.modelRankings) {
87
+ result.modelRankings.forEach((ranking) => {
88
+ if (!modelScores.has(ranking.model)) {
89
+ modelScores.set(ranking.model, []);
90
+ }
91
+ modelScores.get(ranking.model).push(ranking.score);
92
+ });
93
+ }
94
+ });
95
+ // Calculate average scores for supplementary information
96
+ const modelAverages = new Map();
97
+ modelScores.forEach((scores, model) => {
98
+ const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
99
+ modelAverages.set(model, Math.round(avg * 1000) / 1000);
100
+ });
101
+ const reportTitle = EVALUATOR_CONFIG[evaluationType].title;
102
+ return `# ${reportTitle}
103
+
104
+ **Generated**: ${timestamp}
105
+ **Scenarios Analyzed**: ${results.length}
106
+ **Models Evaluated**: ${stats.availableModels.length}
107
+ **Total Datasets**: ${stats.totalDatasets}
108
+
109
+ ## Executive Summary
110
+
111
+ ### šŸ† Overall Winner (AI Assessment)
112
+ ${overallAssessment ? `
113
+ **${overallAssessment.winner}**
114
+
115
+ ${overallAssessment.rationale}
116
+ ` : 'Overall assessment not available'}
117
+
118
+ ### šŸ“Š AI Reliability Rankings
119
+
120
+ ${overallAssessment ? overallAssessment.reliability_ranking
121
+ .map((ranking, index) => `${index + 1}. **${ranking.model}** (${Math.round(ranking.reliability_score * 100)}%) - ${ranking.reliability_notes}`)
122
+ .join('\n') : 'Reliability rankings not available'}
123
+
124
+ ### šŸ“‹ Production Recommendations
125
+
126
+ ${overallAssessment ? `
127
+ - **Primary Choice**: ${overallAssessment.production_recommendations.primary}
128
+ - **Secondary Option**: ${overallAssessment.production_recommendations.secondary}
129
+ - **Avoid for Production**: ${overallAssessment.production_recommendations.avoid.length > 0 ? overallAssessment.production_recommendations.avoid.join(', ') : 'None'}
130
+ ${Object.keys(overallAssessment.production_recommendations.specialized_use).length > 0 ?
131
+ '\n**Specialized Use Cases:**\n' + Object.entries(overallAssessment.production_recommendations.specialized_use)
132
+ .map(([useCase, model]) => `- **${useCase}**: ${model}`)
133
+ .join('\n') : ''}
134
+ ` : 'Production recommendations not available'}
135
+
136
+ ### šŸ“Š Supplementary Statistics (Reference Only)
137
+
138
+ | Model | Avg Score | Notes |
139
+ |-------|-----------|-------|
140
+ ${Array.from(modelAverages.entries())
141
+ .sort((a, b) => b[1] - a[1])
142
+ .map(([model, avgScore]) => `| ${model} | ${avgScore} | See AI assessment above |`)
143
+ .join('\n')}
144
+
145
+ ## Detailed Scenario Results
146
+
147
+ ${results.map((result, index) => {
148
+ const scenarioTitle = result.key.replace(/_/g, ' ').replace(/(remediation|recommendation) comparative /, '').toUpperCase();
149
+ return `### ${index + 1}. ${scenarioTitle}
150
+
151
+ **Winner**: ${result.bestModel} (Score: ${result.score})
152
+ **Models Compared**: ${result.modelCount}
153
+ **Confidence**: ${result.confidence ? Math.round(result.confidence * 100) : 0}%
154
+
155
+ #### Rankings
156
+ ${result.modelRankings ? result.modelRankings.map((rank) => `${rank.rank}. **${rank.model}** - ${rank.score}`).join('\n') : 'No detailed rankings available'}
157
+
158
+ #### Analysis
159
+ ${result.comment}
160
+
161
+ ---`;
162
+ }).join('\n\n')}
163
+
164
+ ## AI Model Selection Guide
165
+
166
+ ${overallAssessment ? `
167
+ ### Key Insights
168
+ ${overallAssessment.key_insights}
169
+
170
+ ### Recommended Selection Strategy
171
+ - **For Production Use**: Choose ${overallAssessment.production_recommendations.primary}
172
+ - **For Secondary Option**: Consider ${overallAssessment.production_recommendations.secondary}
173
+ ${overallAssessment.production_recommendations.avoid.length > 0 ?
174
+ `- **Avoid**: ${overallAssessment.production_recommendations.avoid.join(', ')} (reliability concerns)` : ''}
175
+
176
+ ### Decision Framework
177
+ The AI assessment prioritizes **reliability and consistency** over peak performance. Models that fail completely in any scenario are heavily penalized, ensuring production-ready recommendations.
178
+ ` : 'AI model selection guide not available'}
179
+
180
+ ---
181
+
182
+ ## Report Attribution
183
+
184
+ Report generated by DevOps AI Toolkit Comparative Evaluation System
185
+ `;
186
+ }
187
+ function loadModelMetadata() {
188
+ try {
189
+ const fs = require('fs');
190
+ const path = require('path');
191
+ const metadataPath = path.join(__dirname, 'model-metadata.json');
192
+ if (!fs.existsSync(metadataPath)) {
193
+ console.error('āŒ Model metadata file not found');
194
+ console.error('šŸ“Š Pricing and capabilities data required for cost analysis');
195
+ console.error('');
196
+ console.error('šŸ”„ To create model metadata, run:');
197
+ console.error(' /update-model-metadata');
198
+ console.error('');
199
+ process.exit(1);
200
+ }
201
+ const metadata = JSON.parse(fs.readFileSync(metadataPath, 'utf8'));
202
+ // Check if metadata is older than 30 days
203
+ const metadataAge = Date.now() - new Date(metadata.lastUpdated).getTime();
204
+ const thirtyDays = 30 * 24 * 60 * 60 * 1000;
205
+ if (metadataAge > thirtyDays) {
206
+ console.error('āŒ Model metadata is over 30 days old (last updated: ' + metadata.lastUpdated + ')');
207
+ console.error('šŸ“Š Pricing and capabilities data may be outdated, affecting cost analysis accuracy');
208
+ console.error('');
209
+ console.error('šŸ”„ To update model metadata, run:');
210
+ console.error(' /update-model-metadata');
211
+ console.error('');
212
+ process.exit(1);
213
+ }
214
+ console.log('āœ… Model metadata loaded (updated: ' + metadata.lastUpdated + ')');
215
+ return metadata;
216
+ }
217
+ catch (error) {
218
+ console.error('āŒ Failed to load model metadata:', error instanceof Error ? error.message : String(error));
219
+ console.error('šŸ”„ To create model metadata, run: /update-model-metadata');
220
+ process.exit(1);
221
+ }
222
+ }
223
+ function generateJsonReport(results, stats, evaluationType, modelMetadata, finalAssessment) {
224
+ const timestamp = new Date().toISOString();
225
+ // Use final assessment if provided
226
+ const overallAssessment = finalAssessment || null;
227
+ return {
228
+ metadata: {
229
+ reportType: 'comparative-evaluation',
230
+ evaluationType: evaluationType,
231
+ generated: timestamp,
232
+ scenariosAnalyzed: results.length,
233
+ modelsEvaluated: stats.availableModels.length,
234
+ totalDatasets: stats.totalDatasets,
235
+ tool: EVALUATOR_CONFIG[evaluationType].title
236
+ },
237
+ modelMetadata: modelMetadata.models,
238
+ overallAssessment: overallAssessment,
239
+ results: results,
240
+ summary: stats
241
+ };
242
+ }
243
+ async function detectAvailableDatasets(datasetsDir, filterType) {
244
+ try {
245
+ const files = await (0, promises_1.readdir)(datasetsDir);
246
+ const result = {};
247
+ for (const [type, config] of Object.entries(EVALUATOR_CONFIG)) {
248
+ // If filter specified, only check for that type
249
+ if (filterType && type !== filterType) {
250
+ result[type] = false;
251
+ }
252
+ else {
253
+ result[type] = files.some(file => file.startsWith(config.prefix));
254
+ }
255
+ }
256
+ return result;
257
+ }
258
+ catch (error) {
259
+ console.warn('Could not read datasets directory, assuming no datasets available');
260
+ const result = {};
261
+ for (const type of Object.keys(EVALUATOR_CONFIG)) {
262
+ result[type] = false;
263
+ }
264
+ return result;
265
+ }
266
+ }
267
+ async function runEvaluation(evaluatorType, datasetsDir, modelMetadata) {
268
+ const EvaluatorClass = EVALUATOR_CONFIG[evaluatorType].evaluator;
269
+ const evaluator = new EvaluatorClass(datasetsDir);
270
+ console.log(`\nšŸ”¬ Starting ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Evaluation\n`);
271
+ // Show dataset stats
272
+ console.log('šŸ“Š Dataset Analysis:');
273
+ const stats = evaluator.getDatasetStats();
274
+ console.log(`- Total datasets: ${stats.totalDatasets}`);
275
+ console.log(`- Available models: ${stats.availableModels.join(', ')}`);
276
+ console.log(`- Scenarios with multiple models: ${stats.scenariosWithMultipleModels}`);
277
+ console.log(`- Interaction types: ${stats.interactionTypes.join(', ')}`);
278
+ console.log();
279
+ // Show evaluation phases
280
+ console.log('šŸŽÆ Evaluation Phases:');
281
+ const phases = evaluator.getEvaluationPhases();
282
+ phases.forEach(phase => {
283
+ console.log(`- ${phase.phase}: ${phase.description}`);
284
+ console.log(` Models: ${phase.availableModels.join(', ')}`);
285
+ console.log(` Scenarios: ${phase.scenarioCount}`);
286
+ console.log();
287
+ });
288
+ // Run comparative evaluation on all scenarios
289
+ console.log('šŸš€ Running Comparative Evaluation...\n');
290
+ const results = await evaluator.evaluateAllScenarios();
291
+ console.log(`āœ… ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Evaluation Complete! Analyzed ${results.length} scenarios\n`);
292
+ // Conduct final assessment across all scenarios
293
+ const finalAssessment = await evaluator.conductFinalAssessment(results);
294
+ // Generate dual-format reports using final assessment
295
+ const reportContent = generateMarkdownReport(results, stats, evaluatorType, finalAssessment);
296
+ const jsonResults = generateJsonReport(results, stats, evaluatorType, modelMetadata, finalAssessment);
297
+ // Save reports to files
298
+ const dateStamp = new Date().toISOString().split('T')[0];
299
+ const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation-${dateStamp}.md`;
300
+ const jsonPath = `./eval/analysis/individual/${evaluatorType}-results-${dateStamp}.json`;
301
+ const reportDir = './eval/analysis/individual';
302
+ // Ensure report directory exists
303
+ const fs = await Promise.resolve().then(() => __importStar(require('fs')));
304
+ if (!fs.existsSync(reportDir)) {
305
+ fs.mkdirSync(reportDir, { recursive: true });
306
+ }
307
+ fs.writeFileSync(markdownPath, reportContent);
308
+ fs.writeFileSync(jsonPath, JSON.stringify(jsonResults, null, 2));
309
+ console.log(`šŸ“Š ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} reports generated:`);
310
+ console.log(` šŸ“ Markdown: ${markdownPath}`);
311
+ console.log(` šŸ“„ JSON: ${jsonPath}`);
312
+ // Brief console summary
313
+ console.log(`šŸ† ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Results:`);
314
+ results.forEach((result, index) => {
315
+ console.log(` ${index + 1}. ${result.key}: ${result.bestModel} (${result.score})`);
316
+ });
317
+ return results;
318
+ }
319
+ async function main() {
320
+ console.log('šŸ”¬ Starting Multi-Model Comparative Evaluation\n');
321
+ // Clean old debug files but preserve evaluation datasets
322
+ console.log('🧹 Cleaning old debug files...');
323
+ try {
324
+ await execAsync('find ./tmp/debug-ai -type f ! -name \'*.jsonl\' -delete 2>/dev/null || true');
325
+ await execAsync('mkdir -p ./tmp/debug-ai');
326
+ console.log('āœ… Debug files cleaned (datasets preserved)\n');
327
+ }
328
+ catch (error) {
329
+ console.warn('āš ļø Could not clean debug files:', error instanceof Error ? error.message : String(error));
330
+ }
331
+ // Check model metadata freshness before starting any evaluation work
332
+ const modelMetadata = loadModelMetadata();
333
+ const datasetsDir = './eval/datasets';
334
+ // Parse command line arguments for subset evaluation
335
+ const args = process.argv.slice(2);
336
+ let filterType = undefined;
337
+ if (args.length > 0) {
338
+ const requestedType = args[0];
339
+ if (requestedType in EVALUATOR_CONFIG) {
340
+ filterType = requestedType;
341
+ }
342
+ else {
343
+ console.error(`āŒ Invalid evaluation type: "${requestedType}"`);
344
+ console.error(`āœ… Available types: ${Object.keys(EVALUATOR_CONFIG).join(', ')}`);
345
+ process.exit(1);
346
+ }
347
+ }
348
+ const availableDatasets = await detectAvailableDatasets(datasetsDir, filterType);
349
+ console.log('šŸ” Dataset Detection:');
350
+ for (const [type, available] of Object.entries(availableDatasets)) {
351
+ console.log(`- ${type.charAt(0).toUpperCase() + type.slice(1)} datasets: ${available ? 'āœ…' : 'āŒ'}`);
352
+ }
353
+ if (filterType) {
354
+ console.log(`\nšŸŽÆ Running evaluation for: ${filterType}`);
355
+ }
356
+ const hasAnyDatasets = Object.values(availableDatasets).some(Boolean);
357
+ if (!hasAnyDatasets) {
358
+ if (filterType) {
359
+ console.error(`āŒ No datasets found for type: ${filterType}`);
360
+ }
361
+ else {
362
+ console.error('āŒ No evaluation datasets found. Please run integration tests first to generate datasets.');
363
+ }
364
+ process.exit(1);
365
+ }
366
+ try {
367
+ const allResults = [];
368
+ // If filterType is specified, only run that evaluation type
369
+ if (filterType) {
370
+ if (availableDatasets[filterType]) {
371
+ const results = await runEvaluation(filterType, datasetsDir, modelMetadata);
372
+ allResults.push(...results);
373
+ }
374
+ else {
375
+ console.error(`āŒ No datasets available for type: ${filterType}`);
376
+ process.exit(1);
377
+ }
378
+ }
379
+ else {
380
+ // Run all available evaluations
381
+ for (const [type, available] of Object.entries(availableDatasets)) {
382
+ if (available) {
383
+ const results = await runEvaluation(type, datasetsDir, modelMetadata);
384
+ allResults.push(...results);
385
+ }
386
+ }
387
+ }
388
+ console.log(`\nšŸŽ‰ All Evaluations Complete! Total scenarios analyzed: ${allResults.length}`);
389
+ console.log(`šŸ“ Check ./eval/reports/ for detailed analysis reports\n`);
390
+ }
391
+ catch (error) {
392
+ console.error('āŒ Evaluation failed:', error);
393
+ process.exit(1);
394
+ }
395
+ }
396
+ // Run if this file is executed directly
397
+ if (require.main === module) {
398
+ main().catch(console.error);
399
+ }
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Base Comparative Evaluator
3
+ *
4
+ * Shared functionality for comparing multiple AI models across scenarios
5
+ * Eliminates code duplication between remediation, recommendation, and capability evaluators
6
+ */
7
+ import { EvaluationScore } from './base.js';
8
+ import { VercelProvider } from '../../core/providers/vercel-provider';
9
+ import { DatasetAnalyzer, ComparisonScenario } from '../dataset-analyzer.js';
10
+ export interface ComparativeEvaluationResult {
11
+ scenario_summary: string;
12
+ models_compared: string[];
13
+ comparative_analysis: Record<string, {
14
+ quality_score?: number;
15
+ efficiency_score?: number;
16
+ performance_score?: number;
17
+ communication_score?: number;
18
+ accuracy_score?: number;
19
+ completeness_score?: number;
20
+ clarity_score?: number;
21
+ consistency_score?: number;
22
+ weighted_total: number;
23
+ strengths: string;
24
+ weaknesses: string;
25
+ }>;
26
+ ranking: Array<{
27
+ rank: number;
28
+ model: string;
29
+ score: number;
30
+ rationale?: string;
31
+ reasoning?: string;
32
+ }>;
33
+ overall_insights?: string;
34
+ }
35
+ export interface ComparativeEvaluationScore extends EvaluationScore {
36
+ modelRankings: Array<{
37
+ rank: number;
38
+ model: string;
39
+ score: number;
40
+ }>;
41
+ bestModel: string;
42
+ modelCount: number;
43
+ }
44
+ export declare abstract class BaseComparativeEvaluator {
45
+ abstract readonly name: string;
46
+ abstract readonly description: string;
47
+ protected abstract readonly promptFileName: string;
48
+ protected abstract readonly toolName: string;
49
+ protected evaluatorModel: VercelProvider;
50
+ protected datasetAnalyzer: DatasetAnalyzer;
51
+ protected promptTemplate: string;
52
+ constructor(datasetDir?: string);
53
+ /**
54
+ * Initialize the evaluator - must be called by subclass constructor
55
+ */
56
+ protected initializePrompt(): void;
57
+ /**
58
+ * Evaluate all available models for scenarios
59
+ * This method finds all scenarios with multiple model responses and evaluates them comparatively
60
+ */
61
+ evaluateAllScenarios(): Promise<ComparativeEvaluationScore[]>;
62
+ /**
63
+ * Conduct final assessment across all scenarios to determine overall winner
64
+ */
65
+ conductFinalAssessment(scenarioResults: ComparativeEvaluationScore[]): Promise<any>;
66
+ /**
67
+ * Evaluate a single scenario comparing all available models
68
+ */
69
+ evaluateScenario(scenario: ComparisonScenario): Promise<ComparativeEvaluationScore>;
70
+ /**
71
+ * Build the evaluation prompt - can be overridden by subclasses for custom behavior
72
+ */
73
+ protected buildEvaluationPrompt(scenario: ComparisonScenario, modelResponsesText: string, modelList: string): string;
74
+ /**
75
+ * Get statistics about available datasets
76
+ */
77
+ getDatasetStats(): {
78
+ totalDatasets: number;
79
+ availableModels: string[];
80
+ scenariosWithMultipleModels: number;
81
+ interactionTypes: string[];
82
+ };
83
+ /**
84
+ * Get detailed breakdown of evaluation phases available
85
+ * Must be implemented by subclasses to provide domain-specific phase descriptions
86
+ */
87
+ abstract getEvaluationPhases(): {
88
+ phase: string;
89
+ description: string;
90
+ availableModels: string[];
91
+ scenarioCount: number;
92
+ }[];
93
+ }
94
+ //# sourceMappingURL=base-comparative.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAE7E,MAAM,WAAW,2BAA2B;IAC1C,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE;QACnC,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,0BAA2B,SAAQ,eAAe;IACjE,aAAa,EAAE,KAAK,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,8BAAsB,wBAAwB;IAC5C,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE7C,SAAS,CAAC,cAAc,EAAE,cAAc,CAAC;IACzC,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC;IAC3C,SAAS,CAAC,cAAc,EAAE,MAAM,CAAC;gBAErB,UAAU,CAAC,EAAE,MAAM;IAe/B;;OAEG;IACH,SAAS,CAAC,gBAAgB;IAK1B;;;OAGG;IACG,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAkBnE;;OAEG;IACG,sBAAsB,CAAC,eAAe,EAAE,0BAA0B,EAAE,GAAG,OAAO,CAAC,GAAG,CAAC;IA6CzF;;OAEG;IACG,gBAAgB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,OAAO,CAAC,0BAA0B,CAAC;IAsFzF;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAQpH;;OAEG;IACH,eAAe;;;;;;IAIf;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,IAAI;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CACJ"}
@@ -0,0 +1,187 @@
1
+ "use strict";
2
+ /**
3
+ * Base Comparative Evaluator
4
+ *
5
+ * Shared functionality for comparing multiple AI models across scenarios
6
+ * Eliminates code duplication between remediation, recommendation, and capability evaluators
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.BaseComparativeEvaluator = void 0;
10
+ const vercel_provider_1 = require("../../core/providers/vercel-provider");
11
+ const model_config_1 = require("../../core/model-config");
12
+ const platform_utils_1 = require("../../core/platform-utils");
13
+ const fs_1 = require("fs");
14
+ const path_1 = require("path");
15
+ const dataset_analyzer_js_1 = require("../dataset-analyzer.js");
16
+ class BaseComparativeEvaluator {
17
+ evaluatorModel;
18
+ datasetAnalyzer;
19
+ promptTemplate;
20
+ constructor(datasetDir) {
21
+ // Use Claude via VercelProvider as the evaluator (most reliable for complex comparative evaluation)
22
+ this.evaluatorModel = new vercel_provider_1.VercelProvider({
23
+ provider: 'anthropic',
24
+ apiKey: process.env.ANTHROPIC_API_KEY,
25
+ model: (0, model_config_1.getCurrentModel)('anthropic'),
26
+ debugMode: process.env.DEBUG_DOT_AI === 'true'
27
+ });
28
+ this.datasetAnalyzer = new dataset_analyzer_js_1.DatasetAnalyzer(datasetDir || './eval/datasets');
29
+ // Prompt template will be loaded by subclass
30
+ this.promptTemplate = '';
31
+ }
32
+ /**
33
+ * Initialize the evaluator - must be called by subclass constructor
34
+ */
35
+ initializePrompt() {
36
+ const promptPath = (0, path_1.join)(process.cwd(), 'src', 'evaluation', 'prompts', this.promptFileName);
37
+ this.promptTemplate = (0, fs_1.readFileSync)(promptPath, 'utf8');
38
+ }
39
+ /**
40
+ * Evaluate all available models for scenarios
41
+ * This method finds all scenarios with multiple model responses and evaluates them comparatively
42
+ */
43
+ async evaluateAllScenarios() {
44
+ const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
45
+ const results = [];
46
+ console.log(`Found ${scenarios.length} scenarios with multiple models for comparative evaluation`);
47
+ for (const scenario of scenarios) {
48
+ try {
49
+ const result = await this.evaluateScenario(scenario);
50
+ results.push(result);
51
+ }
52
+ catch (error) {
53
+ console.error(`Failed to evaluate scenario ${scenario.interaction_id}:`, error);
54
+ }
55
+ }
56
+ return results;
57
+ }
58
+ /**
59
+ * Conduct final assessment across all scenarios to determine overall winner
60
+ */
61
+ async conductFinalAssessment(scenarioResults) {
62
+ if (scenarioResults.length === 0) {
63
+ throw new Error('No scenario results provided for final assessment');
64
+ }
65
+ // Load the overall winner assessment prompt
66
+ const promptPath = (0, path_1.join)(process.cwd(), 'src', 'evaluation', 'prompts', 'overall-winner-assessment.md');
67
+ const overallWinnerTemplate = (0, fs_1.readFileSync)(promptPath, 'utf8');
68
+ // Get all models that should have been tested (from first scenario)
69
+ const allModels = scenarioResults[0]?.modelRankings?.map(r => r.model) || [];
70
+ // Build the final assessment prompt with raw data
71
+ const finalPrompt = overallWinnerTemplate
72
+ .replace('{tool_type}', this.toolName)
73
+ .replace('{total_scenarios}', scenarioResults.length.toString())
74
+ .replace('{expected_models}', JSON.stringify(allModels))
75
+ .replace('{scenario_results}', JSON.stringify(scenarioResults, null, 2));
76
+ try {
77
+ console.log(`\nšŸ” Conducting final assessment across ${scenarioResults.length} scenarios for ${this.toolName}\n`);
78
+ const response = await this.evaluatorModel.sendMessage(finalPrompt, `${this.name}-final-assessment`, {
79
+ user_intent: `Final cross-scenario assessment for ${this.toolName}`,
80
+ interaction_id: 'final-assessment'
81
+ });
82
+ // Extract JSON from AI response
83
+ const finalAssessment = (0, platform_utils_1.extractJsonFromAIResponse)(response.content);
84
+ console.log(`āœ… Final Assessment Complete for ${this.toolName}`);
85
+ console.log(`šŸ† Overall Winner: ${finalAssessment.overall_assessment?.winner || 'Unknown'}`);
86
+ return finalAssessment;
87
+ }
88
+ catch (error) {
89
+ console.error(`Final assessment failed for ${this.toolName}:`, error);
90
+ throw error;
91
+ }
92
+ }
93
+ /**
94
+ * Evaluate a single scenario comparing all available models
95
+ */
96
+ async evaluateScenario(scenario) {
97
+ // Build model responses section for the prompt
98
+ const modelResponsesText = scenario.models.map((modelResponse, index) => {
99
+ // Build failure analysis context
100
+ let reliabilityContext = 'āœ… Completed successfully';
101
+ if (modelResponse.metadata.failure_analysis) {
102
+ const failure = modelResponse.metadata.failure_analysis;
103
+ reliabilityContext = `āš ļø **${failure.failure_type.toUpperCase()} FAILURE**: ${failure.failure_reason}`;
104
+ if (failure.failure_type === 'timeout') {
105
+ reliabilityContext += `\n- **Time to failure**: ${Math.round(failure.time_to_failure / 1000)}s (${Math.round(failure.time_to_failure / 60000)}min)`;
106
+ reliabilityContext += `\n- **Impact**: Model could not complete full workflow within time limit`;
107
+ }
108
+ }
109
+ return `### Model ${index + 1}: ${modelResponse.model}
110
+
111
+ **Performance Metrics:**
112
+ - Duration: ${modelResponse.performance.duration_ms}ms
113
+ - Input Tokens: ${modelResponse.performance.input_tokens}
114
+ - Output Tokens: ${modelResponse.performance.output_tokens}
115
+ - Total Tokens: ${modelResponse.performance.total_tokens}
116
+ - Iterations: ${modelResponse.performance.iterations || 'N/A'}
117
+ - Tool Calls: ${modelResponse.performance.tool_calls_executed || 'N/A'}
118
+ - Cache Read: ${modelResponse.performance.cache_read_tokens || 0} tokens
119
+ - Cache Creation: ${modelResponse.performance.cache_creation_tokens || 0} tokens
120
+
121
+ **Reliability Status:**
122
+ ${reliabilityContext}
123
+
124
+ **Response:**
125
+ ${modelResponse.response}
126
+
127
+ ---`;
128
+ }).join('\n\n');
129
+ const modelList = scenario.models.map(m => m.model).join('", "');
130
+ // Generate the comparative evaluation prompt
131
+ const evaluationPrompt = this.buildEvaluationPrompt(scenario, modelResponsesText, modelList);
132
+ try {
133
+ const response = await this.evaluatorModel.sendMessage(evaluationPrompt, `${this.name}-${scenario.interaction_id}`, {
134
+ user_intent: `Comparative ${this.name} evaluation for ${scenario.interaction_id}`,
135
+ interaction_id: scenario.interaction_id
136
+ });
137
+ // Extract JSON from AI response with robust parsing
138
+ const evaluation = (0, platform_utils_1.extractJsonFromAIResponse)(response.content);
139
+ // Convert to standard EvaluationScore format
140
+ const rankings = evaluation.ranking || [];
141
+ const bestModel = rankings.length > 0 ? rankings[0].model : scenario.models[0].model;
142
+ const bestScore = rankings.length > 0 ? rankings[0].score : 0;
143
+ return {
144
+ key: `${this.name}_${scenario.interaction_id}`,
145
+ score: bestScore,
146
+ comment: evaluation.overall_insights || 'Comparative evaluation completed',
147
+ confidence: 0.9, // High confidence for comparative evaluation
148
+ modelRankings: rankings.map(r => ({
149
+ rank: r.rank,
150
+ model: r.model,
151
+ score: r.score
152
+ })),
153
+ bestModel,
154
+ modelCount: scenario.models.length
155
+ };
156
+ }
157
+ catch (error) {
158
+ console.error(`Comparative evaluation failed for ${scenario.interaction_id}:`, error);
159
+ return {
160
+ key: `${this.name}_${scenario.interaction_id}`,
161
+ score: 0,
162
+ comment: `Evaluation error: ${error}`,
163
+ confidence: 0,
164
+ modelRankings: [],
165
+ bestModel: 'unknown',
166
+ modelCount: scenario.models.length
167
+ };
168
+ }
169
+ }
170
+ /**
171
+ * Build the evaluation prompt - can be overridden by subclasses for custom behavior
172
+ */
173
+ buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
174
+ return this.promptTemplate
175
+ .replace('{issue}', scenario.issue)
176
+ .replace('{model_responses}', modelResponsesText)
177
+ .replace('{model_list}', modelList)
178
+ .replace('{phase}', scenario.interaction_id);
179
+ }
180
+ /**
181
+ * Get statistics about available datasets
182
+ */
183
+ getDatasetStats() {
184
+ return this.datasetAnalyzer.getDatasetStats(this.toolName);
185
+ }
186
+ }
187
+ exports.BaseComparativeEvaluator = BaseComparativeEvaluator;