@vfarcic/dot-ai 0.115.0 → 0.116.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -295,9 +295,8 @@ async function runEvaluation(evaluatorType, datasetsDir, modelMetadata) {
295
295
  const reportContent = generateMarkdownReport(results, stats, evaluatorType, finalAssessment);
296
296
  const jsonResults = generateJsonReport(results, stats, evaluatorType, modelMetadata, finalAssessment);
297
297
  // Save reports to files
298
- const dateStamp = new Date().toISOString().split('T')[0];
299
- const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation-${dateStamp}.md`;
300
- const jsonPath = `./eval/analysis/individual/${evaluatorType}-results-${dateStamp}.json`;
298
+ const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation.md`;
299
+ const jsonPath = `./eval/analysis/individual/${evaluatorType}-results.json`;
301
300
  const reportDir = './eval/analysis/individual';
302
301
  // Ensure report directory exists
303
302
  const fs = await Promise.resolve().then(() => __importStar(require('fs')));
@@ -328,6 +327,16 @@ async function main() {
328
327
  catch (error) {
329
328
  console.warn('⚠️ Could not clean debug files:', error instanceof Error ? error.message : String(error));
330
329
  }
330
+ // Clean old evaluation result files from eval/results
331
+ console.log('🧹 Cleaning old evaluation result files...');
332
+ try {
333
+ await execAsync('rm -f ./eval/results/*_comparative_evaluation_*.jsonl 2>/dev/null || true');
334
+ await execAsync('mkdir -p ./eval/results');
335
+ console.log('✅ Old evaluation results cleaned\n');
336
+ }
337
+ catch (error) {
338
+ console.warn('⚠️ Could not clean old evaluation results:', error instanceof Error ? error.message : String(error));
339
+ }
331
340
  // Check model metadata freshness before starting any evaluation work
332
341
  const modelMetadata = loadModelMetadata();
333
342
  const datasetsDir = './eval/datasets';
@@ -7,6 +7,7 @@
7
7
  import { EvaluationScore } from './base.js';
8
8
  import { VercelProvider } from '../../core/providers/vercel-provider';
9
9
  import { DatasetAnalyzer, ComparisonScenario } from '../dataset-analyzer.js';
10
+ import { type EvaluationMetadata } from '../metadata-loader.js';
10
11
  export interface ComparativeEvaluationResult {
11
12
  scenario_summary: string;
12
13
  models_compared: string[];
@@ -49,6 +50,7 @@ export declare abstract class BaseComparativeEvaluator {
49
50
  protected evaluatorModel: VercelProvider;
50
51
  protected datasetAnalyzer: DatasetAnalyzer;
51
52
  protected promptTemplate: string;
53
+ protected metadata: EvaluationMetadata;
52
54
  constructor(datasetDir?: string);
53
55
  /**
54
56
  * Initialize the evaluator - must be called by subclass constructor
@@ -1 +1 @@
1
- {"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAE7E,MAAM,WAAW,2BAA2B;IAC1C,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE;QACnC,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,0BAA2B,SAAQ,eAAe;IACjE,aAAa,EAAE,KAAK,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,8BAAsB,wBAAwB;IAC5C,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE7C,SAAS,CAAC,cAAc,EAAE,cAAc,CAAC;IACzC,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC;IAC3C,SAAS,CAAC,cAAc,EAAE,MAAM,CAAC;gBAErB,UAAU,CAAC,EAAE,MAAM;IAe/B;;OAEG;IACH,SAAS,CAAC,gBAAgB;IAK1B;;;OAGG;IACG,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAkBnE;;OAEG;IACG,sBAAsB,CAAC,eAAe,EAAE,0BAA0B,EAAE,GAAG,OAAO,CAAC,GAAG,CAAC;IA6CzF;;OAEG;IACG,gBAAgB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,OAAO,CAAC,0BAA0B,CAAC;IAsFzF;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAQpH;;OAEG;IACH,eAAe;;;;;;IAIf;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,IAAI;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CACJ"}
1
+ {"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAC7E,OAAO,EAAsE,KAAK,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAEpI,MAAM,WAAW,2BAA2B;IAC1C,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE;QACnC,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,0BAA2B,SAAQ,eAAe;IACjE,aAAa,EAAE,KAAK,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,8BAAsB,wBAAwB;IAC5C,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE7C,SAAS,CAAC,cAAc,EAAE,cAAc,CAAC;IACzC,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC;IAC3C,SAAS,CAAC,cAAc,EAAE,MAAM,CAAC;IACjC,SAAS,CAAC,QAAQ,EAAE,kBAAkB,CAAC;gBAE3B,UAAU,CAAC,EAAE,MAAM;IAkB/B;;OAEG;IACH,SAAS,CAAC,gBAAgB;IAK1B;;;OAGG;IACG,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAkBnE;;OAEG;IACG,sBAAsB,CAAC,eAAe,EAAE,0BAA0B,EAAE,GAAG,OAAO,CAAC,GAAG,CAAC;IA6CzF;;OAEG;IACG,gBAAgB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,OAAO,CAAC,0BAA0B,CAAC;IAuFzF;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAgBpH;;OAEG;IACH,eAAe;;;;;;IAIf;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,IAAI;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CACJ"}
@@ -13,10 +13,12 @@ const platform_utils_1 = require("../../core/platform-utils");
13
13
  const fs_1 = require("fs");
14
14
  const path_1 = require("path");
15
15
  const dataset_analyzer_js_1 = require("../dataset-analyzer.js");
16
+ const metadata_loader_js_1 = require("../metadata-loader.js");
16
17
  class BaseComparativeEvaluator {
17
18
  evaluatorModel;
18
19
  datasetAnalyzer;
19
20
  promptTemplate;
21
+ metadata;
20
22
  constructor(datasetDir) {
21
23
  // Use Claude via VercelProvider as the evaluator (most reliable for complex comparative evaluation)
22
24
  this.evaluatorModel = new vercel_provider_1.VercelProvider({
@@ -28,6 +30,8 @@ class BaseComparativeEvaluator {
28
30
  this.datasetAnalyzer = new dataset_analyzer_js_1.DatasetAnalyzer(datasetDir || './eval/datasets');
29
31
  // Prompt template will be loaded by subclass
30
32
  this.promptTemplate = '';
33
+ // Load metadata
34
+ this.metadata = (0, metadata_loader_js_1.loadEvaluationMetadata)();
31
35
  }
32
36
  /**
33
37
  * Initialize the evaluator - must be called by subclass constructor
@@ -122,6 +126,7 @@ class BaseComparativeEvaluator {
122
126
  ${reliabilityContext}
123
127
 
124
128
  **Response:**
129
+
125
130
  ${modelResponse.response}
126
131
 
127
132
  ---`;
@@ -171,11 +176,18 @@ ${modelResponse.response}
171
176
  * Build the evaluation prompt - can be overridden by subclasses for custom behavior
172
177
  */
173
178
  buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
179
+ // Build metadata context sections
180
+ const pricingContext = (0, metadata_loader_js_1.buildModelPricingContext)(this.metadata.models);
181
+ const toolContext = (0, metadata_loader_js_1.buildToolContext)(this.toolName, this.metadata.tools);
182
+ // Inject all data into prompt template via placeholders
174
183
  return this.promptTemplate
184
+ .replace('{pricing_context}', pricingContext)
185
+ .replace('{tool_context}', toolContext)
175
186
  .replace('{issue}', scenario.issue)
176
187
  .replace('{model_responses}', modelResponsesText)
177
188
  .replace('{model_list}', modelList)
178
- .replace('{phase}', scenario.interaction_id);
189
+ .replace('{phase}', scenario.interaction_id)
190
+ .replace('{scenario_name}', scenario.interaction_id);
179
191
  }
180
192
  /**
181
193
  * Get statistics about available datasets
@@ -0,0 +1,56 @@
1
+ import type { ModelPerformance } from './platform-synthesizer.js';
2
+ export interface GraphGenerationResult {
3
+ success: boolean;
4
+ graphPath?: string;
5
+ error?: string;
6
+ }
7
+ /**
8
+ * GraphGenerator creates data visualizations for platform synthesis reports.
9
+ * Uses QuickChart.io API to generate chart images without requiring native dependencies.
10
+ */
11
+ export declare class GraphGenerator {
12
+ private outputDir;
13
+ private quickchartBaseUrl;
14
+ constructor(outputDir?: string);
15
+ /**
16
+ * Generates all or specific graphs for the platform report
17
+ * @param modelPerformances Model performance data
18
+ * @param graphNames Optional array of specific graph names to generate. If not provided, generates all graphs.
19
+ * Valid names: 'performance-tiers', 'cost-vs-quality', 'reliability-comparison',
20
+ * 'tool-performance-heatmap', 'context-window-correlation'
21
+ */
22
+ generateAllGraphs(modelPerformances: ModelPerformance[], graphNames?: string[]): Promise<Record<string, GraphGenerationResult>>;
23
+ /**
24
+ * Graph 1: Performance Tiers - Grouped bar chart showing score, reliability, and consistency
25
+ */
26
+ private generatePerformanceTiersGraph;
27
+ /**
28
+ * Graph 2: Cost vs Quality - Line chart showing input/output cost range per model
29
+ */
30
+ private generateCostVsQualityGraph;
31
+ /**
32
+ * Graph 3: Reliability Comparison - Bar chart with reliability scores
33
+ */
34
+ private generateReliabilityComparisonGraph;
35
+ /**
36
+ * Graph 4: Tool Performance Heatmap - Shows model scores per tool
37
+ */
38
+ private generateToolPerformanceHeatmap;
39
+ /**
40
+ * Graph 5: Context Window Correlation - Scatter plot showing context window vs performance
41
+ */
42
+ private generateContextWindowCorrelationGraph;
43
+ /**
44
+ * Downloads a chart from QuickChart.io API and saves it as PNG
45
+ */
46
+ private downloadChart;
47
+ /**
48
+ * Cleans model names by removing provider prefixes
49
+ */
50
+ private cleanModelName;
51
+ /**
52
+ * Returns a consistent color for each tool index (supports up to 10 tools)
53
+ */
54
+ private getToolColor;
55
+ }
56
+ //# sourceMappingURL=graph-generator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"graph-generator.d.ts","sourceRoot":"","sources":["../../src/evaluation/graph-generator.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAElE,MAAM,WAAW,qBAAqB;IACpC,OAAO,EAAE,OAAO,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,iBAAiB,CAAiC;gBAE9C,SAAS,SAAoC;IAIzD;;;;;;OAMG;IACG,iBAAiB,CACrB,iBAAiB,EAAE,gBAAgB,EAAE,EACrC,UAAU,CAAC,EAAE,MAAM,EAAE,GACpB,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,qBAAqB,CAAC,CAAC;IAkCjD;;OAEG;YACW,6BAA6B;IAiH3C;;OAEG;YACW,0BAA0B;IAsHxC;;OAEG;YACW,kCAAkC;IA4GhD;;OAEG;YACW,8BAA8B;IAqG5C;;OAEG;YACW,qCAAqC;IAyHnD;;OAEG;YACW,aAAa;IA8B3B;;OAEG;IACH,OAAO,CAAC,cAAc;IAQtB;;OAEG;IACH,OAAO,CAAC,YAAY;CAerB"}
@@ -0,0 +1,694 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.GraphGenerator = void 0;
37
+ const fs = __importStar(require("fs"));
38
+ const path = __importStar(require("path"));
39
+ const https = __importStar(require("https"));
40
+ /**
41
+ * GraphGenerator creates data visualizations for platform synthesis reports.
42
+ * Uses QuickChart.io API to generate chart images without requiring native dependencies.
43
+ */
44
+ class GraphGenerator {
45
+ outputDir;
46
+ quickchartBaseUrl = 'https://quickchart.io/chart';
47
+ constructor(outputDir = './eval/analysis/platform/graphs') {
48
+ this.outputDir = outputDir;
49
+ }
50
+ /**
51
+ * Generates all or specific graphs for the platform report
52
+ * @param modelPerformances Model performance data
53
+ * @param graphNames Optional array of specific graph names to generate. If not provided, generates all graphs.
54
+ * Valid names: 'performance-tiers', 'cost-vs-quality', 'reliability-comparison',
55
+ * 'tool-performance-heatmap', 'context-window-correlation'
56
+ */
57
+ async generateAllGraphs(modelPerformances, graphNames) {
58
+ // Ensure output directory exists
59
+ if (!fs.existsSync(this.outputDir)) {
60
+ fs.mkdirSync(this.outputDir, { recursive: true });
61
+ }
62
+ const results = {};
63
+ // Define all available graphs
64
+ const allGraphs = {
65
+ 'performance-tiers': () => this.generatePerformanceTiersGraph(modelPerformances),
66
+ 'cost-vs-quality': () => this.generateCostVsQualityGraph(modelPerformances),
67
+ 'reliability-comparison': () => this.generateReliabilityComparisonGraph(modelPerformances),
68
+ 'tool-performance-heatmap': () => this.generateToolPerformanceHeatmap(modelPerformances),
69
+ 'context-window-correlation': () => this.generateContextWindowCorrelationGraph(modelPerformances)
70
+ };
71
+ // If specific graphs requested, only generate those
72
+ const graphsToGenerate = graphNames && graphNames.length > 0
73
+ ? graphNames
74
+ : Object.keys(allGraphs);
75
+ // Generate requested graphs
76
+ for (const graphName of graphsToGenerate) {
77
+ if (allGraphs[graphName]) {
78
+ results[graphName] = await allGraphs[graphName]();
79
+ }
80
+ else {
81
+ console.warn(`⚠️ Unknown graph name: ${graphName}`);
82
+ }
83
+ }
84
+ return results;
85
+ }
86
+ /**
87
+ * Graph 1: Performance Tiers - Grouped bar chart showing score, reliability, and consistency
88
+ */
89
+ async generatePerformanceTiersGraph(modelPerformances) {
90
+ try {
91
+ // Sort by average score descending, take top 10 models
92
+ const topModels = modelPerformances
93
+ .sort((a, b) => b.averageScore - a.averageScore)
94
+ .slice(0, 10);
95
+ // Clean model names (remove "vercel_" prefix)
96
+ const labels = topModels.map(m => this.cleanModelName(m.modelId));
97
+ const scores = topModels.map(m => m.averageScore);
98
+ const reliability = topModels.map(m => m.reliabilityScore);
99
+ const consistency = topModels.map(m => m.consistencyAcrossTools);
100
+ const chartConfig = {
101
+ type: 'bar',
102
+ data: {
103
+ labels,
104
+ datasets: [
105
+ {
106
+ label: 'Overall Score',
107
+ data: scores,
108
+ backgroundColor: 'rgba(54, 162, 235, 0.9)',
109
+ borderColor: 'rgba(54, 162, 235, 1)',
110
+ borderWidth: 1
111
+ },
112
+ {
113
+ label: 'Reliability',
114
+ data: reliability,
115
+ backgroundColor: 'rgba(75, 192, 192, 0.9)',
116
+ borderColor: 'rgba(75, 192, 192, 1)',
117
+ borderWidth: 1
118
+ },
119
+ {
120
+ label: 'Consistency',
121
+ data: consistency,
122
+ backgroundColor: 'rgba(153, 102, 255, 0.9)',
123
+ borderColor: 'rgba(153, 102, 255, 1)',
124
+ borderWidth: 1
125
+ }
126
+ ]
127
+ },
128
+ options: {
129
+ plugins: {
130
+ datalabels: {
131
+ display: false
132
+ }
133
+ },
134
+ title: {
135
+ display: true,
136
+ text: 'Model Performance Tiers: Score, Reliability, and Consistency',
137
+ fontSize: 18,
138
+ fontColor: '#FFFFFF',
139
+ fontStyle: 'bold'
140
+ },
141
+ scales: {
142
+ yAxes: [{
143
+ ticks: {
144
+ beginAtZero: true,
145
+ max: 1.0,
146
+ stepSize: 0.1,
147
+ fontColor: '#FFFFFF',
148
+ fontSize: 12
149
+ },
150
+ scaleLabel: {
151
+ display: true,
152
+ labelString: 'Score (0-1)',
153
+ fontColor: '#FFFFFF',
154
+ fontSize: 14
155
+ },
156
+ gridLines: {
157
+ color: 'rgba(255, 255, 255, 0.2)',
158
+ zeroLineColor: 'rgba(255, 255, 255, 0.4)'
159
+ }
160
+ }],
161
+ xAxes: [{
162
+ ticks: {
163
+ autoSkip: false,
164
+ maxRotation: 45,
165
+ minRotation: 45,
166
+ fontColor: '#FFFFFF',
167
+ fontSize: 11
168
+ },
169
+ gridLines: {
170
+ color: 'rgba(255, 255, 255, 0.1)'
171
+ }
172
+ }]
173
+ },
174
+ legend: {
175
+ display: true,
176
+ position: 'top',
177
+ labels: {
178
+ fontColor: '#FFFFFF',
179
+ fontSize: 13
180
+ }
181
+ }
182
+ }
183
+ };
184
+ const outputPath = path.join(this.outputDir, 'performance-tiers.png');
185
+ await this.downloadChart(chartConfig, outputPath);
186
+ return {
187
+ success: true,
188
+ graphPath: outputPath
189
+ };
190
+ }
191
+ catch (error) {
192
+ return {
193
+ success: false,
194
+ error: `Failed to generate performance tiers graph: ${error}`
195
+ };
196
+ }
197
+ }
198
+ /**
199
+ * Graph 2: Cost vs Quality - Line chart showing input/output cost range per model
200
+ */
201
+ async generateCostVsQualityGraph(modelPerformances) {
202
+ try {
203
+ // Filter out models with no pricing data and sort by quality score descending
204
+ const modelsWithPricing = modelPerformances
205
+ .filter(m => m.pricing.input_cost_per_million_tokens > 0 || m.pricing.output_cost_per_million_tokens > 0)
206
+ .sort((a, b) => b.averageScore - a.averageScore);
207
+ // Create datasets: one for each model showing the cost range line
208
+ const datasets = modelsWithPricing.map((m, idx) => {
209
+ const inputCost = m.pricing.input_cost_per_million_tokens;
210
+ const outputCost = m.pricing.output_cost_per_million_tokens;
211
+ const color = this.getToolColor(idx);
212
+ // Line from input cost to output cost at the model's quality score
213
+ return {
214
+ label: this.cleanModelName(m.modelId),
215
+ data: [
216
+ { x: inputCost, y: m.averageScore },
217
+ { x: outputCost, y: m.averageScore }
218
+ ],
219
+ borderColor: color,
220
+ backgroundColor: color,
221
+ borderWidth: 3,
222
+ pointRadius: 5,
223
+ pointHoverRadius: 7,
224
+ fill: false,
225
+ showLine: true,
226
+ tension: 0
227
+ };
228
+ });
229
+ const chartConfig = {
230
+ type: 'line',
231
+ data: { datasets },
232
+ options: {
233
+ plugins: {
234
+ datalabels: {
235
+ display: false
236
+ }
237
+ },
238
+ title: {
239
+ display: true,
240
+ text: 'Cost vs Quality Analysis (line shows input → output cost range)',
241
+ fontSize: 18,
242
+ fontColor: '#FFFFFF',
243
+ fontStyle: 'bold'
244
+ },
245
+ scales: {
246
+ xAxes: [{
247
+ type: 'linear',
248
+ scaleLabel: {
249
+ display: true,
250
+ labelString: 'Cost per 1M Tokens in $ (Input ← → Output)',
251
+ fontColor: '#FFFFFF',
252
+ fontSize: 14
253
+ },
254
+ ticks: {
255
+ callback: function (value) {
256
+ return '$' + value;
257
+ },
258
+ fontColor: '#FFFFFF',
259
+ fontSize: 12
260
+ },
261
+ gridLines: {
262
+ color: 'rgba(255, 255, 255, 0.2)',
263
+ zeroLineColor: 'rgba(255, 255, 255, 0.4)'
264
+ }
265
+ }],
266
+ yAxes: [{
267
+ scaleLabel: {
268
+ display: true,
269
+ labelString: 'Overall Score',
270
+ fontColor: '#FFFFFF',
271
+ fontSize: 14
272
+ },
273
+ ticks: {
274
+ beginAtZero: false,
275
+ min: 0.3,
276
+ max: 1.0,
277
+ fontColor: '#FFFFFF',
278
+ fontSize: 12
279
+ },
280
+ gridLines: {
281
+ color: 'rgba(255, 255, 255, 0.2)',
282
+ zeroLineColor: 'rgba(255, 255, 255, 0.4)'
283
+ }
284
+ }]
285
+ },
286
+ legend: {
287
+ display: true,
288
+ position: 'right',
289
+ labels: {
290
+ fontColor: '#FFFFFF',
291
+ fontSize: 10,
292
+ boxWidth: 15,
293
+ usePointStyle: true
294
+ }
295
+ }
296
+ }
297
+ };
298
+ const outputPath = path.join(this.outputDir, 'cost-vs-quality.png');
299
+ await this.downloadChart(chartConfig, outputPath);
300
+ return {
301
+ success: true,
302
+ graphPath: outputPath
303
+ };
304
+ }
305
+ catch (error) {
306
+ return {
307
+ success: false,
308
+ error: `Failed to generate cost vs quality graph: ${error}`
309
+ };
310
+ }
311
+ }
312
+ /**
313
+ * Graph 3: Reliability Comparison - Bar chart with reliability scores
314
+ */
315
+ async generateReliabilityComparisonGraph(modelPerformances) {
316
+ try {
317
+ // Sort by reliability descending
318
+ const sortedModels = modelPerformances
319
+ .sort((a, b) => b.reliabilityScore - a.reliabilityScore);
320
+ const labels = sortedModels.map(m => this.cleanModelName(m.modelId));
321
+ const reliabilityScores = sortedModels.map(m => m.reliabilityScore);
322
+ // Create separate datasets for legend
323
+ const datasets = [
324
+ {
325
+ label: 'High Reliability (≥0.9)',
326
+ data: reliabilityScores.map(score => score >= 0.9 ? score : null),
327
+ backgroundColor: 'rgba(75, 192, 192, 0.8)',
328
+ borderWidth: 1
329
+ },
330
+ {
331
+ label: 'Medium Reliability (0.7-0.9)',
332
+ data: reliabilityScores.map(score => score >= 0.7 && score < 0.9 ? score : null),
333
+ backgroundColor: 'rgba(255, 206, 86, 0.8)',
334
+ borderWidth: 1
335
+ },
336
+ {
337
+ label: 'Low Reliability (<0.7)',
338
+ data: reliabilityScores.map(score => score < 0.7 ? score : null),
339
+ backgroundColor: 'rgba(255, 99, 132, 0.8)',
340
+ borderWidth: 1
341
+ }
342
+ ];
343
+ const chartConfig = {
344
+ type: 'horizontalBar',
345
+ data: {
346
+ labels,
347
+ datasets
348
+ },
349
+ options: {
350
+ plugins: {
351
+ datalabels: {
352
+ display: false
353
+ }
354
+ },
355
+ title: {
356
+ display: true,
357
+ text: 'Model Reliability Comparison',
358
+ fontSize: 18,
359
+ fontColor: '#FFFFFF',
360
+ fontStyle: 'bold'
361
+ },
362
+ scales: {
363
+ xAxes: [{
364
+ stacked: true,
365
+ ticks: {
366
+ beginAtZero: true,
367
+ max: 1.0,
368
+ stepSize: 0.1,
369
+ fontColor: '#FFFFFF',
370
+ fontSize: 12
371
+ },
372
+ scaleLabel: {
373
+ display: true,
374
+ labelString: 'Reliability Score (0-1)',
375
+ fontColor: '#FFFFFF',
376
+ fontSize: 14
377
+ },
378
+ gridLines: {
379
+ color: 'rgba(255, 255, 255, 0.2)',
380
+ zeroLineColor: 'rgba(255, 255, 255, 0.4)'
381
+ }
382
+ }],
383
+ yAxes: [{
384
+ stacked: true,
385
+ ticks: {
386
+ fontColor: '#FFFFFF',
387
+ fontSize: 11
388
+ },
389
+ gridLines: {
390
+ color: 'rgba(255, 255, 255, 0.1)'
391
+ }
392
+ }]
393
+ },
394
+ legend: {
395
+ display: true,
396
+ position: 'top',
397
+ labels: {
398
+ fontColor: '#FFFFFF',
399
+ fontSize: 12
400
+ }
401
+ }
402
+ }
403
+ };
404
+ const outputPath = path.join(this.outputDir, 'reliability-comparison.png');
405
+ await this.downloadChart(chartConfig, outputPath);
406
+ return {
407
+ success: true,
408
+ graphPath: outputPath
409
+ };
410
+ }
411
+ catch (error) {
412
+ return {
413
+ success: false,
414
+ error: `Failed to generate reliability comparison graph: ${error}`
415
+ };
416
+ }
417
+ }
418
+ /**
419
+ * Graph 4: Tool Performance Heatmap - Shows model scores per tool
420
+ */
421
+ async generateToolPerformanceHeatmap(modelPerformances) {
422
+ try {
423
+ // Get all unique tool names
424
+ const toolNames = new Set();
425
+ modelPerformances.forEach(m => {
426
+ Object.keys(m.toolScores).forEach(tool => toolNames.add(tool));
427
+ });
428
+ const tools = Array.from(toolNames).sort();
429
+ // Sort models by average score
430
+ const sortedModels = modelPerformances
431
+ .sort((a, b) => b.averageScore - a.averageScore)
432
+ .slice(0, 10); // Top 10 models
433
+ // Create matrix data
434
+ const labels = sortedModels.map(m => this.cleanModelName(m.modelId));
435
+ const datasets = tools.map((tool, idx) => ({
436
+ label: tool.charAt(0).toUpperCase() + tool.slice(1),
437
+ data: sortedModels.map(m => m.toolScores[tool] || 0),
438
+ backgroundColor: this.getToolColor(idx),
439
+ borderWidth: 1
440
+ }));
441
+ const chartConfig = {
442
+ type: 'horizontalBar',
443
+ data: {
444
+ labels,
445
+ datasets
446
+ },
447
+ options: {
448
+ plugins: {
449
+ datalabels: {
450
+ display: false
451
+ }
452
+ },
453
+ title: {
454
+ display: true,
455
+ text: 'Tool-Specific Performance Patterns',
456
+ fontSize: 18,
457
+ fontColor: '#FFFFFF',
458
+ fontStyle: 'bold'
459
+ },
460
+ scales: {
461
+ xAxes: [{
462
+ stacked: false,
463
+ ticks: {
464
+ beginAtZero: true,
465
+ max: 1.0,
466
+ stepSize: 0.2,
467
+ fontColor: '#FFFFFF',
468
+ fontSize: 12
469
+ },
470
+ scaleLabel: {
471
+ display: true,
472
+ labelString: 'Tool Score',
473
+ fontColor: '#FFFFFF',
474
+ fontSize: 14
475
+ },
476
+ gridLines: {
477
+ color: 'rgba(255, 255, 255, 0.2)',
478
+ zeroLineColor: 'rgba(255, 255, 255, 0.4)'
479
+ }
480
+ }],
481
+ yAxes: [{
482
+ stacked: false,
483
+ ticks: {
484
+ fontColor: '#FFFFFF',
485
+ fontSize: 11
486
+ },
487
+ gridLines: {
488
+ color: 'rgba(255, 255, 255, 0.1)'
489
+ }
490
+ }]
491
+ },
492
+ legend: {
493
+ display: true,
494
+ position: 'right',
495
+ labels: {
496
+ fontColor: '#FFFFFF',
497
+ fontSize: 12
498
+ }
499
+ }
500
+ }
501
+ };
502
+ const outputPath = path.join(this.outputDir, 'tool-performance-heatmap.png');
503
+ await this.downloadChart(chartConfig, outputPath);
504
+ return {
505
+ success: true,
506
+ graphPath: outputPath
507
+ };
508
+ }
509
+ catch (error) {
510
+ return {
511
+ success: false,
512
+ error: `Failed to generate tool performance heatmap: ${error}`
513
+ };
514
+ }
515
+ }
516
+ /**
517
+ * Graph 5: Context Window Correlation - Scatter plot showing context window vs performance
518
+ */
519
+ async generateContextWindowCorrelationGraph(modelPerformances) {
520
+ try {
521
+ const scatterData = modelPerformances.map((m) => ({
522
+ x: m.capabilities.context_window / 1000, // Convert to thousands for readability
523
+ y: m.averageScore,
524
+ r: 8,
525
+ label: this.cleanModelName(m.modelId)
526
+ }));
527
+ const chartConfig = {
528
+ type: 'scatter',
529
+ data: {
530
+ datasets: [{
531
+ label: 'Models',
532
+ data: scatterData,
533
+ backgroundColor: 'rgba(153, 102, 255, 0.7)',
534
+ borderColor: 'rgba(153, 102, 255, 1)',
535
+ borderWidth: 2,
536
+ pointRadius: 10
537
+ }]
538
+ },
539
+ options: {
540
+ layout: {
541
+ padding: {
542
+ right: 300,
543
+ left: 20,
544
+ top: 20,
545
+ bottom: 20
546
+ }
547
+ },
548
+ plugins: {
549
+ datalabels: {
550
+ display: true,
551
+ align: 'right',
552
+ offset: 12,
553
+ color: '#FFFFFF',
554
+ font: {
555
+ size: 20
556
+ },
557
+ formatter: (value) => value.label
558
+ }
559
+ },
560
+ title: {
561
+ display: true,
562
+ text: 'Context Window Size vs Performance',
563
+ fontSize: 18,
564
+ fontColor: '#FFFFFF',
565
+ fontStyle: 'bold'
566
+ },
567
+ scales: {
568
+ xAxes: [{
569
+ type: 'linear',
570
+ scaleLabel: {
571
+ display: true,
572
+ labelString: 'Context Window Size (K tokens)',
573
+ fontColor: '#FFFFFF',
574
+ fontSize: 14
575
+ },
576
+ ticks: {
577
+ callback: (value) => value + 'K',
578
+ fontColor: '#FFFFFF',
579
+ fontSize: 12
580
+ },
581
+ gridLines: {
582
+ color: 'rgba(255, 255, 255, 0.2)',
583
+ zeroLineColor: 'rgba(255, 255, 255, 0.4)'
584
+ }
585
+ }],
586
+ yAxes: [{
587
+ scaleLabel: {
588
+ display: true,
589
+ labelString: 'Overall Score',
590
+ fontColor: '#FFFFFF',
591
+ fontSize: 14
592
+ },
593
+ ticks: {
594
+ beginAtZero: false,
595
+ min: 0.3,
596
+ max: 1.0,
597
+ fontColor: '#FFFFFF',
598
+ fontSize: 12
599
+ },
600
+ gridLines: {
601
+ color: 'rgba(255, 255, 255, 0.2)',
602
+ zeroLineColor: 'rgba(255, 255, 255, 0.4)'
603
+ }
604
+ }]
605
+ },
606
+ legend: {
607
+ display: false
608
+ },
609
+ tooltips: {
610
+ backgroundColor: 'rgba(0, 0, 0, 0.8)',
611
+ titleFontColor: '#FFFFFF',
612
+ bodyFontColor: '#FFFFFF',
613
+ callbacks: {
614
+ label: (tooltipItem, data) => {
615
+ const dataset = data.datasets[tooltipItem.datasetIndex];
616
+ const point = dataset.data[tooltipItem.index];
617
+ return `${point.label}: ${point.y.toFixed(3)} (${Math.round(point.x)}K tokens)`;
618
+ }
619
+ }
620
+ }
621
+ }
622
+ };
623
+ const outputPath = path.join(this.outputDir, 'context-window-correlation.png');
624
+ await this.downloadChart(chartConfig, outputPath, 1400, 700);
625
+ return {
626
+ success: true,
627
+ graphPath: outputPath
628
+ };
629
+ }
630
+ catch (error) {
631
+ return {
632
+ success: false,
633
+ error: `Failed to generate context window correlation graph: ${error}`
634
+ };
635
+ }
636
+ }
637
+ /**
638
+ * Downloads a chart from QuickChart.io API and saves it as PNG
639
+ */
640
+ async downloadChart(chartConfig, outputPath, width = 1000, height = 600) {
641
+ return new Promise((resolve, reject) => {
642
+ const chartJson = JSON.stringify(chartConfig);
643
+ const url = `${this.quickchartBaseUrl}?c=${encodeURIComponent(chartJson)}&width=${width}&height=${height}&format=png&backgroundColor=black`;
644
+ https.get(url, (response) => {
645
+ if (response.statusCode !== 200) {
646
+ reject(new Error(`QuickChart API returned status ${response.statusCode}`));
647
+ return;
648
+ }
649
+ const fileStream = fs.createWriteStream(outputPath);
650
+ response.pipe(fileStream);
651
+ fileStream.on('finish', () => {
652
+ fileStream.close();
653
+ console.log(`✅ Graph saved: ${outputPath}`);
654
+ resolve();
655
+ });
656
+ fileStream.on('error', (err) => {
657
+ fs.unlink(outputPath, () => { }); // Clean up partial file
658
+ reject(err);
659
+ });
660
+ }).on('error', (err) => {
661
+ reject(err);
662
+ });
663
+ });
664
+ }
665
+ /**
666
+ * Cleans model names by removing provider prefixes
667
+ */
668
+ cleanModelName(modelId) {
669
+ // Remove "vercel_" prefix and timestamp suffix
670
+ return modelId
671
+ .replace(/^vercel_/, '')
672
+ .replace(/_\d{4}-\d{2}-\d{2}$/, '')
673
+ .replace(/_/g, '-');
674
+ }
675
+ /**
676
+ * Returns a consistent color for each tool index (supports up to 10 tools)
677
+ */
678
+ getToolColor(index) {
679
+ const colors = [
680
+ 'rgba(255, 99, 132, 0.8)', // Red
681
+ 'rgba(54, 162, 235, 0.8)', // Blue
682
+ 'rgba(255, 206, 86, 0.8)', // Yellow
683
+ 'rgba(75, 192, 192, 0.8)', // Green
684
+ 'rgba(153, 102, 255, 0.8)', // Purple
685
+ 'rgba(255, 159, 64, 0.8)', // Orange
686
+ 'rgba(199, 199, 199, 0.8)', // Grey
687
+ 'rgba(83, 102, 255, 0.8)', // Indigo
688
+ 'rgba(255, 99, 255, 0.8)', // Pink
689
+ 'rgba(99, 255, 132, 0.8)' // Light Green
690
+ ];
691
+ return colors[index % colors.length];
692
+ }
693
+ }
694
+ exports.GraphGenerator = GraphGenerator;
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Shared Metadata Loader
3
+ *
4
+ * Provides consistent access to model and tool metadata across all evaluators
5
+ */
6
+ export interface ModelMetadata {
7
+ provider: string;
8
+ pricing: {
9
+ input_cost_per_million_tokens: number;
10
+ output_cost_per_million_tokens: number;
11
+ };
12
+ context_window: number;
13
+ supports_function_calling: boolean;
14
+ }
15
+ export interface ToolMetadata {
16
+ name: string;
17
+ description: string;
18
+ primaryFunction: string;
19
+ testTimeout: string;
20
+ successCriteria: string[];
21
+ modelRequirements: Record<string, string>;
22
+ }
23
+ export interface EvaluationMetadata {
24
+ models: Record<string, ModelMetadata>;
25
+ tools: Record<string, ToolMetadata>;
26
+ }
27
+ /**
28
+ * Load model and tool metadata from model-metadata.json
29
+ */
30
+ export declare function loadEvaluationMetadata(): EvaluationMetadata;
31
+ /**
32
+ * Build model pricing context for evaluation prompts
33
+ */
34
+ export declare function buildModelPricingContext(models: Record<string, ModelMetadata>): string;
35
+ /**
36
+ * Build tool context for evaluation prompts (tool-specific description and constraints)
37
+ */
38
+ export declare function buildToolContext(toolName: string, tools: Record<string, ToolMetadata>): string;
39
+ //# sourceMappingURL=metadata-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metadata-loader.d.ts","sourceRoot":"","sources":["../../src/evaluation/metadata-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,MAAM,WAAW,aAAa;IAC5B,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,cAAc,EAAE,MAAM,CAAC;IACvB,yBAAyB,EAAE,OAAO,CAAC;CACpC;AAED,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC3C;AAED,MAAM,WAAW,kBAAkB;IACjC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IACtC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;CACrC;AAED;;GAEG;AACH,wBAAgB,sBAAsB,IAAI,kBAAkB,CAa3D;AAED;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,GAAG,MAAM,CAkBtF;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,GAAG,MAAM,CAqB9F"}
@@ -0,0 +1,74 @@
1
+ "use strict";
2
+ /**
3
+ * Shared Metadata Loader
4
+ *
5
+ * Provides consistent access to model and tool metadata across all evaluators
6
+ */
7
+ Object.defineProperty(exports, "__esModule", { value: true });
8
+ exports.loadEvaluationMetadata = loadEvaluationMetadata;
9
+ exports.buildModelPricingContext = buildModelPricingContext;
10
+ exports.buildToolContext = buildToolContext;
11
+ const fs_1 = require("fs");
12
+ const path_1 = require("path");
13
+ /**
14
+ * Load model and tool metadata from model-metadata.json
15
+ */
16
+ function loadEvaluationMetadata() {
17
+ try {
18
+ const metadataPath = (0, path_1.join)(process.cwd(), 'src', 'evaluation', 'model-metadata.json');
19
+ const metadata = JSON.parse((0, fs_1.readFileSync)(metadataPath, 'utf8'));
20
+ console.log(`✅ Loaded metadata for ${Object.keys(metadata.models || {}).length} models and ${Object.keys(metadata.tools || {}).length} tools`);
21
+ return {
22
+ models: metadata.models || {},
23
+ tools: metadata.tools || {}
24
+ };
25
+ }
26
+ catch (error) {
27
+ console.warn('⚠️ Failed to load evaluation metadata:', error);
28
+ return { models: {}, tools: {} };
29
+ }
30
+ }
31
+ /**
32
+ * Build model pricing context for evaluation prompts
33
+ */
34
+ function buildModelPricingContext(models) {
35
+ const modelIds = Object.keys(models);
36
+ if (modelIds.length === 0) {
37
+ return 'No pricing information available.';
38
+ }
39
+ const pricingLines = modelIds.map(modelId => {
40
+ const model = models[modelId];
41
+ const inputCost = model.pricing?.input_cost_per_million_tokens?.toFixed(2) || 'N/A';
42
+ const outputCost = model.pricing?.output_cost_per_million_tokens?.toFixed(2) || 'N/A';
43
+ const avgCost = model.pricing
44
+ ? ((model.pricing.input_cost_per_million_tokens + model.pricing.output_cost_per_million_tokens) / 2).toFixed(2)
45
+ : 'N/A';
46
+ const contextWindow = model.context_window ? `${(model.context_window / 1000).toFixed(0)}K` : 'N/A';
47
+ return `- **${modelId}** (${model.provider}): $${avgCost}/1M tokens ($${inputCost} input, $${outputCost} output) | Context: ${contextWindow} tokens`;
48
+ });
49
+ return `## Model Pricing Information\n\n${pricingLines.join('\n')}`;
50
+ }
51
+ /**
52
+ * Build tool context for evaluation prompts (tool-specific description and constraints)
53
+ */
54
+ function buildToolContext(toolName, tools) {
55
+ const tool = tools[toolName];
56
+ if (!tool) {
57
+ return `No metadata available for tool: ${toolName}`;
58
+ }
59
+ return `## Tool Being Evaluated: ${tool.name}
60
+
61
+ **Description**: ${tool.description}
62
+
63
+ **Primary Function**: ${tool.primaryFunction}
64
+
65
+ **Test Timeout Constraint**: ${tool.testTimeout}
66
+
67
+ **Success Criteria**:
68
+ ${tool.successCriteria.map((c) => `- ${c}`).join('\n')}
69
+
70
+ **Model Requirements**:
71
+ ${Object.entries(tool.modelRequirements).map(([key, value]) => `- **${key}**: ${value}`).join('\n')}
72
+
73
+ **IMPORTANT**: When analyzing model failures, consider whether the model exceeded the timeout constraint. Models that timeout should be noted as failing due to timeout constraints rather than quality issues.`;
74
+ }
@@ -35,7 +35,7 @@ export declare class PlatformSynthesizer {
35
35
  private aiProvider;
36
36
  private reportsDir;
37
37
  constructor(aiProvider: VercelProvider, reportsDir?: string);
38
- generatePlatformWideAnalysis(): Promise<string>;
38
+ generatePlatformWideAnalysis(graphsToGenerate?: string[], skipReport?: boolean): Promise<string>;
39
39
  private loadToolMetadata;
40
40
  private loadAllReports;
41
41
  private analyzeCrossToolPerformance;
@@ -49,6 +49,10 @@ export declare class PlatformSynthesizer {
49
49
  private generateProductionRecommendations;
50
50
  private calculateCostEstimate;
51
51
  private extractBaseModelId;
52
+ /**
53
+ * Generates graphs and replaces placeholders in the markdown report
54
+ */
55
+ private addGraphsToReport;
52
56
  saveSynthesisReport(markdownContent: string, outputPath?: string): Promise<void>;
53
57
  }
54
58
  //# sourceMappingURL=platform-synthesizer.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"platform-synthesizer.d.ts","sourceRoot":"","sources":["../../src/evaluation/platform-synthesizer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAEtE,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;IACzB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,YAAY,EAAE;QACZ,cAAc,EAAE,MAAM,CAAC;QACvB,yBAAyB,EAAE,OAAO,CAAC;KACpC,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,aAAa,EAAE,gBAAgB,EAAE,CAAC;IAClC,QAAQ,EAAE,gBAAgB,EAAE,CAAC;IAC7B,kBAAkB,EAAE,gBAAgB,EAAE,CAAC;CACxC;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,eAAe,GAAG,aAAa,GAAG,YAAY,GAAG,UAAU,CAAC;IACtE,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,UAAU,CAAiB;IACnC,OAAO,CAAC,UAAU,CAAS;gBAEf,UAAU,EAAE,cAAc,EAAE,UAAU,SAA+B;IAK3E,4BAA4B,IAAI,OAAO,CAAC,MAAM,CAAC;IA8BrD,OAAO,CAAC,gBAAgB;YAaV,cAAc;YA0Bd,2BAA2B;IA8DzC,OAAO,CAAC,0BAA0B;IA2ElC,OAAO,CAAC,wBAAwB;IAiDhC,OAAO,CAAC,4BAA4B;YA0CtB,wBAAwB;IAoBtC,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,oBAAoB;IAsB5B,OAAO,CAAC,yBAAyB;IAWjC,OAAO,CAAC,iCAAiC;IASzC,OAAO,CAAC,qBAAqB;IAQ7B,OAAO,CAAC,kBAAkB;IASpB,mBAAmB,CACvB,eAAe,EAAE,MAAM,EACvB,UAAU,SAAiD,GAC1D,OAAO,CAAC,IAAI,CAAC;CAWjB"}
1
+ {"version":3,"file":"platform-synthesizer.d.ts","sourceRoot":"","sources":["../../src/evaluation/platform-synthesizer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAItE,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;IACzB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,YAAY,EAAE;QACZ,cAAc,EAAE,MAAM,CAAC;QACvB,yBAAyB,EAAE,OAAO,CAAC;KACpC,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,aAAa,EAAE,gBAAgB,EAAE,CAAC;IAClC,QAAQ,EAAE,gBAAgB,EAAE,CAAC;IAC7B,kBAAkB,EAAE,gBAAgB,EAAE,CAAC;CACxC;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,eAAe,GAAG,aAAa,GAAG,YAAY,GAAG,UAAU,CAAC;IACtE,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,UAAU,CAAiB;IACnC,OAAO,CAAC,UAAU,CAAS;gBAEf,UAAU,EAAE,cAAc,EAAE,UAAU,SAA+B;IAK3E,4BAA4B,CAAC,gBAAgB,CAAC,EAAE,MAAM,EAAE,EAAE,UAAU,UAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;IA6CpG,OAAO,CAAC,gBAAgB;YAKV,cAAc;YAyBd,2BAA2B;IA8DzC,OAAO,CAAC,0BAA0B;IA2ElC,OAAO,CAAC,wBAAwB;IAiDhC,OAAO,CAAC,4BAA4B;YA0CtB,wBAAwB;IAoBtC,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,oBAAoB;IAsB5B,OAAO,CAAC,yBAAyB;IAWjC,OAAO,CAAC,iCAAiC;IASzC,OAAO,CAAC,qBAAqB;IAQ7B,OAAO,CAAC,kBAAkB;IAS1B;;OAEG;YACW,iBAAiB;IA8CzB,mBAAmB,CACvB,eAAe,EAAE,MAAM,EACvB,UAAU,SAAiD,GAC1D,OAAO,CAAC,IAAI,CAAC;CAWjB"}
@@ -36,6 +36,8 @@ Object.defineProperty(exports, "__esModule", { value: true });
36
36
  exports.PlatformSynthesizer = void 0;
37
37
  const fs = __importStar(require("fs"));
38
38
  const path = __importStar(require("path"));
39
+ const graph_generator_js_1 = require("./graph-generator.js");
40
+ const metadata_loader_js_1 = require("./metadata-loader.js");
39
41
  class PlatformSynthesizer {
40
42
  aiProvider;
41
43
  reportsDir;
@@ -43,48 +45,48 @@ class PlatformSynthesizer {
43
45
  this.aiProvider = aiProvider;
44
46
  this.reportsDir = reportsDir;
45
47
  }
46
- async generatePlatformWideAnalysis() {
48
+ async generatePlatformWideAnalysis(graphsToGenerate, skipReport = false) {
47
49
  console.log('🔍 Loading all evaluation reports...');
48
50
  const allReports = await this.loadAllReports();
49
51
  console.log('🔧 Loading tool metadata...');
50
52
  const toolMetadata = this.loadToolMetadata();
51
53
  console.log('📊 Analyzing cross-tool performance patterns...');
52
54
  const crossToolAnalysis = await this.analyzeCrossToolPerformance(allReports);
53
- console.log('🎯 Generating decision matrices...');
54
- const decisionMatrices = this.generateDecisionMatrices(crossToolAnalysis.modelPerformances);
55
- console.log('💡 Creating usage recommendations...');
56
- const usageRecommendations = this.generateUsageRecommendations(crossToolAnalysis, decisionMatrices);
57
- console.log('🚀 Generating comprehensive AI-powered report...');
58
- const markdownReport = await this.generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata);
59
- return markdownReport;
60
- }
61
- loadToolMetadata() {
62
- try {
63
- const metadataPath = path.join(process.cwd(), 'src', 'evaluation', 'model-metadata.json');
64
- const metadataContent = fs.readFileSync(metadataPath, 'utf8');
65
- const metadata = JSON.parse(metadataContent);
66
- console.log(`✅ Loaded tool metadata with ${Object.keys(metadata.tools || {}).length} tools`);
67
- return metadata.tools || {};
55
+ let markdownReport;
56
+ if (skipReport) {
57
+ console.log('⏭️ Skipping AI report generation...');
58
+ // Return empty string if we're only generating graphs
59
+ markdownReport = '';
68
60
  }
69
- catch (error) {
70
- console.warn('⚠️ Failed to load tool metadata, proceeding without it:', error);
71
- return {};
61
+ else {
62
+ console.log('🎯 Generating decision matrices...');
63
+ const decisionMatrices = this.generateDecisionMatrices(crossToolAnalysis.modelPerformances);
64
+ console.log('💡 Creating usage recommendations...');
65
+ const usageRecommendations = this.generateUsageRecommendations(crossToolAnalysis, decisionMatrices);
66
+ console.log('🚀 Generating comprehensive AI-powered report...');
67
+ markdownReport = await this.generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata);
72
68
  }
69
+ console.log('📊 Generating data visualizations...');
70
+ const reportWithGraphs = await this.addGraphsToReport(markdownReport, crossToolAnalysis.modelPerformances, graphsToGenerate);
71
+ return reportWithGraphs;
72
+ }
73
+ loadToolMetadata() {
74
+ const metadata = (0, metadata_loader_js_1.loadEvaluationMetadata)();
75
+ return { tools: metadata.tools };
73
76
  }
74
77
  async loadAllReports() {
75
78
  const reports = {};
76
79
  // Load all JSON result files from the directory
77
80
  const reportFiles = fs.readdirSync(this.reportsDir)
78
- .filter(file => file.endsWith('-results-*.json') || file.includes('-results-'))
79
- .filter(file => file.endsWith('.json'));
81
+ .filter(file => file.endsWith('-results.json'));
80
82
  if (reportFiles.length === 0) {
81
83
  throw new Error(`No evaluation result files found in ${this.reportsDir}`);
82
84
  }
83
85
  for (const fileName of reportFiles) {
84
86
  const reportPath = path.join(this.reportsDir, fileName);
85
87
  const reportContent = JSON.parse(fs.readFileSync(reportPath, 'utf8'));
86
- // Extract tool type from filename (e.g., "capability-results-2025-10-15.json" -> "capability")
87
- const toolType = fileName.split('-results-')[0];
88
+ // Extract tool type from filename (e.g., "capability-results.json" -> "capability")
89
+ const toolType = fileName.split('-results.json')[0];
88
90
  reports[toolType] = reportContent;
89
91
  console.log(`✅ Loaded ${toolType} report: ${fileName}`);
90
92
  }
@@ -355,6 +357,46 @@ class PlatformSynthesizer {
355
357
  }
356
358
  return fullModelId;
357
359
  }
360
+ /**
361
+ * Generates graphs and replaces placeholders in the markdown report
362
+ */
363
+ async addGraphsToReport(markdownContent, modelPerformances, graphsToGenerate) {
364
+ const graphGenerator = new graph_generator_js_1.GraphGenerator('./eval/analysis/platform/graphs');
365
+ try {
366
+ // Generate all or specific graphs
367
+ const graphResults = await graphGenerator.generateAllGraphs(modelPerformances, graphsToGenerate);
368
+ // Replace placeholders with actual image markdown
369
+ let updatedMarkdown = markdownContent;
370
+ const graphMappings = {
371
+ '[GRAPH:performance-tiers]': '![Performance Tiers](./graphs/performance-tiers.png)',
372
+ '[GRAPH:cost-vs-quality]': '![Cost vs Quality](./graphs/cost-vs-quality.png)',
373
+ '[GRAPH:reliability-comparison]': '![Reliability Comparison](./graphs/reliability-comparison.png)',
374
+ '[GRAPH:tool-performance-heatmap]': '![Tool Performance Heatmap](./graphs/tool-performance-heatmap.png)',
375
+ '[GRAPH:context-window-correlation]': '![Context Window Correlation](./graphs/context-window-correlation.png)'
376
+ };
377
+ for (const [placeholder, imageMarkdown] of Object.entries(graphMappings)) {
378
+ updatedMarkdown = updatedMarkdown.replace(placeholder, imageMarkdown);
379
+ }
380
+ // Log graph generation results
381
+ for (const [graphName, result] of Object.entries(graphResults)) {
382
+ if (result.success) {
383
+ console.log(` ✅ ${graphName}: ${result.graphPath}`);
384
+ }
385
+ else {
386
+ console.warn(` ⚠️ ${graphName}: ${result.error}`);
387
+ // If graph generation failed, remove the placeholder to avoid broken markdown
388
+ const placeholderKey = `[GRAPH:${graphName}]`;
389
+ updatedMarkdown = updatedMarkdown.replace(placeholderKey, `*Graph generation failed: ${result.error}*`);
390
+ }
391
+ }
392
+ return updatedMarkdown;
393
+ }
394
+ catch (error) {
395
+ console.error('⚠️ Failed to generate graphs, returning report without visualizations:', error);
396
+ // If graph generation completely fails, remove all placeholders
397
+ return markdownContent.replace(/\[GRAPH:[^\]]+\]/g, '*Graph generation failed*');
398
+ }
399
+ }
358
400
  async saveSynthesisReport(markdownContent, outputPath = './eval/analysis/platform/synthesis-report.md') {
359
401
  const dir = path.dirname(outputPath);
360
402
  if (!fs.existsSync(dir)) {
@@ -13,6 +13,21 @@ const model_config_js_1 = require("../core/model-config.js");
13
13
  async function runPlatformSynthesis() {
14
14
  console.log('🚀 Starting Platform-Wide AI Model Synthesis...\n');
15
15
  try {
16
+ // Parse command line arguments for graph filtering
17
+ const args = process.argv.slice(2);
18
+ let graphsToGenerate;
19
+ let skipReport = false;
20
+ if (args.length > 0) {
21
+ const graphArg = args.find(arg => arg.startsWith('--graphs='));
22
+ if (graphArg) {
23
+ graphsToGenerate = graphArg.split('=')[1].split(',');
24
+ console.log(`📊 Generating specific graphs: ${graphsToGenerate.join(', ')}\n`);
25
+ }
26
+ skipReport = args.includes('--skip-report');
27
+ if (skipReport) {
28
+ console.log('⏭️ Skipping AI report generation (graphs only)\n');
29
+ }
30
+ }
16
31
  // Initialize AI provider for synthesis analysis (use Claude for comprehensive analysis)
17
32
  const aiProvider = new vercel_provider_js_1.VercelProvider({
18
33
  provider: 'anthropic',
@@ -22,12 +37,14 @@ async function runPlatformSynthesis() {
22
37
  });
23
38
  // Initialize synthesizer
24
39
  const synthesizer = new platform_synthesizer_js_1.PlatformSynthesizer(aiProvider);
25
- // Generate comprehensive platform-wide analysis
40
+ // Generate comprehensive platform-wide analysis (or just graphs if skip-report is set)
26
41
  console.log('📊 Generating platform-wide analysis...');
27
- const markdownReport = await synthesizer.generatePlatformWideAnalysis();
28
- // Save synthesis report
29
- console.log('\n💾 Saving synthesis report...');
30
- await synthesizer.saveSynthesisReport(markdownReport);
42
+ const markdownReport = await synthesizer.generatePlatformWideAnalysis(graphsToGenerate, skipReport);
43
+ // Save synthesis report only if we generated it
44
+ if (!skipReport) {
45
+ console.log('\n💾 Saving synthesis report...');
46
+ await synthesizer.saveSynthesisReport(markdownReport);
47
+ }
31
48
  console.log('\n✅ Platform-wide synthesis complete!');
32
49
  console.log('📄 Report saved: ./eval/analysis/platform/synthesis-report.md');
33
50
  console.log('\n✨ AI-generated comprehensive report includes:');
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vfarcic/dot-ai",
3
- "version": "0.115.0",
3
+ "version": "0.116.0",
4
4
  "description": "AI-powered development productivity platform that enhances software development workflows through intelligent automation and AI-driven assistance",
5
5
  "mcpName": "io.github.vfarcic/dot-ai",
6
6
  "main": "dist/index.js",