@vfarcic/dot-ai 0.115.0 → 0.116.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/eval-runner.js +12 -3
- package/dist/evaluation/evaluators/base-comparative.d.ts +2 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -1
- package/dist/evaluation/evaluators/base-comparative.js +13 -1
- package/dist/evaluation/graph-generator.d.ts +56 -0
- package/dist/evaluation/graph-generator.d.ts.map +1 -0
- package/dist/evaluation/graph-generator.js +694 -0
- package/dist/evaluation/metadata-loader.d.ts +39 -0
- package/dist/evaluation/metadata-loader.d.ts.map +1 -0
- package/dist/evaluation/metadata-loader.js +74 -0
- package/dist/evaluation/platform-synthesizer.d.ts +5 -1
- package/dist/evaluation/platform-synthesizer.d.ts.map +1 -1
- package/dist/evaluation/platform-synthesizer.js +65 -23
- package/dist/evaluation/run-platform-synthesis.js +22 -5
- package/package.json +1 -1
|
@@ -295,9 +295,8 @@ async function runEvaluation(evaluatorType, datasetsDir, modelMetadata) {
|
|
|
295
295
|
const reportContent = generateMarkdownReport(results, stats, evaluatorType, finalAssessment);
|
|
296
296
|
const jsonResults = generateJsonReport(results, stats, evaluatorType, modelMetadata, finalAssessment);
|
|
297
297
|
// Save reports to files
|
|
298
|
-
const
|
|
299
|
-
const
|
|
300
|
-
const jsonPath = `./eval/analysis/individual/${evaluatorType}-results-${dateStamp}.json`;
|
|
298
|
+
const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation.md`;
|
|
299
|
+
const jsonPath = `./eval/analysis/individual/${evaluatorType}-results.json`;
|
|
301
300
|
const reportDir = './eval/analysis/individual';
|
|
302
301
|
// Ensure report directory exists
|
|
303
302
|
const fs = await Promise.resolve().then(() => __importStar(require('fs')));
|
|
@@ -328,6 +327,16 @@ async function main() {
|
|
|
328
327
|
catch (error) {
|
|
329
328
|
console.warn('⚠️ Could not clean debug files:', error instanceof Error ? error.message : String(error));
|
|
330
329
|
}
|
|
330
|
+
// Clean old evaluation result files from eval/results
|
|
331
|
+
console.log('🧹 Cleaning old evaluation result files...');
|
|
332
|
+
try {
|
|
333
|
+
await execAsync('rm -f ./eval/results/*_comparative_evaluation_*.jsonl 2>/dev/null || true');
|
|
334
|
+
await execAsync('mkdir -p ./eval/results');
|
|
335
|
+
console.log('✅ Old evaluation results cleaned\n');
|
|
336
|
+
}
|
|
337
|
+
catch (error) {
|
|
338
|
+
console.warn('⚠️ Could not clean old evaluation results:', error instanceof Error ? error.message : String(error));
|
|
339
|
+
}
|
|
331
340
|
// Check model metadata freshness before starting any evaluation work
|
|
332
341
|
const modelMetadata = loadModelMetadata();
|
|
333
342
|
const datasetsDir = './eval/datasets';
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import { EvaluationScore } from './base.js';
|
|
8
8
|
import { VercelProvider } from '../../core/providers/vercel-provider';
|
|
9
9
|
import { DatasetAnalyzer, ComparisonScenario } from '../dataset-analyzer.js';
|
|
10
|
+
import { type EvaluationMetadata } from '../metadata-loader.js';
|
|
10
11
|
export interface ComparativeEvaluationResult {
|
|
11
12
|
scenario_summary: string;
|
|
12
13
|
models_compared: string[];
|
|
@@ -49,6 +50,7 @@ export declare abstract class BaseComparativeEvaluator {
|
|
|
49
50
|
protected evaluatorModel: VercelProvider;
|
|
50
51
|
protected datasetAnalyzer: DatasetAnalyzer;
|
|
51
52
|
protected promptTemplate: string;
|
|
53
|
+
protected metadata: EvaluationMetadata;
|
|
52
54
|
constructor(datasetDir?: string);
|
|
53
55
|
/**
|
|
54
56
|
* Initialize the evaluator - must be called by subclass constructor
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;
|
|
1
|
+
{"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAC7E,OAAO,EAAsE,KAAK,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAEpI,MAAM,WAAW,2BAA2B;IAC1C,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE;QACnC,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,0BAA2B,SAAQ,eAAe;IACjE,aAAa,EAAE,KAAK,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,8BAAsB,wBAAwB;IAC5C,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE7C,SAAS,CAAC,cAAc,EAAE,cAAc,CAAC;IACzC,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC;IAC3C,SAAS,CAAC,cAAc,EAAE,MAAM,CAAC;IACjC,SAAS,CAAC,QAAQ,EAAE,kBAAkB,CAAC;gBAE3B,UAAU,CAAC,EAAE,MAAM;IAkB/B;;OAEG;IACH,SAAS,CAAC,gBAAgB;IAK1B;;;OAGG;IACG,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAkBnE;;OAEG;IACG,sBAAsB,CAAC,eAAe,EAAE,0BAA0B,EAAE,GAAG,OAAO,CAAC,GAAG,CAAC;IA6CzF;;OAEG;IACG,gBAAgB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,OAAO,CAAC,0BAA0B,CAAC;IAuFzF;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAgBpH;;OAEG;IACH,eAAe;;;;;;IAIf;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,IAAI;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CACJ"}
|
|
@@ -13,10 +13,12 @@ const platform_utils_1 = require("../../core/platform-utils");
|
|
|
13
13
|
const fs_1 = require("fs");
|
|
14
14
|
const path_1 = require("path");
|
|
15
15
|
const dataset_analyzer_js_1 = require("../dataset-analyzer.js");
|
|
16
|
+
const metadata_loader_js_1 = require("../metadata-loader.js");
|
|
16
17
|
class BaseComparativeEvaluator {
|
|
17
18
|
evaluatorModel;
|
|
18
19
|
datasetAnalyzer;
|
|
19
20
|
promptTemplate;
|
|
21
|
+
metadata;
|
|
20
22
|
constructor(datasetDir) {
|
|
21
23
|
// Use Claude via VercelProvider as the evaluator (most reliable for complex comparative evaluation)
|
|
22
24
|
this.evaluatorModel = new vercel_provider_1.VercelProvider({
|
|
@@ -28,6 +30,8 @@ class BaseComparativeEvaluator {
|
|
|
28
30
|
this.datasetAnalyzer = new dataset_analyzer_js_1.DatasetAnalyzer(datasetDir || './eval/datasets');
|
|
29
31
|
// Prompt template will be loaded by subclass
|
|
30
32
|
this.promptTemplate = '';
|
|
33
|
+
// Load metadata
|
|
34
|
+
this.metadata = (0, metadata_loader_js_1.loadEvaluationMetadata)();
|
|
31
35
|
}
|
|
32
36
|
/**
|
|
33
37
|
* Initialize the evaluator - must be called by subclass constructor
|
|
@@ -122,6 +126,7 @@ class BaseComparativeEvaluator {
|
|
|
122
126
|
${reliabilityContext}
|
|
123
127
|
|
|
124
128
|
**Response:**
|
|
129
|
+
|
|
125
130
|
${modelResponse.response}
|
|
126
131
|
|
|
127
132
|
---`;
|
|
@@ -171,11 +176,18 @@ ${modelResponse.response}
|
|
|
171
176
|
* Build the evaluation prompt - can be overridden by subclasses for custom behavior
|
|
172
177
|
*/
|
|
173
178
|
buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
|
|
179
|
+
// Build metadata context sections
|
|
180
|
+
const pricingContext = (0, metadata_loader_js_1.buildModelPricingContext)(this.metadata.models);
|
|
181
|
+
const toolContext = (0, metadata_loader_js_1.buildToolContext)(this.toolName, this.metadata.tools);
|
|
182
|
+
// Inject all data into prompt template via placeholders
|
|
174
183
|
return this.promptTemplate
|
|
184
|
+
.replace('{pricing_context}', pricingContext)
|
|
185
|
+
.replace('{tool_context}', toolContext)
|
|
175
186
|
.replace('{issue}', scenario.issue)
|
|
176
187
|
.replace('{model_responses}', modelResponsesText)
|
|
177
188
|
.replace('{model_list}', modelList)
|
|
178
|
-
.replace('{phase}', scenario.interaction_id)
|
|
189
|
+
.replace('{phase}', scenario.interaction_id)
|
|
190
|
+
.replace('{scenario_name}', scenario.interaction_id);
|
|
179
191
|
}
|
|
180
192
|
/**
|
|
181
193
|
* Get statistics about available datasets
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { ModelPerformance } from './platform-synthesizer.js';
|
|
2
|
+
export interface GraphGenerationResult {
|
|
3
|
+
success: boolean;
|
|
4
|
+
graphPath?: string;
|
|
5
|
+
error?: string;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* GraphGenerator creates data visualizations for platform synthesis reports.
|
|
9
|
+
* Uses QuickChart.io API to generate chart images without requiring native dependencies.
|
|
10
|
+
*/
|
|
11
|
+
export declare class GraphGenerator {
|
|
12
|
+
private outputDir;
|
|
13
|
+
private quickchartBaseUrl;
|
|
14
|
+
constructor(outputDir?: string);
|
|
15
|
+
/**
|
|
16
|
+
* Generates all or specific graphs for the platform report
|
|
17
|
+
* @param modelPerformances Model performance data
|
|
18
|
+
* @param graphNames Optional array of specific graph names to generate. If not provided, generates all graphs.
|
|
19
|
+
* Valid names: 'performance-tiers', 'cost-vs-quality', 'reliability-comparison',
|
|
20
|
+
* 'tool-performance-heatmap', 'context-window-correlation'
|
|
21
|
+
*/
|
|
22
|
+
generateAllGraphs(modelPerformances: ModelPerformance[], graphNames?: string[]): Promise<Record<string, GraphGenerationResult>>;
|
|
23
|
+
/**
|
|
24
|
+
* Graph 1: Performance Tiers - Grouped bar chart showing score, reliability, and consistency
|
|
25
|
+
*/
|
|
26
|
+
private generatePerformanceTiersGraph;
|
|
27
|
+
/**
|
|
28
|
+
* Graph 2: Cost vs Quality - Line chart showing input/output cost range per model
|
|
29
|
+
*/
|
|
30
|
+
private generateCostVsQualityGraph;
|
|
31
|
+
/**
|
|
32
|
+
* Graph 3: Reliability Comparison - Bar chart with reliability scores
|
|
33
|
+
*/
|
|
34
|
+
private generateReliabilityComparisonGraph;
|
|
35
|
+
/**
|
|
36
|
+
* Graph 4: Tool Performance Heatmap - Shows model scores per tool
|
|
37
|
+
*/
|
|
38
|
+
private generateToolPerformanceHeatmap;
|
|
39
|
+
/**
|
|
40
|
+
* Graph 5: Context Window Correlation - Scatter plot showing context window vs performance
|
|
41
|
+
*/
|
|
42
|
+
private generateContextWindowCorrelationGraph;
|
|
43
|
+
/**
|
|
44
|
+
* Downloads a chart from QuickChart.io API and saves it as PNG
|
|
45
|
+
*/
|
|
46
|
+
private downloadChart;
|
|
47
|
+
/**
|
|
48
|
+
* Cleans model names by removing provider prefixes
|
|
49
|
+
*/
|
|
50
|
+
private cleanModelName;
|
|
51
|
+
/**
|
|
52
|
+
* Returns a consistent color for each tool index (supports up to 10 tools)
|
|
53
|
+
*/
|
|
54
|
+
private getToolColor;
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=graph-generator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"graph-generator.d.ts","sourceRoot":"","sources":["../../src/evaluation/graph-generator.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAElE,MAAM,WAAW,qBAAqB;IACpC,OAAO,EAAE,OAAO,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,iBAAiB,CAAiC;gBAE9C,SAAS,SAAoC;IAIzD;;;;;;OAMG;IACG,iBAAiB,CACrB,iBAAiB,EAAE,gBAAgB,EAAE,EACrC,UAAU,CAAC,EAAE,MAAM,EAAE,GACpB,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,qBAAqB,CAAC,CAAC;IAkCjD;;OAEG;YACW,6BAA6B;IAiH3C;;OAEG;YACW,0BAA0B;IAsHxC;;OAEG;YACW,kCAAkC;IA4GhD;;OAEG;YACW,8BAA8B;IAqG5C;;OAEG;YACW,qCAAqC;IAyHnD;;OAEG;YACW,aAAa;IA8B3B;;OAEG;IACH,OAAO,CAAC,cAAc;IAQtB;;OAEG;IACH,OAAO,CAAC,YAAY;CAerB"}
|
|
@@ -0,0 +1,694 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.GraphGenerator = void 0;
|
|
37
|
+
const fs = __importStar(require("fs"));
|
|
38
|
+
const path = __importStar(require("path"));
|
|
39
|
+
const https = __importStar(require("https"));
|
|
40
|
+
/**
|
|
41
|
+
* GraphGenerator creates data visualizations for platform synthesis reports.
|
|
42
|
+
* Uses QuickChart.io API to generate chart images without requiring native dependencies.
|
|
43
|
+
*/
|
|
44
|
+
class GraphGenerator {
|
|
45
|
+
outputDir;
|
|
46
|
+
quickchartBaseUrl = 'https://quickchart.io/chart';
|
|
47
|
+
constructor(outputDir = './eval/analysis/platform/graphs') {
|
|
48
|
+
this.outputDir = outputDir;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Generates all or specific graphs for the platform report
|
|
52
|
+
* @param modelPerformances Model performance data
|
|
53
|
+
* @param graphNames Optional array of specific graph names to generate. If not provided, generates all graphs.
|
|
54
|
+
* Valid names: 'performance-tiers', 'cost-vs-quality', 'reliability-comparison',
|
|
55
|
+
* 'tool-performance-heatmap', 'context-window-correlation'
|
|
56
|
+
*/
|
|
57
|
+
async generateAllGraphs(modelPerformances, graphNames) {
|
|
58
|
+
// Ensure output directory exists
|
|
59
|
+
if (!fs.existsSync(this.outputDir)) {
|
|
60
|
+
fs.mkdirSync(this.outputDir, { recursive: true });
|
|
61
|
+
}
|
|
62
|
+
const results = {};
|
|
63
|
+
// Define all available graphs
|
|
64
|
+
const allGraphs = {
|
|
65
|
+
'performance-tiers': () => this.generatePerformanceTiersGraph(modelPerformances),
|
|
66
|
+
'cost-vs-quality': () => this.generateCostVsQualityGraph(modelPerformances),
|
|
67
|
+
'reliability-comparison': () => this.generateReliabilityComparisonGraph(modelPerformances),
|
|
68
|
+
'tool-performance-heatmap': () => this.generateToolPerformanceHeatmap(modelPerformances),
|
|
69
|
+
'context-window-correlation': () => this.generateContextWindowCorrelationGraph(modelPerformances)
|
|
70
|
+
};
|
|
71
|
+
// If specific graphs requested, only generate those
|
|
72
|
+
const graphsToGenerate = graphNames && graphNames.length > 0
|
|
73
|
+
? graphNames
|
|
74
|
+
: Object.keys(allGraphs);
|
|
75
|
+
// Generate requested graphs
|
|
76
|
+
for (const graphName of graphsToGenerate) {
|
|
77
|
+
if (allGraphs[graphName]) {
|
|
78
|
+
results[graphName] = await allGraphs[graphName]();
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
console.warn(`⚠️ Unknown graph name: ${graphName}`);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
return results;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Graph 1: Performance Tiers - Grouped bar chart showing score, reliability, and consistency
|
|
88
|
+
*/
|
|
89
|
+
async generatePerformanceTiersGraph(modelPerformances) {
|
|
90
|
+
try {
|
|
91
|
+
// Sort by average score descending, take top 10 models
|
|
92
|
+
const topModels = modelPerformances
|
|
93
|
+
.sort((a, b) => b.averageScore - a.averageScore)
|
|
94
|
+
.slice(0, 10);
|
|
95
|
+
// Clean model names (remove "vercel_" prefix)
|
|
96
|
+
const labels = topModels.map(m => this.cleanModelName(m.modelId));
|
|
97
|
+
const scores = topModels.map(m => m.averageScore);
|
|
98
|
+
const reliability = topModels.map(m => m.reliabilityScore);
|
|
99
|
+
const consistency = topModels.map(m => m.consistencyAcrossTools);
|
|
100
|
+
const chartConfig = {
|
|
101
|
+
type: 'bar',
|
|
102
|
+
data: {
|
|
103
|
+
labels,
|
|
104
|
+
datasets: [
|
|
105
|
+
{
|
|
106
|
+
label: 'Overall Score',
|
|
107
|
+
data: scores,
|
|
108
|
+
backgroundColor: 'rgba(54, 162, 235, 0.9)',
|
|
109
|
+
borderColor: 'rgba(54, 162, 235, 1)',
|
|
110
|
+
borderWidth: 1
|
|
111
|
+
},
|
|
112
|
+
{
|
|
113
|
+
label: 'Reliability',
|
|
114
|
+
data: reliability,
|
|
115
|
+
backgroundColor: 'rgba(75, 192, 192, 0.9)',
|
|
116
|
+
borderColor: 'rgba(75, 192, 192, 1)',
|
|
117
|
+
borderWidth: 1
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
label: 'Consistency',
|
|
121
|
+
data: consistency,
|
|
122
|
+
backgroundColor: 'rgba(153, 102, 255, 0.9)',
|
|
123
|
+
borderColor: 'rgba(153, 102, 255, 1)',
|
|
124
|
+
borderWidth: 1
|
|
125
|
+
}
|
|
126
|
+
]
|
|
127
|
+
},
|
|
128
|
+
options: {
|
|
129
|
+
plugins: {
|
|
130
|
+
datalabels: {
|
|
131
|
+
display: false
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
title: {
|
|
135
|
+
display: true,
|
|
136
|
+
text: 'Model Performance Tiers: Score, Reliability, and Consistency',
|
|
137
|
+
fontSize: 18,
|
|
138
|
+
fontColor: '#FFFFFF',
|
|
139
|
+
fontStyle: 'bold'
|
|
140
|
+
},
|
|
141
|
+
scales: {
|
|
142
|
+
yAxes: [{
|
|
143
|
+
ticks: {
|
|
144
|
+
beginAtZero: true,
|
|
145
|
+
max: 1.0,
|
|
146
|
+
stepSize: 0.1,
|
|
147
|
+
fontColor: '#FFFFFF',
|
|
148
|
+
fontSize: 12
|
|
149
|
+
},
|
|
150
|
+
scaleLabel: {
|
|
151
|
+
display: true,
|
|
152
|
+
labelString: 'Score (0-1)',
|
|
153
|
+
fontColor: '#FFFFFF',
|
|
154
|
+
fontSize: 14
|
|
155
|
+
},
|
|
156
|
+
gridLines: {
|
|
157
|
+
color: 'rgba(255, 255, 255, 0.2)',
|
|
158
|
+
zeroLineColor: 'rgba(255, 255, 255, 0.4)'
|
|
159
|
+
}
|
|
160
|
+
}],
|
|
161
|
+
xAxes: [{
|
|
162
|
+
ticks: {
|
|
163
|
+
autoSkip: false,
|
|
164
|
+
maxRotation: 45,
|
|
165
|
+
minRotation: 45,
|
|
166
|
+
fontColor: '#FFFFFF',
|
|
167
|
+
fontSize: 11
|
|
168
|
+
},
|
|
169
|
+
gridLines: {
|
|
170
|
+
color: 'rgba(255, 255, 255, 0.1)'
|
|
171
|
+
}
|
|
172
|
+
}]
|
|
173
|
+
},
|
|
174
|
+
legend: {
|
|
175
|
+
display: true,
|
|
176
|
+
position: 'top',
|
|
177
|
+
labels: {
|
|
178
|
+
fontColor: '#FFFFFF',
|
|
179
|
+
fontSize: 13
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
};
|
|
184
|
+
const outputPath = path.join(this.outputDir, 'performance-tiers.png');
|
|
185
|
+
await this.downloadChart(chartConfig, outputPath);
|
|
186
|
+
return {
|
|
187
|
+
success: true,
|
|
188
|
+
graphPath: outputPath
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
catch (error) {
|
|
192
|
+
return {
|
|
193
|
+
success: false,
|
|
194
|
+
error: `Failed to generate performance tiers graph: ${error}`
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Graph 2: Cost vs Quality - Line chart showing input/output cost range per model
|
|
200
|
+
*/
|
|
201
|
+
async generateCostVsQualityGraph(modelPerformances) {
|
|
202
|
+
try {
|
|
203
|
+
// Filter out models with no pricing data and sort by quality score descending
|
|
204
|
+
const modelsWithPricing = modelPerformances
|
|
205
|
+
.filter(m => m.pricing.input_cost_per_million_tokens > 0 || m.pricing.output_cost_per_million_tokens > 0)
|
|
206
|
+
.sort((a, b) => b.averageScore - a.averageScore);
|
|
207
|
+
// Create datasets: one for each model showing the cost range line
|
|
208
|
+
const datasets = modelsWithPricing.map((m, idx) => {
|
|
209
|
+
const inputCost = m.pricing.input_cost_per_million_tokens;
|
|
210
|
+
const outputCost = m.pricing.output_cost_per_million_tokens;
|
|
211
|
+
const color = this.getToolColor(idx);
|
|
212
|
+
// Line from input cost to output cost at the model's quality score
|
|
213
|
+
return {
|
|
214
|
+
label: this.cleanModelName(m.modelId),
|
|
215
|
+
data: [
|
|
216
|
+
{ x: inputCost, y: m.averageScore },
|
|
217
|
+
{ x: outputCost, y: m.averageScore }
|
|
218
|
+
],
|
|
219
|
+
borderColor: color,
|
|
220
|
+
backgroundColor: color,
|
|
221
|
+
borderWidth: 3,
|
|
222
|
+
pointRadius: 5,
|
|
223
|
+
pointHoverRadius: 7,
|
|
224
|
+
fill: false,
|
|
225
|
+
showLine: true,
|
|
226
|
+
tension: 0
|
|
227
|
+
};
|
|
228
|
+
});
|
|
229
|
+
const chartConfig = {
|
|
230
|
+
type: 'line',
|
|
231
|
+
data: { datasets },
|
|
232
|
+
options: {
|
|
233
|
+
plugins: {
|
|
234
|
+
datalabels: {
|
|
235
|
+
display: false
|
|
236
|
+
}
|
|
237
|
+
},
|
|
238
|
+
title: {
|
|
239
|
+
display: true,
|
|
240
|
+
text: 'Cost vs Quality Analysis (line shows input → output cost range)',
|
|
241
|
+
fontSize: 18,
|
|
242
|
+
fontColor: '#FFFFFF',
|
|
243
|
+
fontStyle: 'bold'
|
|
244
|
+
},
|
|
245
|
+
scales: {
|
|
246
|
+
xAxes: [{
|
|
247
|
+
type: 'linear',
|
|
248
|
+
scaleLabel: {
|
|
249
|
+
display: true,
|
|
250
|
+
labelString: 'Cost per 1M Tokens in $ (Input ← → Output)',
|
|
251
|
+
fontColor: '#FFFFFF',
|
|
252
|
+
fontSize: 14
|
|
253
|
+
},
|
|
254
|
+
ticks: {
|
|
255
|
+
callback: function (value) {
|
|
256
|
+
return '$' + value;
|
|
257
|
+
},
|
|
258
|
+
fontColor: '#FFFFFF',
|
|
259
|
+
fontSize: 12
|
|
260
|
+
},
|
|
261
|
+
gridLines: {
|
|
262
|
+
color: 'rgba(255, 255, 255, 0.2)',
|
|
263
|
+
zeroLineColor: 'rgba(255, 255, 255, 0.4)'
|
|
264
|
+
}
|
|
265
|
+
}],
|
|
266
|
+
yAxes: [{
|
|
267
|
+
scaleLabel: {
|
|
268
|
+
display: true,
|
|
269
|
+
labelString: 'Overall Score',
|
|
270
|
+
fontColor: '#FFFFFF',
|
|
271
|
+
fontSize: 14
|
|
272
|
+
},
|
|
273
|
+
ticks: {
|
|
274
|
+
beginAtZero: false,
|
|
275
|
+
min: 0.3,
|
|
276
|
+
max: 1.0,
|
|
277
|
+
fontColor: '#FFFFFF',
|
|
278
|
+
fontSize: 12
|
|
279
|
+
},
|
|
280
|
+
gridLines: {
|
|
281
|
+
color: 'rgba(255, 255, 255, 0.2)',
|
|
282
|
+
zeroLineColor: 'rgba(255, 255, 255, 0.4)'
|
|
283
|
+
}
|
|
284
|
+
}]
|
|
285
|
+
},
|
|
286
|
+
legend: {
|
|
287
|
+
display: true,
|
|
288
|
+
position: 'right',
|
|
289
|
+
labels: {
|
|
290
|
+
fontColor: '#FFFFFF',
|
|
291
|
+
fontSize: 10,
|
|
292
|
+
boxWidth: 15,
|
|
293
|
+
usePointStyle: true
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
};
|
|
298
|
+
const outputPath = path.join(this.outputDir, 'cost-vs-quality.png');
|
|
299
|
+
await this.downloadChart(chartConfig, outputPath);
|
|
300
|
+
return {
|
|
301
|
+
success: true,
|
|
302
|
+
graphPath: outputPath
|
|
303
|
+
};
|
|
304
|
+
}
|
|
305
|
+
catch (error) {
|
|
306
|
+
return {
|
|
307
|
+
success: false,
|
|
308
|
+
error: `Failed to generate cost vs quality graph: ${error}`
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Graph 3: Reliability Comparison - Bar chart with reliability scores
|
|
314
|
+
*/
|
|
315
|
+
async generateReliabilityComparisonGraph(modelPerformances) {
|
|
316
|
+
try {
|
|
317
|
+
// Sort by reliability descending
|
|
318
|
+
const sortedModels = modelPerformances
|
|
319
|
+
.sort((a, b) => b.reliabilityScore - a.reliabilityScore);
|
|
320
|
+
const labels = sortedModels.map(m => this.cleanModelName(m.modelId));
|
|
321
|
+
const reliabilityScores = sortedModels.map(m => m.reliabilityScore);
|
|
322
|
+
// Create separate datasets for legend
|
|
323
|
+
const datasets = [
|
|
324
|
+
{
|
|
325
|
+
label: 'High Reliability (≥0.9)',
|
|
326
|
+
data: reliabilityScores.map(score => score >= 0.9 ? score : null),
|
|
327
|
+
backgroundColor: 'rgba(75, 192, 192, 0.8)',
|
|
328
|
+
borderWidth: 1
|
|
329
|
+
},
|
|
330
|
+
{
|
|
331
|
+
label: 'Medium Reliability (0.7-0.9)',
|
|
332
|
+
data: reliabilityScores.map(score => score >= 0.7 && score < 0.9 ? score : null),
|
|
333
|
+
backgroundColor: 'rgba(255, 206, 86, 0.8)',
|
|
334
|
+
borderWidth: 1
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
label: 'Low Reliability (<0.7)',
|
|
338
|
+
data: reliabilityScores.map(score => score < 0.7 ? score : null),
|
|
339
|
+
backgroundColor: 'rgba(255, 99, 132, 0.8)',
|
|
340
|
+
borderWidth: 1
|
|
341
|
+
}
|
|
342
|
+
];
|
|
343
|
+
const chartConfig = {
|
|
344
|
+
type: 'horizontalBar',
|
|
345
|
+
data: {
|
|
346
|
+
labels,
|
|
347
|
+
datasets
|
|
348
|
+
},
|
|
349
|
+
options: {
|
|
350
|
+
plugins: {
|
|
351
|
+
datalabels: {
|
|
352
|
+
display: false
|
|
353
|
+
}
|
|
354
|
+
},
|
|
355
|
+
title: {
|
|
356
|
+
display: true,
|
|
357
|
+
text: 'Model Reliability Comparison',
|
|
358
|
+
fontSize: 18,
|
|
359
|
+
fontColor: '#FFFFFF',
|
|
360
|
+
fontStyle: 'bold'
|
|
361
|
+
},
|
|
362
|
+
scales: {
|
|
363
|
+
xAxes: [{
|
|
364
|
+
stacked: true,
|
|
365
|
+
ticks: {
|
|
366
|
+
beginAtZero: true,
|
|
367
|
+
max: 1.0,
|
|
368
|
+
stepSize: 0.1,
|
|
369
|
+
fontColor: '#FFFFFF',
|
|
370
|
+
fontSize: 12
|
|
371
|
+
},
|
|
372
|
+
scaleLabel: {
|
|
373
|
+
display: true,
|
|
374
|
+
labelString: 'Reliability Score (0-1)',
|
|
375
|
+
fontColor: '#FFFFFF',
|
|
376
|
+
fontSize: 14
|
|
377
|
+
},
|
|
378
|
+
gridLines: {
|
|
379
|
+
color: 'rgba(255, 255, 255, 0.2)',
|
|
380
|
+
zeroLineColor: 'rgba(255, 255, 255, 0.4)'
|
|
381
|
+
}
|
|
382
|
+
}],
|
|
383
|
+
yAxes: [{
|
|
384
|
+
stacked: true,
|
|
385
|
+
ticks: {
|
|
386
|
+
fontColor: '#FFFFFF',
|
|
387
|
+
fontSize: 11
|
|
388
|
+
},
|
|
389
|
+
gridLines: {
|
|
390
|
+
color: 'rgba(255, 255, 255, 0.1)'
|
|
391
|
+
}
|
|
392
|
+
}]
|
|
393
|
+
},
|
|
394
|
+
legend: {
|
|
395
|
+
display: true,
|
|
396
|
+
position: 'top',
|
|
397
|
+
labels: {
|
|
398
|
+
fontColor: '#FFFFFF',
|
|
399
|
+
fontSize: 12
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
};
|
|
404
|
+
const outputPath = path.join(this.outputDir, 'reliability-comparison.png');
|
|
405
|
+
await this.downloadChart(chartConfig, outputPath);
|
|
406
|
+
return {
|
|
407
|
+
success: true,
|
|
408
|
+
graphPath: outputPath
|
|
409
|
+
};
|
|
410
|
+
}
|
|
411
|
+
catch (error) {
|
|
412
|
+
return {
|
|
413
|
+
success: false,
|
|
414
|
+
error: `Failed to generate reliability comparison graph: ${error}`
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
/**
|
|
419
|
+
* Graph 4: Tool Performance Heatmap - Shows model scores per tool
|
|
420
|
+
*/
|
|
421
|
+
async generateToolPerformanceHeatmap(modelPerformances) {
|
|
422
|
+
try {
|
|
423
|
+
// Get all unique tool names
|
|
424
|
+
const toolNames = new Set();
|
|
425
|
+
modelPerformances.forEach(m => {
|
|
426
|
+
Object.keys(m.toolScores).forEach(tool => toolNames.add(tool));
|
|
427
|
+
});
|
|
428
|
+
const tools = Array.from(toolNames).sort();
|
|
429
|
+
// Sort models by average score
|
|
430
|
+
const sortedModels = modelPerformances
|
|
431
|
+
.sort((a, b) => b.averageScore - a.averageScore)
|
|
432
|
+
.slice(0, 10); // Top 10 models
|
|
433
|
+
// Create matrix data
|
|
434
|
+
const labels = sortedModels.map(m => this.cleanModelName(m.modelId));
|
|
435
|
+
const datasets = tools.map((tool, idx) => ({
|
|
436
|
+
label: tool.charAt(0).toUpperCase() + tool.slice(1),
|
|
437
|
+
data: sortedModels.map(m => m.toolScores[tool] || 0),
|
|
438
|
+
backgroundColor: this.getToolColor(idx),
|
|
439
|
+
borderWidth: 1
|
|
440
|
+
}));
|
|
441
|
+
const chartConfig = {
|
|
442
|
+
type: 'horizontalBar',
|
|
443
|
+
data: {
|
|
444
|
+
labels,
|
|
445
|
+
datasets
|
|
446
|
+
},
|
|
447
|
+
options: {
|
|
448
|
+
plugins: {
|
|
449
|
+
datalabels: {
|
|
450
|
+
display: false
|
|
451
|
+
}
|
|
452
|
+
},
|
|
453
|
+
title: {
|
|
454
|
+
display: true,
|
|
455
|
+
text: 'Tool-Specific Performance Patterns',
|
|
456
|
+
fontSize: 18,
|
|
457
|
+
fontColor: '#FFFFFF',
|
|
458
|
+
fontStyle: 'bold'
|
|
459
|
+
},
|
|
460
|
+
scales: {
|
|
461
|
+
xAxes: [{
|
|
462
|
+
stacked: false,
|
|
463
|
+
ticks: {
|
|
464
|
+
beginAtZero: true,
|
|
465
|
+
max: 1.0,
|
|
466
|
+
stepSize: 0.2,
|
|
467
|
+
fontColor: '#FFFFFF',
|
|
468
|
+
fontSize: 12
|
|
469
|
+
},
|
|
470
|
+
scaleLabel: {
|
|
471
|
+
display: true,
|
|
472
|
+
labelString: 'Tool Score',
|
|
473
|
+
fontColor: '#FFFFFF',
|
|
474
|
+
fontSize: 14
|
|
475
|
+
},
|
|
476
|
+
gridLines: {
|
|
477
|
+
color: 'rgba(255, 255, 255, 0.2)',
|
|
478
|
+
zeroLineColor: 'rgba(255, 255, 255, 0.4)'
|
|
479
|
+
}
|
|
480
|
+
}],
|
|
481
|
+
yAxes: [{
|
|
482
|
+
stacked: false,
|
|
483
|
+
ticks: {
|
|
484
|
+
fontColor: '#FFFFFF',
|
|
485
|
+
fontSize: 11
|
|
486
|
+
},
|
|
487
|
+
gridLines: {
|
|
488
|
+
color: 'rgba(255, 255, 255, 0.1)'
|
|
489
|
+
}
|
|
490
|
+
}]
|
|
491
|
+
},
|
|
492
|
+
legend: {
|
|
493
|
+
display: true,
|
|
494
|
+
position: 'right',
|
|
495
|
+
labels: {
|
|
496
|
+
fontColor: '#FFFFFF',
|
|
497
|
+
fontSize: 12
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
};
|
|
502
|
+
const outputPath = path.join(this.outputDir, 'tool-performance-heatmap.png');
|
|
503
|
+
await this.downloadChart(chartConfig, outputPath);
|
|
504
|
+
return {
|
|
505
|
+
success: true,
|
|
506
|
+
graphPath: outputPath
|
|
507
|
+
};
|
|
508
|
+
}
|
|
509
|
+
catch (error) {
|
|
510
|
+
return {
|
|
511
|
+
success: false,
|
|
512
|
+
error: `Failed to generate tool performance heatmap: ${error}`
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
/**
|
|
517
|
+
* Graph 5: Context Window Correlation - Scatter plot showing context window vs performance
|
|
518
|
+
*/
|
|
519
|
+
async generateContextWindowCorrelationGraph(modelPerformances) {
|
|
520
|
+
try {
|
|
521
|
+
const scatterData = modelPerformances.map((m) => ({
|
|
522
|
+
x: m.capabilities.context_window / 1000, // Convert to thousands for readability
|
|
523
|
+
y: m.averageScore,
|
|
524
|
+
r: 8,
|
|
525
|
+
label: this.cleanModelName(m.modelId)
|
|
526
|
+
}));
|
|
527
|
+
const chartConfig = {
|
|
528
|
+
type: 'scatter',
|
|
529
|
+
data: {
|
|
530
|
+
datasets: [{
|
|
531
|
+
label: 'Models',
|
|
532
|
+
data: scatterData,
|
|
533
|
+
backgroundColor: 'rgba(153, 102, 255, 0.7)',
|
|
534
|
+
borderColor: 'rgba(153, 102, 255, 1)',
|
|
535
|
+
borderWidth: 2,
|
|
536
|
+
pointRadius: 10
|
|
537
|
+
}]
|
|
538
|
+
},
|
|
539
|
+
options: {
|
|
540
|
+
layout: {
|
|
541
|
+
padding: {
|
|
542
|
+
right: 300,
|
|
543
|
+
left: 20,
|
|
544
|
+
top: 20,
|
|
545
|
+
bottom: 20
|
|
546
|
+
}
|
|
547
|
+
},
|
|
548
|
+
plugins: {
|
|
549
|
+
datalabels: {
|
|
550
|
+
display: true,
|
|
551
|
+
align: 'right',
|
|
552
|
+
offset: 12,
|
|
553
|
+
color: '#FFFFFF',
|
|
554
|
+
font: {
|
|
555
|
+
size: 20
|
|
556
|
+
},
|
|
557
|
+
formatter: (value) => value.label
|
|
558
|
+
}
|
|
559
|
+
},
|
|
560
|
+
title: {
|
|
561
|
+
display: true,
|
|
562
|
+
text: 'Context Window Size vs Performance',
|
|
563
|
+
fontSize: 18,
|
|
564
|
+
fontColor: '#FFFFFF',
|
|
565
|
+
fontStyle: 'bold'
|
|
566
|
+
},
|
|
567
|
+
scales: {
|
|
568
|
+
xAxes: [{
|
|
569
|
+
type: 'linear',
|
|
570
|
+
scaleLabel: {
|
|
571
|
+
display: true,
|
|
572
|
+
labelString: 'Context Window Size (K tokens)',
|
|
573
|
+
fontColor: '#FFFFFF',
|
|
574
|
+
fontSize: 14
|
|
575
|
+
},
|
|
576
|
+
ticks: {
|
|
577
|
+
callback: (value) => value + 'K',
|
|
578
|
+
fontColor: '#FFFFFF',
|
|
579
|
+
fontSize: 12
|
|
580
|
+
},
|
|
581
|
+
gridLines: {
|
|
582
|
+
color: 'rgba(255, 255, 255, 0.2)',
|
|
583
|
+
zeroLineColor: 'rgba(255, 255, 255, 0.4)'
|
|
584
|
+
}
|
|
585
|
+
}],
|
|
586
|
+
yAxes: [{
|
|
587
|
+
scaleLabel: {
|
|
588
|
+
display: true,
|
|
589
|
+
labelString: 'Overall Score',
|
|
590
|
+
fontColor: '#FFFFFF',
|
|
591
|
+
fontSize: 14
|
|
592
|
+
},
|
|
593
|
+
ticks: {
|
|
594
|
+
beginAtZero: false,
|
|
595
|
+
min: 0.3,
|
|
596
|
+
max: 1.0,
|
|
597
|
+
fontColor: '#FFFFFF',
|
|
598
|
+
fontSize: 12
|
|
599
|
+
},
|
|
600
|
+
gridLines: {
|
|
601
|
+
color: 'rgba(255, 255, 255, 0.2)',
|
|
602
|
+
zeroLineColor: 'rgba(255, 255, 255, 0.4)'
|
|
603
|
+
}
|
|
604
|
+
}]
|
|
605
|
+
},
|
|
606
|
+
legend: {
|
|
607
|
+
display: false
|
|
608
|
+
},
|
|
609
|
+
tooltips: {
|
|
610
|
+
backgroundColor: 'rgba(0, 0, 0, 0.8)',
|
|
611
|
+
titleFontColor: '#FFFFFF',
|
|
612
|
+
bodyFontColor: '#FFFFFF',
|
|
613
|
+
callbacks: {
|
|
614
|
+
label: (tooltipItem, data) => {
|
|
615
|
+
const dataset = data.datasets[tooltipItem.datasetIndex];
|
|
616
|
+
const point = dataset.data[tooltipItem.index];
|
|
617
|
+
return `${point.label}: ${point.y.toFixed(3)} (${Math.round(point.x)}K tokens)`;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
};
|
|
623
|
+
const outputPath = path.join(this.outputDir, 'context-window-correlation.png');
|
|
624
|
+
await this.downloadChart(chartConfig, outputPath, 1400, 700);
|
|
625
|
+
return {
|
|
626
|
+
success: true,
|
|
627
|
+
graphPath: outputPath
|
|
628
|
+
};
|
|
629
|
+
}
|
|
630
|
+
catch (error) {
|
|
631
|
+
return {
|
|
632
|
+
success: false,
|
|
633
|
+
error: `Failed to generate context window correlation graph: ${error}`
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
/**
|
|
638
|
+
* Downloads a chart from QuickChart.io API and saves it as PNG
|
|
639
|
+
*/
|
|
640
|
+
async downloadChart(chartConfig, outputPath, width = 1000, height = 600) {
|
|
641
|
+
return new Promise((resolve, reject) => {
|
|
642
|
+
const chartJson = JSON.stringify(chartConfig);
|
|
643
|
+
const url = `${this.quickchartBaseUrl}?c=${encodeURIComponent(chartJson)}&width=${width}&height=${height}&format=png&backgroundColor=black`;
|
|
644
|
+
https.get(url, (response) => {
|
|
645
|
+
if (response.statusCode !== 200) {
|
|
646
|
+
reject(new Error(`QuickChart API returned status ${response.statusCode}`));
|
|
647
|
+
return;
|
|
648
|
+
}
|
|
649
|
+
const fileStream = fs.createWriteStream(outputPath);
|
|
650
|
+
response.pipe(fileStream);
|
|
651
|
+
fileStream.on('finish', () => {
|
|
652
|
+
fileStream.close();
|
|
653
|
+
console.log(`✅ Graph saved: ${outputPath}`);
|
|
654
|
+
resolve();
|
|
655
|
+
});
|
|
656
|
+
fileStream.on('error', (err) => {
|
|
657
|
+
fs.unlink(outputPath, () => { }); // Clean up partial file
|
|
658
|
+
reject(err);
|
|
659
|
+
});
|
|
660
|
+
}).on('error', (err) => {
|
|
661
|
+
reject(err);
|
|
662
|
+
});
|
|
663
|
+
});
|
|
664
|
+
}
|
|
665
|
+
/**
|
|
666
|
+
* Cleans model names by removing provider prefixes
|
|
667
|
+
*/
|
|
668
|
+
cleanModelName(modelId) {
|
|
669
|
+
// Remove "vercel_" prefix and timestamp suffix
|
|
670
|
+
return modelId
|
|
671
|
+
.replace(/^vercel_/, '')
|
|
672
|
+
.replace(/_\d{4}-\d{2}-\d{2}$/, '')
|
|
673
|
+
.replace(/_/g, '-');
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Returns a consistent color for each tool index (supports up to 10 tools)
|
|
677
|
+
*/
|
|
678
|
+
getToolColor(index) {
|
|
679
|
+
const colors = [
|
|
680
|
+
'rgba(255, 99, 132, 0.8)', // Red
|
|
681
|
+
'rgba(54, 162, 235, 0.8)', // Blue
|
|
682
|
+
'rgba(255, 206, 86, 0.8)', // Yellow
|
|
683
|
+
'rgba(75, 192, 192, 0.8)', // Green
|
|
684
|
+
'rgba(153, 102, 255, 0.8)', // Purple
|
|
685
|
+
'rgba(255, 159, 64, 0.8)', // Orange
|
|
686
|
+
'rgba(199, 199, 199, 0.8)', // Grey
|
|
687
|
+
'rgba(83, 102, 255, 0.8)', // Indigo
|
|
688
|
+
'rgba(255, 99, 255, 0.8)', // Pink
|
|
689
|
+
'rgba(99, 255, 132, 0.8)' // Light Green
|
|
690
|
+
];
|
|
691
|
+
return colors[index % colors.length];
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
exports.GraphGenerator = GraphGenerator;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared Metadata Loader
|
|
3
|
+
*
|
|
4
|
+
* Provides consistent access to model and tool metadata across all evaluators
|
|
5
|
+
*/
|
|
6
|
+
export interface ModelMetadata {
|
|
7
|
+
provider: string;
|
|
8
|
+
pricing: {
|
|
9
|
+
input_cost_per_million_tokens: number;
|
|
10
|
+
output_cost_per_million_tokens: number;
|
|
11
|
+
};
|
|
12
|
+
context_window: number;
|
|
13
|
+
supports_function_calling: boolean;
|
|
14
|
+
}
|
|
15
|
+
export interface ToolMetadata {
|
|
16
|
+
name: string;
|
|
17
|
+
description: string;
|
|
18
|
+
primaryFunction: string;
|
|
19
|
+
testTimeout: string;
|
|
20
|
+
successCriteria: string[];
|
|
21
|
+
modelRequirements: Record<string, string>;
|
|
22
|
+
}
|
|
23
|
+
export interface EvaluationMetadata {
|
|
24
|
+
models: Record<string, ModelMetadata>;
|
|
25
|
+
tools: Record<string, ToolMetadata>;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Load model and tool metadata from model-metadata.json
|
|
29
|
+
*/
|
|
30
|
+
export declare function loadEvaluationMetadata(): EvaluationMetadata;
|
|
31
|
+
/**
|
|
32
|
+
* Build model pricing context for evaluation prompts
|
|
33
|
+
*/
|
|
34
|
+
export declare function buildModelPricingContext(models: Record<string, ModelMetadata>): string;
|
|
35
|
+
/**
|
|
36
|
+
* Build tool context for evaluation prompts (tool-specific description and constraints)
|
|
37
|
+
*/
|
|
38
|
+
export declare function buildToolContext(toolName: string, tools: Record<string, ToolMetadata>): string;
|
|
39
|
+
//# sourceMappingURL=metadata-loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metadata-loader.d.ts","sourceRoot":"","sources":["../../src/evaluation/metadata-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,MAAM,WAAW,aAAa;IAC5B,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,cAAc,EAAE,MAAM,CAAC;IACvB,yBAAyB,EAAE,OAAO,CAAC;CACpC;AAED,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC3C;AAED,MAAM,WAAW,kBAAkB;IACjC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IACtC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;CACrC;AAED;;GAEG;AACH,wBAAgB,sBAAsB,IAAI,kBAAkB,CAa3D;AAED;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,GAAG,MAAM,CAkBtF;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,GAAG,MAAM,CAqB9F"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Shared Metadata Loader
|
|
4
|
+
*
|
|
5
|
+
* Provides consistent access to model and tool metadata across all evaluators
|
|
6
|
+
*/
|
|
7
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
8
|
+
exports.loadEvaluationMetadata = loadEvaluationMetadata;
|
|
9
|
+
exports.buildModelPricingContext = buildModelPricingContext;
|
|
10
|
+
exports.buildToolContext = buildToolContext;
|
|
11
|
+
const fs_1 = require("fs");
|
|
12
|
+
const path_1 = require("path");
|
|
13
|
+
/**
|
|
14
|
+
* Load model and tool metadata from model-metadata.json
|
|
15
|
+
*/
|
|
16
|
+
function loadEvaluationMetadata() {
|
|
17
|
+
try {
|
|
18
|
+
const metadataPath = (0, path_1.join)(process.cwd(), 'src', 'evaluation', 'model-metadata.json');
|
|
19
|
+
const metadata = JSON.parse((0, fs_1.readFileSync)(metadataPath, 'utf8'));
|
|
20
|
+
console.log(`✅ Loaded metadata for ${Object.keys(metadata.models || {}).length} models and ${Object.keys(metadata.tools || {}).length} tools`);
|
|
21
|
+
return {
|
|
22
|
+
models: metadata.models || {},
|
|
23
|
+
tools: metadata.tools || {}
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
console.warn('⚠️ Failed to load evaluation metadata:', error);
|
|
28
|
+
return { models: {}, tools: {} };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Build model pricing context for evaluation prompts
|
|
33
|
+
*/
|
|
34
|
+
function buildModelPricingContext(models) {
|
|
35
|
+
const modelIds = Object.keys(models);
|
|
36
|
+
if (modelIds.length === 0) {
|
|
37
|
+
return 'No pricing information available.';
|
|
38
|
+
}
|
|
39
|
+
const pricingLines = modelIds.map(modelId => {
|
|
40
|
+
const model = models[modelId];
|
|
41
|
+
const inputCost = model.pricing?.input_cost_per_million_tokens?.toFixed(2) || 'N/A';
|
|
42
|
+
const outputCost = model.pricing?.output_cost_per_million_tokens?.toFixed(2) || 'N/A';
|
|
43
|
+
const avgCost = model.pricing
|
|
44
|
+
? ((model.pricing.input_cost_per_million_tokens + model.pricing.output_cost_per_million_tokens) / 2).toFixed(2)
|
|
45
|
+
: 'N/A';
|
|
46
|
+
const contextWindow = model.context_window ? `${(model.context_window / 1000).toFixed(0)}K` : 'N/A';
|
|
47
|
+
return `- **${modelId}** (${model.provider}): $${avgCost}/1M tokens ($${inputCost} input, $${outputCost} output) | Context: ${contextWindow} tokens`;
|
|
48
|
+
});
|
|
49
|
+
return `## Model Pricing Information\n\n${pricingLines.join('\n')}`;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Build tool context for evaluation prompts (tool-specific description and constraints)
|
|
53
|
+
*/
|
|
54
|
+
function buildToolContext(toolName, tools) {
|
|
55
|
+
const tool = tools[toolName];
|
|
56
|
+
if (!tool) {
|
|
57
|
+
return `No metadata available for tool: ${toolName}`;
|
|
58
|
+
}
|
|
59
|
+
return `## Tool Being Evaluated: ${tool.name}
|
|
60
|
+
|
|
61
|
+
**Description**: ${tool.description}
|
|
62
|
+
|
|
63
|
+
**Primary Function**: ${tool.primaryFunction}
|
|
64
|
+
|
|
65
|
+
**Test Timeout Constraint**: ${tool.testTimeout}
|
|
66
|
+
|
|
67
|
+
**Success Criteria**:
|
|
68
|
+
${tool.successCriteria.map((c) => `- ${c}`).join('\n')}
|
|
69
|
+
|
|
70
|
+
**Model Requirements**:
|
|
71
|
+
${Object.entries(tool.modelRequirements).map(([key, value]) => `- **${key}**: ${value}`).join('\n')}
|
|
72
|
+
|
|
73
|
+
**IMPORTANT**: When analyzing model failures, consider whether the model exceeded the timeout constraint. Models that timeout should be noted as failing due to timeout constraints rather than quality issues.`;
|
|
74
|
+
}
|
|
@@ -35,7 +35,7 @@ export declare class PlatformSynthesizer {
|
|
|
35
35
|
private aiProvider;
|
|
36
36
|
private reportsDir;
|
|
37
37
|
constructor(aiProvider: VercelProvider, reportsDir?: string);
|
|
38
|
-
generatePlatformWideAnalysis(): Promise<string>;
|
|
38
|
+
generatePlatformWideAnalysis(graphsToGenerate?: string[], skipReport?: boolean): Promise<string>;
|
|
39
39
|
private loadToolMetadata;
|
|
40
40
|
private loadAllReports;
|
|
41
41
|
private analyzeCrossToolPerformance;
|
|
@@ -49,6 +49,10 @@ export declare class PlatformSynthesizer {
|
|
|
49
49
|
private generateProductionRecommendations;
|
|
50
50
|
private calculateCostEstimate;
|
|
51
51
|
private extractBaseModelId;
|
|
52
|
+
/**
|
|
53
|
+
* Generates graphs and replaces placeholders in the markdown report
|
|
54
|
+
*/
|
|
55
|
+
private addGraphsToReport;
|
|
52
56
|
saveSynthesisReport(markdownContent: string, outputPath?: string): Promise<void>;
|
|
53
57
|
}
|
|
54
58
|
//# sourceMappingURL=platform-synthesizer.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"platform-synthesizer.d.ts","sourceRoot":"","sources":["../../src/evaluation/platform-synthesizer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;
|
|
1
|
+
{"version":3,"file":"platform-synthesizer.d.ts","sourceRoot":"","sources":["../../src/evaluation/platform-synthesizer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAItE,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;IACzB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,YAAY,EAAE;QACZ,cAAc,EAAE,MAAM,CAAC;QACvB,yBAAyB,EAAE,OAAO,CAAC;KACpC,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,aAAa,EAAE,gBAAgB,EAAE,CAAC;IAClC,QAAQ,EAAE,gBAAgB,EAAE,CAAC;IAC7B,kBAAkB,EAAE,gBAAgB,EAAE,CAAC;CACxC;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,eAAe,GAAG,aAAa,GAAG,YAAY,GAAG,UAAU,CAAC;IACtE,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,UAAU,CAAiB;IACnC,OAAO,CAAC,UAAU,CAAS;gBAEf,UAAU,EAAE,cAAc,EAAE,UAAU,SAA+B;IAK3E,4BAA4B,CAAC,gBAAgB,CAAC,EAAE,MAAM,EAAE,EAAE,UAAU,UAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;IA6CpG,OAAO,CAAC,gBAAgB;YAKV,cAAc;YAyBd,2BAA2B;IA8DzC,OAAO,CAAC,0BAA0B;IA2ElC,OAAO,CAAC,wBAAwB;IAiDhC,OAAO,CAAC,4BAA4B;YA0CtB,wBAAwB;IAoBtC,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,oBAAoB;IAsB5B,OAAO,CAAC,yBAAyB;IAWjC,OAAO,CAAC,iCAAiC;IASzC,OAAO,CAAC,qBAAqB;IAQ7B,OAAO,CAAC,kBAAkB;IAS1B;;OAEG;YACW,iBAAiB;IA8CzB,mBAAmB,CACvB,eAAe,EAAE,MAAM,EACvB,UAAU,SAAiD,GAC1D,OAAO,CAAC,IAAI,CAAC;CAWjB"}
|
|
@@ -36,6 +36,8 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
36
36
|
exports.PlatformSynthesizer = void 0;
|
|
37
37
|
const fs = __importStar(require("fs"));
|
|
38
38
|
const path = __importStar(require("path"));
|
|
39
|
+
const graph_generator_js_1 = require("./graph-generator.js");
|
|
40
|
+
const metadata_loader_js_1 = require("./metadata-loader.js");
|
|
39
41
|
class PlatformSynthesizer {
|
|
40
42
|
aiProvider;
|
|
41
43
|
reportsDir;
|
|
@@ -43,48 +45,48 @@ class PlatformSynthesizer {
|
|
|
43
45
|
this.aiProvider = aiProvider;
|
|
44
46
|
this.reportsDir = reportsDir;
|
|
45
47
|
}
|
|
46
|
-
async generatePlatformWideAnalysis() {
|
|
48
|
+
async generatePlatformWideAnalysis(graphsToGenerate, skipReport = false) {
|
|
47
49
|
console.log('🔍 Loading all evaluation reports...');
|
|
48
50
|
const allReports = await this.loadAllReports();
|
|
49
51
|
console.log('🔧 Loading tool metadata...');
|
|
50
52
|
const toolMetadata = this.loadToolMetadata();
|
|
51
53
|
console.log('📊 Analyzing cross-tool performance patterns...');
|
|
52
54
|
const crossToolAnalysis = await this.analyzeCrossToolPerformance(allReports);
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
const markdownReport = await this.generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata);
|
|
59
|
-
return markdownReport;
|
|
60
|
-
}
|
|
61
|
-
loadToolMetadata() {
|
|
62
|
-
try {
|
|
63
|
-
const metadataPath = path.join(process.cwd(), 'src', 'evaluation', 'model-metadata.json');
|
|
64
|
-
const metadataContent = fs.readFileSync(metadataPath, 'utf8');
|
|
65
|
-
const metadata = JSON.parse(metadataContent);
|
|
66
|
-
console.log(`✅ Loaded tool metadata with ${Object.keys(metadata.tools || {}).length} tools`);
|
|
67
|
-
return metadata.tools || {};
|
|
55
|
+
let markdownReport;
|
|
56
|
+
if (skipReport) {
|
|
57
|
+
console.log('⏭️ Skipping AI report generation...');
|
|
58
|
+
// Return empty string if we're only generating graphs
|
|
59
|
+
markdownReport = '';
|
|
68
60
|
}
|
|
69
|
-
|
|
70
|
-
console.
|
|
71
|
-
|
|
61
|
+
else {
|
|
62
|
+
console.log('🎯 Generating decision matrices...');
|
|
63
|
+
const decisionMatrices = this.generateDecisionMatrices(crossToolAnalysis.modelPerformances);
|
|
64
|
+
console.log('💡 Creating usage recommendations...');
|
|
65
|
+
const usageRecommendations = this.generateUsageRecommendations(crossToolAnalysis, decisionMatrices);
|
|
66
|
+
console.log('🚀 Generating comprehensive AI-powered report...');
|
|
67
|
+
markdownReport = await this.generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata);
|
|
72
68
|
}
|
|
69
|
+
console.log('📊 Generating data visualizations...');
|
|
70
|
+
const reportWithGraphs = await this.addGraphsToReport(markdownReport, crossToolAnalysis.modelPerformances, graphsToGenerate);
|
|
71
|
+
return reportWithGraphs;
|
|
72
|
+
}
|
|
73
|
+
loadToolMetadata() {
|
|
74
|
+
const metadata = (0, metadata_loader_js_1.loadEvaluationMetadata)();
|
|
75
|
+
return { tools: metadata.tools };
|
|
73
76
|
}
|
|
74
77
|
async loadAllReports() {
|
|
75
78
|
const reports = {};
|
|
76
79
|
// Load all JSON result files from the directory
|
|
77
80
|
const reportFiles = fs.readdirSync(this.reportsDir)
|
|
78
|
-
.filter(file => file.endsWith('-results
|
|
79
|
-
.filter(file => file.endsWith('.json'));
|
|
81
|
+
.filter(file => file.endsWith('-results.json'));
|
|
80
82
|
if (reportFiles.length === 0) {
|
|
81
83
|
throw new Error(`No evaluation result files found in ${this.reportsDir}`);
|
|
82
84
|
}
|
|
83
85
|
for (const fileName of reportFiles) {
|
|
84
86
|
const reportPath = path.join(this.reportsDir, fileName);
|
|
85
87
|
const reportContent = JSON.parse(fs.readFileSync(reportPath, 'utf8'));
|
|
86
|
-
// Extract tool type from filename (e.g., "capability-results
|
|
87
|
-
const toolType = fileName.split('-results
|
|
88
|
+
// Extract tool type from filename (e.g., "capability-results.json" -> "capability")
|
|
89
|
+
const toolType = fileName.split('-results.json')[0];
|
|
88
90
|
reports[toolType] = reportContent;
|
|
89
91
|
console.log(`✅ Loaded ${toolType} report: ${fileName}`);
|
|
90
92
|
}
|
|
@@ -355,6 +357,46 @@ class PlatformSynthesizer {
|
|
|
355
357
|
}
|
|
356
358
|
return fullModelId;
|
|
357
359
|
}
|
|
360
|
+
/**
|
|
361
|
+
* Generates graphs and replaces placeholders in the markdown report
|
|
362
|
+
*/
|
|
363
|
+
async addGraphsToReport(markdownContent, modelPerformances, graphsToGenerate) {
|
|
364
|
+
const graphGenerator = new graph_generator_js_1.GraphGenerator('./eval/analysis/platform/graphs');
|
|
365
|
+
try {
|
|
366
|
+
// Generate all or specific graphs
|
|
367
|
+
const graphResults = await graphGenerator.generateAllGraphs(modelPerformances, graphsToGenerate);
|
|
368
|
+
// Replace placeholders with actual image markdown
|
|
369
|
+
let updatedMarkdown = markdownContent;
|
|
370
|
+
const graphMappings = {
|
|
371
|
+
'[GRAPH:performance-tiers]': '',
|
|
372
|
+
'[GRAPH:cost-vs-quality]': '',
|
|
373
|
+
'[GRAPH:reliability-comparison]': '',
|
|
374
|
+
'[GRAPH:tool-performance-heatmap]': '',
|
|
375
|
+
'[GRAPH:context-window-correlation]': ''
|
|
376
|
+
};
|
|
377
|
+
for (const [placeholder, imageMarkdown] of Object.entries(graphMappings)) {
|
|
378
|
+
updatedMarkdown = updatedMarkdown.replace(placeholder, imageMarkdown);
|
|
379
|
+
}
|
|
380
|
+
// Log graph generation results
|
|
381
|
+
for (const [graphName, result] of Object.entries(graphResults)) {
|
|
382
|
+
if (result.success) {
|
|
383
|
+
console.log(` ✅ ${graphName}: ${result.graphPath}`);
|
|
384
|
+
}
|
|
385
|
+
else {
|
|
386
|
+
console.warn(` ⚠️ ${graphName}: ${result.error}`);
|
|
387
|
+
// If graph generation failed, remove the placeholder to avoid broken markdown
|
|
388
|
+
const placeholderKey = `[GRAPH:${graphName}]`;
|
|
389
|
+
updatedMarkdown = updatedMarkdown.replace(placeholderKey, `*Graph generation failed: ${result.error}*`);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
return updatedMarkdown;
|
|
393
|
+
}
|
|
394
|
+
catch (error) {
|
|
395
|
+
console.error('⚠️ Failed to generate graphs, returning report without visualizations:', error);
|
|
396
|
+
// If graph generation completely fails, remove all placeholders
|
|
397
|
+
return markdownContent.replace(/\[GRAPH:[^\]]+\]/g, '*Graph generation failed*');
|
|
398
|
+
}
|
|
399
|
+
}
|
|
358
400
|
async saveSynthesisReport(markdownContent, outputPath = './eval/analysis/platform/synthesis-report.md') {
|
|
359
401
|
const dir = path.dirname(outputPath);
|
|
360
402
|
if (!fs.existsSync(dir)) {
|
|
@@ -13,6 +13,21 @@ const model_config_js_1 = require("../core/model-config.js");
|
|
|
13
13
|
async function runPlatformSynthesis() {
|
|
14
14
|
console.log('🚀 Starting Platform-Wide AI Model Synthesis...\n');
|
|
15
15
|
try {
|
|
16
|
+
// Parse command line arguments for graph filtering
|
|
17
|
+
const args = process.argv.slice(2);
|
|
18
|
+
let graphsToGenerate;
|
|
19
|
+
let skipReport = false;
|
|
20
|
+
if (args.length > 0) {
|
|
21
|
+
const graphArg = args.find(arg => arg.startsWith('--graphs='));
|
|
22
|
+
if (graphArg) {
|
|
23
|
+
graphsToGenerate = graphArg.split('=')[1].split(',');
|
|
24
|
+
console.log(`📊 Generating specific graphs: ${graphsToGenerate.join(', ')}\n`);
|
|
25
|
+
}
|
|
26
|
+
skipReport = args.includes('--skip-report');
|
|
27
|
+
if (skipReport) {
|
|
28
|
+
console.log('⏭️ Skipping AI report generation (graphs only)\n');
|
|
29
|
+
}
|
|
30
|
+
}
|
|
16
31
|
// Initialize AI provider for synthesis analysis (use Claude for comprehensive analysis)
|
|
17
32
|
const aiProvider = new vercel_provider_js_1.VercelProvider({
|
|
18
33
|
provider: 'anthropic',
|
|
@@ -22,12 +37,14 @@ async function runPlatformSynthesis() {
|
|
|
22
37
|
});
|
|
23
38
|
// Initialize synthesizer
|
|
24
39
|
const synthesizer = new platform_synthesizer_js_1.PlatformSynthesizer(aiProvider);
|
|
25
|
-
// Generate comprehensive platform-wide analysis
|
|
40
|
+
// Generate comprehensive platform-wide analysis (or just graphs if skip-report is set)
|
|
26
41
|
console.log('📊 Generating platform-wide analysis...');
|
|
27
|
-
const markdownReport = await synthesizer.generatePlatformWideAnalysis();
|
|
28
|
-
// Save synthesis report
|
|
29
|
-
|
|
30
|
-
|
|
42
|
+
const markdownReport = await synthesizer.generatePlatformWideAnalysis(graphsToGenerate, skipReport);
|
|
43
|
+
// Save synthesis report only if we generated it
|
|
44
|
+
if (!skipReport) {
|
|
45
|
+
console.log('\n💾 Saving synthesis report...');
|
|
46
|
+
await synthesizer.saveSynthesisReport(markdownReport);
|
|
47
|
+
}
|
|
31
48
|
console.log('\n✅ Platform-wide synthesis complete!');
|
|
32
49
|
console.log('📄 Report saved: ./eval/analysis/platform/synthesis-report.md');
|
|
33
50
|
console.log('\n✨ AI-generated comprehensive report includes:');
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vfarcic/dot-ai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.116.0",
|
|
4
4
|
"description": "AI-powered development productivity platform that enhances software development workflows through intelligent automation and AI-driven assistance",
|
|
5
5
|
"mcpName": "io.github.vfarcic/dot-ai",
|
|
6
6
|
"main": "dist/index.js",
|