npm - @vfarcic/dot-ai - Versions diffs - 0.115.0 → 0.116.0 - Mend

@vfarcic/dot-ai 0.115.0 → 0.116.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/evaluation/eval-runner.js +12 -3
package/dist/evaluation/evaluators/base-comparative.d.ts +2 -0
package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -1
package/dist/evaluation/evaluators/base-comparative.js +13 -1
package/dist/evaluation/graph-generator.d.ts +56 -0
package/dist/evaluation/graph-generator.d.ts.map +1 -0
package/dist/evaluation/graph-generator.js +694 -0
package/dist/evaluation/metadata-loader.d.ts +39 -0
package/dist/evaluation/metadata-loader.d.ts.map +1 -0
package/dist/evaluation/metadata-loader.js +74 -0
package/dist/evaluation/platform-synthesizer.d.ts +5 -1
package/dist/evaluation/platform-synthesizer.d.ts.map +1 -1
package/dist/evaluation/platform-synthesizer.js +65 -23
package/dist/evaluation/run-platform-synthesis.js +22 -5
package/package.json +1 -1

package/dist/evaluation/eval-runner.js CHANGED Viewed

@@ -295,9 +295,8 @@ async function runEvaluation(evaluatorType, datasetsDir, modelMetadata) {
     const reportContent = generateMarkdownReport(results, stats, evaluatorType, finalAssessment);
     const jsonResults = generateJsonReport(results, stats, evaluatorType, modelMetadata, finalAssessment);
     // Save reports to files
-    const dateStamp = new Date().toISOString().split('T')[0];
-    const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation-${dateStamp}.md`;
-    const jsonPath = `./eval/analysis/individual/${evaluatorType}-results-${dateStamp}.json`;
+    const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation.md`;
+    const jsonPath = `./eval/analysis/individual/${evaluatorType}-results.json`;
     const reportDir = './eval/analysis/individual';
     // Ensure report directory exists
     const fs = await Promise.resolve().then(() => __importStar(require('fs')));
@@ -328,6 +327,16 @@ async function main() {
     catch (error) {
         console.warn('⚠️  Could not clean debug files:', error instanceof Error ? error.message : String(error));
     }
+    // Clean old evaluation result files from eval/results
+    console.log('🧹 Cleaning old evaluation result files...');
+    try {
+        await execAsync('rm -f ./eval/results/*_comparative_evaluation_*.jsonl 2>/dev/null || true');
+        await execAsync('mkdir -p ./eval/results');
+        console.log('✅ Old evaluation results cleaned\n');
+    }
+    catch (error) {
+        console.warn('⚠️  Could not clean old evaluation results:', error instanceof Error ? error.message : String(error));
+    }
     // Check model metadata freshness before starting any evaluation work
     const modelMetadata = loadModelMetadata();
     const datasetsDir = './eval/datasets';

package/dist/evaluation/evaluators/base-comparative.d.ts CHANGED Viewed

@@ -7,6 +7,7 @@
 import { EvaluationScore } from './base.js';
 import { VercelProvider } from '../../core/providers/vercel-provider';
 import { DatasetAnalyzer, ComparisonScenario } from '../dataset-analyzer.js';
+import { type EvaluationMetadata } from '../metadata-loader.js';
 export interface ComparativeEvaluationResult {
     scenario_summary: string;
     models_compared: string[];
@@ -49,6 +50,7 @@ export declare abstract class BaseComparativeEvaluator {
     protected evaluatorModel: VercelProvider;
     protected datasetAnalyzer: DatasetAnalyzer;
     protected promptTemplate: string;
+    protected metadata: EvaluationMetadata;
     constructor(datasetDir?: string);
     /**
      * Initialize the evaluator - must be called by subclass constructor

package/dist/evaluation/evaluators/base-comparative.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;~~AAE7E~~,MAAM,WAAW,2BAA2B;IAC1C,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE;QACnC,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,0BAA2B,SAAQ,eAAe;IACjE,aAAa,EAAE,KAAK,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,8BAAsB,wBAAwB;IAC5C,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE7C,SAAS,CAAC,cAAc,EAAE,cAAc,CAAC;IACzC,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC;IAC3C,SAAS,CAAC,cAAc,EAAE,MAAM,CAAC;~~gBAErB~~,UAAU,CAAC,EAAE,MAAM;~~IAe~~/B;;OAEG;IACH,SAAS,CAAC,gBAAgB;IAK1B;;;OAGG;IACG,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAkBnE;;OAEG;IACG,sBAAsB,CAAC,eAAe,EAAE,0BAA0B,EAAE,GAAG,OAAO,CAAC,GAAG,CAAC;IA6CzF;;OAEG;IACG,gBAAgB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,OAAO,CAAC,0BAA0B,CAAC;~~IAsFzF~~;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;~~IAQpH~~;;OAEG;IACH,eAAe;;;;;;IAIf;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,IAAI;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CACJ"}
1	+ {"version":3,"file":"base-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base-comparative.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAKtE,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAC7E,OAAO,EAAsE,KAAK,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;AAEpI,MAAM,WAAW,2BAA2B;IAC1C,gBAAgB,EAAE,MAAM,CAAC;IACzB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,oBAAoB,EAAE,MAAM,CAAC,MAAM,EAAE;QACnC,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAC1B,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,iBAAiB,CAAC,EAAE,MAAM,CAAC;QAC3B,cAAc,EAAE,MAAM,CAAC;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC,CAAC;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,0BAA2B,SAAQ,eAAe;IACjE,aAAa,EAAE,KAAK,CAAC;QACnB,IAAI,EAAE,MAAM,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,8BAAsB,wBAAwB;IAC5C,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IACtC,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IACnD,SAAS,CAAC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE7C,SAAS,CAAC,cAAc,EAAE,cAAc,CAAC;IACzC,SAAS,CAAC,eAAe,EAAE,eAAe,CAAC;IAC3C,SAAS,CAAC,cAAc,EAAE,MAAM,CAAC;IACjC,SAAS,CAAC,QAAQ,EAAE,kBAAkB,CAAC;gBAE3B,UAAU,CAAC,EAAE,MAAM;IAkB/B;;OAEG;IACH,SAAS,CAAC,gBAAgB;IAK1B;;;OAGG;IACG,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IAkBnE;;OAEG;IACG,sBAAsB,CAAC,eAAe,EAAE,0BAA0B,EAAE,GAAG,OAAO,CAAC,GAAG,CAAC;IA6CzF;;OAEG;IACG,gBAAgB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,OAAO,CAAC,0BAA0B,CAAC;IAuFzF;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAgBpH;;OAEG;IACH,eAAe;;;;;;IAIf;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,IAAI;QAC9B,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CACJ"}

package/dist/evaluation/evaluators/base-comparative.js CHANGED Viewed

@@ -13,10 +13,12 @@ const platform_utils_1 = require("../../core/platform-utils");
 const fs_1 = require("fs");
 const path_1 = require("path");
 const dataset_analyzer_js_1 = require("../dataset-analyzer.js");
+const metadata_loader_js_1 = require("../metadata-loader.js");
 class BaseComparativeEvaluator {
     evaluatorModel;
     datasetAnalyzer;
     promptTemplate;
+    metadata;
     constructor(datasetDir) {
         // Use Claude via VercelProvider as the evaluator (most reliable for complex comparative evaluation)
         this.evaluatorModel = new vercel_provider_1.VercelProvider({
@@ -28,6 +30,8 @@ class BaseComparativeEvaluator {
         this.datasetAnalyzer = new dataset_analyzer_js_1.DatasetAnalyzer(datasetDir || './eval/datasets');
         // Prompt template will be loaded by subclass
         this.promptTemplate = '';
+        // Load metadata
+        this.metadata = (0, metadata_loader_js_1.loadEvaluationMetadata)();
     }
     /**
      * Initialize the evaluator - must be called by subclass constructor
@@ -122,6 +126,7 @@ class BaseComparativeEvaluator {
 ${reliabilityContext}
 **Response:**
 ${modelResponse.response}
 ---`;
@@ -171,11 +176,18 @@ ${modelResponse.response}
      * Build the evaluation prompt - can be overridden by subclasses for custom behavior
      */
     buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
+        // Build metadata context sections
+        const pricingContext = (0, metadata_loader_js_1.buildModelPricingContext)(this.metadata.models);
+        const toolContext = (0, metadata_loader_js_1.buildToolContext)(this.toolName, this.metadata.tools);
+        // Inject all data into prompt template via placeholders
         return this.promptTemplate
+            .replace('{pricing_context}', pricingContext)
+            .replace('{tool_context}', toolContext)
             .replace('{issue}', scenario.issue)
             .replace('{model_responses}', modelResponsesText)
             .replace('{model_list}', modelList)
-            .replace('{phase}', scenario.interaction_id);
+            .replace('{phase}', scenario.interaction_id)
+            .replace('{scenario_name}', scenario.interaction_id);
     }
     /**
      * Get statistics about available datasets

package/dist/evaluation/graph-generator.d.ts ADDED Viewed

@@ -0,0 +1,56 @@
+import type { ModelPerformance } from './platform-synthesizer.js';
+export interface GraphGenerationResult {
+    success: boolean;
+    graphPath?: string;
+    error?: string;
+}
+/**
+ * GraphGenerator creates data visualizations for platform synthesis reports.
+ * Uses QuickChart.io API to generate chart images without requiring native dependencies.
+ */
+export declare class GraphGenerator {
+    private outputDir;
+    private quickchartBaseUrl;
+    constructor(outputDir?: string);
+    /**
+     * Generates all or specific graphs for the platform report
+     * @param modelPerformances Model performance data
+     * @param graphNames Optional array of specific graph names to generate. If not provided, generates all graphs.
+     *                   Valid names: 'performance-tiers', 'cost-vs-quality', 'reliability-comparison',
+     *                   'tool-performance-heatmap', 'context-window-correlation'
+     */
+    generateAllGraphs(modelPerformances: ModelPerformance[], graphNames?: string[]): Promise<Record<string, GraphGenerationResult>>;
+    /**
+     * Graph 1: Performance Tiers - Grouped bar chart showing score, reliability, and consistency
+     */
+    private generatePerformanceTiersGraph;
+    /**
+     * Graph 2: Cost vs Quality - Line chart showing input/output cost range per model
+     */
+    private generateCostVsQualityGraph;
+    /**
+     * Graph 3: Reliability Comparison - Bar chart with reliability scores
+     */
+    private generateReliabilityComparisonGraph;
+    /**
+     * Graph 4: Tool Performance Heatmap - Shows model scores per tool
+     */
+    private generateToolPerformanceHeatmap;
+    /**
+     * Graph 5: Context Window Correlation - Scatter plot showing context window vs performance
+     */
+    private generateContextWindowCorrelationGraph;
+    /**
+     * Downloads a chart from QuickChart.io API and saves it as PNG
+     */
+    private downloadChart;
+    /**
+     * Cleans model names by removing provider prefixes
+     */
+    private cleanModelName;
+    /**
+     * Returns a consistent color for each tool index (supports up to 10 tools)
+     */
+    private getToolColor;
+}
+//# sourceMappingURL=graph-generator.d.ts.map

package/dist/evaluation/graph-generator.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"graph-generator.d.ts","sourceRoot":"","sources":["../../src/evaluation/graph-generator.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAElE,MAAM,WAAW,qBAAqB;IACpC,OAAO,EAAE,OAAO,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,iBAAiB,CAAiC;gBAE9C,SAAS,SAAoC;IAIzD;;;;;;OAMG;IACG,iBAAiB,CACrB,iBAAiB,EAAE,gBAAgB,EAAE,EACrC,UAAU,CAAC,EAAE,MAAM,EAAE,GACpB,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,qBAAqB,CAAC,CAAC;IAkCjD;;OAEG;YACW,6BAA6B;IAiH3C;;OAEG;YACW,0BAA0B;IAsHxC;;OAEG;YACW,kCAAkC;IA4GhD;;OAEG;YACW,8BAA8B;IAqG5C;;OAEG;YACW,qCAAqC;IAyHnD;;OAEG;YACW,aAAa;IA8B3B;;OAEG;IACH,OAAO,CAAC,cAAc;IAQtB;;OAEG;IACH,OAAO,CAAC,YAAY;CAerB"}

package/dist/evaluation/graph-generator.js ADDED Viewed

@@ -0,0 +1,694 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.GraphGenerator = void 0;
+const fs = __importStar(require("fs"));
+const path = __importStar(require("path"));
+const https = __importStar(require("https"));
+/**
+ * GraphGenerator creates data visualizations for platform synthesis reports.
+ * Uses QuickChart.io API to generate chart images without requiring native dependencies.
+ */
+class GraphGenerator {
+    outputDir;
+    quickchartBaseUrl = 'https://quickchart.io/chart';
+    constructor(outputDir = './eval/analysis/platform/graphs') {
+        this.outputDir = outputDir;
+    }
+    /**
+     * Generates all or specific graphs for the platform report
+     * @param modelPerformances Model performance data
+     * @param graphNames Optional array of specific graph names to generate. If not provided, generates all graphs.
+     *                   Valid names: 'performance-tiers', 'cost-vs-quality', 'reliability-comparison',
+     *                   'tool-performance-heatmap', 'context-window-correlation'
+     */
+    async generateAllGraphs(modelPerformances, graphNames) {
+        // Ensure output directory exists
+        if (!fs.existsSync(this.outputDir)) {
+            fs.mkdirSync(this.outputDir, { recursive: true });
+        }
+        const results = {};
+        // Define all available graphs
+        const allGraphs = {
+            'performance-tiers': () => this.generatePerformanceTiersGraph(modelPerformances),
+            'cost-vs-quality': () => this.generateCostVsQualityGraph(modelPerformances),
+            'reliability-comparison': () => this.generateReliabilityComparisonGraph(modelPerformances),
+            'tool-performance-heatmap': () => this.generateToolPerformanceHeatmap(modelPerformances),
+            'context-window-correlation': () => this.generateContextWindowCorrelationGraph(modelPerformances)
+        };
+        // If specific graphs requested, only generate those
+        const graphsToGenerate = graphNames && graphNames.length > 0
+            ? graphNames
+            : Object.keys(allGraphs);
+        // Generate requested graphs
+        for (const graphName of graphsToGenerate) {
+            if (allGraphs[graphName]) {
+                results[graphName] = await allGraphs[graphName]();
+            }
+            else {
+                console.warn(`⚠️  Unknown graph name: ${graphName}`);
+            }
+        }
+        return results;
+    }
+    /**
+     * Graph 1: Performance Tiers - Grouped bar chart showing score, reliability, and consistency
+     */
+    async generatePerformanceTiersGraph(modelPerformances) {
+        try {
+            // Sort by average score descending, take top 10 models
+            const topModels = modelPerformances
+                .sort((a, b) => b.averageScore - a.averageScore)
+                .slice(0, 10);
+            // Clean model names (remove "vercel_" prefix)
+            const labels = topModels.map(m => this.cleanModelName(m.modelId));
+            const scores = topModels.map(m => m.averageScore);
+            const reliability = topModels.map(m => m.reliabilityScore);
+            const consistency = topModels.map(m => m.consistencyAcrossTools);
+            const chartConfig = {
+                type: 'bar',
+                data: {
+                    labels,
+                    datasets: [
+                        {
+                            label: 'Overall Score',
+                            data: scores,
+                            backgroundColor: 'rgba(54, 162, 235, 0.9)',
+                            borderColor: 'rgba(54, 162, 235, 1)',
+                            borderWidth: 1
+                        },
+                        {
+                            label: 'Reliability',
+                            data: reliability,
+                            backgroundColor: 'rgba(75, 192, 192, 0.9)',
+                            borderColor: 'rgba(75, 192, 192, 1)',
+                            borderWidth: 1
+                        },
+                        {
+                            label: 'Consistency',
+                            data: consistency,
+                            backgroundColor: 'rgba(153, 102, 255, 0.9)',
+                            borderColor: 'rgba(153, 102, 255, 1)',
+                            borderWidth: 1
+                        }
+                    ]
+                },
+                options: {
+                    plugins: {
+                        datalabels: {
+                            display: false
+                        }
+                    },
+                    title: {
+                        display: true,
+                        text: 'Model Performance Tiers: Score, Reliability, and Consistency',
+                        fontSize: 18,
+                        fontColor: '#FFFFFF',
+                        fontStyle: 'bold'
+                    },
+                    scales: {
+                        yAxes: [{
+                                ticks: {
+                                    beginAtZero: true,
+                                    max: 1.0,
+                                    stepSize: 0.1,
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 12
+                                },
+                                scaleLabel: {
+                                    display: true,
+                                    labelString: 'Score (0-1)',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 14
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.2)',
+                                    zeroLineColor: 'rgba(255, 255, 255, 0.4)'
+                                }
+                            }],
+                        xAxes: [{
+                                ticks: {
+                                    autoSkip: false,
+                                    maxRotation: 45,
+                                    minRotation: 45,
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 11
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.1)'
+                                }
+                            }]
+                    },
+                    legend: {
+                        display: true,
+                        position: 'top',
+                        labels: {
+                            fontColor: '#FFFFFF',
+                            fontSize: 13
+                        }
+                    }
+                }
+            };
+            const outputPath = path.join(this.outputDir, 'performance-tiers.png');
+            await this.downloadChart(chartConfig, outputPath);
+            return {
+                success: true,
+                graphPath: outputPath
+            };
+        }
+        catch (error) {
+            return {
+                success: false,
+                error: `Failed to generate performance tiers graph: ${error}`
+            };
+        }
+    }
+    /**
+     * Graph 2: Cost vs Quality - Line chart showing input/output cost range per model
+     */
+    async generateCostVsQualityGraph(modelPerformances) {
+        try {
+            // Filter out models with no pricing data and sort by quality score descending
+            const modelsWithPricing = modelPerformances
+                .filter(m => m.pricing.input_cost_per_million_tokens > 0 || m.pricing.output_cost_per_million_tokens > 0)
+                .sort((a, b) => b.averageScore - a.averageScore);
+            // Create datasets: one for each model showing the cost range line
+            const datasets = modelsWithPricing.map((m, idx) => {
+                const inputCost = m.pricing.input_cost_per_million_tokens;
+                const outputCost = m.pricing.output_cost_per_million_tokens;
+                const color = this.getToolColor(idx);
+                // Line from input cost to output cost at the model's quality score
+                return {
+                    label: this.cleanModelName(m.modelId),
+                    data: [
+                        { x: inputCost, y: m.averageScore },
+                        { x: outputCost, y: m.averageScore }
+                    ],
+                    borderColor: color,
+                    backgroundColor: color,
+                    borderWidth: 3,
+                    pointRadius: 5,
+                    pointHoverRadius: 7,
+                    fill: false,
+                    showLine: true,
+                    tension: 0
+                };
+            });
+            const chartConfig = {
+                type: 'line',
+                data: { datasets },
+                options: {
+                    plugins: {
+                        datalabels: {
+                            display: false
+                        }
+                    },
+                    title: {
+                        display: true,
+                        text: 'Cost vs Quality Analysis (line shows input → output cost range)',
+                        fontSize: 18,
+                        fontColor: '#FFFFFF',
+                        fontStyle: 'bold'
+                    },
+                    scales: {
+                        xAxes: [{
+                                type: 'linear',
+                                scaleLabel: {
+                                    display: true,
+                                    labelString: 'Cost per 1M Tokens in $ (Input ← → Output)',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 14
+                                },
+                                ticks: {
+                                    callback: function (value) {
+                                        return '$' + value;
+                                    },
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 12
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.2)',
+                                    zeroLineColor: 'rgba(255, 255, 255, 0.4)'
+                                }
+                            }],
+                        yAxes: [{
+                                scaleLabel: {
+                                    display: true,
+                                    labelString: 'Overall Score',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 14
+                                },
+                                ticks: {
+                                    beginAtZero: false,
+                                    min: 0.3,
+                                    max: 1.0,
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 12
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.2)',
+                                    zeroLineColor: 'rgba(255, 255, 255, 0.4)'
+                                }
+                            }]
+                    },
+                    legend: {
+                        display: true,
+                        position: 'right',
+                        labels: {
+                            fontColor: '#FFFFFF',
+                            fontSize: 10,
+                            boxWidth: 15,
+                            usePointStyle: true
+                        }
+                    }
+                }
+            };
+            const outputPath = path.join(this.outputDir, 'cost-vs-quality.png');
+            await this.downloadChart(chartConfig, outputPath);
+            return {
+                success: true,
+                graphPath: outputPath
+            };
+        }
+        catch (error) {
+            return {
+                success: false,
+                error: `Failed to generate cost vs quality graph: ${error}`
+            };
+        }
+    }
+    /**
+     * Graph 3: Reliability Comparison - Bar chart with reliability scores
+     */
+    async generateReliabilityComparisonGraph(modelPerformances) {
+        try {
+            // Sort by reliability descending
+            const sortedModels = modelPerformances
+                .sort((a, b) => b.reliabilityScore - a.reliabilityScore);
+            const labels = sortedModels.map(m => this.cleanModelName(m.modelId));
+            const reliabilityScores = sortedModels.map(m => m.reliabilityScore);
+            // Create separate datasets for legend
+            const datasets = [
+                {
+                    label: 'High Reliability (≥0.9)',
+                    data: reliabilityScores.map(score => score >= 0.9 ? score : null),
+                    backgroundColor: 'rgba(75, 192, 192, 0.8)',
+                    borderWidth: 1
+                },
+                {
+                    label: 'Medium Reliability (0.7-0.9)',
+                    data: reliabilityScores.map(score => score >= 0.7 && score < 0.9 ? score : null),
+                    backgroundColor: 'rgba(255, 206, 86, 0.8)',
+                    borderWidth: 1
+                },
+                {
+                    label: 'Low Reliability (<0.7)',
+                    data: reliabilityScores.map(score => score < 0.7 ? score : null),
+                    backgroundColor: 'rgba(255, 99, 132, 0.8)',
+                    borderWidth: 1
+                }
+            ];
+            const chartConfig = {
+                type: 'horizontalBar',
+                data: {
+                    labels,
+                    datasets
+                },
+                options: {
+                    plugins: {
+                        datalabels: {
+                            display: false
+                        }
+                    },
+                    title: {
+                        display: true,
+                        text: 'Model Reliability Comparison',
+                        fontSize: 18,
+                        fontColor: '#FFFFFF',
+                        fontStyle: 'bold'
+                    },
+                    scales: {
+                        xAxes: [{
+                                stacked: true,
+                                ticks: {
+                                    beginAtZero: true,
+                                    max: 1.0,
+                                    stepSize: 0.1,
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 12
+                                },
+                                scaleLabel: {
+                                    display: true,
+                                    labelString: 'Reliability Score (0-1)',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 14
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.2)',
+                                    zeroLineColor: 'rgba(255, 255, 255, 0.4)'
+                                }
+                            }],
+                        yAxes: [{
+                                stacked: true,
+                                ticks: {
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 11
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.1)'
+                                }
+                            }]
+                    },
+                    legend: {
+                        display: true,
+                        position: 'top',
+                        labels: {
+                            fontColor: '#FFFFFF',
+                            fontSize: 12
+                        }
+                    }
+                }
+            };
+            const outputPath = path.join(this.outputDir, 'reliability-comparison.png');
+            await this.downloadChart(chartConfig, outputPath);
+            return {
+                success: true,
+                graphPath: outputPath
+            };
+        }
+        catch (error) {
+            return {
+                success: false,
+                error: `Failed to generate reliability comparison graph: ${error}`
+            };
+        }
+    }
+    /**
+     * Graph 4: Tool Performance Heatmap - Shows model scores per tool
+     */
+    async generateToolPerformanceHeatmap(modelPerformances) {
+        try {
+            // Get all unique tool names
+            const toolNames = new Set();
+            modelPerformances.forEach(m => {
+                Object.keys(m.toolScores).forEach(tool => toolNames.add(tool));
+            });
+            const tools = Array.from(toolNames).sort();
+            // Sort models by average score
+            const sortedModels = modelPerformances
+                .sort((a, b) => b.averageScore - a.averageScore)
+                .slice(0, 10); // Top 10 models
+            // Create matrix data
+            const labels = sortedModels.map(m => this.cleanModelName(m.modelId));
+            const datasets = tools.map((tool, idx) => ({
+                label: tool.charAt(0).toUpperCase() + tool.slice(1),
+                data: sortedModels.map(m => m.toolScores[tool] || 0),
+                backgroundColor: this.getToolColor(idx),
+                borderWidth: 1
+            }));
+            const chartConfig = {
+                type: 'horizontalBar',
+                data: {
+                    labels,
+                    datasets
+                },
+                options: {
+                    plugins: {
+                        datalabels: {
+                            display: false
+                        }
+                    },
+                    title: {
+                        display: true,
+                        text: 'Tool-Specific Performance Patterns',
+                        fontSize: 18,
+                        fontColor: '#FFFFFF',
+                        fontStyle: 'bold'
+                    },
+                    scales: {
+                        xAxes: [{
+                                stacked: false,
+                                ticks: {
+                                    beginAtZero: true,
+                                    max: 1.0,
+                                    stepSize: 0.2,
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 12
+                                },
+                                scaleLabel: {
+                                    display: true,
+                                    labelString: 'Tool Score',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 14
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.2)',
+                                    zeroLineColor: 'rgba(255, 255, 255, 0.4)'
+                                }
+                            }],
+                        yAxes: [{
+                                stacked: false,
+                                ticks: {
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 11
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.1)'
+                                }
+                            }]
+                    },
+                    legend: {
+                        display: true,
+                        position: 'right',
+                        labels: {
+                            fontColor: '#FFFFFF',
+                            fontSize: 12
+                        }
+                    }
+                }
+            };
+            const outputPath = path.join(this.outputDir, 'tool-performance-heatmap.png');
+            await this.downloadChart(chartConfig, outputPath);
+            return {
+                success: true,
+                graphPath: outputPath
+            };
+        }
+        catch (error) {
+            return {
+                success: false,
+                error: `Failed to generate tool performance heatmap: ${error}`
+            };
+        }
+    }
+    /**
+     * Graph 5: Context Window Correlation - Scatter plot showing context window vs performance
+     */
+    async generateContextWindowCorrelationGraph(modelPerformances) {
+        try {
+            const scatterData = modelPerformances.map((m) => ({
+                x: m.capabilities.context_window / 1000, // Convert to thousands for readability
+                y: m.averageScore,
+                r: 8,
+                label: this.cleanModelName(m.modelId)
+            }));
+            const chartConfig = {
+                type: 'scatter',
+                data: {
+                    datasets: [{
+                            label: 'Models',
+                            data: scatterData,
+                            backgroundColor: 'rgba(153, 102, 255, 0.7)',
+                            borderColor: 'rgba(153, 102, 255, 1)',
+                            borderWidth: 2,
+                            pointRadius: 10
+                        }]
+                },
+                options: {
+                    layout: {
+                        padding: {
+                            right: 300,
+                            left: 20,
+                            top: 20,
+                            bottom: 20
+                        }
+                    },
+                    plugins: {
+                        datalabels: {
+                            display: true,
+                            align: 'right',
+                            offset: 12,
+                            color: '#FFFFFF',
+                            font: {
+                                size: 20
+                            },
+                            formatter: (value) => value.label
+                        }
+                    },
+                    title: {
+                        display: true,
+                        text: 'Context Window Size vs Performance',
+                        fontSize: 18,
+                        fontColor: '#FFFFFF',
+                        fontStyle: 'bold'
+                    },
+                    scales: {
+                        xAxes: [{
+                                type: 'linear',
+                                scaleLabel: {
+                                    display: true,
+                                    labelString: 'Context Window Size (K tokens)',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 14
+                                },
+                                ticks: {
+                                    callback: (value) => value + 'K',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 12
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.2)',
+                                    zeroLineColor: 'rgba(255, 255, 255, 0.4)'
+                                }
+                            }],
+                        yAxes: [{
+                                scaleLabel: {
+                                    display: true,
+                                    labelString: 'Overall Score',
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 14
+                                },
+                                ticks: {
+                                    beginAtZero: false,
+                                    min: 0.3,
+                                    max: 1.0,
+                                    fontColor: '#FFFFFF',
+                                    fontSize: 12
+                                },
+                                gridLines: {
+                                    color: 'rgba(255, 255, 255, 0.2)',
+                                    zeroLineColor: 'rgba(255, 255, 255, 0.4)'
+                                }
+                            }]
+                    },
+                    legend: {
+                        display: false
+                    },
+                    tooltips: {
+                        backgroundColor: 'rgba(0, 0, 0, 0.8)',
+                        titleFontColor: '#FFFFFF',
+                        bodyFontColor: '#FFFFFF',
+                        callbacks: {
+                            label: (tooltipItem, data) => {
+                                const dataset = data.datasets[tooltipItem.datasetIndex];
+                                const point = dataset.data[tooltipItem.index];
+                                return `${point.label}: ${point.y.toFixed(3)} (${Math.round(point.x)}K tokens)`;
+                            }
+                        }
+                    }
+                }
+            };
+            const outputPath = path.join(this.outputDir, 'context-window-correlation.png');
+            await this.downloadChart(chartConfig, outputPath, 1400, 700);
+            return {
+                success: true,
+                graphPath: outputPath
+            };
+        }
+        catch (error) {
+            return {
+                success: false,
+                error: `Failed to generate context window correlation graph: ${error}`
+            };
+        }
+    }
+    /**
+     * Downloads a chart from QuickChart.io API and saves it as PNG
+     */
+    async downloadChart(chartConfig, outputPath, width = 1000, height = 600) {
+        return new Promise((resolve, reject) => {
+            const chartJson = JSON.stringify(chartConfig);
+            const url = `${this.quickchartBaseUrl}?c=${encodeURIComponent(chartJson)}&width=${width}&height=${height}&format=png&backgroundColor=black`;
+            https.get(url, (response) => {
+                if (response.statusCode !== 200) {
+                    reject(new Error(`QuickChart API returned status ${response.statusCode}`));
+                    return;
+                }
+                const fileStream = fs.createWriteStream(outputPath);
+                response.pipe(fileStream);
+                fileStream.on('finish', () => {
+                    fileStream.close();
+                    console.log(`✅ Graph saved: ${outputPath}`);
+                    resolve();
+                });
+                fileStream.on('error', (err) => {
+                    fs.unlink(outputPath, () => { }); // Clean up partial file
+                    reject(err);
+                });
+            }).on('error', (err) => {
+                reject(err);
+            });
+        });
+    }
+    /**
+     * Cleans model names by removing provider prefixes
+     */
+    cleanModelName(modelId) {
+        // Remove "vercel_" prefix and timestamp suffix
+        return modelId
+            .replace(/^vercel_/, '')
+            .replace(/_\d{4}-\d{2}-\d{2}$/, '')
+            .replace(/_/g, '-');
+    }
+    /**
+     * Returns a consistent color for each tool index (supports up to 10 tools)
+     */
+    getToolColor(index) {
+        const colors = [
+            'rgba(255, 99, 132, 0.8)', // Red
+            'rgba(54, 162, 235, 0.8)', // Blue
+            'rgba(255, 206, 86, 0.8)', // Yellow
+            'rgba(75, 192, 192, 0.8)', // Green
+            'rgba(153, 102, 255, 0.8)', // Purple
+            'rgba(255, 159, 64, 0.8)', // Orange
+            'rgba(199, 199, 199, 0.8)', // Grey
+            'rgba(83, 102, 255, 0.8)', // Indigo
+            'rgba(255, 99, 255, 0.8)', // Pink
+            'rgba(99, 255, 132, 0.8)' // Light Green
+        ];
+        return colors[index % colors.length];
+    }
+}
+exports.GraphGenerator = GraphGenerator;

package/dist/evaluation/metadata-loader.d.ts ADDED Viewed

@@ -0,0 +1,39 @@
+/**
+ * Shared Metadata Loader
+ *
+ * Provides consistent access to model and tool metadata across all evaluators
+ */
+export interface ModelMetadata {
+    provider: string;
+    pricing: {
+        input_cost_per_million_tokens: number;
+        output_cost_per_million_tokens: number;
+    };
+    context_window: number;
+    supports_function_calling: boolean;
+}
+export interface ToolMetadata {
+    name: string;
+    description: string;
+    primaryFunction: string;
+    testTimeout: string;
+    successCriteria: string[];
+    modelRequirements: Record<string, string>;
+}
+export interface EvaluationMetadata {
+    models: Record<string, ModelMetadata>;
+    tools: Record<string, ToolMetadata>;
+}
+/**
+ * Load model and tool metadata from model-metadata.json
+ */
+export declare function loadEvaluationMetadata(): EvaluationMetadata;
+/**
+ * Build model pricing context for evaluation prompts
+ */
+export declare function buildModelPricingContext(models: Record<string, ModelMetadata>): string;
+/**
+ * Build tool context for evaluation prompts (tool-specific description and constraints)
+ */
+export declare function buildToolContext(toolName: string, tools: Record<string, ToolMetadata>): string;
+//# sourceMappingURL=metadata-loader.d.ts.map

package/dist/evaluation/metadata-loader.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"metadata-loader.d.ts","sourceRoot":"","sources":["../../src/evaluation/metadata-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,MAAM,WAAW,aAAa;IAC5B,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,cAAc,EAAE,MAAM,CAAC;IACvB,yBAAyB,EAAE,OAAO,CAAC;CACpC;AAED,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC3C;AAED,MAAM,WAAW,kBAAkB;IACjC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IACtC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;CACrC;AAED;;GAEG;AACH,wBAAgB,sBAAsB,IAAI,kBAAkB,CAa3D;AAED;;GAEG;AACH,wBAAgB,wBAAwB,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,GAAG,MAAM,CAkBtF;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,GAAG,MAAM,CAqB9F"}

package/dist/evaluation/metadata-loader.js ADDED Viewed

@@ -0,0 +1,74 @@
+"use strict";
+/**
+ * Shared Metadata Loader
+ *
+ * Provides consistent access to model and tool metadata across all evaluators
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.loadEvaluationMetadata = loadEvaluationMetadata;
+exports.buildModelPricingContext = buildModelPricingContext;
+exports.buildToolContext = buildToolContext;
+const fs_1 = require("fs");
+const path_1 = require("path");
+/**
+ * Load model and tool metadata from model-metadata.json
+ */
+function loadEvaluationMetadata() {
+    try {
+        const metadataPath = (0, path_1.join)(process.cwd(), 'src', 'evaluation', 'model-metadata.json');
+        const metadata = JSON.parse((0, fs_1.readFileSync)(metadataPath, 'utf8'));
+        console.log(`✅ Loaded metadata for ${Object.keys(metadata.models || {}).length} models and ${Object.keys(metadata.tools || {}).length} tools`);
+        return {
+            models: metadata.models || {},
+            tools: metadata.tools || {}
+        };
+    }
+    catch (error) {
+        console.warn('⚠️  Failed to load evaluation metadata:', error);
+        return { models: {}, tools: {} };
+    }
+}
+/**
+ * Build model pricing context for evaluation prompts
+ */
+function buildModelPricingContext(models) {
+    const modelIds = Object.keys(models);
+    if (modelIds.length === 0) {
+        return 'No pricing information available.';
+    }
+    const pricingLines = modelIds.map(modelId => {
+        const model = models[modelId];
+        const inputCost = model.pricing?.input_cost_per_million_tokens?.toFixed(2) || 'N/A';
+        const outputCost = model.pricing?.output_cost_per_million_tokens?.toFixed(2) || 'N/A';
+        const avgCost = model.pricing
+            ? ((model.pricing.input_cost_per_million_tokens + model.pricing.output_cost_per_million_tokens) / 2).toFixed(2)
+            : 'N/A';
+        const contextWindow = model.context_window ? `${(model.context_window / 1000).toFixed(0)}K` : 'N/A';
+        return `- **${modelId}** (${model.provider}): $${avgCost}/1M tokens ($${inputCost} input, $${outputCost} output) | Context: ${contextWindow} tokens`;
+    });
+    return `## Model Pricing Information\n\n${pricingLines.join('\n')}`;
+}
+/**
+ * Build tool context for evaluation prompts (tool-specific description and constraints)
+ */
+function buildToolContext(toolName, tools) {
+    const tool = tools[toolName];
+    if (!tool) {
+        return `No metadata available for tool: ${toolName}`;
+    }
+    return `## Tool Being Evaluated: ${tool.name}
+**Description**: ${tool.description}
+**Primary Function**: ${tool.primaryFunction}
+**Test Timeout Constraint**: ${tool.testTimeout}
+**Success Criteria**:
+${tool.successCriteria.map((c) => `- ${c}`).join('\n')}
+**Model Requirements**:
+${Object.entries(tool.modelRequirements).map(([key, value]) => `- **${key}**: ${value}`).join('\n')}
+**IMPORTANT**: When analyzing model failures, consider whether the model exceeded the timeout constraint. Models that timeout should be noted as failing due to timeout constraints rather than quality issues.`;
+}

package/dist/evaluation/platform-synthesizer.d.ts CHANGED Viewed

@@ -35,7 +35,7 @@ export declare class PlatformSynthesizer {
     private aiProvider;
     private reportsDir;
     constructor(aiProvider: VercelProvider, reportsDir?: string);
-    generatePlatformWideAnalysis(): Promise<string>;
+    generatePlatformWideAnalysis(graphsToGenerate?: string[], skipReport?: boolean): Promise<string>;
     private loadToolMetadata;
     private loadAllReports;
     private analyzeCrossToolPerformance;
@@ -49,6 +49,10 @@ export declare class PlatformSynthesizer {
     private generateProductionRecommendations;
     private calculateCostEstimate;
     private extractBaseModelId;
+    /**
+     * Generates graphs and replaces placeholders in the markdown report
+     */
+    private addGraphsToReport;
     saveSynthesisReport(markdownContent: string, outputPath?: string): Promise<void>;
 }
 //# sourceMappingURL=platform-synthesizer.d.ts.map

package/dist/evaluation/platform-synthesizer.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"platform-synthesizer.d.ts","sourceRoot":"","sources":["../../src/evaluation/platform-synthesizer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;~~AAEtE~~,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;IACzB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,YAAY,EAAE;QACZ,cAAc,EAAE,MAAM,CAAC;QACvB,yBAAyB,EAAE,OAAO,CAAC;KACpC,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,aAAa,EAAE,gBAAgB,EAAE,CAAC;IAClC,QAAQ,EAAE,gBAAgB,EAAE,CAAC;IAC7B,kBAAkB,EAAE,gBAAgB,EAAE,CAAC;CACxC;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,eAAe,GAAG,aAAa,GAAG,YAAY,GAAG,UAAU,CAAC;IACtE,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,UAAU,CAAiB;IACnC,OAAO,CAAC,UAAU,CAAS;gBAEf,UAAU,EAAE,cAAc,EAAE,UAAU,SAA+B;IAK3E,4BAA4B,~~IAAI~~,OAAO,CAAC,MAAM,CAAC;~~IA8BrD~~,OAAO,CAAC,gBAAgB;~~YAaV~~,cAAc;~~YA0Bd~~,2BAA2B;IA8DzC,OAAO,CAAC,0BAA0B;IA2ElC,OAAO,CAAC,wBAAwB;IAiDhC,OAAO,CAAC,4BAA4B;YA0CtB,wBAAwB;IAoBtC,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,oBAAoB;IAsB5B,OAAO,CAAC,yBAAyB;IAWjC,OAAO,CAAC,iCAAiC;IASzC,OAAO,CAAC,qBAAqB;IAQ7B,OAAO,CAAC,kBAAkB;~~IASpB~~,mBAAmB,CACvB,eAAe,EAAE,MAAM,EACvB,UAAU,SAAiD,GAC1D,OAAO,CAAC,IAAI,CAAC;CAWjB"}
1	+ {"version":3,"file":"platform-synthesizer.d.ts","sourceRoot":"","sources":["../../src/evaluation/platform-synthesizer.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,cAAc,EAAE,MAAM,sCAAsC,CAAC;AAItE,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,YAAY,EAAE,MAAM,CAAC;IACrB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;IACzB,sBAAsB,EAAE,MAAM,CAAC;IAC/B,OAAO,EAAE;QACP,6BAA6B,EAAE,MAAM,CAAC;QACtC,8BAA8B,EAAE,MAAM,CAAC;KACxC,CAAC;IACF,YAAY,EAAE;QACZ,cAAc,EAAE,MAAM,CAAC;QACvB,yBAAyB,EAAE,OAAO,CAAC;KACpC,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,cAAc,EAAE,gBAAgB,EAAE,CAAC;IACnC,aAAa,EAAE,gBAAgB,EAAE,CAAC;IAClC,QAAQ,EAAE,gBAAgB,EAAE,CAAC;IAC7B,kBAAkB,EAAE,gBAAgB,EAAE,CAAC;CACxC;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,EAAE,eAAe,GAAG,aAAa,GAAG,YAAY,GAAG,UAAU,CAAC;IACtE,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,gBAAgB,EAAE,MAAM,CAAC;IACzB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,qBAAa,mBAAmB;IAC9B,OAAO,CAAC,UAAU,CAAiB;IACnC,OAAO,CAAC,UAAU,CAAS;gBAEf,UAAU,EAAE,cAAc,EAAE,UAAU,SAA+B;IAK3E,4BAA4B,CAAC,gBAAgB,CAAC,EAAE,MAAM,EAAE,EAAE,UAAU,UAAQ,GAAG,OAAO,CAAC,MAAM,CAAC;IA6CpG,OAAO,CAAC,gBAAgB;YAKV,cAAc;YAyBd,2BAA2B;IA8DzC,OAAO,CAAC,0BAA0B;IA2ElC,OAAO,CAAC,wBAAwB;IAiDhC,OAAO,CAAC,4BAA4B;YA0CtB,wBAAwB;IAoBtC,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,oBAAoB;IAsB5B,OAAO,CAAC,yBAAyB;IAWjC,OAAO,CAAC,iCAAiC;IASzC,OAAO,CAAC,qBAAqB;IAQ7B,OAAO,CAAC,kBAAkB;IAS1B;;OAEG;YACW,iBAAiB;IA8CzB,mBAAmB,CACvB,eAAe,EAAE,MAAM,EACvB,UAAU,SAAiD,GAC1D,OAAO,CAAC,IAAI,CAAC;CAWjB"}

package/dist/evaluation/platform-synthesizer.js CHANGED Viewed

@@ -36,6 +36,8 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports.PlatformSynthesizer = void 0;
 const fs = __importStar(require("fs"));
 const path = __importStar(require("path"));
+const graph_generator_js_1 = require("./graph-generator.js");
+const metadata_loader_js_1 = require("./metadata-loader.js");
 class PlatformSynthesizer {
     aiProvider;
     reportsDir;
@@ -43,48 +45,48 @@ class PlatformSynthesizer {
         this.aiProvider = aiProvider;
         this.reportsDir = reportsDir;
     }
-    async generatePlatformWideAnalysis() {
+    async generatePlatformWideAnalysis(graphsToGenerate, skipReport = false) {
         console.log('🔍 Loading all evaluation reports...');
         const allReports = await this.loadAllReports();
         console.log('🔧 Loading tool metadata...');
         const toolMetadata = this.loadToolMetadata();
         console.log('📊 Analyzing cross-tool performance patterns...');
         const crossToolAnalysis = await this.analyzeCrossToolPerformance(allReports);
-        console.log('🎯 Generating decision matrices...');
-        const decisionMatrices = this.generateDecisionMatrices(crossToolAnalysis.modelPerformances);
-        console.log('💡 Creating usage recommendations...');
-        const usageRecommendations = this.generateUsageRecommendations(crossToolAnalysis, decisionMatrices);
-        console.log('🚀 Generating comprehensive AI-powered report...');
-        const markdownReport = await this.generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata);
-        return markdownReport;
-    }
-    loadToolMetadata() {
-        try {
-            const metadataPath = path.join(process.cwd(), 'src', 'evaluation', 'model-metadata.json');
-            const metadataContent = fs.readFileSync(metadataPath, 'utf8');
-            const metadata = JSON.parse(metadataContent);
-            console.log(`✅ Loaded tool metadata with ${Object.keys(metadata.tools || {}).length} tools`);
-            return metadata.tools || {};
+        let markdownReport;
+        if (skipReport) {
+            console.log('⏭️  Skipping AI report generation...');
+            // Return empty string if we're only generating graphs
+            markdownReport = '';
         }
-        catch (error) {
-            console.warn('⚠️  Failed to load tool metadata, proceeding without it:', error);
-            return {};
+        else {
+            console.log('🎯 Generating decision matrices...');
+            const decisionMatrices = this.generateDecisionMatrices(crossToolAnalysis.modelPerformances);
+            console.log('💡 Creating usage recommendations...');
+            const usageRecommendations = this.generateUsageRecommendations(crossToolAnalysis, decisionMatrices);
+            console.log('🚀 Generating comprehensive AI-powered report...');
+            markdownReport = await this.generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata);
         }
+        console.log('📊 Generating data visualizations...');
+        const reportWithGraphs = await this.addGraphsToReport(markdownReport, crossToolAnalysis.modelPerformances, graphsToGenerate);
+        return reportWithGraphs;
+    }
+    loadToolMetadata() {
+        const metadata = (0, metadata_loader_js_1.loadEvaluationMetadata)();
+        return { tools: metadata.tools };
     }
     async loadAllReports() {
         const reports = {};
         // Load all JSON result files from the directory
         const reportFiles = fs.readdirSync(this.reportsDir)
-            .filter(file => file.endsWith('-results-*.json') || file.includes('-results-'))
-            .filter(file => file.endsWith('.json'));
+            .filter(file => file.endsWith('-results.json'));
         if (reportFiles.length === 0) {
             throw new Error(`No evaluation result files found in ${this.reportsDir}`);
         }
         for (const fileName of reportFiles) {
             const reportPath = path.join(this.reportsDir, fileName);
             const reportContent = JSON.parse(fs.readFileSync(reportPath, 'utf8'));
-            // Extract tool type from filename (e.g., "capability-results-2025-10-15.json" -> "capability")
-            const toolType = fileName.split('-results-')[0];
+            // Extract tool type from filename (e.g., "capability-results.json" -> "capability")
+            const toolType = fileName.split('-results.json')[0];
             reports[toolType] = reportContent;
             console.log(`✅ Loaded ${toolType} report: ${fileName}`);
         }
@@ -355,6 +357,46 @@ class PlatformSynthesizer {
         }
         return fullModelId;
     }
+    /**
+     * Generates graphs and replaces placeholders in the markdown report
+     */
+    async addGraphsToReport(markdownContent, modelPerformances, graphsToGenerate) {
+        const graphGenerator = new graph_generator_js_1.GraphGenerator('./eval/analysis/platform/graphs');
+        try {
+            // Generate all or specific graphs
+            const graphResults = await graphGenerator.generateAllGraphs(modelPerformances, graphsToGenerate);
+            // Replace placeholders with actual image markdown
+            let updatedMarkdown = markdownContent;
+            const graphMappings = {
+                '[GRAPH:performance-tiers]': '![Performance Tiers](./graphs/performance-tiers.png)',
+                '[GRAPH:cost-vs-quality]': '![Cost vs Quality](./graphs/cost-vs-quality.png)',
+                '[GRAPH:reliability-comparison]': '![Reliability Comparison](./graphs/reliability-comparison.png)',
+                '[GRAPH:tool-performance-heatmap]': '![Tool Performance Heatmap](./graphs/tool-performance-heatmap.png)',
+                '[GRAPH:context-window-correlation]': '![Context Window Correlation](./graphs/context-window-correlation.png)'
+            };
+            for (const [placeholder, imageMarkdown] of Object.entries(graphMappings)) {
+                updatedMarkdown = updatedMarkdown.replace(placeholder, imageMarkdown);
+            }
+            // Log graph generation results
+            for (const [graphName, result] of Object.entries(graphResults)) {
+                if (result.success) {
+                    console.log(`  ✅ ${graphName}: ${result.graphPath}`);
+                }
+                else {
+                    console.warn(`  ⚠️  ${graphName}: ${result.error}`);
+                    // If graph generation failed, remove the placeholder to avoid broken markdown
+                    const placeholderKey = `[GRAPH:${graphName}]`;
+                    updatedMarkdown = updatedMarkdown.replace(placeholderKey, `*Graph generation failed: ${result.error}*`);
+                }
+            }
+            return updatedMarkdown;
+        }
+        catch (error) {
+            console.error('⚠️  Failed to generate graphs, returning report without visualizations:', error);
+            // If graph generation completely fails, remove all placeholders
+            return markdownContent.replace(/\[GRAPH:[^\]]+\]/g, '*Graph generation failed*');
+        }
+    }
     async saveSynthesisReport(markdownContent, outputPath = './eval/analysis/platform/synthesis-report.md') {
         const dir = path.dirname(outputPath);
         if (!fs.existsSync(dir)) {

package/dist/evaluation/run-platform-synthesis.js CHANGED Viewed

@@ -13,6 +13,21 @@ const model_config_js_1 = require("../core/model-config.js");
 async function runPlatformSynthesis() {
     console.log('🚀 Starting Platform-Wide AI Model Synthesis...\n');
     try {
+        // Parse command line arguments for graph filtering
+        const args = process.argv.slice(2);
+        let graphsToGenerate;
+        let skipReport = false;
+        if (args.length > 0) {
+            const graphArg = args.find(arg => arg.startsWith('--graphs='));
+            if (graphArg) {
+                graphsToGenerate = graphArg.split('=')[1].split(',');
+                console.log(`📊 Generating specific graphs: ${graphsToGenerate.join(', ')}\n`);
+            }
+            skipReport = args.includes('--skip-report');
+            if (skipReport) {
+                console.log('⏭️  Skipping AI report generation (graphs only)\n');
+            }
+        }
         // Initialize AI provider for synthesis analysis (use Claude for comprehensive analysis)
         const aiProvider = new vercel_provider_js_1.VercelProvider({
             provider: 'anthropic',
@@ -22,12 +37,14 @@ async function runPlatformSynthesis() {
         });
         // Initialize synthesizer
         const synthesizer = new platform_synthesizer_js_1.PlatformSynthesizer(aiProvider);
-        // Generate comprehensive platform-wide analysis
+        // Generate comprehensive platform-wide analysis (or just graphs if skip-report is set)
         console.log('📊 Generating platform-wide analysis...');
-        const markdownReport = await synthesizer.generatePlatformWideAnalysis();
-        // Save synthesis report
-        console.log('\n💾 Saving synthesis report...');
-        await synthesizer.saveSynthesisReport(markdownReport);
+        const markdownReport = await synthesizer.generatePlatformWideAnalysis(graphsToGenerate, skipReport);
+        // Save synthesis report only if we generated it
+        if (!skipReport) {
+            console.log('\n💾 Saving synthesis report...');
+            await synthesizer.saveSynthesisReport(markdownReport);
+        }
         console.log('\n✅ Platform-wide synthesis complete!');
         console.log('📄 Report saved: ./eval/analysis/platform/synthesis-report.md');
         console.log('\n✨ AI-generated comprehensive report includes:');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vfarcic/dot-ai",
-  "version": "0.115.0",
+  "version": "0.116.0",
   "description": "AI-powered development productivity platform that enhances software development workflows through intelligent automation and AI-driven assistance",
   "mcpName": "io.github.vfarcic/dot-ai",
   "main": "dist/index.js",