@vfarcic/dot-ai 0.111.0 → 0.112.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/ai-provider-factory.d.ts +5 -0
- package/dist/core/ai-provider-factory.d.ts.map +1 -1
- package/dist/core/ai-provider-factory.js +13 -2
- package/dist/core/ai-provider.interface.d.ts +16 -1
- package/dist/core/ai-provider.interface.d.ts.map +1 -1
- package/dist/core/capabilities.d.ts +1 -1
- package/dist/core/capabilities.d.ts.map +1 -1
- package/dist/core/capabilities.js +7 -4
- package/dist/core/capability-scan-workflow.js +2 -2
- package/dist/core/model-config.d.ts +17 -0
- package/dist/core/model-config.d.ts.map +1 -0
- package/dist/core/model-config.js +22 -0
- package/dist/core/platform-operations.d.ts.map +1 -1
- package/dist/core/platform-operations.js +3 -5
- package/dist/core/platform-utils.d.ts +3 -2
- package/dist/core/platform-utils.d.ts.map +1 -1
- package/dist/core/platform-utils.js +35 -9
- package/dist/core/providers/anthropic-provider.d.ts +4 -1
- package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
- package/dist/core/providers/anthropic-provider.js +89 -27
- package/dist/core/providers/provider-debug-utils.d.ts +49 -20
- package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
- package/dist/core/providers/provider-debug-utils.js +117 -51
- package/dist/core/providers/vercel-provider.d.ts +4 -1
- package/dist/core/providers/vercel-provider.d.ts.map +1 -1
- package/dist/core/providers/vercel-provider.js +105 -114
- package/dist/core/schema.d.ts +1 -5
- package/dist/core/schema.d.ts.map +1 -1
- package/dist/core/schema.js +16 -42
- package/dist/core/unified-creation-session.d.ts.map +1 -1
- package/dist/core/unified-creation-session.js +12 -6
- package/dist/evaluation/dataset-analyzer.d.ts +118 -0
- package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
- package/dist/evaluation/dataset-analyzer.js +234 -0
- package/dist/evaluation/datasets/loader.d.ts +42 -0
- package/dist/evaluation/datasets/loader.d.ts.map +1 -0
- package/dist/evaluation/datasets/loader.js +104 -0
- package/dist/evaluation/eval-runner.d.ts +9 -0
- package/dist/evaluation/eval-runner.d.ts.map +1 -0
- package/dist/evaluation/eval-runner.js +255 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts +91 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base-comparative.js +152 -0
- package/dist/evaluation/evaluators/base.d.ts +47 -0
- package/dist/evaluation/evaluators/base.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base.js +10 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/capability-comparative.js +104 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/policy-comparative.js +97 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
- package/dist/interfaces/rest-api.d.ts.map +1 -1
- package/dist/tools/answer-question.d.ts +2 -0
- package/dist/tools/answer-question.d.ts.map +1 -1
- package/dist/tools/answer-question.js +18 -11
- package/dist/tools/generate-manifests.d.ts +2 -0
- package/dist/tools/generate-manifests.d.ts.map +1 -1
- package/dist/tools/generate-manifests.js +8 -4
- package/dist/tools/organizational-data.d.ts +1 -0
- package/dist/tools/organizational-data.d.ts.map +1 -1
- package/dist/tools/organizational-data.js +2 -1
- package/dist/tools/recommend.d.ts +1 -0
- package/dist/tools/recommend.d.ts.map +1 -1
- package/dist/tools/recommend.js +10 -5
- package/dist/tools/remediate.d.ts +3 -0
- package/dist/tools/remediate.d.ts.map +1 -1
- package/dist/tools/remediate.js +25 -12
- package/dist/tools/test-docs.d.ts +1 -0
- package/dist/tools/test-docs.d.ts.map +1 -1
- package/dist/tools/test-docs.js +4 -2
- package/dist/tools/version.d.ts +4 -1
- package/dist/tools/version.d.ts.map +1 -1
- package/dist/tools/version.js +12 -4
- package/package.json +5 -1
|
@@ -14,32 +14,52 @@ export declare function ensureDebugDirectory(): string;
|
|
|
14
14
|
*/
|
|
15
15
|
export declare function generateDebugId(operation: string): string;
|
|
16
16
|
/**
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
* PRD #143 Decision 5: Extended metrics for model comparison analysis
|
|
17
|
+
* Unified evaluation metrics entry for AI quality assessment and performance tracking
|
|
18
|
+
* PRD #154: Single interface for all metrics and evaluation data
|
|
20
19
|
*/
|
|
21
|
-
export
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
20
|
+
export interface EvaluationMetrics {
|
|
21
|
+
operation: string;
|
|
22
|
+
sdk: string;
|
|
23
|
+
inputTokens: number;
|
|
24
|
+
outputTokens: number;
|
|
25
|
+
durationMs: number;
|
|
26
|
+
iterationCount: number;
|
|
27
|
+
toolCallCount: number;
|
|
28
|
+
status: string;
|
|
29
|
+
completionReason: string;
|
|
30
|
+
modelVersion: string;
|
|
31
|
+
cacheCreationTokens?: number;
|
|
32
|
+
cacheReadTokens?: number;
|
|
33
|
+
cacheHitRate?: number;
|
|
34
|
+
uniqueToolsUsed?: string[];
|
|
35
|
+
test_scenario: string;
|
|
36
|
+
ai_response_summary: string;
|
|
37
|
+
debug_files?: {
|
|
38
|
+
full_prompt: string;
|
|
39
|
+
full_response: string;
|
|
27
40
|
};
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
41
|
+
user_intent: string;
|
|
42
|
+
interaction_id: string;
|
|
43
|
+
failure_analysis?: string;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Determine if dataset generation should be skipped for specific operations
|
|
47
|
+
*/
|
|
48
|
+
export declare function shouldSkipDatasetGeneration(operation: string): boolean;
|
|
49
|
+
/**
|
|
50
|
+
* Log unified evaluation metrics when DEBUG_DOT_AI=true
|
|
51
|
+
* Single function for all metrics and evaluation data capture
|
|
52
|
+
*/
|
|
53
|
+
/**
|
|
54
|
+
* Generate eval dataset entry in standard OpenAI Evals format
|
|
55
|
+
* Logs evaluation metrics to JSONL dataset files for AI quality assessment
|
|
56
|
+
*/
|
|
57
|
+
export declare function logEvaluationDataset(metrics: EvaluationMetrics, debugMode?: boolean): void;
|
|
38
58
|
/**
|
|
39
59
|
* Create AgenticResult and log metrics in one step
|
|
40
60
|
* Reduces code duplication across providers
|
|
41
61
|
*
|
|
42
|
-
* PRD #
|
|
62
|
+
* PRD #154: Updated to use unified evaluation metrics
|
|
43
63
|
*/
|
|
44
64
|
export declare function createAndLogAgenticResult(config: {
|
|
45
65
|
finalMessage: string;
|
|
@@ -62,6 +82,15 @@ export declare function createAndLogAgenticResult(config: {
|
|
|
62
82
|
sdk: string;
|
|
63
83
|
startTime: number;
|
|
64
84
|
debugMode: boolean;
|
|
85
|
+
debugFiles?: {
|
|
86
|
+
promptFile: string;
|
|
87
|
+
responseFile: string;
|
|
88
|
+
} | null;
|
|
89
|
+
evaluationContext?: {
|
|
90
|
+
user_intent?: string;
|
|
91
|
+
failure_analysis?: string;
|
|
92
|
+
};
|
|
93
|
+
interaction_id?: string;
|
|
65
94
|
}): AgenticResult;
|
|
66
95
|
/**
|
|
67
96
|
* Save AI interaction for debugging when DEBUG_DOT_AI=true
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"provider-debug-utils.d.ts","sourceRoot":"","sources":["../../../src/core/providers/provider-debug-utils.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AAErE;;GAEG;AACH,wBAAgB,oBAAoB,IAAI,MAAM,CAM7C;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAKzD;AAED
|
|
1
|
+
{"version":3,"file":"provider-debug-utils.d.ts","sourceRoot":"","sources":["../../../src/core/providers/provider-debug-utils.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AAErE;;GAEG;AACH,wBAAgB,oBAAoB,IAAI,MAAM,CAM7C;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAKzD;AAED;;;GAGG;AACH,MAAM,WAAW,iBAAiB;IAEhC,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IAGnB,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE,MAAM,CAAC;IACf,gBAAgB,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;IAGrB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAG3B,aAAa,EAAE,MAAM,CAAC;IACtB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,WAAW,CAAC,EAAE;QACZ,WAAW,EAAE,MAAM,CAAC;QACpB,aAAa,EAAE,MAAM,CAAC;KACvB,CAAC;IAGF,WAAW,EAAE,MAAM,CAAC;IACpB,cAAc,EAAE,MAAM,CAAC;IAGvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,wBAAgB,2BAA2B,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAGtE;AAED;;;GAGG;AACH;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,iBAAiB,EAC1B,SAAS,GAAE,OAAe,GACzB,IAAI,CAiFN;AAGD;;;;;GAKG;AACH,wBAAgB,yBAAyB,CAAC,MAAM,EAAE;IAChD,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,GAAG,CAAC;QAAC,MAAM,EAAE,GAAG,CAAA;KAAE,CAAC,CAAC;IACpE,WAAW,EAAE;QACX,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,EAAE,MAAM,CAAC;QACtB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,MAAM,EAAE,SAAS,GAAG,QAAQ,GAAG,SAAS,GAAG,aAAa,CAAC;IACzD,gBAAgB,EAAE,wBAAwB,GAAG,gBAAgB,GAAG,eAAe,GAAG,eAAe,GAAG,OAAO,CAAC;IAC5G,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,CAAC;IACnB,UAAU,CAAC,EAAE;QAAE,UAAU,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;IAGjE,iBAAiB,CAAC,EAAE;QAClB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,gBAAgB,CAAC,EAAE,MAAM,CAAC;KAC3B,CAAC;IAGF,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB,GAAG,aAAa,CAsDhB;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CACjC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,UAAU,EACpB,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,OAAO,GACjB,IAAI,CAkCN"}
|
|
@@ -41,7 +41,8 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
41
41
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
42
|
exports.ensureDebugDirectory = ensureDebugDirectory;
|
|
43
43
|
exports.generateDebugId = generateDebugId;
|
|
44
|
-
exports.
|
|
44
|
+
exports.shouldSkipDatasetGeneration = shouldSkipDatasetGeneration;
|
|
45
|
+
exports.logEvaluationDataset = logEvaluationDataset;
|
|
45
46
|
exports.createAndLogAgenticResult = createAndLogAgenticResult;
|
|
46
47
|
exports.debugLogInteraction = debugLogInteraction;
|
|
47
48
|
const fs = __importStar(require("fs"));
|
|
@@ -67,70 +68,103 @@ function generateDebugId(operation) {
|
|
|
67
68
|
return `${dateTime}_${randomHex}_${operation}`;
|
|
68
69
|
}
|
|
69
70
|
/**
|
|
70
|
-
*
|
|
71
|
-
*
|
|
72
|
-
* PRD #143 Decision 5: Extended metrics for model comparison analysis
|
|
71
|
+
* Determine if dataset generation should be skipped for specific operations
|
|
73
72
|
*/
|
|
74
|
-
function
|
|
73
|
+
function shouldSkipDatasetGeneration(operation) {
|
|
74
|
+
const skipDatasetOperations = ['version-connectivity-check', 'generic'];
|
|
75
|
+
return skipDatasetOperations.includes(operation);
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Log unified evaluation metrics when DEBUG_DOT_AI=true
|
|
79
|
+
* Single function for all metrics and evaluation data capture
|
|
80
|
+
*/
|
|
81
|
+
/**
|
|
82
|
+
* Generate eval dataset entry in standard OpenAI Evals format
|
|
83
|
+
* Logs evaluation metrics to JSONL dataset files for AI quality assessment
|
|
84
|
+
*/
|
|
85
|
+
function logEvaluationDataset(metrics, debugMode = false) {
|
|
75
86
|
if (!debugMode)
|
|
76
87
|
return;
|
|
88
|
+
// Skip dataset generation for non-evaluable operations
|
|
89
|
+
if (shouldSkipDatasetGeneration(metrics.test_scenario))
|
|
90
|
+
return;
|
|
77
91
|
try {
|
|
78
|
-
const
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
sdk,
|
|
83
|
-
operation,
|
|
84
|
-
inputTokens: result.totalTokens.input,
|
|
85
|
-
outputTokens: result.totalTokens.output,
|
|
86
|
-
durationMs
|
|
87
|
-
};
|
|
88
|
-
// Add cache metrics if present
|
|
89
|
-
if (result.totalTokens.cacheCreation !== undefined) {
|
|
90
|
-
entry.cacheCreationTokens = result.totalTokens.cacheCreation;
|
|
92
|
+
const evalDir = path.join(process.cwd(), 'eval', 'datasets');
|
|
93
|
+
// Ensure eval datasets directory exists
|
|
94
|
+
if (!fs.existsSync(evalDir)) {
|
|
95
|
+
fs.mkdirSync(evalDir, { recursive: true });
|
|
91
96
|
}
|
|
92
|
-
|
|
93
|
-
|
|
97
|
+
// Parse operation for tool name
|
|
98
|
+
const operationParts = metrics.operation.split('-');
|
|
99
|
+
const toolName = operationParts[0]; // e.g., "remediate"
|
|
100
|
+
// Check if this is a comparative evaluation
|
|
101
|
+
const isComparativeEvaluation = metrics.operation.includes('-comparative-');
|
|
102
|
+
let datasetFile;
|
|
103
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '').split('T').join('_');
|
|
104
|
+
if (isComparativeEvaluation) {
|
|
105
|
+
// For comparative evaluations, don't include single model name since it compares multiple models
|
|
106
|
+
datasetFile = path.join(evalDir, `${toolName}_comparative_evaluation_${timestamp}.jsonl`);
|
|
94
107
|
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
108
|
+
else {
|
|
109
|
+
// Extract model name from modelVersion or sdk for single-model datasets
|
|
110
|
+
let modelName = 'unknown';
|
|
111
|
+
if (metrics.modelVersion) {
|
|
112
|
+
if (metrics.modelVersion.includes('sonnet')) {
|
|
113
|
+
modelName = 'sonnet';
|
|
114
|
+
}
|
|
115
|
+
else if (metrics.modelVersion.includes('gpt-5-pro')) {
|
|
116
|
+
modelName = 'gpt-pro';
|
|
117
|
+
}
|
|
118
|
+
else if (metrics.modelVersion.includes('gpt')) {
|
|
119
|
+
modelName = 'gpt';
|
|
120
|
+
}
|
|
121
|
+
else if (metrics.modelVersion.includes('gemini')) {
|
|
122
|
+
modelName = 'gemini';
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
// Create filename with interaction ID, SDK, model, and timestamp for single-model datasets
|
|
126
|
+
datasetFile = path.join(evalDir, `${toolName}_${metrics.interaction_id}_${metrics.sdk}_${modelName}_${timestamp}.jsonl`);
|
|
98
127
|
}
|
|
99
|
-
//
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
128
|
+
// Transform metrics into OpenAI Evals format (no ideal field - using model-graded evaluation)
|
|
129
|
+
const evalEntry = {
|
|
130
|
+
input: {
|
|
131
|
+
issue: metrics.user_intent || "Tool execution scenario"
|
|
132
|
+
},
|
|
133
|
+
output: metrics.ai_response_summary || "",
|
|
134
|
+
performance: {
|
|
135
|
+
duration_ms: metrics.durationMs,
|
|
136
|
+
input_tokens: metrics.inputTokens,
|
|
137
|
+
output_tokens: metrics.outputTokens,
|
|
138
|
+
total_tokens: metrics.inputTokens + metrics.outputTokens,
|
|
139
|
+
sdk: metrics.sdk,
|
|
140
|
+
model_version: metrics.modelVersion,
|
|
141
|
+
iterations: metrics.iterationCount,
|
|
142
|
+
tool_calls_executed: metrics.toolCallCount,
|
|
143
|
+
cache_read_tokens: metrics.cacheReadTokens || 0,
|
|
144
|
+
cache_creation_tokens: metrics.cacheCreationTokens || 0
|
|
145
|
+
},
|
|
146
|
+
metadata: {
|
|
147
|
+
timestamp: new Date().toISOString(),
|
|
148
|
+
complexity: "medium",
|
|
149
|
+
tags: ["troubleshooting"],
|
|
150
|
+
source: "integration_test",
|
|
151
|
+
tool: toolName,
|
|
152
|
+
test_scenario: metrics.test_scenario || `${toolName}_test`,
|
|
153
|
+
failure_analysis: metrics.failure_analysis || ""
|
|
154
|
+
}
|
|
155
|
+
};
|
|
156
|
+
fs.writeFileSync(datasetFile, JSON.stringify(evalEntry) + '\n');
|
|
157
|
+
console.log(`📊 Generated eval dataset: ${path.basename(datasetFile)} (${metrics.interaction_id}, ${metrics.durationMs}ms, ${metrics.inputTokens}+${metrics.outputTokens} tokens)`);
|
|
124
158
|
}
|
|
125
159
|
catch (error) {
|
|
126
|
-
console.
|
|
160
|
+
console.error(`❌ Failed to generate eval dataset for ${metrics.interaction_id} (${metrics.test_scenario}):`, error);
|
|
127
161
|
}
|
|
128
162
|
}
|
|
129
163
|
/**
|
|
130
164
|
* Create AgenticResult and log metrics in one step
|
|
131
165
|
* Reduces code duplication across providers
|
|
132
166
|
*
|
|
133
|
-
* PRD #
|
|
167
|
+
* PRD #154: Updated to use unified evaluation metrics
|
|
134
168
|
*/
|
|
135
169
|
function createAndLogAgenticResult(config) {
|
|
136
170
|
const result = {
|
|
@@ -144,7 +178,39 @@ function createAndLogAgenticResult(config) {
|
|
|
144
178
|
};
|
|
145
179
|
const durationMs = Date.now() - config.startTime;
|
|
146
180
|
if (config.debugMode) {
|
|
147
|
-
|
|
181
|
+
// PRD #154: Use unified evaluation metrics system
|
|
182
|
+
const evaluationMetrics = {
|
|
183
|
+
// Core execution data
|
|
184
|
+
operation: config.operation,
|
|
185
|
+
sdk: config.sdk,
|
|
186
|
+
inputTokens: config.totalTokens.input,
|
|
187
|
+
outputTokens: config.totalTokens.output,
|
|
188
|
+
durationMs,
|
|
189
|
+
// Required fields
|
|
190
|
+
iterationCount: config.iterations,
|
|
191
|
+
toolCallCount: config.toolCallsExecuted.length,
|
|
192
|
+
status: config.status,
|
|
193
|
+
completionReason: config.completionReason,
|
|
194
|
+
modelVersion: config.modelVersion,
|
|
195
|
+
// Required evaluation context - NO DEFAULTS, must be provided
|
|
196
|
+
test_scenario: config.operation,
|
|
197
|
+
ai_response_summary: config.finalMessage,
|
|
198
|
+
user_intent: config.evaluationContext?.user_intent || '', // Will be enhanced later by EvalDatasetEnhancer
|
|
199
|
+
interaction_id: config.interaction_id || '', // Will be enhanced later if missing
|
|
200
|
+
// Optional performance data
|
|
201
|
+
...(config.totalTokens.cacheCreation !== undefined && { cacheCreationTokens: config.totalTokens.cacheCreation }),
|
|
202
|
+
...(config.totalTokens.cacheRead !== undefined && { cacheReadTokens: config.totalTokens.cacheRead }),
|
|
203
|
+
...(config.toolCallsExecuted.length > 0 && {
|
|
204
|
+
uniqueToolsUsed: [...new Set(config.toolCallsExecuted.map(tc => tc.tool))]
|
|
205
|
+
}),
|
|
206
|
+
...(config.debugFiles && { debug_files: { full_prompt: config.debugFiles.promptFile, full_response: config.debugFiles.responseFile } }),
|
|
207
|
+
...(config.evaluationContext?.failure_analysis && { failure_analysis: config.evaluationContext.failure_analysis })
|
|
208
|
+
};
|
|
209
|
+
// Calculate cache hit rate if applicable
|
|
210
|
+
if (config.totalTokens.cacheRead !== undefined && config.totalTokens.input > 0) {
|
|
211
|
+
evaluationMetrics.cacheHitRate = Math.round((config.totalTokens.cacheRead / config.totalTokens.input) * 100);
|
|
212
|
+
}
|
|
213
|
+
logEvaluationDataset(evaluationMetrics, config.debugMode);
|
|
148
214
|
}
|
|
149
215
|
return result;
|
|
150
216
|
}
|
|
@@ -18,7 +18,10 @@ export declare class VercelProvider implements AIProvider {
|
|
|
18
18
|
getDefaultModel(): string;
|
|
19
19
|
isInitialized(): boolean;
|
|
20
20
|
private logDebugIfEnabled;
|
|
21
|
-
sendMessage(message: string, operation?: string
|
|
21
|
+
sendMessage(message: string, operation?: string, evaluationContext?: {
|
|
22
|
+
user_intent?: string;
|
|
23
|
+
interaction_id?: string;
|
|
24
|
+
}): Promise<AIResponse>;
|
|
22
25
|
/**
|
|
23
26
|
* Agentic tool loop using Vercel AI SDK
|
|
24
27
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"vercel-provider.d.ts","sourceRoot":"","sources":["../../../src/core/providers/vercel-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,OAAO,EACL,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,aAAa,EACd,MAAM,0BAA0B,CAAC;
|
|
1
|
+
{"version":3,"file":"vercel-provider.d.ts","sourceRoot":"","sources":["../../../src/core/providers/vercel-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,OAAO,EACL,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,aAAa,EACd,MAAM,0BAA0B,CAAC;AAMlC,qBAAa,cAAe,YAAW,UAAU;IAC/C,OAAO,CAAC,YAAY,CAAoB;IACxC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAU;IAC3B,OAAO,CAAC,aAAa,CAAM;gBAEf,MAAM,EAAE,gBAAgB;IAUpC,OAAO,CAAC,qBAAqB;IAU7B,OAAO,CAAC,eAAe;IAiCvB,eAAe,IAAI,MAAM;IAIzB,eAAe,IAAI,MAAM;IAIzB,aAAa,IAAI,OAAO;IAIxB,OAAO,CAAC,iBAAiB;IAiBnB,WAAW,CACf,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,MAAkB,EAC7B,iBAAiB,CAAC,EAAE;QAClB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,GACA,OAAO,CAAC,UAAU,CAAC;IAwEtB;;;;;;;;;;;;OAYG;IACG,QAAQ,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;CAwQ/D"}
|
|
@@ -5,39 +5,6 @@
|
|
|
5
5
|
* Implements AIProvider interface using Vercel AI SDK.
|
|
6
6
|
* Supports OpenAI and Google Gemini providers through unified interface.
|
|
7
7
|
*/
|
|
8
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
|
-
if (k2 === undefined) k2 = k;
|
|
10
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
11
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
12
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
13
|
-
}
|
|
14
|
-
Object.defineProperty(o, k2, desc);
|
|
15
|
-
}) : (function(o, m, k, k2) {
|
|
16
|
-
if (k2 === undefined) k2 = k;
|
|
17
|
-
o[k2] = m[k];
|
|
18
|
-
}));
|
|
19
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
20
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
21
|
-
}) : function(o, v) {
|
|
22
|
-
o["default"] = v;
|
|
23
|
-
});
|
|
24
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
25
|
-
var ownKeys = function(o) {
|
|
26
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
27
|
-
var ar = [];
|
|
28
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
29
|
-
return ar;
|
|
30
|
-
};
|
|
31
|
-
return ownKeys(o);
|
|
32
|
-
};
|
|
33
|
-
return function (mod) {
|
|
34
|
-
if (mod && mod.__esModule) return mod;
|
|
35
|
-
var result = {};
|
|
36
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
37
|
-
__setModuleDefault(result, mod);
|
|
38
|
-
return result;
|
|
39
|
-
};
|
|
40
|
-
})();
|
|
41
8
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
9
|
exports.VercelProvider = void 0;
|
|
43
10
|
const ai_1 = require("ai");
|
|
@@ -45,14 +12,7 @@ const openai_1 = require("@ai-sdk/openai");
|
|
|
45
12
|
const google_1 = require("@ai-sdk/google");
|
|
46
13
|
const anthropic_1 = require("@ai-sdk/anthropic");
|
|
47
14
|
const provider_debug_utils_1 = require("./provider-debug-utils");
|
|
48
|
-
|
|
49
|
-
* Provider-specific default models
|
|
50
|
-
*/
|
|
51
|
-
const PROVIDER_MODELS = {
|
|
52
|
-
openai: 'gpt-5',
|
|
53
|
-
google: 'gemini-2.5-pro',
|
|
54
|
-
anthropic: 'claude-sonnet-4-5-20250929'
|
|
55
|
-
};
|
|
15
|
+
const model_config_1 = require("../model-config");
|
|
56
16
|
class VercelProvider {
|
|
57
17
|
providerType;
|
|
58
18
|
model;
|
|
@@ -71,14 +31,15 @@ class VercelProvider {
|
|
|
71
31
|
if (!this.apiKey) {
|
|
72
32
|
throw new Error(`API key is required for ${this.providerType} provider`);
|
|
73
33
|
}
|
|
74
|
-
if (!['openai', 'google', 'anthropic'].includes(this.providerType)) {
|
|
75
|
-
throw new Error(`Unsupported provider: ${this.providerType}. Must be 'openai', 'google', or 'anthropic'`);
|
|
34
|
+
if (!['openai', 'openai_pro', 'google', 'anthropic'].includes(this.providerType)) {
|
|
35
|
+
throw new Error(`Unsupported provider: ${this.providerType}. Must be 'openai', 'openai_pro', 'google', or 'anthropic'`);
|
|
76
36
|
}
|
|
77
37
|
}
|
|
78
38
|
initializeModel() {
|
|
79
39
|
try {
|
|
80
40
|
switch (this.providerType) {
|
|
81
|
-
case 'openai':
|
|
41
|
+
case 'openai':
|
|
42
|
+
case 'openai_pro': {
|
|
82
43
|
const provider = (0, openai_1.createOpenAI)({
|
|
83
44
|
apiKey: this.apiKey
|
|
84
45
|
});
|
|
@@ -111,27 +72,23 @@ class VercelProvider {
|
|
|
111
72
|
return 'vercel';
|
|
112
73
|
}
|
|
113
74
|
getDefaultModel() {
|
|
114
|
-
return
|
|
75
|
+
return model_config_1.CURRENT_MODELS[this.providerType];
|
|
115
76
|
}
|
|
116
77
|
isInitialized() {
|
|
117
78
|
return this.modelInstance !== undefined;
|
|
118
79
|
}
|
|
119
|
-
logDebugIfEnabled(operation, prompt, response
|
|
80
|
+
logDebugIfEnabled(operation, prompt, response) {
|
|
120
81
|
if (!this.debugMode)
|
|
121
|
-
return;
|
|
82
|
+
return null;
|
|
122
83
|
const debugId = (0, provider_debug_utils_1.generateDebugId)(operation);
|
|
123
84
|
(0, provider_debug_utils_1.debugLogInteraction)(debugId, prompt, response, operation, this.getProviderType(), this.model, this.debugMode);
|
|
124
|
-
//
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
cacheCreation: response.usage.cache_creation_input_tokens,
|
|
130
|
-
cacheRead: response.usage.cache_read_input_tokens
|
|
131
|
-
}
|
|
132
|
-
}, durationMs, this.debugMode);
|
|
85
|
+
// Return the actual debug file names created
|
|
86
|
+
return {
|
|
87
|
+
promptFile: `${debugId}_prompt.md`,
|
|
88
|
+
responseFile: `${debugId}_response.md`
|
|
89
|
+
};
|
|
133
90
|
}
|
|
134
|
-
async sendMessage(message, operation = 'generic') {
|
|
91
|
+
async sendMessage(message, operation = 'generic', evaluationContext) {
|
|
135
92
|
if (!this.isInitialized()) {
|
|
136
93
|
throw new Error(`${this.providerType} provider not initialized`);
|
|
137
94
|
}
|
|
@@ -146,8 +103,8 @@ class VercelProvider {
|
|
|
146
103
|
const response = {
|
|
147
104
|
content: result.text,
|
|
148
105
|
usage: {
|
|
149
|
-
input_tokens: result.usage.inputTokens || 0,
|
|
150
|
-
output_tokens: result.usage.outputTokens || 0
|
|
106
|
+
input_tokens: (result.totalUsage || result.usage).inputTokens || 0,
|
|
107
|
+
output_tokens: (result.totalUsage || result.usage).outputTokens || 0
|
|
151
108
|
}
|
|
152
109
|
};
|
|
153
110
|
const durationMs = Date.now() - startTime;
|
|
@@ -155,14 +112,34 @@ class VercelProvider {
|
|
|
155
112
|
if (this.debugMode) {
|
|
156
113
|
const debugId = (0, provider_debug_utils_1.generateDebugId)(operation);
|
|
157
114
|
(0, provider_debug_utils_1.debugLogInteraction)(debugId, message, response, operation, this.getProviderType(), this.model, this.debugMode);
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
115
|
+
// PRD #154: Always use new evaluation dataset system
|
|
116
|
+
const evaluationMetrics = {
|
|
117
|
+
// Core execution data
|
|
118
|
+
operation,
|
|
119
|
+
sdk: this.getProviderType(),
|
|
120
|
+
inputTokens: response.usage.input_tokens,
|
|
121
|
+
outputTokens: response.usage.output_tokens,
|
|
122
|
+
durationMs,
|
|
123
|
+
// Required fields
|
|
124
|
+
iterationCount: 1,
|
|
125
|
+
toolCallCount: 0,
|
|
126
|
+
status: 'completed',
|
|
127
|
+
completionReason: 'stop',
|
|
128
|
+
modelVersion: this.model,
|
|
129
|
+
// Required evaluation context - NO DEFAULTS, must be provided
|
|
130
|
+
test_scenario: operation,
|
|
131
|
+
ai_response_summary: response.content,
|
|
132
|
+
user_intent: evaluationContext?.user_intent || '',
|
|
133
|
+
interaction_id: evaluationContext?.interaction_id || '',
|
|
134
|
+
// Optional performance data
|
|
135
|
+
...(response.usage.cache_creation_input_tokens && { cacheCreationTokens: response.usage.cache_creation_input_tokens }),
|
|
136
|
+
...(response.usage.cache_read_input_tokens && { cacheReadTokens: response.usage.cache_read_input_tokens })
|
|
137
|
+
};
|
|
138
|
+
// Calculate cache hit rate if applicable
|
|
139
|
+
if (response.usage.cache_read_input_tokens && response.usage.input_tokens > 0) {
|
|
140
|
+
evaluationMetrics.cacheHitRate = Math.round((response.usage.cache_read_input_tokens / response.usage.input_tokens) * 100);
|
|
141
|
+
}
|
|
142
|
+
(0, provider_debug_utils_1.logEvaluationDataset)(evaluationMetrics, this.debugMode);
|
|
166
143
|
}
|
|
167
144
|
return response;
|
|
168
145
|
}
|
|
@@ -272,48 +249,6 @@ class VercelProvider {
|
|
|
272
249
|
generateConfig.system = systemParam;
|
|
273
250
|
}
|
|
274
251
|
const result = await (0, ai_1.generateText)(generateConfig);
|
|
275
|
-
// Debug: Log the full cumulative context that was actually sent to the AI
|
|
276
|
-
if (this.debugMode && result.response?.messages) {
|
|
277
|
-
const path = await Promise.resolve().then(() => __importStar(require('path')));
|
|
278
|
-
const debugId = (0, provider_debug_utils_1.generateDebugId)(`${operation}-final-context`);
|
|
279
|
-
const debugDir = path.join(process.cwd(), 'tmp', 'debug-ai');
|
|
280
|
-
const contextFile = path.join(debugDir, `${debugId}_full-context.md`);
|
|
281
|
-
// Build full conversation history representation
|
|
282
|
-
const messages = result.response.messages;
|
|
283
|
-
const contextParts = [`# Full Conversation Context - ${operation}\n`];
|
|
284
|
-
contextParts.push(`\nTimestamp: ${new Date().toISOString()}`);
|
|
285
|
-
contextParts.push(`Provider: ${this.getProviderType()}`);
|
|
286
|
-
contextParts.push(`Model: ${this.model}`);
|
|
287
|
-
contextParts.push(`Total Messages: ${messages.length}`);
|
|
288
|
-
contextParts.push(`Total Steps: ${result.steps?.length || 0}`);
|
|
289
|
-
contextParts.push('\n---\n');
|
|
290
|
-
for (let i = 0; i < messages.length; i++) {
|
|
291
|
-
const msg = messages[i];
|
|
292
|
-
contextParts.push(`\n## Message ${i + 1} - Role: ${msg.role}\n`);
|
|
293
|
-
if (typeof msg.content === 'string') {
|
|
294
|
-
contextParts.push(msg.content);
|
|
295
|
-
}
|
|
296
|
-
else if (Array.isArray(msg.content)) {
|
|
297
|
-
for (const part of msg.content) {
|
|
298
|
-
if (part.type === 'text') {
|
|
299
|
-
contextParts.push(part.text || '');
|
|
300
|
-
}
|
|
301
|
-
else if (part.type === 'tool-call') {
|
|
302
|
-
contextParts.push(`\n[TOOL CALL: ${part.toolName}]`);
|
|
303
|
-
contextParts.push(JSON.stringify(part.args, null, 2));
|
|
304
|
-
}
|
|
305
|
-
else if (part.type === 'tool-result') {
|
|
306
|
-
contextParts.push(`\n[TOOL RESULT: ${part.toolName}]`);
|
|
307
|
-
const resultData = part.output || part.result || part.content || part;
|
|
308
|
-
contextParts.push(JSON.stringify(resultData, null, 2));
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
}
|
|
312
|
-
}
|
|
313
|
-
const fs = await Promise.resolve().then(() => __importStar(require('fs')));
|
|
314
|
-
fs.writeFileSync(contextFile, contextParts.join('\n'));
|
|
315
|
-
console.log(`🐛 DEBUG: Full conversation context logged to ${contextFile}`);
|
|
316
|
-
}
|
|
317
252
|
// Extract tool call history from steps
|
|
318
253
|
const toolCallsExecuted = [];
|
|
319
254
|
for (const step of result.steps || []) {
|
|
@@ -332,11 +267,11 @@ class VercelProvider {
|
|
|
332
267
|
// - GitHub Issue #8795: Token reporting issues with Anthropic provider (streaming)
|
|
333
268
|
// Our version (5.0.60, released Oct 2, 2025) includes these fixes.
|
|
334
269
|
// However, testing still shows ~70% fewer tokens reported vs Anthropic native SDK.
|
|
335
|
-
// Root cause
|
|
336
|
-
const usage = result.usage;
|
|
270
|
+
// Root cause: We were using result.usage (final step only) instead of result.totalUsage (sum of all steps)!
|
|
271
|
+
const usage = result.totalUsage || result.usage;
|
|
337
272
|
let cacheReadTokens = 0;
|
|
338
273
|
let cacheCreationTokens = 0;
|
|
339
|
-
// Anthropic via Vercel uses cachedInputTokens
|
|
274
|
+
// Anthropic via Vercel uses cachedInputTokens (confirmed in AI SDK 5+)
|
|
340
275
|
if (usage.cachedInputTokens) {
|
|
341
276
|
cacheReadTokens = usage.cachedInputTokens;
|
|
342
277
|
}
|
|
@@ -367,6 +302,57 @@ class VercelProvider {
|
|
|
367
302
|
}
|
|
368
303
|
}
|
|
369
304
|
}
|
|
305
|
+
// Log debug for summary operations to capture complete prompts/responses for evaluation
|
|
306
|
+
let debugFiles = null;
|
|
307
|
+
if (this.debugMode) {
|
|
308
|
+
// Build the full conversation context like Anthropic provider does
|
|
309
|
+
let finalPrompt = `System: ${config.systemPrompt}\n\n`;
|
|
310
|
+
// Always include the original user intent first
|
|
311
|
+
finalPrompt += `user: ${config.userMessage}\n\n`;
|
|
312
|
+
// Then add the conversation history if available
|
|
313
|
+
if (result.response?.messages) {
|
|
314
|
+
finalPrompt += result.response.messages
|
|
315
|
+
.map(msg => {
|
|
316
|
+
if (typeof msg.content === 'string') {
|
|
317
|
+
return `${msg.role}: ${msg.content}`;
|
|
318
|
+
}
|
|
319
|
+
else if (Array.isArray(msg.content)) {
|
|
320
|
+
const contentParts = msg.content.map(part => {
|
|
321
|
+
if (part.type === 'text') {
|
|
322
|
+
return part.text;
|
|
323
|
+
}
|
|
324
|
+
else if (part.type === 'tool-call') {
|
|
325
|
+
return `[TOOL_USE: ${part.toolName}]`;
|
|
326
|
+
}
|
|
327
|
+
else if (part.type === 'tool-result') {
|
|
328
|
+
const resultData = part.output || part.result || part.content;
|
|
329
|
+
if (typeof resultData === 'string') {
|
|
330
|
+
return `[TOOL_RESULT: ${part.toolName}]\n${resultData}`;
|
|
331
|
+
}
|
|
332
|
+
else if (resultData) {
|
|
333
|
+
return `[TOOL_RESULT: ${part.toolName}]\n${JSON.stringify(resultData, null, 2)}`;
|
|
334
|
+
}
|
|
335
|
+
return `[TOOL_RESULT: ${part.toolName}]`;
|
|
336
|
+
}
|
|
337
|
+
return `[${part.type}]`;
|
|
338
|
+
}).join(' ');
|
|
339
|
+
return `${msg.role}: ${contentParts}`;
|
|
340
|
+
}
|
|
341
|
+
return `${msg.role}: [complex_content]`;
|
|
342
|
+
})
|
|
343
|
+
.join('\n\n');
|
|
344
|
+
}
|
|
345
|
+
const aiResponse = {
|
|
346
|
+
content: finalText || '',
|
|
347
|
+
usage: {
|
|
348
|
+
input_tokens: usage.inputTokens || 0,
|
|
349
|
+
output_tokens: usage.outputTokens || 0,
|
|
350
|
+
cache_creation_input_tokens: cacheCreationTokens,
|
|
351
|
+
cache_read_input_tokens: cacheReadTokens
|
|
352
|
+
}
|
|
353
|
+
};
|
|
354
|
+
debugFiles = this.logDebugIfEnabled(`${operation}-summary`, finalPrompt, aiResponse);
|
|
355
|
+
}
|
|
370
356
|
return (0, provider_debug_utils_1.createAndLogAgenticResult)({
|
|
371
357
|
finalMessage: finalText || '',
|
|
372
358
|
iterations: result.steps?.length || 1,
|
|
@@ -383,7 +369,10 @@ class VercelProvider {
|
|
|
383
369
|
operation: `${operation}-summary`,
|
|
384
370
|
sdk: this.getProviderType(),
|
|
385
371
|
startTime,
|
|
386
|
-
debugMode: this.debugMode
|
|
372
|
+
debugMode: this.debugMode,
|
|
373
|
+
debugFiles,
|
|
374
|
+
evaluationContext: config.evaluationContext,
|
|
375
|
+
interaction_id: config.interaction_id
|
|
387
376
|
});
|
|
388
377
|
}
|
|
389
378
|
catch (error) {
|
|
@@ -404,7 +393,9 @@ class VercelProvider {
|
|
|
404
393
|
operation: `${operation}-error`,
|
|
405
394
|
sdk: this.getProviderType(),
|
|
406
395
|
startTime,
|
|
407
|
-
debugMode: this.debugMode
|
|
396
|
+
debugMode: this.debugMode,
|
|
397
|
+
evaluationContext: config.evaluationContext,
|
|
398
|
+
interaction_id: config.interaction_id
|
|
408
399
|
});
|
|
409
400
|
}
|
|
410
401
|
}
|