@vfarcic/dot-ai 0.111.0 → 0.112.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/core/ai-provider-factory.d.ts +5 -0
  2. package/dist/core/ai-provider-factory.d.ts.map +1 -1
  3. package/dist/core/ai-provider-factory.js +13 -2
  4. package/dist/core/ai-provider.interface.d.ts +16 -1
  5. package/dist/core/ai-provider.interface.d.ts.map +1 -1
  6. package/dist/core/capabilities.d.ts +1 -1
  7. package/dist/core/capabilities.d.ts.map +1 -1
  8. package/dist/core/capabilities.js +7 -4
  9. package/dist/core/capability-scan-workflow.js +2 -2
  10. package/dist/core/model-config.d.ts +17 -0
  11. package/dist/core/model-config.d.ts.map +1 -0
  12. package/dist/core/model-config.js +22 -0
  13. package/dist/core/platform-operations.d.ts.map +1 -1
  14. package/dist/core/platform-operations.js +3 -5
  15. package/dist/core/platform-utils.d.ts +3 -2
  16. package/dist/core/platform-utils.d.ts.map +1 -1
  17. package/dist/core/platform-utils.js +35 -9
  18. package/dist/core/providers/anthropic-provider.d.ts +4 -1
  19. package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
  20. package/dist/core/providers/anthropic-provider.js +89 -27
  21. package/dist/core/providers/provider-debug-utils.d.ts +49 -20
  22. package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
  23. package/dist/core/providers/provider-debug-utils.js +117 -51
  24. package/dist/core/providers/vercel-provider.d.ts +4 -1
  25. package/dist/core/providers/vercel-provider.d.ts.map +1 -1
  26. package/dist/core/providers/vercel-provider.js +105 -114
  27. package/dist/core/schema.d.ts +1 -5
  28. package/dist/core/schema.d.ts.map +1 -1
  29. package/dist/core/schema.js +16 -42
  30. package/dist/core/unified-creation-session.d.ts.map +1 -1
  31. package/dist/core/unified-creation-session.js +12 -6
  32. package/dist/evaluation/dataset-analyzer.d.ts +118 -0
  33. package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
  34. package/dist/evaluation/dataset-analyzer.js +234 -0
  35. package/dist/evaluation/datasets/loader.d.ts +42 -0
  36. package/dist/evaluation/datasets/loader.d.ts.map +1 -0
  37. package/dist/evaluation/datasets/loader.js +104 -0
  38. package/dist/evaluation/eval-runner.d.ts +9 -0
  39. package/dist/evaluation/eval-runner.d.ts.map +1 -0
  40. package/dist/evaluation/eval-runner.js +255 -0
  41. package/dist/evaluation/evaluators/base-comparative.d.ts +91 -0
  42. package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
  43. package/dist/evaluation/evaluators/base-comparative.js +152 -0
  44. package/dist/evaluation/evaluators/base.d.ts +47 -0
  45. package/dist/evaluation/evaluators/base.d.ts.map +1 -0
  46. package/dist/evaluation/evaluators/base.js +10 -0
  47. package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
  48. package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
  49. package/dist/evaluation/evaluators/capability-comparative.js +104 -0
  50. package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
  51. package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
  52. package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
  53. package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
  54. package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
  55. package/dist/evaluation/evaluators/policy-comparative.js +97 -0
  56. package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
  57. package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
  58. package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
  59. package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
  60. package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
  61. package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
  62. package/dist/interfaces/rest-api.d.ts.map +1 -1
  63. package/dist/tools/answer-question.d.ts +2 -0
  64. package/dist/tools/answer-question.d.ts.map +1 -1
  65. package/dist/tools/answer-question.js +18 -11
  66. package/dist/tools/generate-manifests.d.ts +2 -0
  67. package/dist/tools/generate-manifests.d.ts.map +1 -1
  68. package/dist/tools/generate-manifests.js +8 -4
  69. package/dist/tools/organizational-data.d.ts +1 -0
  70. package/dist/tools/organizational-data.d.ts.map +1 -1
  71. package/dist/tools/organizational-data.js +2 -1
  72. package/dist/tools/recommend.d.ts +1 -0
  73. package/dist/tools/recommend.d.ts.map +1 -1
  74. package/dist/tools/recommend.js +10 -5
  75. package/dist/tools/remediate.d.ts +3 -0
  76. package/dist/tools/remediate.d.ts.map +1 -1
  77. package/dist/tools/remediate.js +25 -12
  78. package/dist/tools/test-docs.d.ts +1 -0
  79. package/dist/tools/test-docs.d.ts.map +1 -1
  80. package/dist/tools/test-docs.js +4 -2
  81. package/dist/tools/version.d.ts +4 -1
  82. package/dist/tools/version.d.ts.map +1 -1
  83. package/dist/tools/version.js +12 -4
  84. package/package.json +5 -1
@@ -14,32 +14,52 @@ export declare function ensureDebugDirectory(): string;
14
14
  */
15
15
  export declare function generateDebugId(operation: string): string;
16
16
  /**
17
- * Log metrics for token usage and execution time when DEBUG_DOT_AI=true
18
- *
19
- * PRD #143 Decision 5: Extended metrics for model comparison analysis
17
+ * Unified evaluation metrics entry for AI quality assessment and performance tracking
18
+ * PRD #154: Single interface for all metrics and evaluation data
20
19
  */
21
- export declare function logMetrics(operation: string, sdk: string, result: {
22
- totalTokens: {
23
- input: number;
24
- output: number;
25
- cacheCreation?: number;
26
- cacheRead?: number;
20
+ export interface EvaluationMetrics {
21
+ operation: string;
22
+ sdk: string;
23
+ inputTokens: number;
24
+ outputTokens: number;
25
+ durationMs: number;
26
+ iterationCount: number;
27
+ toolCallCount: number;
28
+ status: string;
29
+ completionReason: string;
30
+ modelVersion: string;
31
+ cacheCreationTokens?: number;
32
+ cacheReadTokens?: number;
33
+ cacheHitRate?: number;
34
+ uniqueToolsUsed?: string[];
35
+ test_scenario: string;
36
+ ai_response_summary: string;
37
+ debug_files?: {
38
+ full_prompt: string;
39
+ full_response: string;
27
40
  };
28
- iterations?: number;
29
- toolCallsExecuted?: Array<{
30
- tool: string;
31
- input: any;
32
- output: any;
33
- }>;
34
- status?: string;
35
- completionReason?: string;
36
- modelVersion?: string;
37
- }, durationMs: number, debugMode: boolean): void;
41
+ user_intent: string;
42
+ interaction_id: string;
43
+ failure_analysis?: string;
44
+ }
45
+ /**
46
+ * Determine if dataset generation should be skipped for specific operations
47
+ */
48
+ export declare function shouldSkipDatasetGeneration(operation: string): boolean;
49
+ /**
50
+ * Log unified evaluation metrics when DEBUG_DOT_AI=true
51
+ * Single function for all metrics and evaluation data capture
52
+ */
53
+ /**
54
+ * Generate eval dataset entry in standard OpenAI Evals format
55
+ * Logs evaluation metrics to JSONL dataset files for AI quality assessment
56
+ */
57
+ export declare function logEvaluationDataset(metrics: EvaluationMetrics, debugMode?: boolean): void;
38
58
  /**
39
59
  * Create AgenticResult and log metrics in one step
40
60
  * Reduces code duplication across providers
41
61
  *
42
- * PRD #143 Decision 5: Standardized metrics logging
62
+ * PRD #154: Updated to use unified evaluation metrics
43
63
  */
44
64
  export declare function createAndLogAgenticResult(config: {
45
65
  finalMessage: string;
@@ -62,6 +82,15 @@ export declare function createAndLogAgenticResult(config: {
62
82
  sdk: string;
63
83
  startTime: number;
64
84
  debugMode: boolean;
85
+ debugFiles?: {
86
+ promptFile: string;
87
+ responseFile: string;
88
+ } | null;
89
+ evaluationContext?: {
90
+ user_intent?: string;
91
+ failure_analysis?: string;
92
+ };
93
+ interaction_id?: string;
65
94
  }): AgenticResult;
66
95
  /**
67
96
  * Save AI interaction for debugging when DEBUG_DOT_AI=true
@@ -1 +1 @@
1
- {"version":3,"file":"provider-debug-utils.d.ts","sourceRoot":"","sources":["../../../src/core/providers/provider-debug-utils.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AAErE;;GAEG;AACH,wBAAgB,oBAAoB,IAAI,MAAM,CAM7C;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAKzD;AAED;;;;GAIG;AACH,wBAAgB,UAAU,CACxB,SAAS,EAAE,MAAM,EACjB,GAAG,EAAE,MAAM,EACX,MAAM,EAAE;IACN,WAAW,EAAE;QACX,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,iBAAiB,CAAC,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,GAAG,CAAC;QAAC,MAAM,EAAE,GAAG,CAAA;KAAE,CAAC,CAAC;IACrE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB,EACD,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,OAAO,GACjB,IAAI,CA2DN;AAED;;;;;GAKG;AACH,wBAAgB,yBAAyB,CAAC,MAAM,EAAE;IAChD,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,GAAG,CAAC;QAAC,MAAM,EAAE,GAAG,CAAA;KAAE,CAAC,CAAC;IACpE,WAAW,EAAE;QACX,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,EAAE,MAAM,CAAC;QACtB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,MAAM,EAAE,SAAS,GAAG,QAAQ,GAAG,SAAS,GAAG,aAAa,CAAC;IACzD,gBAAgB,EAAE,wBAAwB,GAAG,gBAAgB,GAAG,eAAe,GAAG,eAAe,GAAG,OAAO,CAAC;IAC5G,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,CAAC;CACpB,GAAG,aAAa,CAiBhB;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CACjC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,UAAU,EACpB,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,OAAO,GACjB,IAAI,CAkCN"}
1
+ {"version":3,"file":"provider-debug-utils.d.ts","sourceRoot":"","sources":["../../../src/core/providers/provider-debug-utils.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,0BAA0B,CAAC;AAErE;;GAEG;AACH,wBAAgB,oBAAoB,IAAI,MAAM,CAM7C;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAKzD;AAED;;;GAGG;AACH,MAAM,WAAW,iBAAiB;IAEhC,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IAGnB,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE,MAAM,CAAC;IACf,gBAAgB,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;IAGrB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAG3B,aAAa,EAAE,MAAM,CAAC;IACtB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,WAAW,CAAC,EAAE;QACZ,WAAW,EAAE,MAAM,CAAC;QACpB,aAAa,EAAE,MAAM,CAAC;KACvB,CAAC;IAGF,WAAW,EAAE,MAAM,CAAC;IACpB,cAAc,EAAE,MAAM,CAAC;IAGvB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,wBAAgB,2BAA2B,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAGtE;AAED;;;GAGG;AACH;;;GAGG;AACH,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,iBAAiB,EAC1B,SAAS,GAAE,OAAe,GACzB,IAAI,CAiFN;AAGD;;;;;GAKG;AACH,wBAAgB,yBAAyB,CAAC,MAAM,EAAE;IAChD,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,GAAG,CAAC;QAAC,MAAM,EAAE,GAAG,CAAA;KAAE,CAAC,CAAC;IACpE,WAAW,EAAE;QACX,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,EAAE,MAAM,CAAC;QACtB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,MAAM,EAAE,SAAS,GAAG,QAAQ,GAAG,SAAS,GAAG,aAAa,CAAC;IACzD,gBAAgB,EAAE,wBAAwB,GAAG,gBAAgB,GAAG,eAAe,GAAG,eAAe,GAAG,OAAO,CAAC;IAC5G,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,CAAC;IACnB,UAAU,CAAC,EAAE;QAAE,UAAU,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,CAAC;IAGjE,iBAAiB,CAAC,EAAE;QAClB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,gBAAgB,CAAC,EAAE,MAAM,CAAC;KAC3B,CAAC;IAGF,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB,GAAG,aAAa,CAsDhB;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CACjC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,UAAU,EACpB,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,SAAS,EAAE,OAAO,GACjB,IAAI,CAkCN"}
@@ -41,7 +41,8 @@ var __importStar = (this && this.__importStar) || (function () {
41
41
  Object.defineProperty(exports, "__esModule", { value: true });
42
42
  exports.ensureDebugDirectory = ensureDebugDirectory;
43
43
  exports.generateDebugId = generateDebugId;
44
- exports.logMetrics = logMetrics;
44
+ exports.shouldSkipDatasetGeneration = shouldSkipDatasetGeneration;
45
+ exports.logEvaluationDataset = logEvaluationDataset;
45
46
  exports.createAndLogAgenticResult = createAndLogAgenticResult;
46
47
  exports.debugLogInteraction = debugLogInteraction;
47
48
  const fs = __importStar(require("fs"));
@@ -67,70 +68,103 @@ function generateDebugId(operation) {
67
68
  return `${dateTime}_${randomHex}_${operation}`;
68
69
  }
69
70
  /**
70
- * Log metrics for token usage and execution time when DEBUG_DOT_AI=true
71
- *
72
- * PRD #143 Decision 5: Extended metrics for model comparison analysis
71
+ * Determine if dataset generation should be skipped for specific operations
73
72
  */
74
- function logMetrics(operation, sdk, result, durationMs, debugMode) {
73
+ function shouldSkipDatasetGeneration(operation) {
74
+ const skipDatasetOperations = ['version-connectivity-check', 'generic'];
75
+ return skipDatasetOperations.includes(operation);
76
+ }
77
+ /**
78
+ * Log unified evaluation metrics when DEBUG_DOT_AI=true
79
+ * Single function for all metrics and evaluation data capture
80
+ */
81
+ /**
82
+ * Generate eval dataset entry in standard OpenAI Evals format
83
+ * Logs evaluation metrics to JSONL dataset files for AI quality assessment
84
+ */
85
+ function logEvaluationDataset(metrics, debugMode = false) {
75
86
  if (!debugMode)
76
87
  return;
88
+ // Skip dataset generation for non-evaluable operations
89
+ if (shouldSkipDatasetGeneration(metrics.test_scenario))
90
+ return;
77
91
  try {
78
- const debugDir = ensureDebugDirectory();
79
- const metricsFile = path.join(debugDir, 'metrics.jsonl');
80
- const entry = {
81
- timestamp: new Date().toISOString(),
82
- sdk,
83
- operation,
84
- inputTokens: result.totalTokens.input,
85
- outputTokens: result.totalTokens.output,
86
- durationMs
87
- };
88
- // Add cache metrics if present
89
- if (result.totalTokens.cacheCreation !== undefined) {
90
- entry.cacheCreationTokens = result.totalTokens.cacheCreation;
92
+ const evalDir = path.join(process.cwd(), 'eval', 'datasets');
93
+ // Ensure eval datasets directory exists
94
+ if (!fs.existsSync(evalDir)) {
95
+ fs.mkdirSync(evalDir, { recursive: true });
91
96
  }
92
- if (result.totalTokens.cacheRead !== undefined) {
93
- entry.cacheReadTokens = result.totalTokens.cacheRead;
97
+ // Parse operation for tool name
98
+ const operationParts = metrics.operation.split('-');
99
+ const toolName = operationParts[0]; // e.g., "remediate"
100
+ // Check if this is a comparative evaluation
101
+ const isComparativeEvaluation = metrics.operation.includes('-comparative-');
102
+ let datasetFile;
103
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '').split('T').join('_');
104
+ if (isComparativeEvaluation) {
105
+ // For comparative evaluations, don't include single model name since it compares multiple models
106
+ datasetFile = path.join(evalDir, `${toolName}_comparative_evaluation_${timestamp}.jsonl`);
94
107
  }
95
- // Calculate cache hit rate (percentage)
96
- if (result.totalTokens.cacheRead !== undefined && result.totalTokens.input > 0) {
97
- entry.cacheHitRate = Math.round((result.totalTokens.cacheRead / result.totalTokens.input) * 100);
108
+ else {
109
+ // Extract model name from modelVersion or sdk for single-model datasets
110
+ let modelName = 'unknown';
111
+ if (metrics.modelVersion) {
112
+ if (metrics.modelVersion.includes('sonnet')) {
113
+ modelName = 'sonnet';
114
+ }
115
+ else if (metrics.modelVersion.includes('gpt-5-pro')) {
116
+ modelName = 'gpt-pro';
117
+ }
118
+ else if (metrics.modelVersion.includes('gpt')) {
119
+ modelName = 'gpt';
120
+ }
121
+ else if (metrics.modelVersion.includes('gemini')) {
122
+ modelName = 'gemini';
123
+ }
124
+ }
125
+ // Create filename with interaction ID, SDK, model, and timestamp for single-model datasets
126
+ datasetFile = path.join(evalDir, `${toolName}_${metrics.interaction_id}_${metrics.sdk}_${modelName}_${timestamp}.jsonl`);
98
127
  }
99
- // Add extended metrics (PRD #143 Decision 5)
100
- if (result.iterations !== undefined) {
101
- entry.iterationCount = result.iterations;
102
- }
103
- if (result.toolCallsExecuted) {
104
- entry.toolCallCount = result.toolCallsExecuted.length;
105
- // Extract unique tool names
106
- const uniqueTools = [...new Set(result.toolCallsExecuted.map(tc => tc.tool))];
107
- entry.uniqueToolsUsed = uniqueTools;
108
- }
109
- if (result.status) {
110
- entry.status = result.status;
111
- }
112
- if (result.completionReason) {
113
- entry.completionReason = result.completionReason;
114
- }
115
- if (result.modelVersion) {
116
- entry.modelVersion = result.modelVersion;
117
- }
118
- // Manual annotation placeholders (populate after test analysis)
119
- entry.manualNotes = '';
120
- entry.failureReason = '';
121
- entry.qualityIssues = [];
122
- entry.comparisonNotes = '';
123
- fs.appendFileSync(metricsFile, JSON.stringify(entry) + '\n');
128
+ // Transform metrics into OpenAI Evals format (no ideal field - using model-graded evaluation)
129
+ const evalEntry = {
130
+ input: {
131
+ issue: metrics.user_intent || "Tool execution scenario"
132
+ },
133
+ output: metrics.ai_response_summary || "",
134
+ performance: {
135
+ duration_ms: metrics.durationMs,
136
+ input_tokens: metrics.inputTokens,
137
+ output_tokens: metrics.outputTokens,
138
+ total_tokens: metrics.inputTokens + metrics.outputTokens,
139
+ sdk: metrics.sdk,
140
+ model_version: metrics.modelVersion,
141
+ iterations: metrics.iterationCount,
142
+ tool_calls_executed: metrics.toolCallCount,
143
+ cache_read_tokens: metrics.cacheReadTokens || 0,
144
+ cache_creation_tokens: metrics.cacheCreationTokens || 0
145
+ },
146
+ metadata: {
147
+ timestamp: new Date().toISOString(),
148
+ complexity: "medium",
149
+ tags: ["troubleshooting"],
150
+ source: "integration_test",
151
+ tool: toolName,
152
+ test_scenario: metrics.test_scenario || `${toolName}_test`,
153
+ failure_analysis: metrics.failure_analysis || ""
154
+ }
155
+ };
156
+ fs.writeFileSync(datasetFile, JSON.stringify(evalEntry) + '\n');
157
+ console.log(`📊 Generated eval dataset: ${path.basename(datasetFile)} (${metrics.interaction_id}, ${metrics.durationMs}ms, ${metrics.inputTokens}+${metrics.outputTokens} tokens)`);
124
158
  }
125
159
  catch (error) {
126
- console.warn('Failed to log metrics:', error);
160
+ console.error(`❌ Failed to generate eval dataset for ${metrics.interaction_id} (${metrics.test_scenario}):`, error);
127
161
  }
128
162
  }
129
163
  /**
130
164
  * Create AgenticResult and log metrics in one step
131
165
  * Reduces code duplication across providers
132
166
  *
133
- * PRD #143 Decision 5: Standardized metrics logging
167
+ * PRD #154: Updated to use unified evaluation metrics
134
168
  */
135
169
  function createAndLogAgenticResult(config) {
136
170
  const result = {
@@ -144,7 +178,39 @@ function createAndLogAgenticResult(config) {
144
178
  };
145
179
  const durationMs = Date.now() - config.startTime;
146
180
  if (config.debugMode) {
147
- logMetrics(config.operation, config.sdk, result, durationMs, config.debugMode);
181
+ // PRD #154: Use unified evaluation metrics system
182
+ const evaluationMetrics = {
183
+ // Core execution data
184
+ operation: config.operation,
185
+ sdk: config.sdk,
186
+ inputTokens: config.totalTokens.input,
187
+ outputTokens: config.totalTokens.output,
188
+ durationMs,
189
+ // Required fields
190
+ iterationCount: config.iterations,
191
+ toolCallCount: config.toolCallsExecuted.length,
192
+ status: config.status,
193
+ completionReason: config.completionReason,
194
+ modelVersion: config.modelVersion,
195
+ // Required evaluation context - NO DEFAULTS, must be provided
196
+ test_scenario: config.operation,
197
+ ai_response_summary: config.finalMessage,
198
+ user_intent: config.evaluationContext?.user_intent || '', // Will be enhanced later by EvalDatasetEnhancer
199
+ interaction_id: config.interaction_id || '', // Will be enhanced later if missing
200
+ // Optional performance data
201
+ ...(config.totalTokens.cacheCreation !== undefined && { cacheCreationTokens: config.totalTokens.cacheCreation }),
202
+ ...(config.totalTokens.cacheRead !== undefined && { cacheReadTokens: config.totalTokens.cacheRead }),
203
+ ...(config.toolCallsExecuted.length > 0 && {
204
+ uniqueToolsUsed: [...new Set(config.toolCallsExecuted.map(tc => tc.tool))]
205
+ }),
206
+ ...(config.debugFiles && { debug_files: { full_prompt: config.debugFiles.promptFile, full_response: config.debugFiles.responseFile } }),
207
+ ...(config.evaluationContext?.failure_analysis && { failure_analysis: config.evaluationContext.failure_analysis })
208
+ };
209
+ // Calculate cache hit rate if applicable
210
+ if (config.totalTokens.cacheRead !== undefined && config.totalTokens.input > 0) {
211
+ evaluationMetrics.cacheHitRate = Math.round((config.totalTokens.cacheRead / config.totalTokens.input) * 100);
212
+ }
213
+ logEvaluationDataset(evaluationMetrics, config.debugMode);
148
214
  }
149
215
  return result;
150
216
  }
@@ -18,7 +18,10 @@ export declare class VercelProvider implements AIProvider {
18
18
  getDefaultModel(): string;
19
19
  isInitialized(): boolean;
20
20
  private logDebugIfEnabled;
21
- sendMessage(message: string, operation?: string): Promise<AIResponse>;
21
+ sendMessage(message: string, operation?: string, evaluationContext?: {
22
+ user_intent?: string;
23
+ interaction_id?: string;
24
+ }): Promise<AIResponse>;
22
25
  /**
23
26
  * Agentic tool loop using Vercel AI SDK
24
27
  *
@@ -1 +1 @@
1
- {"version":3,"file":"vercel-provider.d.ts","sourceRoot":"","sources":["../../../src/core/providers/vercel-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,OAAO,EACL,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,aAAa,EACd,MAAM,0BAA0B,CAAC;AAclC,qBAAa,cAAe,YAAW,UAAU;IAC/C,OAAO,CAAC,YAAY,CAAoB;IACxC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAU;IAC3B,OAAO,CAAC,aAAa,CAAM;gBAEf,MAAM,EAAE,gBAAgB;IAUpC,OAAO,CAAC,qBAAqB;IAU7B,OAAO,CAAC,eAAe;IAgCvB,eAAe,IAAI,MAAM;IAIzB,eAAe,IAAI,MAAM;IAIzB,aAAa,IAAI,OAAO;IAIxB,OAAO,CAAC,iBAAiB;IAqBnB,WAAW,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,GAAE,MAAkB,GAAG,OAAO,CAAC,UAAU,CAAC;IA8CtF;;;;;;;;;;;;OAYG;IACG,QAAQ,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;CA0P/D"}
1
+ {"version":3,"file":"vercel-provider.d.ts","sourceRoot":"","sources":["../../../src/core/providers/vercel-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,OAAO,EACL,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,aAAa,EACd,MAAM,0BAA0B,CAAC;AAMlC,qBAAa,cAAe,YAAW,UAAU;IAC/C,OAAO,CAAC,YAAY,CAAoB;IACxC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAU;IAC3B,OAAO,CAAC,aAAa,CAAM;gBAEf,MAAM,EAAE,gBAAgB;IAUpC,OAAO,CAAC,qBAAqB;IAU7B,OAAO,CAAC,eAAe;IAiCvB,eAAe,IAAI,MAAM;IAIzB,eAAe,IAAI,MAAM;IAIzB,aAAa,IAAI,OAAO;IAIxB,OAAO,CAAC,iBAAiB;IAiBnB,WAAW,CACf,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,MAAkB,EAC7B,iBAAiB,CAAC,EAAE;QAClB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,GACA,OAAO,CAAC,UAAU,CAAC;IAwEtB;;;;;;;;;;;;OAYG;IACG,QAAQ,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;CAwQ/D"}
@@ -5,39 +5,6 @@
5
5
  * Implements AIProvider interface using Vercel AI SDK.
6
6
  * Supports OpenAI and Google Gemini providers through unified interface.
7
7
  */
8
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
9
- if (k2 === undefined) k2 = k;
10
- var desc = Object.getOwnPropertyDescriptor(m, k);
11
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
12
- desc = { enumerable: true, get: function() { return m[k]; } };
13
- }
14
- Object.defineProperty(o, k2, desc);
15
- }) : (function(o, m, k, k2) {
16
- if (k2 === undefined) k2 = k;
17
- o[k2] = m[k];
18
- }));
19
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
20
- Object.defineProperty(o, "default", { enumerable: true, value: v });
21
- }) : function(o, v) {
22
- o["default"] = v;
23
- });
24
- var __importStar = (this && this.__importStar) || (function () {
25
- var ownKeys = function(o) {
26
- ownKeys = Object.getOwnPropertyNames || function (o) {
27
- var ar = [];
28
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
29
- return ar;
30
- };
31
- return ownKeys(o);
32
- };
33
- return function (mod) {
34
- if (mod && mod.__esModule) return mod;
35
- var result = {};
36
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
37
- __setModuleDefault(result, mod);
38
- return result;
39
- };
40
- })();
41
8
  Object.defineProperty(exports, "__esModule", { value: true });
42
9
  exports.VercelProvider = void 0;
43
10
  const ai_1 = require("ai");
@@ -45,14 +12,7 @@ const openai_1 = require("@ai-sdk/openai");
45
12
  const google_1 = require("@ai-sdk/google");
46
13
  const anthropic_1 = require("@ai-sdk/anthropic");
47
14
  const provider_debug_utils_1 = require("./provider-debug-utils");
48
- /**
49
- * Provider-specific default models
50
- */
51
- const PROVIDER_MODELS = {
52
- openai: 'gpt-5',
53
- google: 'gemini-2.5-pro',
54
- anthropic: 'claude-sonnet-4-5-20250929'
55
- };
15
+ const model_config_1 = require("../model-config");
56
16
  class VercelProvider {
57
17
  providerType;
58
18
  model;
@@ -71,14 +31,15 @@ class VercelProvider {
71
31
  if (!this.apiKey) {
72
32
  throw new Error(`API key is required for ${this.providerType} provider`);
73
33
  }
74
- if (!['openai', 'google', 'anthropic'].includes(this.providerType)) {
75
- throw new Error(`Unsupported provider: ${this.providerType}. Must be 'openai', 'google', or 'anthropic'`);
34
+ if (!['openai', 'openai_pro', 'google', 'anthropic'].includes(this.providerType)) {
35
+ throw new Error(`Unsupported provider: ${this.providerType}. Must be 'openai', 'openai_pro', 'google', or 'anthropic'`);
76
36
  }
77
37
  }
78
38
  initializeModel() {
79
39
  try {
80
40
  switch (this.providerType) {
81
- case 'openai': {
41
+ case 'openai':
42
+ case 'openai_pro': {
82
43
  const provider = (0, openai_1.createOpenAI)({
83
44
  apiKey: this.apiKey
84
45
  });
@@ -111,27 +72,23 @@ class VercelProvider {
111
72
  return 'vercel';
112
73
  }
113
74
  getDefaultModel() {
114
- return PROVIDER_MODELS[this.providerType];
75
+ return model_config_1.CURRENT_MODELS[this.providerType];
115
76
  }
116
77
  isInitialized() {
117
78
  return this.modelInstance !== undefined;
118
79
  }
119
- logDebugIfEnabled(operation, prompt, response, durationMs) {
80
+ logDebugIfEnabled(operation, prompt, response) {
120
81
  if (!this.debugMode)
121
- return;
82
+ return null;
122
83
  const debugId = (0, provider_debug_utils_1.generateDebugId)(operation);
123
84
  (0, provider_debug_utils_1.debugLogInteraction)(debugId, prompt, response, operation, this.getProviderType(), this.model, this.debugMode);
124
- // Use logMetrics for sendMessage calls (simple token structure, no extended metrics)
125
- (0, provider_debug_utils_1.logMetrics)(operation, this.getProviderType(), {
126
- totalTokens: {
127
- input: response.usage.input_tokens,
128
- output: response.usage.output_tokens,
129
- cacheCreation: response.usage.cache_creation_input_tokens,
130
- cacheRead: response.usage.cache_read_input_tokens
131
- }
132
- }, durationMs, this.debugMode);
85
+ // Return the actual debug file names created
86
+ return {
87
+ promptFile: `${debugId}_prompt.md`,
88
+ responseFile: `${debugId}_response.md`
89
+ };
133
90
  }
134
- async sendMessage(message, operation = 'generic') {
91
+ async sendMessage(message, operation = 'generic', evaluationContext) {
135
92
  if (!this.isInitialized()) {
136
93
  throw new Error(`${this.providerType} provider not initialized`);
137
94
  }
@@ -146,8 +103,8 @@ class VercelProvider {
146
103
  const response = {
147
104
  content: result.text,
148
105
  usage: {
149
- input_tokens: result.usage.inputTokens || 0,
150
- output_tokens: result.usage.outputTokens || 0
106
+ input_tokens: (result.totalUsage || result.usage).inputTokens || 0,
107
+ output_tokens: (result.totalUsage || result.usage).outputTokens || 0
151
108
  }
152
109
  };
153
110
  const durationMs = Date.now() - startTime;
@@ -155,14 +112,34 @@ class VercelProvider {
155
112
  if (this.debugMode) {
156
113
  const debugId = (0, provider_debug_utils_1.generateDebugId)(operation);
157
114
  (0, provider_debug_utils_1.debugLogInteraction)(debugId, message, response, operation, this.getProviderType(), this.model, this.debugMode);
158
- (0, provider_debug_utils_1.logMetrics)(operation, this.getProviderType(), {
159
- totalTokens: {
160
- input: response.usage.input_tokens,
161
- output: response.usage.output_tokens,
162
- cacheCreation: response.usage.cache_creation_input_tokens,
163
- cacheRead: response.usage.cache_read_input_tokens
164
- }
165
- }, durationMs, this.debugMode);
115
+ // PRD #154: Always use new evaluation dataset system
116
+ const evaluationMetrics = {
117
+ // Core execution data
118
+ operation,
119
+ sdk: this.getProviderType(),
120
+ inputTokens: response.usage.input_tokens,
121
+ outputTokens: response.usage.output_tokens,
122
+ durationMs,
123
+ // Required fields
124
+ iterationCount: 1,
125
+ toolCallCount: 0,
126
+ status: 'completed',
127
+ completionReason: 'stop',
128
+ modelVersion: this.model,
129
+ // Required evaluation context - NO DEFAULTS, must be provided
130
+ test_scenario: operation,
131
+ ai_response_summary: response.content,
132
+ user_intent: evaluationContext?.user_intent || '',
133
+ interaction_id: evaluationContext?.interaction_id || '',
134
+ // Optional performance data
135
+ ...(response.usage.cache_creation_input_tokens && { cacheCreationTokens: response.usage.cache_creation_input_tokens }),
136
+ ...(response.usage.cache_read_input_tokens && { cacheReadTokens: response.usage.cache_read_input_tokens })
137
+ };
138
+ // Calculate cache hit rate if applicable
139
+ if (response.usage.cache_read_input_tokens && response.usage.input_tokens > 0) {
140
+ evaluationMetrics.cacheHitRate = Math.round((response.usage.cache_read_input_tokens / response.usage.input_tokens) * 100);
141
+ }
142
+ (0, provider_debug_utils_1.logEvaluationDataset)(evaluationMetrics, this.debugMode);
166
143
  }
167
144
  return response;
168
145
  }
@@ -272,48 +249,6 @@ class VercelProvider {
272
249
  generateConfig.system = systemParam;
273
250
  }
274
251
  const result = await (0, ai_1.generateText)(generateConfig);
275
- // Debug: Log the full cumulative context that was actually sent to the AI
276
- if (this.debugMode && result.response?.messages) {
277
- const path = await Promise.resolve().then(() => __importStar(require('path')));
278
- const debugId = (0, provider_debug_utils_1.generateDebugId)(`${operation}-final-context`);
279
- const debugDir = path.join(process.cwd(), 'tmp', 'debug-ai');
280
- const contextFile = path.join(debugDir, `${debugId}_full-context.md`);
281
- // Build full conversation history representation
282
- const messages = result.response.messages;
283
- const contextParts = [`# Full Conversation Context - ${operation}\n`];
284
- contextParts.push(`\nTimestamp: ${new Date().toISOString()}`);
285
- contextParts.push(`Provider: ${this.getProviderType()}`);
286
- contextParts.push(`Model: ${this.model}`);
287
- contextParts.push(`Total Messages: ${messages.length}`);
288
- contextParts.push(`Total Steps: ${result.steps?.length || 0}`);
289
- contextParts.push('\n---\n');
290
- for (let i = 0; i < messages.length; i++) {
291
- const msg = messages[i];
292
- contextParts.push(`\n## Message ${i + 1} - Role: ${msg.role}\n`);
293
- if (typeof msg.content === 'string') {
294
- contextParts.push(msg.content);
295
- }
296
- else if (Array.isArray(msg.content)) {
297
- for (const part of msg.content) {
298
- if (part.type === 'text') {
299
- contextParts.push(part.text || '');
300
- }
301
- else if (part.type === 'tool-call') {
302
- contextParts.push(`\n[TOOL CALL: ${part.toolName}]`);
303
- contextParts.push(JSON.stringify(part.args, null, 2));
304
- }
305
- else if (part.type === 'tool-result') {
306
- contextParts.push(`\n[TOOL RESULT: ${part.toolName}]`);
307
- const resultData = part.output || part.result || part.content || part;
308
- contextParts.push(JSON.stringify(resultData, null, 2));
309
- }
310
- }
311
- }
312
- }
313
- const fs = await Promise.resolve().then(() => __importStar(require('fs')));
314
- fs.writeFileSync(contextFile, contextParts.join('\n'));
315
- console.log(`🐛 DEBUG: Full conversation context logged to ${contextFile}`);
316
- }
317
252
  // Extract tool call history from steps
318
253
  const toolCallsExecuted = [];
319
254
  for (const step of result.steps || []) {
@@ -332,11 +267,11 @@ class VercelProvider {
332
267
  // - GitHub Issue #8795: Token reporting issues with Anthropic provider (streaming)
333
268
  // Our version (5.0.60, released Oct 2, 2025) includes these fixes.
334
269
  // However, testing still shows ~70% fewer tokens reported vs Anthropic native SDK.
335
- // Root cause unknown - may be additional unreported bugs or different calculation methods.
336
- const usage = result.usage;
270
+ // Root cause: We were using result.usage (final step only) instead of result.totalUsage (sum of all steps)!
271
+ const usage = result.totalUsage || result.usage;
337
272
  let cacheReadTokens = 0;
338
273
  let cacheCreationTokens = 0;
339
- // Anthropic via Vercel uses cachedInputTokens
274
+ // Anthropic via Vercel uses cachedInputTokens (confirmed in AI SDK 5+)
340
275
  if (usage.cachedInputTokens) {
341
276
  cacheReadTokens = usage.cachedInputTokens;
342
277
  }
@@ -367,6 +302,57 @@ class VercelProvider {
367
302
  }
368
303
  }
369
304
  }
305
+ // Log debug for summary operations to capture complete prompts/responses for evaluation
306
+ let debugFiles = null;
307
+ if (this.debugMode) {
308
+ // Build the full conversation context like Anthropic provider does
309
+ let finalPrompt = `System: ${config.systemPrompt}\n\n`;
310
+ // Always include the original user intent first
311
+ finalPrompt += `user: ${config.userMessage}\n\n`;
312
+ // Then add the conversation history if available
313
+ if (result.response?.messages) {
314
+ finalPrompt += result.response.messages
315
+ .map(msg => {
316
+ if (typeof msg.content === 'string') {
317
+ return `${msg.role}: ${msg.content}`;
318
+ }
319
+ else if (Array.isArray(msg.content)) {
320
+ const contentParts = msg.content.map(part => {
321
+ if (part.type === 'text') {
322
+ return part.text;
323
+ }
324
+ else if (part.type === 'tool-call') {
325
+ return `[TOOL_USE: ${part.toolName}]`;
326
+ }
327
+ else if (part.type === 'tool-result') {
328
+ const resultData = part.output || part.result || part.content;
329
+ if (typeof resultData === 'string') {
330
+ return `[TOOL_RESULT: ${part.toolName}]\n${resultData}`;
331
+ }
332
+ else if (resultData) {
333
+ return `[TOOL_RESULT: ${part.toolName}]\n${JSON.stringify(resultData, null, 2)}`;
334
+ }
335
+ return `[TOOL_RESULT: ${part.toolName}]`;
336
+ }
337
+ return `[${part.type}]`;
338
+ }).join(' ');
339
+ return `${msg.role}: ${contentParts}`;
340
+ }
341
+ return `${msg.role}: [complex_content]`;
342
+ })
343
+ .join('\n\n');
344
+ }
345
+ const aiResponse = {
346
+ content: finalText || '',
347
+ usage: {
348
+ input_tokens: usage.inputTokens || 0,
349
+ output_tokens: usage.outputTokens || 0,
350
+ cache_creation_input_tokens: cacheCreationTokens,
351
+ cache_read_input_tokens: cacheReadTokens
352
+ }
353
+ };
354
+ debugFiles = this.logDebugIfEnabled(`${operation}-summary`, finalPrompt, aiResponse);
355
+ }
370
356
  return (0, provider_debug_utils_1.createAndLogAgenticResult)({
371
357
  finalMessage: finalText || '',
372
358
  iterations: result.steps?.length || 1,
@@ -383,7 +369,10 @@ class VercelProvider {
383
369
  operation: `${operation}-summary`,
384
370
  sdk: this.getProviderType(),
385
371
  startTime,
386
- debugMode: this.debugMode
372
+ debugMode: this.debugMode,
373
+ debugFiles,
374
+ evaluationContext: config.evaluationContext,
375
+ interaction_id: config.interaction_id
387
376
  });
388
377
  }
389
378
  catch (error) {
@@ -404,7 +393,9 @@ class VercelProvider {
404
393
  operation: `${operation}-error`,
405
394
  sdk: this.getProviderType(),
406
395
  startTime,
407
- debugMode: this.debugMode
396
+ debugMode: this.debugMode,
397
+ evaluationContext: config.evaluationContext,
398
+ interaction_id: config.interaction_id
408
399
  });
409
400
  }
410
401
  }