@vfarcic/dot-ai 0.111.0 → 0.113.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/dist/core/ai-provider-factory.d.ts +0 -10
  2. package/dist/core/ai-provider-factory.d.ts.map +1 -1
  3. package/dist/core/ai-provider-factory.js +14 -24
  4. package/dist/core/ai-provider.interface.d.ts +28 -1
  5. package/dist/core/ai-provider.interface.d.ts.map +1 -1
  6. package/dist/core/capabilities.d.ts +1 -1
  7. package/dist/core/capabilities.d.ts.map +1 -1
  8. package/dist/core/capabilities.js +7 -4
  9. package/dist/core/capability-scan-workflow.js +2 -2
  10. package/dist/core/embedding-service.d.ts +35 -2
  11. package/dist/core/embedding-service.d.ts.map +1 -1
  12. package/dist/core/embedding-service.js +228 -15
  13. package/dist/core/model-config.d.ts +23 -0
  14. package/dist/core/model-config.d.ts.map +1 -0
  15. package/dist/core/model-config.js +28 -0
  16. package/dist/core/platform-operations.d.ts.map +1 -1
  17. package/dist/core/platform-operations.js +3 -5
  18. package/dist/core/platform-utils.d.ts +13 -2
  19. package/dist/core/platform-utils.d.ts.map +1 -1
  20. package/dist/core/platform-utils.js +91 -9
  21. package/dist/core/providers/anthropic-provider.d.ts +6 -1
  22. package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
  23. package/dist/core/providers/anthropic-provider.js +99 -27
  24. package/dist/core/providers/provider-debug-utils.d.ts +53 -20
  25. package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
  26. package/dist/core/providers/provider-debug-utils.js +106 -51
  27. package/dist/core/providers/vercel-provider.d.ts +6 -1
  28. package/dist/core/providers/vercel-provider.d.ts.map +1 -1
  29. package/dist/core/providers/vercel-provider.js +212 -130
  30. package/dist/core/schema.d.ts +1 -101
  31. package/dist/core/schema.d.ts.map +1 -1
  32. package/dist/core/schema.js +20 -154
  33. package/dist/core/unified-creation-session.d.ts.map +1 -1
  34. package/dist/core/unified-creation-session.js +15 -7
  35. package/dist/evaluation/dataset-analyzer.d.ts +118 -0
  36. package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
  37. package/dist/evaluation/dataset-analyzer.js +234 -0
  38. package/dist/evaluation/datasets/loader.d.ts +42 -0
  39. package/dist/evaluation/datasets/loader.d.ts.map +1 -0
  40. package/dist/evaluation/datasets/loader.js +104 -0
  41. package/dist/evaluation/eval-runner.d.ts +9 -0
  42. package/dist/evaluation/eval-runner.d.ts.map +1 -0
  43. package/dist/evaluation/eval-runner.js +399 -0
  44. package/dist/evaluation/evaluators/base-comparative.d.ts +94 -0
  45. package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
  46. package/dist/evaluation/evaluators/base-comparative.js +187 -0
  47. package/dist/evaluation/evaluators/base.d.ts +47 -0
  48. package/dist/evaluation/evaluators/base.d.ts.map +1 -0
  49. package/dist/evaluation/evaluators/base.js +10 -0
  50. package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
  51. package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
  52. package/dist/evaluation/evaluators/capability-comparative.js +104 -0
  53. package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
  54. package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
  55. package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
  56. package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
  57. package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
  58. package/dist/evaluation/evaluators/policy-comparative.js +97 -0
  59. package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
  60. package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
  61. package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
  62. package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
  63. package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
  64. package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
  65. package/dist/evaluation/platform-synthesizer.d.ts +54 -0
  66. package/dist/evaluation/platform-synthesizer.d.ts.map +1 -0
  67. package/dist/evaluation/platform-synthesizer.js +368 -0
  68. package/dist/evaluation/run-platform-synthesis.d.ts +9 -0
  69. package/dist/evaluation/run-platform-synthesis.d.ts.map +1 -0
  70. package/dist/evaluation/run-platform-synthesis.js +45 -0
  71. package/dist/interfaces/mcp.d.ts.map +1 -1
  72. package/dist/interfaces/mcp.js +23 -29
  73. package/dist/interfaces/rest-api.d.ts.map +1 -1
  74. package/dist/tools/answer-question.d.ts +2 -0
  75. package/dist/tools/answer-question.d.ts.map +1 -1
  76. package/dist/tools/answer-question.js +18 -11
  77. package/dist/tools/generate-manifests.d.ts +2 -0
  78. package/dist/tools/generate-manifests.d.ts.map +1 -1
  79. package/dist/tools/generate-manifests.js +11 -12
  80. package/dist/tools/organizational-data.d.ts +1 -0
  81. package/dist/tools/organizational-data.d.ts.map +1 -1
  82. package/dist/tools/organizational-data.js +2 -1
  83. package/dist/tools/recommend.d.ts +1 -0
  84. package/dist/tools/recommend.d.ts.map +1 -1
  85. package/dist/tools/recommend.js +13 -21
  86. package/dist/tools/remediate.d.ts +3 -0
  87. package/dist/tools/remediate.d.ts.map +1 -1
  88. package/dist/tools/remediate.js +35 -14
  89. package/dist/tools/test-docs.d.ts +1 -0
  90. package/dist/tools/test-docs.d.ts.map +1 -1
  91. package/dist/tools/test-docs.js +4 -2
  92. package/dist/tools/version.d.ts +5 -1
  93. package/dist/tools/version.d.ts.map +1 -1
  94. package/dist/tools/version.js +23 -8
  95. package/package.json +19 -1
@@ -41,7 +41,8 @@ var __importStar = (this && this.__importStar) || (function () {
41
41
  Object.defineProperty(exports, "__esModule", { value: true });
42
42
  exports.ensureDebugDirectory = ensureDebugDirectory;
43
43
  exports.generateDebugId = generateDebugId;
44
- exports.logMetrics = logMetrics;
44
+ exports.shouldSkipDatasetGeneration = shouldSkipDatasetGeneration;
45
+ exports.logEvaluationDataset = logEvaluationDataset;
45
46
  exports.createAndLogAgenticResult = createAndLogAgenticResult;
46
47
  exports.debugLogInteraction = debugLogInteraction;
47
48
  const fs = __importStar(require("fs"));
@@ -67,70 +68,92 @@ function generateDebugId(operation) {
67
68
  return `${dateTime}_${randomHex}_${operation}`;
68
69
  }
69
70
  /**
70
- * Log metrics for token usage and execution time when DEBUG_DOT_AI=true
71
- *
72
- * PRD #143 Decision 5: Extended metrics for model comparison analysis
71
+ * Determine if dataset generation should be skipped for specific operations
73
72
  */
74
- function logMetrics(operation, sdk, result, durationMs, debugMode) {
73
+ function shouldSkipDatasetGeneration(operation) {
74
+ const skipDatasetOperations = ['version-connectivity-check', 'generic'];
75
+ return skipDatasetOperations.includes(operation);
76
+ }
77
+ /**
78
+ * Log unified evaluation metrics when DEBUG_DOT_AI=true
79
+ * Single function for all metrics and evaluation data capture
80
+ */
81
+ /**
82
+ * Generate eval dataset entry in standard OpenAI Evals format
83
+ * Logs evaluation metrics to JSONL dataset files for AI quality assessment
84
+ */
85
+ function logEvaluationDataset(metrics, debugMode = false) {
75
86
  if (!debugMode)
76
87
  return;
88
+ // Skip dataset generation for non-evaluable operations
89
+ if (shouldSkipDatasetGeneration(metrics.test_scenario))
90
+ return;
77
91
  try {
78
- const debugDir = ensureDebugDirectory();
79
- const metricsFile = path.join(debugDir, 'metrics.jsonl');
80
- const entry = {
81
- timestamp: new Date().toISOString(),
82
- sdk,
83
- operation,
84
- inputTokens: result.totalTokens.input,
85
- outputTokens: result.totalTokens.output,
86
- durationMs
87
- };
88
- // Add cache metrics if present
89
- if (result.totalTokens.cacheCreation !== undefined) {
90
- entry.cacheCreationTokens = result.totalTokens.cacheCreation;
92
+ // Parse operation for tool name
93
+ const operationParts = metrics.operation.split('-');
94
+ const toolName = operationParts[0]; // e.g., "remediate"
95
+ // Check if this is a comparative evaluation
96
+ const isComparativeEvaluation = metrics.operation.includes('-comparative-');
97
+ // Use different directories for comparative evaluations vs raw test datasets
98
+ const baseDir = isComparativeEvaluation ?
99
+ path.join(process.cwd(), 'eval', 'results') : // Comparative evaluation results go here
100
+ path.join(process.cwd(), 'eval', 'datasets'); // Raw test datasets go here
101
+ // Ensure directory exists
102
+ if (!fs.existsSync(baseDir)) {
103
+ fs.mkdirSync(baseDir, { recursive: true });
91
104
  }
92
- if (result.totalTokens.cacheRead !== undefined) {
93
- entry.cacheReadTokens = result.totalTokens.cacheRead;
105
+ let datasetFile;
106
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '').split('T').join('_');
107
+ if (isComparativeEvaluation) {
108
+ // For comparative evaluations, save to results directory
109
+ datasetFile = path.join(baseDir, `${toolName}_comparative_evaluation_${timestamp}.jsonl`);
94
110
  }
95
- // Calculate cache hit rate (percentage)
96
- if (result.totalTokens.cacheRead !== undefined && result.totalTokens.input > 0) {
97
- entry.cacheHitRate = Math.round((result.totalTokens.cacheRead / result.totalTokens.input) * 100);
111
+ else {
112
+ // Use modelVersion directly for accurate model identification
113
+ const modelName = metrics.modelVersion || 'unknown';
114
+ // Create filename with interaction ID, SDK, model, and timestamp for single-model datasets
115
+ datasetFile = path.join(baseDir, `${toolName}_${metrics.interaction_id}_${metrics.sdk}_${modelName}_${timestamp}.jsonl`);
98
116
  }
99
- // Add extended metrics (PRD #143 Decision 5)
100
- if (result.iterations !== undefined) {
101
- entry.iterationCount = result.iterations;
102
- }
103
- if (result.toolCallsExecuted) {
104
- entry.toolCallCount = result.toolCallsExecuted.length;
105
- // Extract unique tool names
106
- const uniqueTools = [...new Set(result.toolCallsExecuted.map(tc => tc.tool))];
107
- entry.uniqueToolsUsed = uniqueTools;
108
- }
109
- if (result.status) {
110
- entry.status = result.status;
111
- }
112
- if (result.completionReason) {
113
- entry.completionReason = result.completionReason;
114
- }
115
- if (result.modelVersion) {
116
- entry.modelVersion = result.modelVersion;
117
- }
118
- // Manual annotation placeholders (populate after test analysis)
119
- entry.manualNotes = '';
120
- entry.failureReason = '';
121
- entry.qualityIssues = [];
122
- entry.comparisonNotes = '';
123
- fs.appendFileSync(metricsFile, JSON.stringify(entry) + '\n');
117
+ // Transform metrics into OpenAI Evals format (no ideal field - using model-graded evaluation)
118
+ const evalEntry = {
119
+ input: {
120
+ issue: metrics.user_intent || "Tool execution scenario"
121
+ },
122
+ output: metrics.ai_response_summary || "",
123
+ performance: {
124
+ duration_ms: metrics.durationMs,
125
+ input_tokens: metrics.inputTokens,
126
+ output_tokens: metrics.outputTokens,
127
+ total_tokens: metrics.inputTokens + metrics.outputTokens,
128
+ sdk: metrics.sdk,
129
+ model_version: metrics.modelVersion,
130
+ iterations: metrics.iterationCount,
131
+ tool_calls_executed: metrics.toolCallCount,
132
+ cache_read_tokens: metrics.cacheReadTokens || 0,
133
+ cache_creation_tokens: metrics.cacheCreationTokens || 0
134
+ },
135
+ metadata: {
136
+ timestamp: new Date().toISOString(),
137
+ complexity: "medium",
138
+ tags: ["troubleshooting"],
139
+ source: "integration_test",
140
+ tool: toolName,
141
+ test_scenario: metrics.test_scenario || `${toolName}_test`,
142
+ failure_analysis: metrics.failure_analysis || ""
143
+ }
144
+ };
145
+ fs.writeFileSync(datasetFile, JSON.stringify(evalEntry) + '\n');
146
+ console.log(`📊 Generated eval dataset: ${path.basename(datasetFile)} (${metrics.interaction_id}, ${metrics.durationMs}ms, ${metrics.inputTokens}+${metrics.outputTokens} tokens)`);
124
147
  }
125
148
  catch (error) {
126
- console.warn('Failed to log metrics:', error);
149
+ console.error(`❌ Failed to generate eval dataset for ${metrics.interaction_id} (${metrics.test_scenario}):`, error);
127
150
  }
128
151
  }
129
152
  /**
130
153
  * Create AgenticResult and log metrics in one step
131
154
  * Reduces code duplication across providers
132
155
  *
133
- * PRD #143 Decision 5: Standardized metrics logging
156
+ * PRD #154: Updated to use unified evaluation metrics
134
157
  */
135
158
  function createAndLogAgenticResult(config) {
136
159
  const result = {
@@ -144,7 +167,39 @@ function createAndLogAgenticResult(config) {
144
167
  };
145
168
  const durationMs = Date.now() - config.startTime;
146
169
  if (config.debugMode) {
147
- logMetrics(config.operation, config.sdk, result, durationMs, config.debugMode);
170
+ // PRD #154: Use unified evaluation metrics system
171
+ const evaluationMetrics = {
172
+ // Core execution data
173
+ operation: config.operation,
174
+ sdk: config.sdk,
175
+ inputTokens: config.totalTokens.input,
176
+ outputTokens: config.totalTokens.output,
177
+ durationMs,
178
+ // Required fields
179
+ iterationCount: config.iterations,
180
+ toolCallCount: config.toolCallsExecuted.length,
181
+ status: config.status,
182
+ completionReason: config.completionReason,
183
+ modelVersion: config.modelVersion,
184
+ // Required evaluation context - NO DEFAULTS, must be provided
185
+ test_scenario: config.operation,
186
+ ai_response_summary: config.finalMessage,
187
+ user_intent: config.evaluationContext?.user_intent || '', // Will be enhanced later by EvalDatasetEnhancer
188
+ interaction_id: config.interaction_id || '', // Will be enhanced later if missing
189
+ // Optional performance data
190
+ ...(config.totalTokens.cacheCreation !== undefined && { cacheCreationTokens: config.totalTokens.cacheCreation }),
191
+ ...(config.totalTokens.cacheRead !== undefined && { cacheReadTokens: config.totalTokens.cacheRead }),
192
+ ...(config.toolCallsExecuted.length > 0 && {
193
+ uniqueToolsUsed: [...new Set(config.toolCallsExecuted.map(tc => tc.tool))]
194
+ }),
195
+ ...(config.debugFiles && { debug_files: { full_prompt: config.debugFiles.promptFile, full_response: config.debugFiles.responseFile } }),
196
+ ...(config.evaluationContext?.failure_analysis && { failure_analysis: config.evaluationContext.failure_analysis })
197
+ };
198
+ // Calculate cache hit rate if applicable
199
+ if (config.totalTokens.cacheRead !== undefined && config.totalTokens.input > 0) {
200
+ evaluationMetrics.cacheHitRate = Math.round((config.totalTokens.cacheRead / config.totalTokens.input) * 100);
201
+ }
202
+ logEvaluationDataset(evaluationMetrics, config.debugMode);
148
203
  }
149
204
  return result;
150
205
  }
@@ -16,9 +16,14 @@ export declare class VercelProvider implements AIProvider {
16
16
  private initializeModel;
17
17
  getProviderType(): string;
18
18
  getDefaultModel(): string;
19
+ getModelName(): string;
20
+ getSDKProvider(): string;
19
21
  isInitialized(): boolean;
20
22
  private logDebugIfEnabled;
21
- sendMessage(message: string, operation?: string): Promise<AIResponse>;
23
+ sendMessage(message: string, operation?: string, evaluationContext?: {
24
+ user_intent?: string;
25
+ interaction_id?: string;
26
+ }): Promise<AIResponse>;
22
27
  /**
23
28
  * Agentic tool loop using Vercel AI SDK
24
29
  *
@@ -1 +1 @@
1
- {"version":3,"file":"vercel-provider.d.ts","sourceRoot":"","sources":["../../../src/core/providers/vercel-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAMH,OAAO,EACL,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,aAAa,EACd,MAAM,0BAA0B,CAAC;AAclC,qBAAa,cAAe,YAAW,UAAU;IAC/C,OAAO,CAAC,YAAY,CAAoB;IACxC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAU;IAC3B,OAAO,CAAC,aAAa,CAAM;gBAEf,MAAM,EAAE,gBAAgB;IAUpC,OAAO,CAAC,qBAAqB;IAU7B,OAAO,CAAC,eAAe;IAgCvB,eAAe,IAAI,MAAM;IAIzB,eAAe,IAAI,MAAM;IAIzB,aAAa,IAAI,OAAO;IAIxB,OAAO,CAAC,iBAAiB;IAqBnB,WAAW,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,GAAE,MAAkB,GAAG,OAAO,CAAC,UAAU,CAAC;IA8CtF;;;;;;;;;;;;OAYG;IACG,QAAQ,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;CA0P/D"}
1
+ {"version":3,"file":"vercel-provider.d.ts","sourceRoot":"","sources":["../../../src/core/providers/vercel-provider.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AASH,OAAO,EACL,UAAU,EACV,UAAU,EACV,gBAAgB,EAChB,cAAc,EACd,aAAa,EACd,MAAM,0BAA0B,CAAC;AASlC,qBAAa,cAAe,YAAW,UAAU;IAC/C,OAAO,CAAC,YAAY,CAAoB;IACxC,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAU;IAC3B,OAAO,CAAC,aAAa,CAAM;gBAEf,MAAM,EAAE,gBAAgB;IAUpC,OAAO,CAAC,qBAAqB;IAU7B,OAAO,CAAC,eAAe;IA4CvB,eAAe,IAAI,MAAM;IAIzB,eAAe,IAAI,MAAM;IAIzB,YAAY,IAAI,MAAM;IAItB,cAAc,IAAI,MAAM;IAIxB,aAAa,IAAI,OAAO;IAIxB,OAAO,CAAC,iBAAiB;IAiBnB,WAAW,CACf,OAAO,EAAE,MAAM,EACf,SAAS,GAAE,MAAkB,EAC7B,iBAAiB,CAAC,EAAE;QAClB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,GACA,OAAO,CAAC,UAAU,CAAC;IAoGtB;;;;;;;;;;;;OAYG;IACG,QAAQ,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC;CA0T/D"}