@juspay/neurolink 3.0.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +57 -6
- package/README.md +235 -2
- package/dist/agent/direct-tools.d.ts +6 -6
- package/dist/chat/client-utils.d.ts +92 -0
- package/dist/chat/client-utils.js +298 -0
- package/dist/chat/index.d.ts +27 -0
- package/dist/chat/index.js +41 -0
- package/dist/chat/session-storage.d.ts +77 -0
- package/dist/chat/session-storage.js +233 -0
- package/dist/chat/session.d.ts +95 -0
- package/dist/chat/session.js +257 -0
- package/dist/chat/sse-handler.d.ts +49 -0
- package/dist/chat/sse-handler.js +266 -0
- package/dist/chat/types.d.ts +73 -0
- package/dist/chat/types.js +5 -0
- package/dist/chat/websocket-chat-handler.d.ts +36 -0
- package/dist/chat/websocket-chat-handler.js +262 -0
- package/dist/cli/commands/config.js +12 -12
- package/dist/cli/commands/mcp.js +3 -4
- package/dist/cli/index.d.ts +0 -7
- package/dist/cli/index.js +247 -28
- package/dist/config/configManager.d.ts +60 -0
- package/dist/config/configManager.js +300 -0
- package/dist/config/types.d.ts +136 -0
- package/dist/config/types.js +43 -0
- package/dist/core/analytics.d.ts +23 -0
- package/dist/core/analytics.js +131 -0
- package/dist/core/constants.d.ts +41 -0
- package/dist/core/constants.js +50 -0
- package/dist/core/defaults.d.ts +18 -0
- package/dist/core/defaults.js +29 -0
- package/dist/core/evaluation-config.d.ts +29 -0
- package/dist/core/evaluation-config.js +144 -0
- package/dist/core/evaluation-providers.d.ts +30 -0
- package/dist/core/evaluation-providers.js +187 -0
- package/dist/core/evaluation.d.ts +117 -0
- package/dist/core/evaluation.js +528 -0
- package/dist/core/factory.js +33 -25
- package/dist/core/types.d.ts +165 -6
- package/dist/core/types.js +3 -4
- package/dist/index.d.ts +9 -4
- package/dist/index.js +25 -4
- package/dist/lib/agent/direct-tools.d.ts +6 -6
- package/dist/lib/chat/client-utils.d.ts +92 -0
- package/dist/lib/chat/client-utils.js +298 -0
- package/dist/lib/chat/index.d.ts +27 -0
- package/dist/lib/chat/index.js +41 -0
- package/dist/lib/chat/session-storage.d.ts +77 -0
- package/dist/lib/chat/session-storage.js +233 -0
- package/dist/lib/chat/session.d.ts +95 -0
- package/dist/lib/chat/session.js +257 -0
- package/dist/lib/chat/sse-handler.d.ts +49 -0
- package/dist/lib/chat/sse-handler.js +266 -0
- package/dist/lib/chat/types.d.ts +73 -0
- package/dist/lib/chat/types.js +5 -0
- package/dist/lib/chat/websocket-chat-handler.d.ts +36 -0
- package/dist/lib/chat/websocket-chat-handler.js +262 -0
- package/dist/lib/config/configManager.d.ts +60 -0
- package/dist/lib/config/configManager.js +300 -0
- package/dist/lib/config/types.d.ts +136 -0
- package/dist/lib/config/types.js +43 -0
- package/dist/lib/core/analytics.d.ts +23 -0
- package/dist/lib/core/analytics.js +131 -0
- package/dist/lib/core/constants.d.ts +41 -0
- package/dist/lib/core/constants.js +50 -0
- package/dist/lib/core/defaults.d.ts +18 -0
- package/dist/lib/core/defaults.js +29 -0
- package/dist/lib/core/evaluation-config.d.ts +29 -0
- package/dist/lib/core/evaluation-config.js +144 -0
- package/dist/lib/core/evaluation-providers.d.ts +30 -0
- package/dist/lib/core/evaluation-providers.js +187 -0
- package/dist/lib/core/evaluation.d.ts +117 -0
- package/dist/lib/core/evaluation.js +528 -0
- package/dist/lib/core/factory.js +33 -26
- package/dist/lib/core/types.d.ts +165 -6
- package/dist/lib/core/types.js +3 -4
- package/dist/lib/index.d.ts +9 -4
- package/dist/lib/index.js +25 -4
- package/dist/lib/mcp/contracts/mcpContract.d.ts +118 -0
- package/dist/lib/mcp/contracts/mcpContract.js +5 -0
- package/dist/lib/mcp/function-calling.js +11 -3
- package/dist/lib/mcp/logging.js +5 -0
- package/dist/lib/mcp/neurolink-mcp-client.js +2 -1
- package/dist/lib/mcp/orchestrator.js +18 -9
- package/dist/lib/mcp/registry.d.ts +49 -16
- package/dist/lib/mcp/registry.js +80 -6
- package/dist/lib/mcp/servers/ai-providers/ai-workflow-tools.js +5 -4
- package/dist/lib/mcp/tool-integration.js +1 -1
- package/dist/lib/mcp/tool-registry.d.ts +55 -34
- package/dist/lib/mcp/tool-registry.js +111 -97
- package/dist/lib/mcp/unified-mcp.js +6 -1
- package/dist/lib/mcp/unified-registry.d.ts +12 -4
- package/dist/lib/mcp/unified-registry.js +17 -4
- package/dist/lib/neurolink.d.ts +26 -0
- package/dist/lib/neurolink.js +43 -1
- package/dist/lib/providers/agent-enhanced-provider.d.ts +11 -2
- package/dist/lib/providers/agent-enhanced-provider.js +86 -15
- package/dist/lib/providers/amazonBedrock.d.ts +9 -1
- package/dist/lib/providers/amazonBedrock.js +26 -2
- package/dist/lib/providers/analytics-helper.d.ts +53 -0
- package/dist/lib/providers/analytics-helper.js +151 -0
- package/dist/lib/providers/anthropic.d.ts +11 -1
- package/dist/lib/providers/anthropic.js +29 -4
- package/dist/lib/providers/azureOpenAI.d.ts +3 -1
- package/dist/lib/providers/azureOpenAI.js +28 -4
- package/dist/lib/providers/function-calling-provider.d.ts +9 -1
- package/dist/lib/providers/function-calling-provider.js +14 -1
- package/dist/lib/providers/googleAIStudio.d.ts +15 -1
- package/dist/lib/providers/googleAIStudio.js +32 -2
- package/dist/lib/providers/googleVertexAI.d.ts +9 -1
- package/dist/lib/providers/googleVertexAI.js +31 -2
- package/dist/lib/providers/huggingFace.d.ts +3 -1
- package/dist/lib/providers/huggingFace.js +26 -3
- package/dist/lib/providers/mcp-provider.d.ts +9 -1
- package/dist/lib/providers/mcp-provider.js +12 -0
- package/dist/lib/providers/mistralAI.d.ts +3 -1
- package/dist/lib/providers/mistralAI.js +25 -2
- package/dist/lib/providers/ollama.d.ts +3 -1
- package/dist/lib/providers/ollama.js +27 -4
- package/dist/lib/providers/openAI.d.ts +15 -1
- package/dist/lib/providers/openAI.js +32 -2
- package/dist/lib/proxy/proxy-fetch.js +8 -7
- package/dist/lib/services/streaming/streaming-manager.d.ts +29 -0
- package/dist/lib/services/streaming/streaming-manager.js +244 -0
- package/dist/lib/services/types.d.ts +155 -0
- package/dist/lib/services/types.js +2 -0
- package/dist/lib/services/websocket/websocket-server.d.ts +34 -0
- package/dist/lib/services/websocket/websocket-server.js +304 -0
- package/dist/lib/telemetry/index.d.ts +15 -0
- package/dist/lib/telemetry/index.js +22 -0
- package/dist/lib/telemetry/telemetry-service.d.ts +47 -0
- package/dist/lib/telemetry/telemetry-service.js +259 -0
- package/dist/lib/utils/streaming-utils.d.ts +67 -0
- package/dist/lib/utils/streaming-utils.js +201 -0
- package/dist/mcp/contracts/mcpContract.d.ts +118 -0
- package/dist/mcp/contracts/mcpContract.js +5 -0
- package/dist/mcp/function-calling.js +11 -3
- package/dist/mcp/logging.js +5 -0
- package/dist/mcp/neurolink-mcp-client.js +2 -1
- package/dist/mcp/orchestrator.js +18 -9
- package/dist/mcp/registry.d.ts +49 -16
- package/dist/mcp/registry.js +80 -6
- package/dist/mcp/servers/ai-providers/ai-workflow-tools.d.ts +2 -2
- package/dist/mcp/servers/ai-providers/ai-workflow-tools.js +5 -4
- package/dist/mcp/tool-integration.js +1 -1
- package/dist/mcp/tool-registry.d.ts +55 -34
- package/dist/mcp/tool-registry.js +111 -97
- package/dist/mcp/unified-mcp.js +6 -1
- package/dist/mcp/unified-registry.d.ts +12 -4
- package/dist/mcp/unified-registry.js +17 -4
- package/dist/neurolink.d.ts +26 -0
- package/dist/neurolink.js +43 -1
- package/dist/providers/agent-enhanced-provider.d.ts +11 -2
- package/dist/providers/agent-enhanced-provider.js +86 -15
- package/dist/providers/amazonBedrock.d.ts +9 -1
- package/dist/providers/amazonBedrock.js +26 -2
- package/dist/providers/analytics-helper.d.ts +53 -0
- package/dist/providers/analytics-helper.js +151 -0
- package/dist/providers/anthropic.d.ts +11 -1
- package/dist/providers/anthropic.js +29 -4
- package/dist/providers/azureOpenAI.d.ts +3 -1
- package/dist/providers/azureOpenAI.js +29 -4
- package/dist/providers/function-calling-provider.d.ts +9 -1
- package/dist/providers/function-calling-provider.js +14 -1
- package/dist/providers/googleAIStudio.d.ts +15 -1
- package/dist/providers/googleAIStudio.js +32 -2
- package/dist/providers/googleVertexAI.d.ts +9 -1
- package/dist/providers/googleVertexAI.js +31 -2
- package/dist/providers/huggingFace.d.ts +3 -1
- package/dist/providers/huggingFace.js +26 -3
- package/dist/providers/mcp-provider.d.ts +9 -1
- package/dist/providers/mcp-provider.js +12 -0
- package/dist/providers/mistralAI.d.ts +3 -1
- package/dist/providers/mistralAI.js +25 -2
- package/dist/providers/ollama.d.ts +3 -1
- package/dist/providers/ollama.js +27 -4
- package/dist/providers/openAI.d.ts +15 -1
- package/dist/providers/openAI.js +33 -2
- package/dist/proxy/proxy-fetch.js +8 -7
- package/dist/services/streaming/streaming-manager.d.ts +29 -0
- package/dist/services/streaming/streaming-manager.js +244 -0
- package/dist/services/types.d.ts +155 -0
- package/dist/services/types.js +2 -0
- package/dist/services/websocket/websocket-server.d.ts +34 -0
- package/dist/services/websocket/websocket-server.js +304 -0
- package/dist/telemetry/index.d.ts +15 -0
- package/dist/telemetry/index.js +22 -0
- package/dist/telemetry/telemetry-service.d.ts +47 -0
- package/dist/telemetry/telemetry-service.js +261 -0
- package/dist/utils/streaming-utils.d.ts +67 -0
- package/dist/utils/streaming-utils.js +201 -0
- package/package.json +18 -2
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NeuroLink Unified Evaluation System
|
|
3
|
+
*
|
|
4
|
+
* Combines Universal Evaluation with Lighthouse-Enhanced capabilities
|
|
5
|
+
* - Domain-aware evaluation with sophisticated context handling
|
|
6
|
+
* - Multi-provider support with fallback strategies
|
|
7
|
+
* - Structured output with Zod schema validation
|
|
8
|
+
* - Tool usage and conversation history analysis
|
|
9
|
+
* - Enterprise-grade reliability and performance
|
|
10
|
+
*/
|
|
11
|
+
import { logger } from "../utils/logger.js";
|
|
12
|
+
import { AIProviderFactory } from "./factory.js";
|
|
13
|
+
import { z } from "zod";
|
|
14
|
+
/**
|
|
15
|
+
* Unified Evaluation Schema (Lighthouse-compatible with extensions)
|
|
16
|
+
*/
|
|
17
|
+
export const unifiedEvaluationSchema = z.object({
|
|
18
|
+
// Core evaluation scores
|
|
19
|
+
relevanceScore: z
|
|
20
|
+
.number()
|
|
21
|
+
.min(0)
|
|
22
|
+
.max(10)
|
|
23
|
+
.describe("Score (0-10) for how well the response addresses query intent and aligns with domain/role. 10 is most relevant."),
|
|
24
|
+
accuracyScore: z
|
|
25
|
+
.number()
|
|
26
|
+
.min(0)
|
|
27
|
+
.max(10)
|
|
28
|
+
.describe("Score (0-10) for factual correctness against data, tool outputs, and domain knowledge. 10 is most accurate."),
|
|
29
|
+
completenessScore: z
|
|
30
|
+
.number()
|
|
31
|
+
.min(0)
|
|
32
|
+
.max(10)
|
|
33
|
+
.describe("Score (0-10) for how completely the response addresses the query. 10 is most complete."),
|
|
34
|
+
// Enhanced domain scores (optional)
|
|
35
|
+
domainAlignment: z
|
|
36
|
+
.number()
|
|
37
|
+
.min(0)
|
|
38
|
+
.max(10)
|
|
39
|
+
.optional()
|
|
40
|
+
.describe("Score (0-10) for how well response aligns with specified domain expertise."),
|
|
41
|
+
terminologyAccuracy: z
|
|
42
|
+
.number()
|
|
43
|
+
.min(0)
|
|
44
|
+
.max(10)
|
|
45
|
+
.optional()
|
|
46
|
+
.describe("Score (0-10) for correct usage of domain-specific terminology."),
|
|
47
|
+
toolEffectiveness: z
|
|
48
|
+
.number()
|
|
49
|
+
.min(0)
|
|
50
|
+
.max(10)
|
|
51
|
+
.optional()
|
|
52
|
+
.describe("Score (0-10) for how effectively available tools/MCPs were utilized."),
|
|
53
|
+
// Qualitative assessment
|
|
54
|
+
isOffTopic: z
|
|
55
|
+
.boolean()
|
|
56
|
+
.describe("True if the response significantly deviates from query/domain."),
|
|
57
|
+
reasoning: z
|
|
58
|
+
.string()
|
|
59
|
+
.describe("Brief justification for scores, especially if low or off-topic. Max 150 words."),
|
|
60
|
+
suggestedImprovements: z
|
|
61
|
+
.string()
|
|
62
|
+
.optional()
|
|
63
|
+
.describe("Optional: Suggestions for improving the original response. Max 100 words."),
|
|
64
|
+
alertSeverity: z
|
|
65
|
+
.enum(["low", "medium", "high", "none"])
|
|
66
|
+
.describe("Suggested alert severity considering all scores and domain context."),
|
|
67
|
+
});
|
|
68
|
+
/**
|
|
69
|
+
* Main unified evaluation function
|
|
70
|
+
*/
|
|
71
|
+
export async function performUnifiedEvaluation(context) {
|
|
72
|
+
const functionTag = "performUnifiedEvaluation";
|
|
73
|
+
const startTime = Date.now();
|
|
74
|
+
// Determine evaluation mode
|
|
75
|
+
const mode = context.mode || detectEvaluationMode(context);
|
|
76
|
+
logger.debug(`[${functionTag}] Starting unified evaluation`, {
|
|
77
|
+
mode,
|
|
78
|
+
domain: context.primaryDomain,
|
|
79
|
+
toolsUsed: context.toolsUsed?.length || 0,
|
|
80
|
+
conversationTurns: context.conversationHistory?.length || 0,
|
|
81
|
+
queryLength: context.userQuery.length,
|
|
82
|
+
responseLength: context.aiResponse.length,
|
|
83
|
+
});
|
|
84
|
+
const { parseEvaluationConfig } = await import("./evaluation-config.js");
|
|
85
|
+
const config = parseEvaluationConfig();
|
|
86
|
+
let lastError = null;
|
|
87
|
+
for (let attempt = 0; attempt <= config.retryAttempts; attempt++) {
|
|
88
|
+
try {
|
|
89
|
+
// Get evaluation model
|
|
90
|
+
const evaluationModelResult = await getEvaluationModel();
|
|
91
|
+
if (!evaluationModelResult) {
|
|
92
|
+
logger.debug(`[${functionTag}] No evaluation model available, returning defaults`);
|
|
93
|
+
return getDefaultUnifiedEvaluation("unavailable", Date.now() - startTime, context);
|
|
94
|
+
}
|
|
95
|
+
const { provider: evaluationModel, config: modelConfig } = evaluationModelResult;
|
|
96
|
+
// Create evaluation prompt based on mode
|
|
97
|
+
const evaluationPrompt = createUnifiedEvaluationPrompt(context, mode);
|
|
98
|
+
logger.debug(`[${functionTag}] Using ${mode} evaluation mode`, {
|
|
99
|
+
provider: modelConfig.providerName,
|
|
100
|
+
model: modelConfig.modelName,
|
|
101
|
+
attempt: attempt + 1,
|
|
102
|
+
});
|
|
103
|
+
// Try structured evaluation first (preferred)
|
|
104
|
+
try {
|
|
105
|
+
const structuredResult = await evaluationModel.generateObject({
|
|
106
|
+
schema: unifiedEvaluationSchema,
|
|
107
|
+
prompt: evaluationPrompt,
|
|
108
|
+
temperature: 0.1,
|
|
109
|
+
maxTokens: 1000,
|
|
110
|
+
system: createUnifiedSystemPrompt(mode),
|
|
111
|
+
});
|
|
112
|
+
return processStructuredEvaluationResult(structuredResult.object, modelConfig, Date.now() - startTime, context, attempt + 1);
|
|
113
|
+
}
|
|
114
|
+
catch (structuredError) {
|
|
115
|
+
logger.warn(`[${functionTag}] Structured evaluation failed, using fallback`, { structuredError });
|
|
116
|
+
// Fallback to legacy generateText
|
|
117
|
+
const result = await evaluationModel.generateText({
|
|
118
|
+
prompt: evaluationPrompt + "\n\nRespond with valid JSON only.",
|
|
119
|
+
temperature: 0.1,
|
|
120
|
+
maxTokens: 1000,
|
|
121
|
+
systemPrompt: createUnifiedSystemPrompt(mode),
|
|
122
|
+
});
|
|
123
|
+
const responseText = result?.text || result?.content;
|
|
124
|
+
if (!responseText) {
|
|
125
|
+
throw new Error("No evaluation text received from fallback");
|
|
126
|
+
}
|
|
127
|
+
return parseUnifiedEvaluationResult(responseText, modelConfig, Date.now() - startTime, context, attempt + 1);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
catch (error) {
|
|
131
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
132
|
+
logger.warn(`[${functionTag}] Evaluation attempt ${attempt + 1} failed:`, lastError.message);
|
|
133
|
+
if (attempt === config.retryAttempts) {
|
|
134
|
+
break;
|
|
135
|
+
}
|
|
136
|
+
// Exponential backoff
|
|
137
|
+
await new Promise((resolve) => setTimeout(resolve, Math.pow(2, attempt) * 1000));
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
// All attempts failed
|
|
141
|
+
logger.error(`[${functionTag}] All evaluation attempts failed:`, lastError?.message);
|
|
142
|
+
return getDefaultUnifiedEvaluation(lastError?.message || "unknown-error", Date.now() - startTime, context);
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Detect appropriate evaluation mode based on context
|
|
146
|
+
*/
|
|
147
|
+
function detectEvaluationMode(context) {
|
|
148
|
+
// Lighthouse mode: Has domain awareness, tool context, or conversation history
|
|
149
|
+
if (context.primaryDomain ||
|
|
150
|
+
context.toolsUsed?.length ||
|
|
151
|
+
context.conversationHistory?.length) {
|
|
152
|
+
return "lighthouse";
|
|
153
|
+
}
|
|
154
|
+
// Enhanced mode: Has rich context
|
|
155
|
+
if (context.context && Object.keys(context.context).length > 0) {
|
|
156
|
+
return "enhanced";
|
|
157
|
+
}
|
|
158
|
+
// Simple mode: Basic evaluation
|
|
159
|
+
return "simple";
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Create unified evaluation prompt based on mode
|
|
163
|
+
*/
|
|
164
|
+
function createUnifiedEvaluationPrompt(context, mode) {
|
|
165
|
+
switch (mode) {
|
|
166
|
+
case "lighthouse":
|
|
167
|
+
return createLighthouseEvaluationPrompt(context);
|
|
168
|
+
case "enhanced":
|
|
169
|
+
return createEnhancedEvaluationPrompt(context);
|
|
170
|
+
case "simple":
|
|
171
|
+
default:
|
|
172
|
+
return createSimpleEvaluationPrompt(context);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Create Lighthouse-style domain-aware evaluation prompt
|
|
177
|
+
*/
|
|
178
|
+
function createLighthouseEvaluationPrompt(context) {
|
|
179
|
+
const { userQuery, aiResponse, primaryDomain = "general AI assistant", assistantRole = "AI assistant", toolContext = "No specific tools used in this interaction", conversationHistory = [], } = context;
|
|
180
|
+
const formattedHistory = formatConversationHistory(conversationHistory);
|
|
181
|
+
return `You are an AI Response Evaluator with advanced domain awareness.
|
|
182
|
+
|
|
183
|
+
**EVALUATION CONTEXT**:
|
|
184
|
+
|
|
185
|
+
1. **Primary Assistant Domain**: "${primaryDomain}"
|
|
186
|
+
- This defines the AI assistant's core expertise area
|
|
187
|
+
- Responses should demonstrate competency within this domain
|
|
188
|
+
- Domain-specific terminology should be used accurately
|
|
189
|
+
|
|
190
|
+
2. **Assistant Role**: "${assistantRole}"
|
|
191
|
+
- This defines the specific role the assistant should fulfill
|
|
192
|
+
- Responses should align with this role's responsibilities
|
|
193
|
+
|
|
194
|
+
3. **Tool Usage Context**: "${toolContext}"
|
|
195
|
+
- Tools/MCPs are capabilities the assistant used to generate the response
|
|
196
|
+
- Evaluate how effectively these tools were utilized
|
|
197
|
+
- Consider if additional tools should have been used
|
|
198
|
+
|
|
199
|
+
4. **Conversation History**:
|
|
200
|
+
\`\`\`
|
|
201
|
+
${formattedHistory}
|
|
202
|
+
\`\`\`
|
|
203
|
+
|
|
204
|
+
**CRITICAL DOMAIN FAILURE ASSESSMENT**:
|
|
205
|
+
Pay special attention to domain alignment. If the query is within the assistant's domain and sufficient context is available:
|
|
206
|
+
- Inability to answer ("I can't help", generic errors, evasions) = HIGH ALERT
|
|
207
|
+
- Incorrect domain-specific information = HIGH ALERT
|
|
208
|
+
- Misuse of domain terminology = MEDIUM-HIGH ALERT
|
|
209
|
+
|
|
210
|
+
**EVALUATION CRITERIA**:
|
|
211
|
+
- **relevanceScore** (0-10): Direct query addressing + domain alignment
|
|
212
|
+
- **accuracyScore** (0-10): Factual correctness + terminology accuracy
|
|
213
|
+
- **completenessScore** (0-10): Full query addressing + appropriate depth
|
|
214
|
+
- **domainAlignment** (0-10): How well response fits the domain expertise
|
|
215
|
+
- **terminologyAccuracy** (0-10): Correct use of domain-specific terms
|
|
216
|
+
- **toolEffectiveness** (0-10): How well available tools were utilized
|
|
217
|
+
- **isOffTopic** (boolean): True if significantly deviates from domain/query
|
|
218
|
+
- **reasoning** (string): Brief explanation (max 150 words)
|
|
219
|
+
- **suggestedImprovements** (string): How to improve (max 100 words)
|
|
220
|
+
- **alertSeverity** ('low'|'medium'|'high'|'none'): Based on domain failure assessment
|
|
221
|
+
|
|
222
|
+
**Current User Query**:
|
|
223
|
+
"${userQuery}"
|
|
224
|
+
|
|
225
|
+
**AI Assistant Response**:
|
|
226
|
+
"${aiResponse}"
|
|
227
|
+
|
|
228
|
+
Provide your assessment in the specified format.`;
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Create enhanced evaluation prompt
|
|
232
|
+
*/
|
|
233
|
+
function createEnhancedEvaluationPrompt(context) {
|
|
234
|
+
const { userQuery, aiResponse, context: additionalContext } = context;
|
|
235
|
+
const contextInfo = additionalContext
|
|
236
|
+
? `\nContext: ${JSON.stringify(additionalContext, null, 2)}`
|
|
237
|
+
: "";
|
|
238
|
+
return `Evaluate this AI response with enhanced criteria:
|
|
239
|
+
|
|
240
|
+
Query: "${userQuery}"
|
|
241
|
+
Response: "${aiResponse}"${contextInfo}
|
|
242
|
+
|
|
243
|
+
Provide scores for:
|
|
244
|
+
- relevanceScore (0-10): How well the response addresses the query
|
|
245
|
+
- accuracyScore (0-10): Factual correctness and reliability
|
|
246
|
+
- completenessScore (0-10): Whether the response fully answers the question
|
|
247
|
+
- isOffTopic (boolean): Whether response deviates from query
|
|
248
|
+
- reasoning (string): Brief explanation of scores
|
|
249
|
+
- alertSeverity ('low'|'medium'|'high'|'none'): Overall quality assessment
|
|
250
|
+
|
|
251
|
+
Respond in the specified format.`;
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Create simple evaluation prompt
|
|
255
|
+
*/
|
|
256
|
+
function createSimpleEvaluationPrompt(context) {
|
|
257
|
+
const { userQuery, aiResponse } = context;
|
|
258
|
+
return `Rate this AI response:
|
|
259
|
+
|
|
260
|
+
Q: "${userQuery}"
|
|
261
|
+
A: "${aiResponse}"
|
|
262
|
+
|
|
263
|
+
Provide:
|
|
264
|
+
- relevanceScore (0-10)
|
|
265
|
+
- accuracyScore (0-10)
|
|
266
|
+
- completenessScore (0-10)
|
|
267
|
+
- reasoning (brief explanation)
|
|
268
|
+
|
|
269
|
+
Respond in the specified format.`;
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Create unified system prompt based on mode
|
|
273
|
+
*/
|
|
274
|
+
function createUnifiedSystemPrompt(mode) {
|
|
275
|
+
const basePrompt = "You are an expert AI Response Evaluator. Respond with valid structured output only.";
|
|
276
|
+
switch (mode) {
|
|
277
|
+
case "lighthouse":
|
|
278
|
+
return `${basePrompt} Use advanced domain awareness and sophisticated context analysis for comprehensive evaluation.`;
|
|
279
|
+
case "enhanced":
|
|
280
|
+
return `${basePrompt} Consider all provided context and metadata for thorough evaluation.`;
|
|
281
|
+
case "simple":
|
|
282
|
+
default:
|
|
283
|
+
return `${basePrompt} Focus on core quality metrics: relevance, accuracy, and completeness.`;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Process structured evaluation result
|
|
288
|
+
*/
|
|
289
|
+
function processStructuredEvaluationResult(result, modelConfig, evaluationTime, context, attempt) {
|
|
290
|
+
// Calculate overall score
|
|
291
|
+
const coreScores = [
|
|
292
|
+
result.relevanceScore || 0,
|
|
293
|
+
result.accuracyScore || 0,
|
|
294
|
+
result.completenessScore || 0,
|
|
295
|
+
];
|
|
296
|
+
const enhancedScores = [
|
|
297
|
+
result.domainAlignment,
|
|
298
|
+
result.terminologyAccuracy,
|
|
299
|
+
result.toolEffectiveness,
|
|
300
|
+
].filter((score) => typeof score === "number" && score > 0);
|
|
301
|
+
const allScores = [...coreScores, ...enhancedScores];
|
|
302
|
+
const overall = Math.round(allScores.reduce((sum, score) => sum + score, 0) / allScores.length);
|
|
303
|
+
return {
|
|
304
|
+
// Core scores
|
|
305
|
+
relevanceScore: Math.max(0, Math.min(10, Math.round(result.relevanceScore || 0))),
|
|
306
|
+
accuracyScore: Math.max(0, Math.min(10, Math.round(result.accuracyScore || 0))),
|
|
307
|
+
completenessScore: Math.max(0, Math.min(10, Math.round(result.completenessScore || 0))),
|
|
308
|
+
overall: Math.max(0, Math.min(10, overall)),
|
|
309
|
+
// Enhanced insights
|
|
310
|
+
isOffTopic: result.isOffTopic || false,
|
|
311
|
+
alertSeverity: result.alertSeverity || "none",
|
|
312
|
+
reasoning: result.reasoning || "Evaluation completed successfully.",
|
|
313
|
+
suggestedImprovements: result.suggestedImprovements,
|
|
314
|
+
// Domain-specific scores (if available)
|
|
315
|
+
domainAlignment: result.domainAlignment
|
|
316
|
+
? Math.max(0, Math.min(10, Math.round(result.domainAlignment)))
|
|
317
|
+
: undefined,
|
|
318
|
+
terminologyAccuracy: result.terminologyAccuracy
|
|
319
|
+
? Math.max(0, Math.min(10, Math.round(result.terminologyAccuracy)))
|
|
320
|
+
: undefined,
|
|
321
|
+
toolEffectiveness: result.toolEffectiveness
|
|
322
|
+
? Math.max(0, Math.min(10, Math.round(result.toolEffectiveness)))
|
|
323
|
+
: undefined,
|
|
324
|
+
// Context analysis
|
|
325
|
+
contextUtilization: {
|
|
326
|
+
conversationUsed: (context.conversationHistory?.length || 0) > 0,
|
|
327
|
+
toolsUsed: (context.toolsUsed?.length || 0) > 0,
|
|
328
|
+
domainKnowledgeUsed: !!context.primaryDomain,
|
|
329
|
+
},
|
|
330
|
+
// Enhanced metadata
|
|
331
|
+
evaluationContext: {
|
|
332
|
+
domain: context.primaryDomain || "general",
|
|
333
|
+
toolsEvaluated: context.toolsUsed || [],
|
|
334
|
+
conversationTurns: context.conversationHistory?.length || 0,
|
|
335
|
+
},
|
|
336
|
+
// Standard metadata
|
|
337
|
+
evaluationModel: `${modelConfig.providerName}/${modelConfig.modelName}`,
|
|
338
|
+
evaluationTime,
|
|
339
|
+
evaluationProvider: modelConfig.providerName,
|
|
340
|
+
evaluationAttempt: attempt,
|
|
341
|
+
evaluationConfig: {
|
|
342
|
+
mode: context.mode || "auto",
|
|
343
|
+
fallbackUsed: attempt > 1,
|
|
344
|
+
costEstimate: 0,
|
|
345
|
+
},
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
/**
|
|
349
|
+
* Parse evaluation result from text response
|
|
350
|
+
*/
|
|
351
|
+
function parseUnifiedEvaluationResult(evaluationText, modelConfig, evaluationTime, context, attempt) {
|
|
352
|
+
try {
|
|
353
|
+
// Clean and parse JSON
|
|
354
|
+
const cleanText = evaluationText.trim().replace(/```json\s*|```\s*/g, "");
|
|
355
|
+
const jsonMatch = cleanText.match(/\{[^]*?\}/s);
|
|
356
|
+
if (jsonMatch) {
|
|
357
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
358
|
+
return processStructuredEvaluationResult(parsed, modelConfig, evaluationTime, context, attempt);
|
|
359
|
+
}
|
|
360
|
+
// Fallback to regex parsing with improved patterns
|
|
361
|
+
const relevanceMatch = evaluationText.match(/(?:relevance[Score"\s]*:?["\s]*(\d+)|Relevance["\s]*:?["\s]*(\d+)|relevance.*?(\d+))/i);
|
|
362
|
+
const accuracyMatch = evaluationText.match(/(?:accuracy[Score"\s]*:?["\s]*(\d+)|Accuracy["\s]*:?["\s]*(\d+)|accuracy.*?(\d+))/i);
|
|
363
|
+
const completenessMatch = evaluationText.match(/(?:completeness[Score"\s]*:?["\s]*(\d+)|Completeness["\s]*:?["\s]*(\d+)|completeness.*?(\d+))/i);
|
|
364
|
+
// Extract scores with fallback to default values
|
|
365
|
+
const relevance = relevanceMatch
|
|
366
|
+
? parseInt(relevanceMatch[1] || relevanceMatch[2] || relevanceMatch[3], 10)
|
|
367
|
+
: 8; // Default fallback score
|
|
368
|
+
const accuracy = accuracyMatch
|
|
369
|
+
? parseInt(accuracyMatch[1] || accuracyMatch[2] || accuracyMatch[3], 10)
|
|
370
|
+
: 8; // Default fallback score
|
|
371
|
+
const completeness = completenessMatch
|
|
372
|
+
? parseInt(completenessMatch[1] || completenessMatch[2] || completenessMatch[3], 10)
|
|
373
|
+
: 8; // Default fallback score
|
|
374
|
+
return {
|
|
375
|
+
relevanceScore: Math.max(0, Math.min(10, relevance)),
|
|
376
|
+
accuracyScore: Math.max(0, Math.min(10, accuracy)),
|
|
377
|
+
completenessScore: Math.max(0, Math.min(10, completeness)),
|
|
378
|
+
overall: Math.round((relevance + accuracy + completeness) / 3),
|
|
379
|
+
isOffTopic: false,
|
|
380
|
+
alertSeverity: "none",
|
|
381
|
+
reasoning: "Parsed using regex fallback - response was not in expected JSON format.",
|
|
382
|
+
evaluationModel: `${modelConfig.providerName}/${modelConfig.modelName}`,
|
|
383
|
+
evaluationTime,
|
|
384
|
+
evaluationProvider: modelConfig.providerName,
|
|
385
|
+
evaluationAttempt: attempt,
|
|
386
|
+
evaluationConfig: {
|
|
387
|
+
mode: "fallback",
|
|
388
|
+
fallbackUsed: true,
|
|
389
|
+
costEstimate: 0,
|
|
390
|
+
},
|
|
391
|
+
};
|
|
392
|
+
}
|
|
393
|
+
catch (error) {
|
|
394
|
+
logger.error("Failed to parse unified evaluation result", { error });
|
|
395
|
+
return getDefaultUnifiedEvaluation("parse-error", evaluationTime, context);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Get default evaluation when evaluation fails
|
|
400
|
+
*/
|
|
401
|
+
function getDefaultUnifiedEvaluation(reason, evaluationTime, context) {
|
|
402
|
+
return {
|
|
403
|
+
relevanceScore: 0,
|
|
404
|
+
accuracyScore: 0,
|
|
405
|
+
completenessScore: 0,
|
|
406
|
+
overall: 0,
|
|
407
|
+
isOffTopic: false,
|
|
408
|
+
alertSeverity: "high",
|
|
409
|
+
reasoning: `Evaluation unavailable (${reason}). This may be due to missing API keys, network issues, or service unavailability.`,
|
|
410
|
+
suggestedImprovements: "Check evaluation system configuration, API credentials, and network connectivity.",
|
|
411
|
+
evaluationModel: "unavailable",
|
|
412
|
+
evaluationTime,
|
|
413
|
+
evaluationProvider: "none",
|
|
414
|
+
evaluationAttempt: 0,
|
|
415
|
+
evaluationConfig: {
|
|
416
|
+
mode: "default",
|
|
417
|
+
fallbackUsed: true,
|
|
418
|
+
costEstimate: 0,
|
|
419
|
+
},
|
|
420
|
+
contextUtilization: {
|
|
421
|
+
conversationUsed: (context.conversationHistory?.length || 0) > 0,
|
|
422
|
+
toolsUsed: (context.toolsUsed?.length || 0) > 0,
|
|
423
|
+
domainKnowledgeUsed: !!context.primaryDomain,
|
|
424
|
+
},
|
|
425
|
+
evaluationContext: {
|
|
426
|
+
domain: context.primaryDomain || "unknown",
|
|
427
|
+
toolsEvaluated: context.toolsUsed || [],
|
|
428
|
+
conversationTurns: context.conversationHistory?.length || 0,
|
|
429
|
+
},
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Enhanced evaluation model selection
|
|
434
|
+
*/
|
|
435
|
+
export async function getEvaluationModel() {
|
|
436
|
+
const { parseEvaluationConfig, getProviderFallbackOrder } = await import("./evaluation-config.js");
|
|
437
|
+
const { getProviderConfig } = await import("./evaluation-providers.js");
|
|
438
|
+
const config = parseEvaluationConfig();
|
|
439
|
+
const fallbackOrder = getProviderFallbackOrder(config);
|
|
440
|
+
for (const providerName of fallbackOrder) {
|
|
441
|
+
try {
|
|
442
|
+
const providerConfig = getProviderConfig(providerName);
|
|
443
|
+
if (!providerConfig) {
|
|
444
|
+
continue;
|
|
445
|
+
}
|
|
446
|
+
let modelName = config.model;
|
|
447
|
+
if (modelName === "auto" || !config.model) {
|
|
448
|
+
modelName =
|
|
449
|
+
providerConfig.models[config.mode] || providerConfig.models.fast;
|
|
450
|
+
}
|
|
451
|
+
const provider = await AIProviderFactory.createProvider(providerName, modelName);
|
|
452
|
+
if (provider) {
|
|
453
|
+
return {
|
|
454
|
+
provider,
|
|
455
|
+
config: {
|
|
456
|
+
providerName,
|
|
457
|
+
modelName,
|
|
458
|
+
providerConfig,
|
|
459
|
+
evaluationConfig: config,
|
|
460
|
+
},
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
catch (error) {
|
|
465
|
+
if (!config.fallbackEnabled) {
|
|
466
|
+
throw error;
|
|
467
|
+
}
|
|
468
|
+
continue;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
return null;
|
|
472
|
+
}
|
|
473
|
+
/**
|
|
474
|
+
* Format conversation history for evaluation
|
|
475
|
+
*/
|
|
476
|
+
function formatConversationHistory(history) {
|
|
477
|
+
if (!history?.length) {
|
|
478
|
+
return "No prior conversation context.";
|
|
479
|
+
}
|
|
480
|
+
return history
|
|
481
|
+
.slice(-3) // Last 3 turns
|
|
482
|
+
.map((msg, i) => `${i + 1}. ${msg.role.toUpperCase()}: ${msg.content.substring(0, 200)}${msg.content.length > 200 ? "..." : ""}`)
|
|
483
|
+
.join("\n");
|
|
484
|
+
}
|
|
485
|
+
/**
|
|
486
|
+
* Create simple evaluation context (backward compatibility)
|
|
487
|
+
*/
|
|
488
|
+
export function createSimpleEvaluationContext(prompt, response, context) {
|
|
489
|
+
return {
|
|
490
|
+
userQuery: prompt,
|
|
491
|
+
aiResponse: response,
|
|
492
|
+
context,
|
|
493
|
+
mode: "simple",
|
|
494
|
+
};
|
|
495
|
+
}
|
|
496
|
+
/**
|
|
497
|
+
* Create enhanced evaluation context
|
|
498
|
+
*/
|
|
499
|
+
export function createEnhancedEvaluationContext(userQuery, aiResponse, options = {}) {
|
|
500
|
+
return {
|
|
501
|
+
userQuery,
|
|
502
|
+
aiResponse,
|
|
503
|
+
primaryDomain: options.domain,
|
|
504
|
+
assistantRole: options.role,
|
|
505
|
+
toolsUsed: options.toolsUsed,
|
|
506
|
+
toolContext: options.toolsUsed?.length
|
|
507
|
+
? `Tools used: ${options.toolsUsed.join(", ")}`
|
|
508
|
+
: undefined,
|
|
509
|
+
conversationHistory: options.conversationHistory,
|
|
510
|
+
sessionId: options.sessionId,
|
|
511
|
+
context: options.context,
|
|
512
|
+
mode: "lighthouse",
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
// Legacy compatibility wrapper for old function signature
|
|
516
|
+
export async function evaluateResponse(prompt, response, context, evaluationDomain, toolUsageContext, conversationHistory) {
|
|
517
|
+
// Convert old arguments to new context format
|
|
518
|
+
const unifiedContext = {
|
|
519
|
+
userQuery: prompt,
|
|
520
|
+
aiResponse: response,
|
|
521
|
+
context,
|
|
522
|
+
primaryDomain: evaluationDomain,
|
|
523
|
+
toolContext: toolUsageContext,
|
|
524
|
+
conversationHistory: conversationHistory,
|
|
525
|
+
mode: evaluationDomain ? "lighthouse" : "simple",
|
|
526
|
+
};
|
|
527
|
+
return performUnifiedEvaluation(unifiedContext);
|
|
528
|
+
}
|
package/dist/core/factory.js
CHANGED
|
@@ -81,31 +81,39 @@ export class AIProviderFactory {
|
|
|
81
81
|
// error: dynamicError instanceof Error ? dynamicError.message : String(dynamicError),
|
|
82
82
|
// });
|
|
83
83
|
// }
|
|
84
|
-
//
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
84
|
+
// COMPREHENSIVE FIX: Disable dynamic model resolution completely until provider is fixed
|
|
85
|
+
// This prevents stale gemini-1.5-pro-latest from overriding correct gemini-2.5-pro defaults
|
|
86
|
+
const resolvedModelName = modelName;
|
|
87
|
+
// COMMENTED OUT: Dynamic model resolution causing 1.5 vs 2.5 Pro issues
|
|
88
|
+
// if (!modelName || modelName === "default") {
|
|
89
|
+
// try {
|
|
90
|
+
// const normalizedProvider = this.normalizeProviderName(providerName);
|
|
91
|
+
// const dynamicModel = dynamicModelProvider.resolveModel(
|
|
92
|
+
// normalizedProvider,
|
|
93
|
+
// modelName || undefined,
|
|
94
|
+
// );
|
|
95
|
+
// if (dynamicModel) {
|
|
96
|
+
// resolvedModelName = dynamicModel.id;
|
|
97
|
+
// logger.debug(`[${functionTag}] Resolved dynamic model`, {
|
|
98
|
+
// provider: normalizedProvider,
|
|
99
|
+
// requestedModel: modelName || "default",
|
|
100
|
+
// resolvedModel: resolvedModelName,
|
|
101
|
+
// displayName: dynamicModel.displayName,
|
|
102
|
+
// pricing: dynamicModel.pricing.input,
|
|
103
|
+
// });
|
|
104
|
+
// }
|
|
105
|
+
// } catch (resolveError) {
|
|
106
|
+
// logger.debug(
|
|
107
|
+
// `[${functionTag}] Dynamic model resolution failed, using fallback`,
|
|
108
|
+
// {
|
|
109
|
+
// error:
|
|
110
|
+
// resolveError instanceof Error
|
|
111
|
+
// ? resolveError.message
|
|
112
|
+
// : String(resolveError),
|
|
113
|
+
// },
|
|
114
|
+
// );
|
|
115
|
+
// }
|
|
116
|
+
// }
|
|
109
117
|
let provider;
|
|
110
118
|
switch (providerName.toLowerCase()) {
|
|
111
119
|
case "vertex":
|