@juspay/neurolink 7.35.0 → 7.36.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ import { generateText } from "ai";
1
2
  import { MiddlewareFactory } from "../middleware/factory.js";
2
3
  import { logger } from "../utils/logger.js";
3
4
  import { DEFAULT_MAX_STEPS, STEP_LIMITS } from "../core/constants.js";
@@ -5,12 +6,11 @@ import { directAgentTools } from "../agent/directTools.js";
5
6
  import { getSafeMaxTokens } from "../utils/tokenLimits.js";
6
7
  import { createTimeoutController, TimeoutError } from "../utils/timeout.js";
7
8
  import { shouldDisableBuiltinTools } from "../utils/toolUtils.js";
8
- import { buildMessagesArray } from "../utils/messageBuilder.js";
9
+ import { buildMessagesArray, buildMultimodalMessagesArray, } from "../utils/messageBuilder.js";
9
10
  import { getKeysAsString, getKeyCount } from "../utils/transformationUtils.js";
10
11
  import { validateStreamOptions as validateStreamOpts, validateTextGenerationOptions, ValidationError, createValidationSummary, } from "../utils/parameterValidation.js";
11
12
  import { recordProviderPerformanceFromMetrics, getPerformanceOptimizedProvider, } from "./evaluationProviders.js";
12
13
  import { modelConfig } from "./modelConfiguration.js";
13
- // Provider types moved to ../types/providers.js
14
14
  /**
15
15
  * Abstract base class for all AI providers
16
16
  * Tools are integrated as first-class citizens - always available by default
@@ -166,7 +166,7 @@ export class BaseProvider {
166
166
  try {
167
167
  // Import streamText dynamically to avoid circular dependencies
168
168
  // Using streamText instead of generateText for unified implementation
169
- const { streamText } = await import("ai");
169
+ // const { streamText } = await import("ai");
170
170
  // Get ALL available tools (direct + MCP + external from options)
171
171
  const shouldUseTools = !options.disableTools && this.supportsTools();
172
172
  const baseTools = shouldUseTools ? await this.getAllTools() : {};
@@ -211,42 +211,86 @@ export class BaseProvider {
211
211
  });
212
212
  const model = await this.getAISDKModelWithMiddleware(options);
213
213
  // Build proper message array with conversation history
214
- const messages = buildMessagesArray(options);
215
- // Use streamText and accumulate results instead of generateText
216
- const streamResult = await streamText({
214
+ // Check if this is a multimodal request (images or content present)
215
+ let messages;
216
+ // Type guard to check if options has multimodal input
217
+ const hasMultimodalInput = (opts) => {
218
+ const input = opts.input;
219
+ const hasImages = !!input?.images?.length;
220
+ const hasContent = !!input?.content?.length;
221
+ return hasImages || hasContent;
222
+ };
223
+ if (hasMultimodalInput(options)) {
224
+ if (process.env.NEUROLINK_DEBUG === "true") {
225
+ logger.info("🖼️ [MULTIMODAL-REQUEST] Detected multimodal input, using multimodal message builder");
226
+ }
227
+ // This is a multimodal request - use multimodal message builder
228
+ // Convert TextGenerationOptions to GenerateOptions format for multimodal processing
229
+ const input = options.input;
230
+ const multimodalOptions = {
231
+ input: {
232
+ text: options.prompt || options.input?.text || "",
233
+ images: input?.images,
234
+ content: input?.content,
235
+ },
236
+ provider: options.provider,
237
+ model: options.model,
238
+ temperature: options.temperature,
239
+ maxTokens: options.maxTokens,
240
+ systemPrompt: options.systemPrompt,
241
+ enableAnalytics: options.enableAnalytics,
242
+ enableEvaluation: options.enableEvaluation,
243
+ context: options.context,
244
+ };
245
+ messages = await buildMultimodalMessagesArray(multimodalOptions, this.providerName, this.modelName);
246
+ }
247
+ else {
248
+ if (process.env.NEUROLINK_DEBUG === "true") {
249
+ logger.info("📝 [TEXT-ONLY-REQUEST] No multimodal input detected, using standard message builder");
250
+ }
251
+ // Standard text-only request
252
+ messages = buildMessagesArray(options);
253
+ }
254
+ // Convert messages to Vercel AI SDK format
255
+ const aiSDKMessages = messages.map((msg) => {
256
+ if (typeof msg.content === "string") {
257
+ // Simple text content
258
+ return {
259
+ role: msg.role,
260
+ content: msg.content,
261
+ };
262
+ }
263
+ else {
264
+ // Multimodal content array - convert to Vercel AI SDK format
265
+ // The Vercel AI SDK expects content to be in a specific format
266
+ return {
267
+ role: msg.role,
268
+ content: msg.content.map((item) => {
269
+ if (item.type === "text") {
270
+ return { type: "text", text: item.text || "" };
271
+ }
272
+ else if (item.type === "image") {
273
+ return { type: "image", image: item.image || "" };
274
+ }
275
+ return item;
276
+ }),
277
+ };
278
+ }
279
+ });
280
+ const generateResult = await generateText({
217
281
  model,
218
- messages: messages,
282
+ messages: aiSDKMessages,
219
283
  tools,
220
284
  maxSteps: options.maxSteps || DEFAULT_MAX_STEPS,
221
285
  toolChoice: shouldUseTools ? "auto" : "none",
222
286
  temperature: options.temperature,
223
287
  maxTokens: options.maxTokens, // No default limit - unlimited unless specified
224
288
  });
225
- // Accumulate the streamed content
226
- let accumulatedContent = "";
227
- // Wait for the stream to complete and accumulate content
228
- try {
229
- for await (const chunk of streamResult.textStream) {
230
- accumulatedContent += chunk;
231
- }
232
- }
233
- catch (streamError) {
234
- logger.error(`Error reading text stream for ${this.providerName}:`, streamError);
235
- throw streamError;
236
- }
237
- // Get the final result - this should include usage, toolCalls, etc.
238
- const usage = await streamResult.usage;
239
- const toolCalls = await streamResult.toolCalls;
240
- const toolResults = await streamResult.toolResults;
241
289
  const responseTime = Date.now() - startTime;
242
- // Create a result object compatible with generateText format
243
- const result = {
244
- text: accumulatedContent,
245
- usage: usage,
246
- toolCalls: toolCalls,
247
- toolResults: toolResults,
248
- steps: streamResult.steps, // Include steps for tool execution tracking
249
- };
290
+ // Extract properties from generateResult
291
+ const usage = generateResult.usage;
292
+ const toolCalls = generateResult.toolCalls;
293
+ const toolResults = generateResult.toolResults;
250
294
  try {
251
295
  const actualCost = await this.calculateActualCost(usage || { promptTokens: 0, completionTokens: 0, totalTokens: 0 });
252
296
  recordProviderPerformanceFromMetrics(this.providerName, {
@@ -273,14 +317,14 @@ export class BaseProvider {
273
317
  // First check direct tool calls (fallback)
274
318
  if (toolCalls && toolCalls.length > 0) {
275
319
  toolsUsed.push(...toolCalls.map((tc) => {
276
- return tc.toolName || "unknown";
320
+ return tc.toolName || tc.name || "unknown";
277
321
  }));
278
322
  }
279
323
  // Then check steps for tool calls (primary source for multi-step)
280
- if (result.steps &&
281
- Array.isArray(result.steps)) {
282
- for (const step of result.steps ||
283
- []) {
324
+ if (generateResult.steps &&
325
+ Array.isArray(generateResult.steps)) {
326
+ for (const step of generateResult
327
+ .steps || []) {
284
328
  if (step?.toolCalls && Array.isArray(step.toolCalls)) {
285
329
  toolsUsed.push(...step.toolCalls.map((tc) => {
286
330
  return tc.toolName || tc.name || "unknown";
@@ -295,10 +339,10 @@ export class BaseProvider {
295
339
  // Create a map of tool calls to their arguments for matching with results
296
340
  const toolCallArgsMap = new Map();
297
341
  // Extract tool executions from AI SDK result steps
298
- if (result.steps &&
299
- Array.isArray(result.steps)) {
300
- for (const step of result.steps ||
301
- []) {
342
+ if (generateResult.steps &&
343
+ Array.isArray(generateResult.steps)) {
344
+ for (const step of generateResult
345
+ .steps || []) {
302
346
  // First, collect tool calls and their arguments
303
347
  if (step?.toolCalls && Array.isArray(step.toolCalls)) {
304
348
  for (const toolCall of step.toolCalls) {
@@ -359,11 +403,11 @@ export class BaseProvider {
359
403
  }
360
404
  // Format the result with tool executions included
361
405
  const enhancedResult = {
362
- content: result.text,
406
+ content: generateResult.text,
363
407
  usage: {
364
- input: result.usage?.promptTokens || 0,
365
- output: result.usage?.completionTokens || 0,
366
- total: result.usage?.totalTokens || 0,
408
+ input: generateResult.usage?.promptTokens || 0,
409
+ output: generateResult.usage?.completionTokens || 0,
410
+ total: generateResult.usage?.totalTokens || 0,
367
411
  },
368
412
  provider: this.providerName,
369
413
  model: this.modelName,
@@ -943,13 +987,23 @@ export class BaseProvider {
943
987
  const providerName = optionsOrPrompt.provider || this.providerName;
944
988
  // Apply safe maxTokens based on provider and model
945
989
  const safeMaxTokens = getSafeMaxTokens(providerName, modelName, optionsOrPrompt.maxTokens);
946
- return {
990
+ // CRITICAL FIX: Preserve the entire input object for multimodal support
991
+ // This ensures images and content arrays are not lost during normalization
992
+ const normalizedOptions = {
947
993
  ...optionsOrPrompt,
948
994
  prompt,
949
995
  provider: providerName,
950
996
  model: modelName,
951
997
  maxTokens: safeMaxTokens,
952
998
  };
999
+ // Ensure input object is preserved if it exists (for multimodal support)
1000
+ if (optionsOrPrompt.input) {
1001
+ normalizedOptions.input = {
1002
+ ...optionsOrPrompt.input,
1003
+ text: prompt, // Ensure text is consistent
1004
+ };
1005
+ }
1006
+ return normalizedOptions;
953
1007
  }
954
1008
  normalizeStreamOptions(optionsOrPrompt) {
955
1009
  if (typeof optionsOrPrompt === "string") {
@@ -175,11 +175,14 @@ export interface StreamingOptions {
175
175
  }
176
176
  /**
177
177
  * Text generation options interface
178
+ * Extended to support multimodal content with zero breaking changes
178
179
  */
179
180
  export interface TextGenerationOptions {
180
181
  prompt?: string;
181
182
  input?: {
182
183
  text: string;
184
+ images?: Array<Buffer | string>;
185
+ content?: Array<import("../types/content.js").TextContent | import("../types/content.js").ImageContent>;
183
186
  };
184
187
  provider?: AIProviderName;
185
188
  model?: string;
@@ -854,7 +854,7 @@ export class NeuroLink {
854
854
  // Continue with warning rather than throwing - graceful degradation
855
855
  }
856
856
  }
857
- // Convert to TextGenerationOptions using factory utilities
857
+ // 🔧 CRITICAL FIX: Convert to TextGenerationOptions while preserving the input object for multimodal support
858
858
  const baseOptions = {
859
859
  prompt: options.input.text,
860
860
  provider: options.provider,
@@ -868,6 +868,7 @@ export class NeuroLink {
868
868
  context: options.context,
869
869
  evaluationDomain: options.evaluationDomain,
870
870
  toolUsageContext: options.toolUsageContext,
871
+ input: options.input, // This includes text, images, and content arrays
871
872
  };
872
873
  // Apply factory enhancement using centralized utilities
873
874
  const textOptions = enhanceTextGenerationOptions(baseOptions, factoryResult);
@@ -1664,7 +1665,9 @@ export class NeuroLink {
1664
1665
  const processedStream = (async function* (self) {
1665
1666
  try {
1666
1667
  for await (const chunk of mcpStream) {
1667
- if (chunk && "content" in chunk && typeof chunk.content === "string") {
1668
+ if (chunk &&
1669
+ "content" in chunk &&
1670
+ typeof chunk.content === "string") {
1668
1671
  accumulatedContent += chunk.content;
1669
1672
  // Emit chunk event for compatibility
1670
1673
  self.emitter.emit("response:chunk", chunk.content);
@@ -1941,7 +1944,9 @@ export class NeuroLink {
1941
1944
  const fallbackProcessedStream = (async function* (self) {
1942
1945
  try {
1943
1946
  for await (const chunk of fallbackStreamResult.stream) {
1944
- if (chunk && "content" in chunk && typeof chunk.content === "string") {
1947
+ if (chunk &&
1948
+ "content" in chunk &&
1949
+ typeof chunk.content === "string") {
1945
1950
  fallbackAccumulatedContent += chunk.content;
1946
1951
  // Emit chunk event
1947
1952
  self.emitter.emit("response:chunk", chunk.content);
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Content type definitions for multimodal support
3
+ * Supports text and image content with provider-specific formatting
4
+ */
5
+ /**
6
+ * Text content type for multimodal messages
7
+ */
8
+ export interface TextContent {
9
+ type: "text";
10
+ text: string;
11
+ }
12
+ /**
13
+ * Image content type for multimodal messages
14
+ */
15
+ export interface ImageContent {
16
+ type: "image";
17
+ data: Buffer | string;
18
+ mediaType?: "image/jpeg" | "image/png" | "image/gif" | "image/webp" | "image/bmp" | "image/tiff";
19
+ metadata?: {
20
+ description?: string;
21
+ quality?: "low" | "high" | "auto";
22
+ dimensions?: {
23
+ width: number;
24
+ height: number;
25
+ };
26
+ filename?: string;
27
+ };
28
+ }
29
+ /**
30
+ * Union type for all content types
31
+ */
32
+ export type Content = TextContent | ImageContent;
33
+ /**
34
+ * Vision capability information for providers
35
+ */
36
+ export interface VisionCapability {
37
+ provider: string;
38
+ supportedModels: string[];
39
+ maxImageSize?: number;
40
+ supportedFormats: string[];
41
+ maxImagesPerRequest?: number;
42
+ }
43
+ /**
44
+ * Provider-specific image format requirements
45
+ */
46
+ export interface ProviderImageFormat {
47
+ provider: string;
48
+ format: "data_uri" | "base64" | "inline_data" | "source";
49
+ requiresPrefix?: boolean;
50
+ mimeTypeField?: string;
51
+ dataField?: string;
52
+ }
53
+ /**
54
+ * Image processing result
55
+ */
56
+ export interface ProcessedImage {
57
+ data: string;
58
+ mediaType: string;
59
+ size: number;
60
+ format: "data_uri" | "base64" | "inline_data" | "source";
61
+ }
62
+ /**
63
+ * Multimodal message structure for provider adapters
64
+ */
65
+ export interface MultimodalMessage {
66
+ role: "user" | "assistant" | "system";
67
+ content: Content[];
68
+ }
69
+ /**
70
+ * Provider-specific multimodal payload
71
+ */
72
+ export interface ProviderMultimodalPayload {
73
+ provider: string;
74
+ model: string;
75
+ messages?: MultimodalMessage[];
76
+ contents?: unknown[];
77
+ [key: string]: unknown;
78
+ }
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Content type definitions for multimodal support
3
+ * Supports text and image content with provider-specific formatting
4
+ */
5
+ export {};
@@ -66,6 +66,25 @@ export interface ChatMessage {
66
66
  /** Content of the message */
67
67
  content: string;
68
68
  }
69
+ /**
70
+ * Content format for multimodal messages (used internally)
71
+ */
72
+ export interface MessageContent {
73
+ type: string;
74
+ text?: string;
75
+ image?: string;
76
+ mimeType?: string;
77
+ [key: string]: unknown;
78
+ }
79
+ /**
80
+ * Extended chat message for multimodal support (internal use)
81
+ */
82
+ export interface MultimodalChatMessage {
83
+ /** Role of the message sender */
84
+ role: "user" | "assistant" | "system";
85
+ /** Content of the message - can be text or multimodal content array */
86
+ content: string | MessageContent[];
87
+ }
69
88
  /**
70
89
  * Events emitted by conversation memory system
71
90
  */
@@ -6,13 +6,16 @@ import type { EvaluationData } from "./evaluation.js";
6
6
  import type { ChatMessage, ConversationMemoryConfig } from "./conversation.js";
7
7
  import type { MiddlewareFactoryOptions } from "./middlewareTypes.js";
8
8
  import type { JsonValue } from "./common.js";
9
+ import type { TextContent, ImageContent } from "./content.js";
9
10
  /**
10
11
  * Generate function options type - Primary method for content generation
11
- * Future-ready for multi-modal capabilities while maintaining text focus
12
+ * Supports multimodal content while maintaining backward compatibility
12
13
  */
13
14
  export type GenerateOptions = {
14
15
  input: {
15
16
  text: string;
17
+ images?: Array<Buffer | string>;
18
+ content?: Array<TextContent | ImageContent>;
16
19
  };
17
20
  output?: {
18
21
  format?: "text" | "structured" | "json";
@@ -5,6 +5,7 @@ import type { AnalyticsData, TokenUsage } from "./analytics.js";
5
5
  import type { EvaluationData } from "./evaluation.js";
6
6
  import type { UnknownRecord, JsonValue } from "./common.js";
7
7
  import type { ChatMessage } from "./conversation.js";
8
+ import type { TextContent, ImageContent } from "./content.js";
8
9
  import type { MiddlewareFactoryOptions } from "./middlewareTypes.js";
9
10
  /**
10
11
  * Progress tracking and metadata for streaming operations
@@ -118,10 +119,12 @@ export interface AudioChunk {
118
119
  channels: number;
119
120
  encoding: PCMEncoding;
120
121
  }
121
- export type StreamOptions = {
122
+ export interface StreamOptions {
122
123
  input: {
123
- text?: string;
124
+ text: string;
124
125
  audio?: AudioInputSpec;
126
+ images?: Array<Buffer | string>;
127
+ content?: Array<TextContent | ImageContent>;
125
128
  };
126
129
  output?: {
127
130
  format?: "text" | "structured" | "json";
@@ -166,7 +169,7 @@ export type StreamOptions = {
166
169
  };
167
170
  conversationMessages?: ChatMessage[];
168
171
  middleware?: MiddlewareFactoryOptions;
169
- };
172
+ }
170
173
  /**
171
174
  * Stream function result type - Primary output format for streaming
172
175
  * Future-ready for multi-modal outputs while maintaining text focus
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Image processing utilities for multimodal support
3
+ * Handles format conversion for different AI providers
4
+ */
5
+ import type { ProcessedImage } from "../types/content.js";
6
+ /**
7
+ * Image processor class for handling provider-specific image formatting
8
+ */
9
+ export declare class ImageProcessor {
10
+ /**
11
+ * Process image for OpenAI (requires data URI format)
12
+ */
13
+ static processImageForOpenAI(image: Buffer | string): string;
14
+ /**
15
+ * Process image for Google AI (requires base64 without data URI prefix)
16
+ */
17
+ static processImageForGoogle(image: Buffer | string): {
18
+ mimeType: string;
19
+ data: string;
20
+ };
21
+ /**
22
+ * Process image for Anthropic (requires base64 without data URI prefix)
23
+ */
24
+ static processImageForAnthropic(image: Buffer | string): {
25
+ mediaType: string;
26
+ data: string;
27
+ };
28
+ /**
29
+ * Process image for Vertex AI (model-specific routing)
30
+ */
31
+ static processImageForVertex(image: Buffer | string, model: string): {
32
+ mimeType?: string;
33
+ mediaType?: string;
34
+ data: string;
35
+ };
36
+ /**
37
+ * Detect image type from filename or data
38
+ */
39
+ static detectImageType(input: string | Buffer): string;
40
+ /**
41
+ * Validate image size (default 10MB limit)
42
+ */
43
+ static validateImageSize(data: Buffer | string, maxSize?: number): boolean;
44
+ /**
45
+ * Validate image format
46
+ */
47
+ static validateImageFormat(mediaType: string): boolean;
48
+ /**
49
+ * Get image dimensions from Buffer (basic implementation)
50
+ */
51
+ static getImageDimensions(buffer: Buffer): {
52
+ width: number;
53
+ height: number;
54
+ } | null;
55
+ /**
56
+ * Convert image to ProcessedImage format
57
+ */
58
+ static processImage(image: Buffer | string, provider: string, model?: string): ProcessedImage;
59
+ }
60
+ /**
61
+ * Utility functions for image handling
62
+ */
63
+ export declare const imageUtils: {
64
+ /**
65
+ * Check if a string is a valid data URI
66
+ */
67
+ isDataUri: (str: string) => boolean;
68
+ /**
69
+ * Check if a string is a valid URL
70
+ */
71
+ isUrl: (str: string) => boolean;
72
+ /**
73
+ * Check if a string is base64 encoded
74
+ */
75
+ isBase64: (str: string) => boolean;
76
+ /**
77
+ * Extract file extension from filename or URL
78
+ */
79
+ getFileExtension: (filename: string) => string | null;
80
+ /**
81
+ * Convert file size to human readable format
82
+ */
83
+ formatFileSize: (bytes: number) => string;
84
+ };