@juspay/neurolink 8.18.0 → 8.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/adapters/providerImageAdapter.d.ts +12 -0
  3. package/dist/adapters/providerImageAdapter.js +30 -3
  4. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  5. package/dist/config/conversationMemory.d.ts +2 -1
  6. package/dist/config/conversationMemory.js +15 -7
  7. package/dist/core/baseProvider.d.ts +15 -0
  8. package/dist/core/baseProvider.js +79 -1
  9. package/dist/core/modules/GenerationHandler.d.ts +5 -0
  10. package/dist/core/modules/GenerationHandler.js +56 -9
  11. package/dist/factories/providerRegistry.js +18 -0
  12. package/dist/lib/adapters/providerImageAdapter.d.ts +12 -0
  13. package/dist/lib/adapters/providerImageAdapter.js +30 -3
  14. package/dist/lib/config/conversationMemory.d.ts +2 -1
  15. package/dist/lib/config/conversationMemory.js +15 -7
  16. package/dist/lib/core/baseProvider.d.ts +15 -0
  17. package/dist/lib/core/baseProvider.js +79 -1
  18. package/dist/lib/core/modules/GenerationHandler.d.ts +5 -0
  19. package/dist/lib/core/modules/GenerationHandler.js +56 -9
  20. package/dist/lib/factories/providerRegistry.js +18 -0
  21. package/dist/lib/mcp/servers/agent/directToolsServer.js +5 -0
  22. package/dist/lib/mcp/toolRegistry.js +5 -0
  23. package/dist/lib/neurolink.js +6 -1
  24. package/dist/lib/types/generateTypes.d.ts +31 -0
  25. package/dist/lib/types/ttsTypes.d.ts +29 -0
  26. package/dist/lib/utils/fileDetector.d.ts +25 -0
  27. package/dist/lib/utils/fileDetector.js +433 -10
  28. package/dist/lib/utils/messageBuilder.js +6 -2
  29. package/dist/lib/utils/ttsProcessor.d.ts +41 -14
  30. package/dist/lib/utils/ttsProcessor.js +10 -26
  31. package/dist/mcp/servers/agent/directToolsServer.js +5 -0
  32. package/dist/mcp/toolRegistry.js +5 -0
  33. package/dist/neurolink.js +6 -1
  34. package/dist/types/generateTypes.d.ts +31 -0
  35. package/dist/types/ttsTypes.d.ts +29 -0
  36. package/dist/utils/fileDetector.d.ts +25 -0
  37. package/dist/utils/fileDetector.js +433 -10
  38. package/dist/utils/messageBuilder.js +6 -2
  39. package/dist/utils/ttsProcessor.d.ts +41 -14
  40. package/dist/utils/ttsProcessor.js +10 -26
  41. package/package.json +1 -1
package/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [8.19.1](https://github.com/juspay/neurolink/compare/v8.19.0...v8.19.1) (2025-12-20)
2
+
3
+ ### Bug Fixes
4
+
5
+ - **(files):** comprehensive extension-less file detection with fallback parsing (FD-018) ([7e9dbc7](https://github.com/juspay/neurolink/commit/7e9dbc78df48f6df051c7845824977f360f8feee))
6
+
7
+ ## [8.19.0](https://github.com/juspay/neurolink/compare/v8.18.0...v8.19.0) (2025-12-18)
8
+
9
+ ### Features
10
+
11
+ - **(tts):** Integrate TTS into BaseProvider.generate() ([ffae0b5](https://github.com/juspay/neurolink/commit/ffae0b5be9c4a2ef249876bdeee265004adf28a3))
12
+
1
13
  ## [8.18.0](https://github.com/juspay/neurolink/compare/v8.17.0...v8.18.0) (2025-12-16)
2
14
 
3
15
  ### Features
@@ -60,4 +60,16 @@ export declare class ProviderImageAdapter {
60
60
  * Get all vision-capable providers
61
61
  */
62
62
  static getVisionProviders(): string[];
63
+ /**
64
+ * Count total "images" in a message (actual images + PDF pages)
65
+ * PDF pages count toward image limits for providers
66
+ */
67
+ static countImagesInMessage(images: Array<Buffer | string>, pdfPages?: number | null): number;
68
+ /**
69
+ * Extract page count from PDF metadata array
70
+ * Returns total pages across all PDFs
71
+ */
72
+ static countImagesInPages(pdfMetadataArray: Array<{
73
+ pageCount?: number | null;
74
+ }> | undefined): number;
63
75
  }
@@ -416,13 +416,19 @@ export class ProviderImageAdapter {
416
416
  adaptedPayload = this.formatForOpenAI(text, images);
417
417
  break;
418
418
  case "litellm":
419
- adaptedPayload = this.formatForOpenAI(text, images);
419
+ // LiteLLM uses same format as OpenAI but validate with litellm provider name
420
+ this.validateImageCount(images.length, "litellm");
421
+ adaptedPayload = this.formatForOpenAI(text, images, true);
420
422
  break;
421
423
  case "mistral":
422
- adaptedPayload = this.formatForOpenAI(text, images);
424
+ // Mistral uses same format as OpenAI but validate with mistral provider name
425
+ this.validateImageCount(images.length, "mistral");
426
+ adaptedPayload = this.formatForOpenAI(text, images, true);
423
427
  break;
424
428
  case "bedrock":
425
- adaptedPayload = this.formatForAnthropic(text, images);
429
+ // Bedrock uses same format as Anthropic but validate with bedrock provider name
430
+ this.validateImageCount(images.length, "bedrock");
431
+ adaptedPayload = this.formatForAnthropic(text, images, true);
426
432
  break;
427
433
  default:
428
434
  throw new Error(`Vision not supported for provider: ${provider}`);
@@ -666,4 +672,25 @@ export class ProviderImageAdapter {
666
672
  static getVisionProviders() {
667
673
  return Object.keys(VISION_CAPABILITIES);
668
674
  }
675
+ /**
676
+ * Count total "images" in a message (actual images + PDF pages)
677
+ * PDF pages count toward image limits for providers
678
+ */
679
+ static countImagesInMessage(images, pdfPages) {
680
+ const imageCount = images?.length || 0;
681
+ const pageCount = pdfPages ?? 0;
682
+ return imageCount + pageCount;
683
+ }
684
+ /**
685
+ * Extract page count from PDF metadata array
686
+ * Returns total pages across all PDFs
687
+ */
688
+ static countImagesInPages(pdfMetadataArray) {
689
+ if (!pdfMetadataArray || pdfMetadataArray.length === 0) {
690
+ return 0;
691
+ }
692
+ return pdfMetadataArray.reduce((total, pdf) => {
693
+ return total + (pdf.pageCount ?? 0);
694
+ }, 0);
695
+ }
669
696
  }
@@ -5,4 +5,4 @@ import type { OptionSchema } from "../../lib/types/cli.js";
5
5
  * This object provides metadata for validation and help text in the CLI loop.
6
6
  * It is derived from the main TextGenerationOptions interface to ensure consistency.
7
7
  */
8
- export declare const textGenerationOptionsSchema: Record<keyof Omit<TextGenerationOptions, "prompt" | "input" | "schema" | "tools" | "context" | "conversationHistory" | "conversationMessages" | "conversationMemoryConfig" | "originalPrompt" | "middleware" | "expectedOutcome" | "evaluationCriteria" | "region" | "csvOptions">, OptionSchema>;
8
+ export declare const textGenerationOptionsSchema: Record<keyof Omit<TextGenerationOptions, "prompt" | "input" | "schema" | "tools" | "context" | "conversationHistory" | "conversationMessages" | "conversationMemoryConfig" | "originalPrompt" | "middleware" | "expectedOutcome" | "evaluationCriteria" | "region" | "csvOptions" | "tts">, OptionSchema>;
@@ -24,8 +24,9 @@ export declare const CONVERSATION_INSTRUCTIONS = "\n\nIMPORTANT: You are continu
24
24
  * Structured output instructions for JSON/structured output mode
25
25
  * Used to ensure AI providers output only valid JSON without conversational filler
26
26
  * This addresses the issue where models add text like "Excellent!" before JSON output
27
+ * and the case where tools are used but final output must still be pure JSON
27
28
  */
28
- export declare const STRUCTURED_OUTPUT_INSTRUCTIONS = "\n\nSTRUCTURED OUTPUT REQUIREMENT:\nYou MUST respond with ONLY a valid JSON object that matches the provided schema.\n- Do NOT include any text before the JSON (no greetings, acknowledgments, or preamble like \"Excellent!\", \"Sure!\", \"Here is the result:\", etc.)\n- Do NOT include any text after the JSON (no explanations, summaries, or follow-up comments)\n- Do NOT wrap the JSON in markdown code blocks\n- Output ONLY the raw JSON object, starting with { and ending with }\n- Ensure the JSON is valid and parseable";
29
+ export declare const STRUCTURED_OUTPUT_INSTRUCTIONS = "\nOutput ONLY valid JSON. No markdown, text, or decorations\u2014ever.\n\nFORBIDDEN: markdown code blocks, text before/after JSON, explanations, preambles, summaries, conversational text about tools.\n\nREQUIRED: response starts with { and ends with }, valid JSON only, no additional characters.\n\nIF YOU CALLED TOOLS: Incorporate data directly into the JSON structure. Do NOT explain what you did.\n\nWRONG: ```json\n{\"field\": \"value\"}\n```\nWRONG: Based on the data, here's the result: {\"field\": \"value\"}\nCORRECT: {\"field\": \"value\"}\n\nYour entire response = raw JSON object. Nothing else.";
29
30
  /**
30
31
  * Get default configuration values for conversation memory
31
32
  * Reads environment variables when called (not at module load time)
@@ -30,16 +30,24 @@ Always reference and build upon this conversation history when relevant. If the
30
30
  * Structured output instructions for JSON/structured output mode
31
31
  * Used to ensure AI providers output only valid JSON without conversational filler
32
32
  * This addresses the issue where models add text like "Excellent!" before JSON output
33
+ * and the case where tools are used but final output must still be pure JSON
33
34
  */
34
35
  export const STRUCTURED_OUTPUT_INSTRUCTIONS = `
36
+ Output ONLY valid JSON. No markdown, text, or decorations—ever.
35
37
 
36
- STRUCTURED OUTPUT REQUIREMENT:
37
- You MUST respond with ONLY a valid JSON object that matches the provided schema.
38
- - Do NOT include any text before the JSON (no greetings, acknowledgments, or preamble like "Excellent!", "Sure!", "Here is the result:", etc.)
39
- - Do NOT include any text after the JSON (no explanations, summaries, or follow-up comments)
40
- - Do NOT wrap the JSON in markdown code blocks
41
- - Output ONLY the raw JSON object, starting with { and ending with }
42
- - Ensure the JSON is valid and parseable`;
38
+ FORBIDDEN: markdown code blocks, text before/after JSON, explanations, preambles, summaries, conversational text about tools.
39
+
40
+ REQUIRED: response starts with { and ends with }, valid JSON only, no additional characters.
41
+
42
+ IF YOU CALLED TOOLS: Incorporate data directly into the JSON structure. Do NOT explain what you did.
43
+
44
+ WRONG: \`\`\`json
45
+ {"field": "value"}
46
+ \`\`\`
47
+ WRONG: Based on the data, here's the result: {"field": "value"}
48
+ CORRECT: {"field": "value"}
49
+
50
+ Your entire response = raw JSON object. Nothing else.`;
43
51
  /**
44
52
  * Get default configuration values for conversation memory
45
53
  * Reads environment variables when called (not at module load time)
@@ -85,6 +85,21 @@ export declare abstract class BaseProvider implements AIProvider {
85
85
  /**
86
86
  * Text generation method - implements AIProvider interface
87
87
  * Tools are always available unless explicitly disabled
88
+ *
89
+ * Supports Text-to-Speech (TTS) audio generation in two modes:
90
+ * 1. Direct synthesis (default): TTS synthesizes the input text without AI generation
91
+ * 2. AI response synthesis: TTS synthesizes the AI-generated response after generation
92
+ *
93
+ * When TTS is enabled with useAiResponse=false (default), the method returns early with
94
+ * only the audio result, skipping AI generation entirely for optimal performance.
95
+ *
96
+ * When TTS is enabled with useAiResponse=true, the method performs full AI generation
97
+ * and then synthesizes the AI response to audio.
98
+ *
99
+ * @param optionsOrPrompt - Generation options or prompt string
100
+ * @param _analysisSchema - Optional analysis schema (not used)
101
+ * @returns Enhanced result with optional audio field containing TTSResult
102
+ *
88
103
  * IMPLEMENTATION NOTE: Uses streamText() under the hood and accumulates results
89
104
  * for consistency and better performance
90
105
  */
@@ -13,6 +13,7 @@ import { GenerationHandler } from "./modules/GenerationHandler.js";
13
13
  import { TelemetryHandler } from "./modules/TelemetryHandler.js";
14
14
  import { Utilities } from "./modules/Utilities.js";
15
15
  import { ToolsManager } from "./modules/ToolsManager.js";
16
+ import { TTSProcessor } from "../utils/ttsProcessor.js";
16
17
  /**
17
18
  * Abstract base class for all AI providers
18
19
  * Tools are integrated as first-class citizens - always available by default
@@ -298,6 +299,21 @@ export class BaseProvider {
298
299
  /**
299
300
  * Text generation method - implements AIProvider interface
300
301
  * Tools are always available unless explicitly disabled
302
+ *
303
+ * Supports Text-to-Speech (TTS) audio generation in two modes:
304
+ * 1. Direct synthesis (default): TTS synthesizes the input text without AI generation
305
+ * 2. AI response synthesis: TTS synthesizes the AI-generated response after generation
306
+ *
307
+ * When TTS is enabled with useAiResponse=false (default), the method returns early with
308
+ * only the audio result, skipping AI generation entirely for optimal performance.
309
+ *
310
+ * When TTS is enabled with useAiResponse=true, the method performs full AI generation
311
+ * and then synthesizes the AI response to audio.
312
+ *
313
+ * @param optionsOrPrompt - Generation options or prompt string
314
+ * @param _analysisSchema - Optional analysis schema (not used)
315
+ * @returns Enhanced result with optional audio field containing TTSResult
316
+ *
301
317
  * IMPLEMENTATION NOTE: Uses streamText() under the hood and accumulates results
302
318
  * for consistency and better performance
303
319
  */
@@ -306,6 +322,30 @@ export class BaseProvider {
306
322
  this.validateOptions(options);
307
323
  const startTime = Date.now();
308
324
  try {
325
+ // ===== TTS MODE 1: Direct Input Synthesis (useAiResponse=false) =====
326
+ // Synthesize input text directly without AI generation
327
+ // This is optimal for simple read-aloud scenarios
328
+ if (options.tts?.enabled && !options.tts?.useAiResponse) {
329
+ const textToSynthesize = options.prompt ?? options.input?.text ?? "";
330
+ // Build base result structure - common to both paths
331
+ const baseResult = {
332
+ content: textToSynthesize,
333
+ provider: options.provider ?? this.providerName,
334
+ model: this.modelName,
335
+ usage: { input: 0, output: 0, total: 0 },
336
+ };
337
+ try {
338
+ const ttsResult = await TTSProcessor.synthesize(textToSynthesize, options.provider ?? this.providerName, options.tts);
339
+ baseResult.audio = ttsResult;
340
+ }
341
+ catch (ttsError) {
342
+ logger.error(`TTS synthesis failed in Mode 1 (direct input synthesis):`, ttsError);
343
+ // baseResult remains without audio - graceful degradation
344
+ }
345
+ // Call enhanceResult for consistency - enables analytics/evaluation for TTS-only requests
346
+ return await this.enhanceResult(baseResult, options, startTime);
347
+ }
348
+ // ===== Normal AI Generation Flow =====
309
349
  const { tools, model } = await this.prepareGenerationContext(options);
310
350
  const messages = await this.buildMessages(options);
311
351
  const generateResult = await this.executeGeneration(model, messages, tools, options);
@@ -314,7 +354,44 @@ export class BaseProvider {
314
354
  const responseTime = Date.now() - startTime;
315
355
  await this.recordPerformanceMetrics(generateResult.usage, responseTime);
316
356
  const { toolsUsed, toolExecutions } = this.extractToolInformation(generateResult);
317
- const enhancedResult = this.formatEnhancedResult(generateResult, tools, toolsUsed, toolExecutions, options);
357
+ let enhancedResult = this.formatEnhancedResult(generateResult, tools, toolsUsed, toolExecutions, options);
358
+ // ===== TTS MODE 2: AI Response Synthesis (useAiResponse=true) =====
359
+ // Synthesize AI-generated response after generation completes
360
+ if (options.tts?.enabled && options.tts?.useAiResponse) {
361
+ const aiResponse = enhancedResult.content;
362
+ const provider = options.provider ?? this.providerName;
363
+ // Validate AI response and provider before synthesis
364
+ if (aiResponse && provider) {
365
+ try {
366
+ const ttsResult = await TTSProcessor.synthesize(aiResponse, provider, options.tts);
367
+ // Add audio to enhanced result (TTSProcessor already includes latency in metadata)
368
+ enhancedResult = {
369
+ ...enhancedResult,
370
+ audio: ttsResult,
371
+ };
372
+ }
373
+ catch (ttsError) {
374
+ // Log TTS error but continue with text-only result
375
+ logger.error(`TTS synthesis failed in Mode 2 (AI response synthesis):`, ttsError);
376
+ // enhancedResult remains unchanged (no audio field added)
377
+ }
378
+ }
379
+ else {
380
+ logger.warn(`TTS synthesis skipped despite being enabled`, {
381
+ provider: this.providerName,
382
+ hasAiResponse: !!aiResponse,
383
+ aiResponseLength: aiResponse?.length ?? 0,
384
+ hasProvider: !!provider,
385
+ ttsConfig: {
386
+ enabled: options.tts?.enabled,
387
+ useAiResponse: options.tts?.useAiResponse,
388
+ },
389
+ reason: !aiResponse
390
+ ? "AI response is empty or undefined"
391
+ : "Provider is missing",
392
+ });
393
+ }
394
+ }
318
395
  return await this.enhanceResult(enhancedResult, options, startTime);
319
396
  }
320
397
  catch (error) {
@@ -361,6 +438,7 @@ export class BaseProvider {
361
438
  enhancedWithTools: !!(result.toolsUsed && result.toolsUsed.length > 0),
362
439
  analytics: result.analytics,
363
440
  evaluation: result.evaluation,
441
+ audio: result.audio,
364
442
  };
365
443
  }
366
444
  /**
@@ -29,6 +29,11 @@ export declare class GenerationHandler {
29
29
  functionId?: string;
30
30
  metadata?: Record<string, string | number | boolean>;
31
31
  } | undefined, handleToolStorageFn: (toolCalls: unknown[], toolResults: unknown[], options: TextGenerationOptions, timestamp: Date) => Promise<void>);
32
+ /**
33
+ * Helper method to call generateText with optional structured output
34
+ * @private
35
+ */
36
+ private callGenerateText;
32
37
  /**
33
38
  * Execute the generation with AI SDK
34
39
  */
@@ -12,7 +12,7 @@
12
12
  *
13
13
  * @module core/modules/GenerationHandler
14
14
  */
15
- import { generateText, Output } from "ai";
15
+ import { generateText, Output, NoObjectGeneratedError } from "ai";
16
16
  import { logger } from "../../utils/logger.js";
17
17
  import { DEFAULT_MAX_STEPS } from "../constants.js";
18
18
  /**
@@ -32,11 +32,12 @@ export class GenerationHandler {
32
32
  this.handleToolStorageFn = handleToolStorageFn;
33
33
  }
34
34
  /**
35
- * Execute the generation with AI SDK
35
+ * Helper method to call generateText with optional structured output
36
+ * @private
36
37
  */
37
- async executeGeneration(model, messages, tools, options) {
38
- const shouldUseTools = !options.disableTools && this.supportsToolsFn();
39
- const useStructuredOutput = !!options.schema &&
38
+ async callGenerateText(model, messages, tools, options, shouldUseTools, includeStructuredOutput) {
39
+ const useStructuredOutput = includeStructuredOutput &&
40
+ !!options.schema &&
40
41
  (options.output?.format === "json" ||
41
42
  options.output?.format === "structured");
42
43
  return await generateText({
@@ -64,6 +65,34 @@ export class GenerationHandler {
64
65
  },
65
66
  });
66
67
  }
68
+ /**
69
+ * Execute the generation with AI SDK
70
+ */
71
+ async executeGeneration(model, messages, tools, options) {
72
+ const shouldUseTools = !options.disableTools && this.supportsToolsFn();
73
+ const useStructuredOutput = !!options.schema &&
74
+ (options.output?.format === "json" ||
75
+ options.output?.format === "structured");
76
+ try {
77
+ return await this.callGenerateText(model, messages, tools, options, shouldUseTools, true);
78
+ }
79
+ catch (error) {
80
+ // If NoObjectGeneratedError is thrown when using schema + tools together,
81
+ // fall back to generating without experimental_output and extract JSON manually
82
+ if (error instanceof NoObjectGeneratedError && useStructuredOutput) {
83
+ logger.debug("[GenerationHandler] NoObjectGeneratedError caught - falling back to manual JSON extraction", {
84
+ provider: this.providerName,
85
+ model: this.modelName,
86
+ error: error.message,
87
+ });
88
+ // Retry without experimental_output - the formatEnhancedResult method
89
+ // will extract JSON from the text response
90
+ return await this.callGenerateText(model, messages, tools, options, shouldUseTools, false);
91
+ }
92
+ // Re-throw other errors
93
+ throw error;
94
+ }
95
+ }
67
96
  /**
68
97
  * Log generation completion information
69
98
  */
@@ -164,11 +193,29 @@ export class GenerationHandler {
164
193
  options.output?.format === "structured");
165
194
  let content;
166
195
  if (useStructuredOutput) {
167
- if (generateResult.experimental_output !== undefined) {
168
- content = JSON.stringify(generateResult.experimental_output);
196
+ try {
197
+ const experimentalOutput = generateResult.experimental_output;
198
+ if (experimentalOutput !== undefined) {
199
+ content = JSON.stringify(experimentalOutput);
200
+ }
201
+ else {
202
+ // Fall back to text parsing
203
+ const rawText = generateResult.text || "";
204
+ const strippedText = rawText
205
+ .replace(/^```(?:json)?\s*\n?/i, "")
206
+ .replace(/\n?```\s*$/i, "")
207
+ .trim();
208
+ content = strippedText;
209
+ }
169
210
  }
170
- else {
171
- logger.debug("[GenerationHandler] experimental_output not available, falling back to text parsing");
211
+ catch (outputError) {
212
+ // experimental_output is a getter that can throw NoObjectGeneratedError
213
+ // Fall back to text parsing when structured output fails
214
+ logger.debug("[GenerationHandler] experimental_output threw, falling back to text parsing", {
215
+ error: outputError instanceof Error
216
+ ? outputError.message
217
+ : String(outputError),
218
+ });
172
219
  const rawText = generateResult.text || "";
173
220
  const strippedText = rawText
174
221
  .replace(/^```(?:json)?\s*\n?/i, "")
@@ -89,6 +89,24 @@ export class ProviderRegistry {
89
89
  }, process.env.SAGEMAKER_MODEL || "sagemaker-model", ["sagemaker", "aws-sagemaker"]);
90
90
  logger.debug("All providers registered successfully");
91
91
  this.registered = true;
92
+ // ===== TTS HANDLER REGISTRATION =====
93
+ try {
94
+ // Create handler instance and register explicitly
95
+ const { GoogleTTSHandler } = await import("../adapters/tts/googleTTSHandler.js");
96
+ const { TTSProcessor } = await import("../utils/ttsProcessor.js");
97
+ const googleHandler = new GoogleTTSHandler();
98
+ TTSProcessor.registerHandler("google-ai", googleHandler);
99
+ TTSProcessor.registerHandler("vertex", googleHandler);
100
+ logger.debug("TTS handlers registered successfully", {
101
+ providers: ["google-ai", "vertex"],
102
+ });
103
+ }
104
+ catch (ttsError) {
105
+ logger.warn("Failed to register TTS handlers - TTS functionality will be unavailable", {
106
+ error: ttsError instanceof Error ? ttsError.message : String(ttsError),
107
+ });
108
+ // Don't throw - TTS is optional functionality
109
+ }
92
110
  }
93
111
  catch (error) {
94
112
  logger.error("Failed to register providers:", error);
@@ -60,4 +60,16 @@ export declare class ProviderImageAdapter {
60
60
  * Get all vision-capable providers
61
61
  */
62
62
  static getVisionProviders(): string[];
63
+ /**
64
+ * Count total "images" in a message (actual images + PDF pages)
65
+ * PDF pages count toward image limits for providers
66
+ */
67
+ static countImagesInMessage(images: Array<Buffer | string>, pdfPages?: number | null): number;
68
+ /**
69
+ * Extract page count from PDF metadata array
70
+ * Returns total pages across all PDFs
71
+ */
72
+ static countImagesInPages(pdfMetadataArray: Array<{
73
+ pageCount?: number | null;
74
+ }> | undefined): number;
63
75
  }
@@ -416,13 +416,19 @@ export class ProviderImageAdapter {
416
416
  adaptedPayload = this.formatForOpenAI(text, images);
417
417
  break;
418
418
  case "litellm":
419
- adaptedPayload = this.formatForOpenAI(text, images);
419
+ // LiteLLM uses same format as OpenAI but validate with litellm provider name
420
+ this.validateImageCount(images.length, "litellm");
421
+ adaptedPayload = this.formatForOpenAI(text, images, true);
420
422
  break;
421
423
  case "mistral":
422
- adaptedPayload = this.formatForOpenAI(text, images);
424
+ // Mistral uses same format as OpenAI but validate with mistral provider name
425
+ this.validateImageCount(images.length, "mistral");
426
+ adaptedPayload = this.formatForOpenAI(text, images, true);
423
427
  break;
424
428
  case "bedrock":
425
- adaptedPayload = this.formatForAnthropic(text, images);
429
+ // Bedrock uses same format as Anthropic but validate with bedrock provider name
430
+ this.validateImageCount(images.length, "bedrock");
431
+ adaptedPayload = this.formatForAnthropic(text, images, true);
426
432
  break;
427
433
  default:
428
434
  throw new Error(`Vision not supported for provider: ${provider}`);
@@ -666,5 +672,26 @@ export class ProviderImageAdapter {
666
672
  static getVisionProviders() {
667
673
  return Object.keys(VISION_CAPABILITIES);
668
674
  }
675
+ /**
676
+ * Count total "images" in a message (actual images + PDF pages)
677
+ * PDF pages count toward image limits for providers
678
+ */
679
+ static countImagesInMessage(images, pdfPages) {
680
+ const imageCount = images?.length || 0;
681
+ const pageCount = pdfPages ?? 0;
682
+ return imageCount + pageCount;
683
+ }
684
+ /**
685
+ * Extract page count from PDF metadata array
686
+ * Returns total pages across all PDFs
687
+ */
688
+ static countImagesInPages(pdfMetadataArray) {
689
+ if (!pdfMetadataArray || pdfMetadataArray.length === 0) {
690
+ return 0;
691
+ }
692
+ return pdfMetadataArray.reduce((total, pdf) => {
693
+ return total + (pdf.pageCount ?? 0);
694
+ }, 0);
695
+ }
669
696
  }
670
697
  //# sourceMappingURL=providerImageAdapter.js.map
@@ -24,8 +24,9 @@ export declare const CONVERSATION_INSTRUCTIONS = "\n\nIMPORTANT: You are continu
24
24
  * Structured output instructions for JSON/structured output mode
25
25
  * Used to ensure AI providers output only valid JSON without conversational filler
26
26
  * This addresses the issue where models add text like "Excellent!" before JSON output
27
+ * and the case where tools are used but final output must still be pure JSON
27
28
  */
28
- export declare const STRUCTURED_OUTPUT_INSTRUCTIONS = "\n\nSTRUCTURED OUTPUT REQUIREMENT:\nYou MUST respond with ONLY a valid JSON object that matches the provided schema.\n- Do NOT include any text before the JSON (no greetings, acknowledgments, or preamble like \"Excellent!\", \"Sure!\", \"Here is the result:\", etc.)\n- Do NOT include any text after the JSON (no explanations, summaries, or follow-up comments)\n- Do NOT wrap the JSON in markdown code blocks\n- Output ONLY the raw JSON object, starting with { and ending with }\n- Ensure the JSON is valid and parseable";
29
+ export declare const STRUCTURED_OUTPUT_INSTRUCTIONS = "\nOutput ONLY valid JSON. No markdown, text, or decorations\u2014ever.\n\nFORBIDDEN: markdown code blocks, text before/after JSON, explanations, preambles, summaries, conversational text about tools.\n\nREQUIRED: response starts with { and ends with }, valid JSON only, no additional characters.\n\nIF YOU CALLED TOOLS: Incorporate data directly into the JSON structure. Do NOT explain what you did.\n\nWRONG: ```json\n{\"field\": \"value\"}\n```\nWRONG: Based on the data, here's the result: {\"field\": \"value\"}\nCORRECT: {\"field\": \"value\"}\n\nYour entire response = raw JSON object. Nothing else.";
29
30
  /**
30
31
  * Get default configuration values for conversation memory
31
32
  * Reads environment variables when called (not at module load time)
@@ -30,16 +30,24 @@ Always reference and build upon this conversation history when relevant. If the
30
30
  * Structured output instructions for JSON/structured output mode
31
31
  * Used to ensure AI providers output only valid JSON without conversational filler
32
32
  * This addresses the issue where models add text like "Excellent!" before JSON output
33
+ * and the case where tools are used but final output must still be pure JSON
33
34
  */
34
35
  export const STRUCTURED_OUTPUT_INSTRUCTIONS = `
36
+ Output ONLY valid JSON. No markdown, text, or decorations—ever.
35
37
 
36
- STRUCTURED OUTPUT REQUIREMENT:
37
- You MUST respond with ONLY a valid JSON object that matches the provided schema.
38
- - Do NOT include any text before the JSON (no greetings, acknowledgments, or preamble like "Excellent!", "Sure!", "Here is the result:", etc.)
39
- - Do NOT include any text after the JSON (no explanations, summaries, or follow-up comments)
40
- - Do NOT wrap the JSON in markdown code blocks
41
- - Output ONLY the raw JSON object, starting with { and ending with }
42
- - Ensure the JSON is valid and parseable`;
38
+ FORBIDDEN: markdown code blocks, text before/after JSON, explanations, preambles, summaries, conversational text about tools.
39
+
40
+ REQUIRED: response starts with { and ends with }, valid JSON only, no additional characters.
41
+
42
+ IF YOU CALLED TOOLS: Incorporate data directly into the JSON structure. Do NOT explain what you did.
43
+
44
+ WRONG: \`\`\`json
45
+ {"field": "value"}
46
+ \`\`\`
47
+ WRONG: Based on the data, here's the result: {"field": "value"}
48
+ CORRECT: {"field": "value"}
49
+
50
+ Your entire response = raw JSON object. Nothing else.`;
43
51
  /**
44
52
  * Get default configuration values for conversation memory
45
53
  * Reads environment variables when called (not at module load time)
@@ -85,6 +85,21 @@ export declare abstract class BaseProvider implements AIProvider {
85
85
  /**
86
86
  * Text generation method - implements AIProvider interface
87
87
  * Tools are always available unless explicitly disabled
88
+ *
89
+ * Supports Text-to-Speech (TTS) audio generation in two modes:
90
+ * 1. Direct synthesis (default): TTS synthesizes the input text without AI generation
91
+ * 2. AI response synthesis: TTS synthesizes the AI-generated response after generation
92
+ *
93
+ * When TTS is enabled with useAiResponse=false (default), the method returns early with
94
+ * only the audio result, skipping AI generation entirely for optimal performance.
95
+ *
96
+ * When TTS is enabled with useAiResponse=true, the method performs full AI generation
97
+ * and then synthesizes the AI response to audio.
98
+ *
99
+ * @param optionsOrPrompt - Generation options or prompt string
100
+ * @param _analysisSchema - Optional analysis schema (not used)
101
+ * @returns Enhanced result with optional audio field containing TTSResult
102
+ *
88
103
  * IMPLEMENTATION NOTE: Uses streamText() under the hood and accumulates results
89
104
  * for consistency and better performance
90
105
  */