npm - neural-ai-sdk - Versions diffs - 0.1.1 → 0.1.2 - Mend

neural-ai-sdk 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +120 -0
package/dist/models/google-model.d.ts +4 -1
package/dist/models/google-model.js +43 -7
package/dist/models/huggingface-model.d.ts +28 -0
package/dist/models/huggingface-model.js +284 -11
package/dist/models/ollama-model.d.ts +4 -0
package/dist/models/ollama-model.js +131 -40
package/dist/models/openai-model.d.ts +4 -0
package/dist/models/openai-model.js +65 -24
package/dist/types.d.ts +12 -0
package/dist/utils/image-utils.d.ts +26 -0
package/dist/utils/image-utils.js +103 -0
package/dist/utils/index.d.ts +1 -0
package/dist/utils/index.js +17 -0
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -116,6 +116,101 @@ const deepseekModel = NeuralAI.createModel(AIProvider.DEEPSEEK, {
 });
 ```
+### Using Multimodal Capabilities
+The SDK supports multimodal capabilities for providers with vision-capable models. You can pass images to any model - the SDK will attempt to process them appropriately and provide helpful error messages if the model doesn't support vision inputs.
+#### Simple Image + Text Example
+```typescript
+import { NeuralAI, AIProvider } from "neural-ai-sdk";
+// Create an OpenAI model with vision capabilities
+const openaiModel = NeuralAI.createModel(AIProvider.OPENAI, {
+  model: "gpt-4o", // Model that supports vision
+});
+// Process an image with a text prompt
+async function analyzeImage() {
+  const response = await openaiModel.generate({
+    prompt: "What's in this image? Please describe it in detail.",
+    // The image can be a URL, local file path, or Buffer
+    image: "https://example.com/image.jpg",
+  });
+  console.log(response.text);
+}
+analyzeImage();
+```
+#### Using Multiple Images
+For more complex scenarios with multiple images or mixed content:
+```typescript
+import { NeuralAI, AIProvider } from "neural-ai-sdk";
+// Create a Google model with multimodal support
+const googleModel = NeuralAI.createModel(AIProvider.GOOGLE, {
+  model: "gemini-2.0-flash",
+});
+async function compareImages() {
+  const response = await googleModel.generate({
+    prompt: "Compare these two images and tell me the differences:",
+    content: [
+      {
+        type: "image",
+        source: "https://example.com/image1.jpg",
+      },
+      {
+        type: "text",
+        text: "This is the first image.",
+      },
+      {
+        type: "image",
+        source: "https://example.com/image2.jpg",
+      },
+      {
+        type: "text",
+        text: "This is the second image.",
+      },
+    ],
+  });
+  console.log(response.text);
+}
+compareImages();
+```
+#### Supported Image Sources
+The SDK handles various image sources:
+- **URLs**: `"https://example.com/image.jpg"`
+- **Local file paths**: `"/path/to/local/image.jpg"`
+- **Buffers**: Direct image data as a Buffer object
+The SDK automatically handles:
+- Base64 encoding
+- MIME type detection
+- Image formatting for each provider's API
+#### Multimodal Support Across Providers
+All providers can attempt to process images - the SDK will automatically handle errors gracefully if a specific model doesn't support multimodal inputs.
+| Provider    | Common Vision-Capable Models                        |
+| ----------- | --------------------------------------------------- |
+| OpenAI      | gpt-4o, gpt-4-vision                                |
+| Google      | gemini-2.0-flash                                    |
+| Ollama      | llama-3.2-vision, llama3-vision, bakllava, llava    |
+| HuggingFace | llava, cogvlm, idefics, instructblip                |
+| DeepSeek    | (Check provider documentation for supported models) |
 ## Environment Configuration
 You can set up environment variables by:
@@ -196,6 +291,31 @@ console.log(`Completion tokens: ${response.usage?.completionTokens}`);
 console.log(`Total tokens: ${response.usage?.totalTokens}`);
 ```
+### Multimodal Streaming
+You can also stream responses from multimodal prompts:
+```typescript
+import { NeuralAI, AIProvider } from "neural-ai-sdk";
+const model = NeuralAI.createModel(AIProvider.OPENAI, {
+  model: "gpt-4o",
+});
+async function streamImageAnalysis() {
+  const stream = model.stream({
+    prompt: "Describe this image in detail:",
+    image: "https://example.com/image.jpg",
+  });
+  for await (const chunk of stream) {
+    process.stdout.write(chunk);
+  }
+}
+streamImageAnalysis();
+```
 ## License
 MIT

package/dist/models/google-model.d.ts CHANGED Viewed

@@ -6,5 +6,8 @@ export declare class GoogleModel extends BaseModel {
     constructor(config: AIModelConfig);
     generate(request: AIModelRequest): Promise<AIModelResponse>;
     stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
-    private formatPrompt;
+    /**
+     * Format content for Google's Gemini API, handling both text and images
+     */
+    private formatMultiModalContent;
 }

package/dist/models/google-model.js CHANGED Viewed

@@ -5,6 +5,7 @@ const generative_ai_1 = require("@google/generative-ai");
 const types_1 = require("../types");
 const base_model_1 = require("./base-model");
 const utils_1 = require("../utils");
+const image_utils_1 = require("../utils/image-utils");
 class GoogleModel extends base_model_1.BaseModel {
     constructor(config) {
         super(config);
@@ -22,8 +23,8 @@ class GoogleModel extends base_model_1.BaseModel {
                 topP: config.topP,
             },
         });
-        const prompt = this.formatPrompt(request);
-        const result = await model.generateContent(prompt);
+        const content = await this.formatMultiModalContent(request);
+        const result = await model.generateContent(content);
         const response = result.response;
         return {
             text: response.text(),
@@ -40,8 +41,8 @@ class GoogleModel extends base_model_1.BaseModel {
                 topP: config.topP,
             },
         });
-        const prompt = this.formatPrompt(request);
-        const result = await model.generateContentStream(prompt);
+        const content = await this.formatMultiModalContent(request);
+        const result = await model.generateContentStream(content);
         for await (const chunk of result.stream) {
             const text = chunk.text();
             if (text) {
@@ -49,12 +50,47 @@ class GoogleModel extends base_model_1.BaseModel {
             }
         }
     }
-    formatPrompt(request) {
+    /**
+     * Format content for Google's Gemini API, handling both text and images
+     */
+    async formatMultiModalContent(request) {
         const parts = [];
+        // Add system prompt if provided
         if (request.systemPrompt) {
-            parts.push(request.systemPrompt);
+            parts.push({ text: request.systemPrompt });
+        }
+        // Add main prompt text
+        if (request.prompt) {
+            parts.push({ text: request.prompt });
+        }
+        // Process structured content array if provided
+        if (request.content) {
+            for (const item of request.content) {
+                if (item.type === "text") {
+                    parts.push({ text: item.text });
+                }
+                else if (item.type === "image") {
+                    // Process image and add to parts
+                    const { base64, mimeType } = await (0, image_utils_1.processImage)(item.source);
+                    parts.push({
+                        inlineData: {
+                            data: base64,
+                            mimeType: mimeType,
+                        },
+                    });
+                }
+            }
+        }
+        // Process single image if provided via convenience property
+        if (request.image) {
+            const { base64, mimeType } = await (0, image_utils_1.processImage)(request.image);
+            parts.push({
+                inlineData: {
+                    data: base64,
+                    mimeType: mimeType,
+                },
+            });
         }
-        parts.push(request.prompt);
         return parts;
     }
 }

package/dist/models/huggingface-model.d.ts CHANGED Viewed

@@ -5,5 +5,33 @@ export declare class HuggingFaceModel extends BaseModel {
     private baseURL;
     constructor(config: AIModelConfig);
     generate(request: AIModelRequest): Promise<AIModelResponse>;
+    /**
+     * Generate a text-only response
+     */
+    private generateTextOnly;
+    /**
+     * Generate a response using multimodal inputs (text + images)
+     */
+    private generateWithImages;
+    /**
+     * Try generating with nested inputs format (common in newer models)
+     */
+    private generateWithNestedFormat;
+    /**
+     * Try generating with flat inputs format (common in some models)
+     */
+    private generateWithFlatFormat;
+    /**
+     * Helper to parse HuggingFace response in various formats
+     */
+    private parseResponse;
+    /**
+     * Fallback method that uses multipart/form-data for older HuggingFace models
+     */
+    private generateWithMultipartForm;
     stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
+    /**
+     * Convert a base64 string to a Blob object
+     */
+    private base64ToBlob;
 }

package/dist/models/huggingface-model.js CHANGED Viewed

@@ -8,6 +8,7 @@ const axios_1 = __importDefault(require("axios"));
 const types_1 = require("../types");
 const base_model_1 = require("./base-model");
 const utils_1 = require("../utils");
+const image_utils_1 = require("../utils/image-utils");
 class HuggingFaceModel extends base_model_1.BaseModel {
     constructor(config) {
         super(config);
@@ -18,6 +19,46 @@ class HuggingFaceModel extends base_model_1.BaseModel {
     async generate(request) {
         const config = this.mergeConfig(request.options);
         const model = config.model || "meta-llama/Llama-2-7b-chat-hf";
+        try {
+            // Try multimodal approach if images are present
+            if (request.image ||
+                (request.content &&
+                    request.content.some((item) => item.type === "image"))) {
+                return await this.generateWithImages(request, config, model);
+            }
+            else {
+                return await this.generateTextOnly(request, config, model);
+            }
+        }
+        catch (error) {
+            // Enhance error messages for multimodal related errors
+            if ((request.image || request.content) &&
+                (error.response?.data?.includes("Content-Type") ||
+                    error.response?.status === 415 ||
+                    error.response?.data?.error?.includes("image") ||
+                    error.message?.includes("multimodal") ||
+                    error.message?.toLowerCase().includes("vision") ||
+                    error.message?.toLowerCase().includes("unsupported"))) {
+                let errorMessage = `Model "${model}" doesn't appear to support multimodal inputs properly.`;
+                // Add more specific guidance based on error details
+                if (error.response?.status === 415) {
+                    errorMessage +=
+                        " The model may require a different format for image inputs.";
+                }
+                // Include original error message for debugging
+                errorMessage += ` Original error: ${error.response?.data || error.message}`;
+                // Suggest known working models
+                errorMessage +=
+                    " Try a different vision-capable model like 'llava-hf/llava-1.5-7b-hf' or check HuggingFace's documentation for this specific model.";
+                throw new Error(errorMessage);
+            }
+            throw error;
+        }
+    }
+    /**
+     * Generate a text-only response
+     */
+    async generateTextOnly(request, config, model) {
         let fullPrompt = request.prompt;
         if (request.systemPrompt) {
             fullPrompt = `${request.systemPrompt}\n\n${fullPrompt}`;
@@ -54,19 +95,251 @@ class HuggingFaceModel extends base_model_1.BaseModel {
             raw: response.data,
         };
     }
+    /**
+     * Generate a response using multimodal inputs (text + images)
+     */
+    async generateWithImages(request, config, model) {
+        // Some HF models expect different input formats, try various formats one by one
+        const errors = [];
+        // Format 1: Nested inputs object with text and image
+        try {
+            return await this.generateWithNestedFormat(request, config, model);
+        }
+        catch (error) {
+            errors.push(error);
+        }
+        // Format 2: Plain inputs with string prompt and image in the main object
+        try {
+            return await this.generateWithFlatFormat(request, config, model);
+        }
+        catch (error) {
+            errors.push(error);
+        }
+        // Format 3: Try multipart form data as last resort
+        try {
+            return await this.generateWithMultipartForm(request, config, model);
+        }
+        catch (error) {
+            errors.push(error);
+        }
+        // If we get here, all formats failed, throw an enhanced error
+        const errorMessage = `Model "${model}" doesn't appear to support multimodal inputs in any of the attempted formats. Try a different vision-capable model like 'llava-hf/llava-1.5-7b-hf'. Errors: ${errors
+            .map((e) => e.message || e)
+            .join("; ")}`;
+        throw new Error(errorMessage);
+    }
+    /**
+     * Try generating with nested inputs format (common in newer models)
+     */
+    async generateWithNestedFormat(request, config, model) {
+        const prompt = request.systemPrompt
+            ? `${request.systemPrompt}\n\n${request.prompt}`
+            : request.prompt;
+        let payload = {
+            inputs: {
+                text: prompt,
+            },
+            parameters: {
+                temperature: config.temperature,
+                max_new_tokens: config.maxTokens,
+                top_p: config.topP,
+                return_full_text: false,
+            },
+        };
+        // Process the convenience 'image' property
+        if (request.image) {
+            const { base64 } = await (0, image_utils_1.processImage)(request.image);
+            payload.inputs.image = base64;
+        }
+        // Process content array if provided
+        if (request.content) {
+            // Initialize images array if multiple images
+            const hasMultipleImages = request.content.filter((item) => item.type === "image").length > 1;
+            if (hasMultipleImages) {
+                payload.inputs.images = [];
+            }
+            for (const item of request.content) {
+                if (item.type === "image") {
+                    const { base64 } = await (0, image_utils_1.processImage)(item.source);
+                    if (hasMultipleImages) {
+                        payload.inputs.images.push(base64);
+                    }
+                    else {
+                        payload.inputs.image = base64;
+                    }
+                }
+                // Text content is already included in the prompt
+            }
+        }
+        const response = await axios_1.default.post(`${this.baseURL}/${model}`, payload, {
+            headers: {
+                Authorization: `Bearer ${config.apiKey ||
+                    (0, utils_1.getApiKey)(config.apiKey, "HUGGINGFACE_API_KEY", "HuggingFace")}`,
+                "Content-Type": "application/json",
+            },
+        });
+        // Parse response
+        return this.parseResponse(response);
+    }
+    /**
+     * Try generating with flat inputs format (common in some models)
+     */
+    async generateWithFlatFormat(request, config, model) {
+        const prompt = request.systemPrompt
+            ? `${request.systemPrompt}\n\n${request.prompt}`
+            : request.prompt;
+        // Some models expect a flat structure with inputs as a string
+        let payload = {
+            inputs: prompt,
+            parameters: {
+                temperature: config.temperature,
+                max_new_tokens: config.maxTokens,
+                top_p: config.topP,
+                return_full_text: false,
+            },
+        };
+        // For single image, add it directly to the payload
+        if (request.image) {
+            const { base64 } = await (0, image_utils_1.processImage)(request.image);
+            payload.image = base64; // At top level, not in inputs
+        }
+        // Process only the first image from content if available and no direct image
+        if (!request.image && request.content) {
+            const imageContent = request.content.find((item) => item.type === "image");
+            if (imageContent) {
+                const { base64 } = await (0, image_utils_1.processImage)(imageContent.source);
+                payload.image = base64; // At top level, not in inputs
+            }
+        }
+        const response = await axios_1.default.post(`${this.baseURL}/${model}`, payload, {
+            headers: {
+                Authorization: `Bearer ${config.apiKey ||
+                    (0, utils_1.getApiKey)(config.apiKey, "HUGGINGFACE_API_KEY", "HuggingFace")}`,
+                "Content-Type": "application/json",
+            },
+        });
+        // Parse response
+        return this.parseResponse(response);
+    }
+    /**
+     * Helper to parse HuggingFace response in various formats
+     */
+    parseResponse(response) {
+        let text = "";
+        if (Array.isArray(response.data)) {
+            text = response.data[0]?.generated_text || "";
+        }
+        else if (response.data.generated_text) {
+            text = response.data.generated_text;
+        }
+        else if (typeof response.data === "string") {
+            text = response.data;
+        }
+        else {
+            text = JSON.stringify(response.data);
+        }
+        return {
+            text,
+            raw: response.data,
+        };
+    }
+    /**
+     * Fallback method that uses multipart/form-data for older HuggingFace models
+     */
+    async generateWithMultipartForm(request, config, model) {
+        // Create a multipart form-data payload for multimodal models
+        const formData = new FormData();
+        // Add text prompt
+        const prompt = request.systemPrompt
+            ? `${request.systemPrompt}\n\n${request.prompt}`
+            : request.prompt;
+        formData.append("text", prompt);
+        // Process the convenience 'image' property
+        if (request.image) {
+            const { base64 } = await (0, image_utils_1.processImage)(request.image);
+            const imageBlob = this.base64ToBlob(base64);
+            formData.append("image", imageBlob, "image.jpg");
+        }
+        // Process content array if provided
+        if (request.content) {
+            let imageIndex = 0;
+            for (const item of request.content) {
+                if (item.type === "image") {
+                    const { base64 } = await (0, image_utils_1.processImage)(item.source);
+                    const imageBlob = this.base64ToBlob(base64);
+                    formData.append(`image_${imageIndex}`, imageBlob, `image_${imageIndex}.jpg`);
+                    imageIndex++;
+                }
+                // Text content is already included in the prompt
+            }
+        }
+        // Add model parameters
+        if (config.temperature) {
+            formData.append("temperature", config.temperature.toString());
+        }
+        if (config.maxTokens) {
+            formData.append("max_new_tokens", config.maxTokens.toString());
+        }
+        if (config.topP) {
+            formData.append("top_p", config.topP.toString());
+        }
+        const response = await axios_1.default.post(`${this.baseURL}/${model}`, formData, {
+            headers: {
+                Authorization: `Bearer ${config.apiKey ||
+                    (0, utils_1.getApiKey)(config.apiKey, "HUGGINGFACE_API_KEY", "HuggingFace")}`,
+                "Content-Type": "multipart/form-data",
+            },
+        });
+        // Parse response based on return format
+        let text = "";
+        if (Array.isArray(response.data)) {
+            text = response.data[0]?.generated_text || "";
+        }
+        else if (response.data.generated_text) {
+            text = response.data.generated_text;
+        }
+        else if (typeof response.data === "string") {
+            text = response.data;
+        }
+        else {
+            text = JSON.stringify(response.data);
+        }
+        return {
+            text,
+            raw: response.data,
+        };
+    }
     async *stream(request) {
-        // HuggingFace Inference API doesn't natively support streaming for all models
-        // We'll implement a basic chunking on top of the non-streaming API
-        const response = await this.generate(request);
-        // Simple chunking for demonstration purposes
-        const chunkSize = 10;
-        const text = response.text;
-        for (let i = 0; i < text.length; i += chunkSize) {
-            const chunk = text.slice(i, i + chunkSize);
-            yield chunk;
-            // Add a small delay to simulate streaming
-            await new Promise((resolve) => setTimeout(resolve, 10));
+        try {
+            // HuggingFace Inference API doesn't natively support streaming for all models
+            // We'll implement a basic chunking on top of the non-streaming API
+            const response = await this.generate(request);
+            // Simple chunking for demonstration purposes
+            const chunkSize = 10;
+            const text = response.text;
+            for (let i = 0; i < text.length; i += chunkSize) {
+                const chunk = text.slice(i, i + chunkSize);
+                yield chunk;
+                // Add a small delay to simulate streaming
+                await new Promise((resolve) => setTimeout(resolve, 10));
+            }
+        }
+        catch (error) {
+            // Rethrow with enhanced error message
+            throw error;
+        }
+    }
+    /**
+     * Convert a base64 string to a Blob object
+     */
+    base64ToBlob(base64) {
+        const byteString = atob(base64);
+        const ab = new ArrayBuffer(byteString.length);
+        const ia = new Uint8Array(ab);
+        for (let i = 0; i < byteString.length; i++) {
+            ia[i] = byteString.charCodeAt(i);
         }
+        return new Blob([ab], { type: "image/jpeg" });
     }
 }
 exports.HuggingFaceModel = HuggingFaceModel;

package/dist/models/ollama-model.d.ts CHANGED Viewed

@@ -6,4 +6,8 @@ export declare class OllamaModel extends BaseModel {
     constructor(config: AIModelConfig);
     generate(request: AIModelRequest): Promise<AIModelResponse>;
     stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
+    /**
+     * Creates the request payload for Ollama, handling multimodal content if provided
+     */
+    private createRequestPayload;
 }

package/dist/models/ollama-model.js CHANGED Viewed

@@ -8,6 +8,7 @@ const axios_1 = __importDefault(require("axios"));
 const types_1 = require("../types");
 const base_model_1 = require("./base-model");
 const utils_1 = require("../utils");
+const image_utils_1 = require("../utils/image-utils");
 class OllamaModel extends base_model_1.BaseModel {
     constructor(config) {
         super(config);
@@ -16,59 +17,149 @@ class OllamaModel extends base_model_1.BaseModel {
     }
     async generate(request) {
         const config = this.mergeConfig(request.options);
-        let prompt = request.prompt;
-        // Add system prompt if provided
-        if (request.systemPrompt) {
-            prompt = `${request.systemPrompt}\n\n${prompt}`;
+        try {
+            const payload = await this.createRequestPayload(request, config);
+            const response = await axios_1.default.post(`${this.baseURL}/generate`, payload);
+            return {
+                text: response.data.response,
+                usage: {
+                    promptTokens: response.data.prompt_eval_count,
+                    completionTokens: response.data.eval_count,
+                    totalTokens: response.data.prompt_eval_count + response.data.eval_count,
+                },
+                raw: response.data,
+            };
+        }
+        catch (error) {
+            // Enhance error message if it appears to be related to multimodal support
+            if (error.response?.status === 400 &&
+                (request.image || request.content) &&
+                (error.response?.data?.error?.includes("image") ||
+                    error.response?.data?.error?.includes("multimodal") ||
+                    error.response?.data?.error?.includes("vision"))) {
+                throw new Error(`The model "${config.model || "default"}" doesn't support multimodal inputs. Try a vision-capable model like "llama-3.2-vision" or "llava". Original error: ${error.message}`);
+            }
+            throw error;
         }
-        const response = await axios_1.default.post(`${this.baseURL}/generate`, {
-            model: config.model || "llama2",
-            prompt,
-            temperature: config.temperature,
-            num_predict: config.maxTokens,
-            top_p: config.topP,
-        });
-        return {
-            text: response.data.response,
-            usage: {
-                promptTokens: response.data.prompt_eval_count,
-                completionTokens: response.data.eval_count,
-                totalTokens: response.data.prompt_eval_count + response.data.eval_count,
-            },
-            raw: response.data,
-        };
     }
     async *stream(request) {
         const config = this.mergeConfig(request.options);
-        let prompt = request.prompt;
-        if (request.systemPrompt) {
-            prompt = `${request.systemPrompt}\n\n${prompt}`;
+        try {
+            const payload = await this.createRequestPayload(request, config, true);
+            const response = await axios_1.default.post(`${this.baseURL}/generate`, payload, {
+                responseType: "stream",
+            });
+            const reader = response.data;
+            for await (const chunk of reader) {
+                const lines = chunk.toString().split("\n").filter(Boolean);
+                for (const line of lines) {
+                    try {
+                        const parsed = JSON.parse(line);
+                        if (parsed.response) {
+                            yield parsed.response;
+                        }
+                    }
+                    catch (error) {
+                        console.error("Error parsing Ollama stream data:", error);
+                    }
+                }
+            }
         }
-        const response = await axios_1.default.post(`${this.baseURL}/generate`, {
+        catch (error) {
+            // Enhance error message if it appears to be related to multimodal support
+            if (error.response?.status === 400 &&
+                (request.image || request.content) &&
+                (error.response?.data?.error?.includes("image") ||
+                    error.response?.data?.error?.includes("multimodal") ||
+                    error.response?.data?.error?.includes("vision"))) {
+                throw new Error(`The model "${config.model || "default"}" doesn't support multimodal inputs. Try a vision-capable model like "llama-3.2-vision" or "llava". Original error: ${error.message}`);
+            }
+            throw error;
+        }
+    }
+    /**
+     * Creates the request payload for Ollama, handling multimodal content if provided
+     */
+    async createRequestPayload(request, config, isStream = false) {
+        // Base payload
+        const payload = {
             model: config.model || "llama2",
-            prompt,
             temperature: config.temperature,
             num_predict: config.maxTokens,
             top_p: config.topP,
-            stream: true,
-        }, {
-            responseType: "stream",
-        });
-        const reader = response.data;
-        for await (const chunk of reader) {
-            const lines = chunk.toString().split("\n").filter(Boolean);
-            for (const line of lines) {
-                try {
-                    const parsed = JSON.parse(line);
-                    if (parsed.response) {
-                        yield parsed.response;
+        };
+        // Handle streaming
+        if (isStream) {
+            payload.stream = true;
+        }
+        // If there are any image inputs, use the messages format
+        if (request.image ||
+            (request.content && request.content.some((item) => item.type === "image"))) {
+            // Create a messages array for multimodal models (similar to OpenAI format)
+            const messages = [];
+            // Add system prompt if provided
+            if (request.systemPrompt) {
+                messages.push({
+                    role: "system",
+                    content: request.systemPrompt,
+                });
+            }
+            // Create a user message with potentially multiple content parts
+            const userMessage = { role: "user", content: [] };
+            // Add the main prompt as text content
+            if (request.prompt) {
+                userMessage.content.push({
+                    type: "text",
+                    text: request.prompt,
+                });
+            }
+            // Process structured content if available
+            if (request.content) {
+                for (const item of request.content) {
+                    if (item.type === "text") {
+                        userMessage.content.push({
+                            type: "text",
+                            text: item.text,
+                        });
+                    }
+                    else if (item.type === "image") {
+                        const { base64, mimeType } = await (0, image_utils_1.processImage)(item.source);
+                        userMessage.content.push({
+                            type: "image",
+                            image: {
+                                data: base64,
+                                mimeType: mimeType,
+                            },
+                        });
                     }
                 }
-                catch (error) {
-                    console.error("Error parsing Ollama stream data:", error);
-                }
             }
+            // Handle the convenience image property
+            if (request.image) {
+                const { base64, mimeType } = await (0, image_utils_1.processImage)(request.image);
+                userMessage.content.push({
+                    type: "image",
+                    image: {
+                        data: base64,
+                        mimeType: mimeType,
+                    },
+                });
+            }
+            // Add the user message
+            messages.push(userMessage);
+            // Set the messages in the payload
+            payload.messages = messages;
+        }
+        else {
+            // Traditional text-only format
+            let prompt = request.prompt;
+            // Add system prompt if provided
+            if (request.systemPrompt) {
+                prompt = `${request.systemPrompt}\n\n${prompt}`;
+            }
+            payload.prompt = prompt;
         }
+        return payload;
     }
 }
 exports.OllamaModel = OllamaModel;

package/dist/models/openai-model.d.ts CHANGED Viewed

@@ -6,4 +6,8 @@ export declare class OpenAIModel extends BaseModel {
     constructor(config: AIModelConfig);
     generate(request: AIModelRequest): Promise<AIModelResponse>;
     stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
+    /**
+     * Format messages for OpenAI API, including handling multimodal content
+     */
+    private formatMessages;
 }

package/dist/models/openai-model.js CHANGED Viewed

@@ -5,6 +5,7 @@ const openai_1 = require("openai");
 const types_1 = require("../types");
 const base_model_1 = require("./base-model");
 const utils_1 = require("../utils");
+const image_utils_1 = require("../utils/image-utils");
 class OpenAIModel extends base_model_1.BaseModel {
     constructor(config) {
         super(config);
@@ -17,19 +18,8 @@ class OpenAIModel extends base_model_1.BaseModel {
     }
     async generate(request) {
         const config = this.mergeConfig(request.options);
-        const messages = [];
-        // Add system prompt if provided
-        if (request.systemPrompt) {
-            messages.push({
-                role: "system",
-                content: request.systemPrompt,
-            });
-        }
-        // Add user prompt
-        messages.push({
-            role: "user",
-            content: request.prompt,
-        });
+        // Process messages for OpenAI API
+        const messages = await this.formatMessages(request);
         const response = await this.client.chat.completions.create({
             model: config.model || "gpt-3.5-turbo",
             messages,
@@ -49,17 +39,8 @@ class OpenAIModel extends base_model_1.BaseModel {
     }
     async *stream(request) {
         const config = this.mergeConfig(request.options);
-        const messages = [];
-        if (request.systemPrompt) {
-            messages.push({
-                role: "system",
-                content: request.systemPrompt,
-            });
-        }
-        messages.push({
-            role: "user",
-            content: request.prompt,
-        });
+        // Process messages for OpenAI API
+        const messages = await this.formatMessages(request);
         const stream = await this.client.chat.completions.create({
             model: config.model || "gpt-3.5-turbo",
             messages,
@@ -75,5 +56,65 @@ class OpenAIModel extends base_model_1.BaseModel {
             }
         }
     }
+    /**
+     * Format messages for OpenAI API, including handling multimodal content
+     */
+    async formatMessages(request) {
+        const messages = [];
+        // Add system prompt if provided
+        if (request.systemPrompt) {
+            messages.push({
+                role: "system",
+                content: request.systemPrompt,
+            });
+        }
+        // Handle multimodal content
+        if (request.content || request.image) {
+            const content = [];
+            // Add the text prompt
+            if (request.prompt) {
+                content.push({ type: "text", text: request.prompt });
+            }
+            // Add any structured content
+            if (request.content) {
+                for (const item of request.content) {
+                    if (item.type === "text") {
+                        content.push({ type: "text", text: item.text });
+                    }
+                    else if (item.type === "image") {
+                        const { base64, mimeType } = await (0, image_utils_1.processImage)(item.source);
+                        content.push({
+                            type: "image_url",
+                            image_url: {
+                                url: `data:${mimeType};base64,${base64}`,
+                            },
+                        });
+                    }
+                }
+            }
+            // Add single image if provided via the convenience property
+            if (request.image) {
+                const { base64, mimeType } = await (0, image_utils_1.processImage)(request.image);
+                content.push({
+                    type: "image_url",
+                    image_url: {
+                        url: `data:${mimeType};base64,${base64}`,
+                    },
+                });
+            }
+            messages.push({
+                role: "user",
+                content,
+            });
+        }
+        else {
+            // Traditional text-only message
+            messages.push({
+                role: "user",
+                content: request.prompt,
+            });
+        }
+        return messages;
+    }
 }
 exports.OpenAIModel = OpenAIModel;

package/dist/types.d.ts CHANGED Viewed

@@ -13,6 +13,16 @@ export declare enum AIProvider {
     OLLAMA = "ollama",
     HUGGINGFACE = "huggingface"
 }
+export type ContentType = "text" | "image";
+export interface TextContent {
+    type: "text";
+    text: string;
+}
+export interface ImageContent {
+    type: "image";
+    source: string | Buffer;
+}
+export type Content = TextContent | ImageContent;
 export interface AIModelResponse {
     text: string;
     usage?: {
@@ -26,6 +36,8 @@ export interface AIModelRequest {
     prompt: string;
     systemPrompt?: string;
     options?: Partial<AIModelConfig>;
+    content?: Content[];
+    image?: string | Buffer;
 }
 export interface AIModel {
     provider: AIProvider;

package/dist/utils/image-utils.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * Checks if a string is a valid URL
+ */
+export declare function isUrl(str: string): boolean;
+/**
+ * Checks if a string is a valid file path
+ */
+export declare function isFilePath(str: string): boolean;
+/**
+ * Converts an image to base64 from various sources
+ * @param source - URL, file path, or Buffer
+ * @returns Promise with base64 encoded image
+ */
+export declare function imageToBase64(source: string | Buffer): Promise<string>;
+/**
+ * Determines the MIME type based on file extension
+ * @param filePath - Path to the file or URL
+ */
+export declare function getMimeType(filePath: string): string;
+/**
+ * Processes an image source and returns data needed for API requests
+ */
+export declare function processImage(source: string | Buffer): Promise<{
+    base64: string;
+    mimeType: string;
+}>;

package/dist/utils/image-utils.js ADDED Viewed

@@ -0,0 +1,103 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.isUrl = isUrl;
+exports.isFilePath = isFilePath;
+exports.imageToBase64 = imageToBase64;
+exports.getMimeType = getMimeType;
+exports.processImage = processImage;
+const fs_1 = __importDefault(require("fs"));
+const path_1 = __importDefault(require("path"));
+const axios_1 = __importDefault(require("axios"));
+/**
+ * Checks if a string is a valid URL
+ */
+function isUrl(str) {
+    try {
+        const url = new URL(str);
+        return url.protocol === "http:" || url.protocol === "https:";
+    }
+    catch {
+        return false;
+    }
+}
+/**
+ * Checks if a string is a valid file path
+ */
+function isFilePath(str) {
+    try {
+        return fs_1.default.existsSync(str) && fs_1.default.statSync(str).isFile();
+    }
+    catch {
+        return false;
+    }
+}
+/**
+ * Converts an image to base64 from various sources
+ * @param source - URL, file path, or Buffer
+ * @returns Promise with base64 encoded image
+ */
+async function imageToBase64(source) {
+    // If source is already a Buffer
+    if (Buffer.isBuffer(source)) {
+        return source.toString("base64");
+    }
+    // If source is a URL
+    if (isUrl(source)) {
+        try {
+            const response = await axios_1.default.get(source, { responseType: "arraybuffer" });
+            const buffer = Buffer.from(response.data, "binary");
+            return buffer.toString("base64");
+        }
+        catch (error) {
+            throw new Error(`Failed to fetch image from URL: ${error.message}`);
+        }
+    }
+    // If source is a file path
+    if (isFilePath(source)) {
+        try {
+            const buffer = fs_1.default.readFileSync(source);
+            return buffer.toString("base64");
+        }
+        catch (error) {
+            throw new Error(`Failed to read image file: ${error.message}`);
+        }
+    }
+    throw new Error("Invalid image source. Must be URL, file path, or Buffer");
+}
+/**
+ * Determines the MIME type based on file extension
+ * @param filePath - Path to the file or URL
+ */
+function getMimeType(filePath) {
+    if (!filePath)
+        return "image/jpeg"; // Default
+    const ext = path_1.default.extname(filePath).toLowerCase();
+    switch (ext) {
+        case ".jpg":
+        case ".jpeg":
+            return "image/jpeg";
+        case ".png":
+            return "image/png";
+        case ".gif":
+            return "image/gif";
+        case ".webp":
+            return "image/webp";
+        case ".bmp":
+            return "image/bmp";
+        case ".svg":
+            return "image/svg+xml";
+        default:
+            return "image/jpeg"; // Default to JPEG
+    }
+}
+/**
+ * Processes an image source and returns data needed for API requests
+ */
+async function processImage(source) {
+    const base64 = await imageToBase64(source);
+    const mimeType = typeof source === "string" ? getMimeType(source) : "image/jpeg";
+    return { base64, mimeType };
+}

package/dist/utils/index.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from "./image-utils";

package/dist/utils/index.js ADDED Viewed

@@ -0,0 +1,17 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __exportStar = (this && this.__exportStar) || function(m, exports) {
+    for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+__exportStar(require("./image-utils"), exports);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "neural-ai-sdk",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "description": "Unified SDK for interacting with various AI LLM providers",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",