neural-ai-sdk 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -116,6 +116,101 @@ const deepseekModel = NeuralAI.createModel(AIProvider.DEEPSEEK, {
116
116
  });
117
117
  ```
118
118
 
119
+ ### Using Multimodal Capabilities
120
+
121
+ The SDK supports multimodal capabilities for providers with vision-capable models. You can pass images to any model - the SDK will attempt to process them appropriately and provide helpful error messages if the model doesn't support vision inputs.
122
+
123
+ #### Simple Image + Text Example
124
+
125
+ ```typescript
126
+ import { NeuralAI, AIProvider } from "neural-ai-sdk";
127
+
128
+ // Create an OpenAI model with vision capabilities
129
+ const openaiModel = NeuralAI.createModel(AIProvider.OPENAI, {
130
+ model: "gpt-4o", // Model that supports vision
131
+ });
132
+
133
+ // Process an image with a text prompt
134
+ async function analyzeImage() {
135
+ const response = await openaiModel.generate({
136
+ prompt: "What's in this image? Please describe it in detail.",
137
+ // The image can be a URL, local file path, or Buffer
138
+ image: "https://example.com/image.jpg",
139
+ });
140
+
141
+ console.log(response.text);
142
+ }
143
+
144
+ analyzeImage();
145
+ ```
146
+
147
+ #### Using Multiple Images
148
+
149
+ For more complex scenarios with multiple images or mixed content:
150
+
151
+ ```typescript
152
+ import { NeuralAI, AIProvider } from "neural-ai-sdk";
153
+
154
+ // Create a Google model with multimodal support
155
+ const googleModel = NeuralAI.createModel(AIProvider.GOOGLE, {
156
+ model: "gemini-2.0-flash",
157
+ });
158
+
159
+ async function compareImages() {
160
+ const response = await googleModel.generate({
161
+ prompt: "Compare these two images and tell me the differences:",
162
+ content: [
163
+ {
164
+ type: "image",
165
+ source: "https://example.com/image1.jpg",
166
+ },
167
+ {
168
+ type: "text",
169
+ text: "This is the first image.",
170
+ },
171
+ {
172
+ type: "image",
173
+ source: "https://example.com/image2.jpg",
174
+ },
175
+ {
176
+ type: "text",
177
+ text: "This is the second image.",
178
+ },
179
+ ],
180
+ });
181
+
182
+ console.log(response.text);
183
+ }
184
+
185
+ compareImages();
186
+ ```
187
+
188
+ #### Supported Image Sources
189
+
190
+ The SDK handles various image sources:
191
+
192
+ - **URLs**: `"https://example.com/image.jpg"`
193
+ - **Local file paths**: `"/path/to/local/image.jpg"`
194
+ - **Buffers**: Direct image data as a Buffer object
195
+
196
+ The SDK automatically handles:
197
+
198
+ - Base64 encoding
199
+ - MIME type detection
200
+ - Image formatting for each provider's API
201
+
202
+ #### Multimodal Support Across Providers
203
+
204
+ All providers can attempt to process images - the SDK will automatically handle errors gracefully if a specific model doesn't support multimodal inputs.
205
+
206
+ | Provider | Common Vision-Capable Models |
207
+ | ----------- | --------------------------------------------------- |
208
+ | OpenAI | gpt-4o, gpt-4-vision |
209
+ | Google | gemini-2.0-flash |
210
+ | Ollama | llama-3.2-vision, llama3-vision, bakllava, llava |
211
+ | HuggingFace | llava, cogvlm, idefics, instructblip |
212
+ | DeepSeek | (Check provider documentation for supported models) |
213
+
119
214
  ## Environment Configuration
120
215
 
121
216
  You can set up environment variables by:
@@ -196,6 +291,31 @@ console.log(`Completion tokens: ${response.usage?.completionTokens}`);
196
291
  console.log(`Total tokens: ${response.usage?.totalTokens}`);
197
292
  ```
198
293
 
294
+ ### Multimodal Streaming
295
+
296
+ You can also stream responses from multimodal prompts:
297
+
298
+ ```typescript
299
+ import { NeuralAI, AIProvider } from "neural-ai-sdk";
300
+
301
+ const model = NeuralAI.createModel(AIProvider.OPENAI, {
302
+ model: "gpt-4o",
303
+ });
304
+
305
+ async function streamImageAnalysis() {
306
+ const stream = model.stream({
307
+ prompt: "Describe this image in detail:",
308
+ image: "https://example.com/image.jpg",
309
+ });
310
+
311
+ for await (const chunk of stream) {
312
+ process.stdout.write(chunk);
313
+ }
314
+ }
315
+
316
+ streamImageAnalysis();
317
+ ```
318
+
199
319
  ## License
200
320
 
201
321
  MIT
@@ -6,5 +6,8 @@ export declare class GoogleModel extends BaseModel {
6
6
  constructor(config: AIModelConfig);
7
7
  generate(request: AIModelRequest): Promise<AIModelResponse>;
8
8
  stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
9
- private formatPrompt;
9
+ /**
10
+ * Format content for Google's Gemini API, handling both text and images
11
+ */
12
+ private formatMultiModalContent;
10
13
  }
@@ -5,6 +5,7 @@ const generative_ai_1 = require("@google/generative-ai");
5
5
  const types_1 = require("../types");
6
6
  const base_model_1 = require("./base-model");
7
7
  const utils_1 = require("../utils");
8
+ const image_utils_1 = require("../utils/image-utils");
8
9
  class GoogleModel extends base_model_1.BaseModel {
9
10
  constructor(config) {
10
11
  super(config);
@@ -22,8 +23,8 @@ class GoogleModel extends base_model_1.BaseModel {
22
23
  topP: config.topP,
23
24
  },
24
25
  });
25
- const prompt = this.formatPrompt(request);
26
- const result = await model.generateContent(prompt);
26
+ const content = await this.formatMultiModalContent(request);
27
+ const result = await model.generateContent(content);
27
28
  const response = result.response;
28
29
  return {
29
30
  text: response.text(),
@@ -40,8 +41,8 @@ class GoogleModel extends base_model_1.BaseModel {
40
41
  topP: config.topP,
41
42
  },
42
43
  });
43
- const prompt = this.formatPrompt(request);
44
- const result = await model.generateContentStream(prompt);
44
+ const content = await this.formatMultiModalContent(request);
45
+ const result = await model.generateContentStream(content);
45
46
  for await (const chunk of result.stream) {
46
47
  const text = chunk.text();
47
48
  if (text) {
@@ -49,12 +50,47 @@ class GoogleModel extends base_model_1.BaseModel {
49
50
  }
50
51
  }
51
52
  }
52
- formatPrompt(request) {
53
+ /**
54
+ * Format content for Google's Gemini API, handling both text and images
55
+ */
56
+ async formatMultiModalContent(request) {
53
57
  const parts = [];
58
+ // Add system prompt if provided
54
59
  if (request.systemPrompt) {
55
- parts.push(request.systemPrompt);
60
+ parts.push({ text: request.systemPrompt });
61
+ }
62
+ // Add main prompt text
63
+ if (request.prompt) {
64
+ parts.push({ text: request.prompt });
65
+ }
66
+ // Process structured content array if provided
67
+ if (request.content) {
68
+ for (const item of request.content) {
69
+ if (item.type === "text") {
70
+ parts.push({ text: item.text });
71
+ }
72
+ else if (item.type === "image") {
73
+ // Process image and add to parts
74
+ const { base64, mimeType } = await (0, image_utils_1.processImage)(item.source);
75
+ parts.push({
76
+ inlineData: {
77
+ data: base64,
78
+ mimeType: mimeType,
79
+ },
80
+ });
81
+ }
82
+ }
83
+ }
84
+ // Process single image if provided via convenience property
85
+ if (request.image) {
86
+ const { base64, mimeType } = await (0, image_utils_1.processImage)(request.image);
87
+ parts.push({
88
+ inlineData: {
89
+ data: base64,
90
+ mimeType: mimeType,
91
+ },
92
+ });
56
93
  }
57
- parts.push(request.prompt);
58
94
  return parts;
59
95
  }
60
96
  }
@@ -5,5 +5,33 @@ export declare class HuggingFaceModel extends BaseModel {
5
5
  private baseURL;
6
6
  constructor(config: AIModelConfig);
7
7
  generate(request: AIModelRequest): Promise<AIModelResponse>;
8
+ /**
9
+ * Generate a text-only response
10
+ */
11
+ private generateTextOnly;
12
+ /**
13
+ * Generate a response using multimodal inputs (text + images)
14
+ */
15
+ private generateWithImages;
16
+ /**
17
+ * Try generating with nested inputs format (common in newer models)
18
+ */
19
+ private generateWithNestedFormat;
20
+ /**
21
+ * Try generating with flat inputs format (common in some models)
22
+ */
23
+ private generateWithFlatFormat;
24
+ /**
25
+ * Helper to parse HuggingFace response in various formats
26
+ */
27
+ private parseResponse;
28
+ /**
29
+ * Fallback method that uses multipart/form-data for older HuggingFace models
30
+ */
31
+ private generateWithMultipartForm;
8
32
  stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
33
+ /**
34
+ * Convert a base64 string to a Blob object
35
+ */
36
+ private base64ToBlob;
9
37
  }
@@ -8,6 +8,7 @@ const axios_1 = __importDefault(require("axios"));
8
8
  const types_1 = require("../types");
9
9
  const base_model_1 = require("./base-model");
10
10
  const utils_1 = require("../utils");
11
+ const image_utils_1 = require("../utils/image-utils");
11
12
  class HuggingFaceModel extends base_model_1.BaseModel {
12
13
  constructor(config) {
13
14
  super(config);
@@ -18,6 +19,46 @@ class HuggingFaceModel extends base_model_1.BaseModel {
18
19
  async generate(request) {
19
20
  const config = this.mergeConfig(request.options);
20
21
  const model = config.model || "meta-llama/Llama-2-7b-chat-hf";
22
+ try {
23
+ // Try multimodal approach if images are present
24
+ if (request.image ||
25
+ (request.content &&
26
+ request.content.some((item) => item.type === "image"))) {
27
+ return await this.generateWithImages(request, config, model);
28
+ }
29
+ else {
30
+ return await this.generateTextOnly(request, config, model);
31
+ }
32
+ }
33
+ catch (error) {
34
+ // Enhance error messages for multimodal related errors
35
+ if ((request.image || request.content) &&
36
+ (error.response?.data?.includes("Content-Type") ||
37
+ error.response?.status === 415 ||
38
+ error.response?.data?.error?.includes("image") ||
39
+ error.message?.includes("multimodal") ||
40
+ error.message?.toLowerCase().includes("vision") ||
41
+ error.message?.toLowerCase().includes("unsupported"))) {
42
+ let errorMessage = `Model "${model}" doesn't appear to support multimodal inputs properly.`;
43
+ // Add more specific guidance based on error details
44
+ if (error.response?.status === 415) {
45
+ errorMessage +=
46
+ " The model may require a different format for image inputs.";
47
+ }
48
+ // Include original error message for debugging
49
+ errorMessage += ` Original error: ${error.response?.data || error.message}`;
50
+ // Suggest known working models
51
+ errorMessage +=
52
+ " Try a different vision-capable model like 'llava-hf/llava-1.5-7b-hf' or check HuggingFace's documentation for this specific model.";
53
+ throw new Error(errorMessage);
54
+ }
55
+ throw error;
56
+ }
57
+ }
58
+ /**
59
+ * Generate a text-only response
60
+ */
61
+ async generateTextOnly(request, config, model) {
21
62
  let fullPrompt = request.prompt;
22
63
  if (request.systemPrompt) {
23
64
  fullPrompt = `${request.systemPrompt}\n\n${fullPrompt}`;
@@ -54,19 +95,251 @@ class HuggingFaceModel extends base_model_1.BaseModel {
54
95
  raw: response.data,
55
96
  };
56
97
  }
98
+ /**
99
+ * Generate a response using multimodal inputs (text + images)
100
+ */
101
+ async generateWithImages(request, config, model) {
102
+ // Some HF models expect different input formats, try various formats one by one
103
+ const errors = [];
104
+ // Format 1: Nested inputs object with text and image
105
+ try {
106
+ return await this.generateWithNestedFormat(request, config, model);
107
+ }
108
+ catch (error) {
109
+ errors.push(error);
110
+ }
111
+ // Format 2: Plain inputs with string prompt and image in the main object
112
+ try {
113
+ return await this.generateWithFlatFormat(request, config, model);
114
+ }
115
+ catch (error) {
116
+ errors.push(error);
117
+ }
118
+ // Format 3: Try multipart form data as last resort
119
+ try {
120
+ return await this.generateWithMultipartForm(request, config, model);
121
+ }
122
+ catch (error) {
123
+ errors.push(error);
124
+ }
125
+ // If we get here, all formats failed, throw an enhanced error
126
+ const errorMessage = `Model "${model}" doesn't appear to support multimodal inputs in any of the attempted formats. Try a different vision-capable model like 'llava-hf/llava-1.5-7b-hf'. Errors: ${errors
127
+ .map((e) => e.message || e)
128
+ .join("; ")}`;
129
+ throw new Error(errorMessage);
130
+ }
131
+ /**
132
+ * Try generating with nested inputs format (common in newer models)
133
+ */
134
+ async generateWithNestedFormat(request, config, model) {
135
+ const prompt = request.systemPrompt
136
+ ? `${request.systemPrompt}\n\n${request.prompt}`
137
+ : request.prompt;
138
+ let payload = {
139
+ inputs: {
140
+ text: prompt,
141
+ },
142
+ parameters: {
143
+ temperature: config.temperature,
144
+ max_new_tokens: config.maxTokens,
145
+ top_p: config.topP,
146
+ return_full_text: false,
147
+ },
148
+ };
149
+ // Process the convenience 'image' property
150
+ if (request.image) {
151
+ const { base64 } = await (0, image_utils_1.processImage)(request.image);
152
+ payload.inputs.image = base64;
153
+ }
154
+ // Process content array if provided
155
+ if (request.content) {
156
+ // Initialize images array if multiple images
157
+ const hasMultipleImages = request.content.filter((item) => item.type === "image").length > 1;
158
+ if (hasMultipleImages) {
159
+ payload.inputs.images = [];
160
+ }
161
+ for (const item of request.content) {
162
+ if (item.type === "image") {
163
+ const { base64 } = await (0, image_utils_1.processImage)(item.source);
164
+ if (hasMultipleImages) {
165
+ payload.inputs.images.push(base64);
166
+ }
167
+ else {
168
+ payload.inputs.image = base64;
169
+ }
170
+ }
171
+ // Text content is already included in the prompt
172
+ }
173
+ }
174
+ const response = await axios_1.default.post(`${this.baseURL}/${model}`, payload, {
175
+ headers: {
176
+ Authorization: `Bearer ${config.apiKey ||
177
+ (0, utils_1.getApiKey)(config.apiKey, "HUGGINGFACE_API_KEY", "HuggingFace")}`,
178
+ "Content-Type": "application/json",
179
+ },
180
+ });
181
+ // Parse response
182
+ return this.parseResponse(response);
183
+ }
184
+ /**
185
+ * Try generating with flat inputs format (common in some models)
186
+ */
187
+ async generateWithFlatFormat(request, config, model) {
188
+ const prompt = request.systemPrompt
189
+ ? `${request.systemPrompt}\n\n${request.prompt}`
190
+ : request.prompt;
191
+ // Some models expect a flat structure with inputs as a string
192
+ let payload = {
193
+ inputs: prompt,
194
+ parameters: {
195
+ temperature: config.temperature,
196
+ max_new_tokens: config.maxTokens,
197
+ top_p: config.topP,
198
+ return_full_text: false,
199
+ },
200
+ };
201
+ // For single image, add it directly to the payload
202
+ if (request.image) {
203
+ const { base64 } = await (0, image_utils_1.processImage)(request.image);
204
+ payload.image = base64; // At top level, not in inputs
205
+ }
206
+ // Process only the first image from content if available and no direct image
207
+ if (!request.image && request.content) {
208
+ const imageContent = request.content.find((item) => item.type === "image");
209
+ if (imageContent) {
210
+ const { base64 } = await (0, image_utils_1.processImage)(imageContent.source);
211
+ payload.image = base64; // At top level, not in inputs
212
+ }
213
+ }
214
+ const response = await axios_1.default.post(`${this.baseURL}/${model}`, payload, {
215
+ headers: {
216
+ Authorization: `Bearer ${config.apiKey ||
217
+ (0, utils_1.getApiKey)(config.apiKey, "HUGGINGFACE_API_KEY", "HuggingFace")}`,
218
+ "Content-Type": "application/json",
219
+ },
220
+ });
221
+ // Parse response
222
+ return this.parseResponse(response);
223
+ }
224
+ /**
225
+ * Helper to parse HuggingFace response in various formats
226
+ */
227
+ parseResponse(response) {
228
+ let text = "";
229
+ if (Array.isArray(response.data)) {
230
+ text = response.data[0]?.generated_text || "";
231
+ }
232
+ else if (response.data.generated_text) {
233
+ text = response.data.generated_text;
234
+ }
235
+ else if (typeof response.data === "string") {
236
+ text = response.data;
237
+ }
238
+ else {
239
+ text = JSON.stringify(response.data);
240
+ }
241
+ return {
242
+ text,
243
+ raw: response.data,
244
+ };
245
+ }
246
+ /**
247
+ * Fallback method that uses multipart/form-data for older HuggingFace models
248
+ */
249
+ async generateWithMultipartForm(request, config, model) {
250
+ // Create a multipart form-data payload for multimodal models
251
+ const formData = new FormData();
252
+ // Add text prompt
253
+ const prompt = request.systemPrompt
254
+ ? `${request.systemPrompt}\n\n${request.prompt}`
255
+ : request.prompt;
256
+ formData.append("text", prompt);
257
+ // Process the convenience 'image' property
258
+ if (request.image) {
259
+ const { base64 } = await (0, image_utils_1.processImage)(request.image);
260
+ const imageBlob = this.base64ToBlob(base64);
261
+ formData.append("image", imageBlob, "image.jpg");
262
+ }
263
+ // Process content array if provided
264
+ if (request.content) {
265
+ let imageIndex = 0;
266
+ for (const item of request.content) {
267
+ if (item.type === "image") {
268
+ const { base64 } = await (0, image_utils_1.processImage)(item.source);
269
+ const imageBlob = this.base64ToBlob(base64);
270
+ formData.append(`image_${imageIndex}`, imageBlob, `image_${imageIndex}.jpg`);
271
+ imageIndex++;
272
+ }
273
+ // Text content is already included in the prompt
274
+ }
275
+ }
276
+ // Add model parameters
277
+ if (config.temperature) {
278
+ formData.append("temperature", config.temperature.toString());
279
+ }
280
+ if (config.maxTokens) {
281
+ formData.append("max_new_tokens", config.maxTokens.toString());
282
+ }
283
+ if (config.topP) {
284
+ formData.append("top_p", config.topP.toString());
285
+ }
286
+ const response = await axios_1.default.post(`${this.baseURL}/${model}`, formData, {
287
+ headers: {
288
+ Authorization: `Bearer ${config.apiKey ||
289
+ (0, utils_1.getApiKey)(config.apiKey, "HUGGINGFACE_API_KEY", "HuggingFace")}`,
290
+ "Content-Type": "multipart/form-data",
291
+ },
292
+ });
293
+ // Parse response based on return format
294
+ let text = "";
295
+ if (Array.isArray(response.data)) {
296
+ text = response.data[0]?.generated_text || "";
297
+ }
298
+ else if (response.data.generated_text) {
299
+ text = response.data.generated_text;
300
+ }
301
+ else if (typeof response.data === "string") {
302
+ text = response.data;
303
+ }
304
+ else {
305
+ text = JSON.stringify(response.data);
306
+ }
307
+ return {
308
+ text,
309
+ raw: response.data,
310
+ };
311
+ }
57
312
  async *stream(request) {
58
- // HuggingFace Inference API doesn't natively support streaming for all models
59
- // We'll implement a basic chunking on top of the non-streaming API
60
- const response = await this.generate(request);
61
- // Simple chunking for demonstration purposes
62
- const chunkSize = 10;
63
- const text = response.text;
64
- for (let i = 0; i < text.length; i += chunkSize) {
65
- const chunk = text.slice(i, i + chunkSize);
66
- yield chunk;
67
- // Add a small delay to simulate streaming
68
- await new Promise((resolve) => setTimeout(resolve, 10));
313
+ try {
314
+ // HuggingFace Inference API doesn't natively support streaming for all models
315
+ // We'll implement a basic chunking on top of the non-streaming API
316
+ const response = await this.generate(request);
317
+ // Simple chunking for demonstration purposes
318
+ const chunkSize = 10;
319
+ const text = response.text;
320
+ for (let i = 0; i < text.length; i += chunkSize) {
321
+ const chunk = text.slice(i, i + chunkSize);
322
+ yield chunk;
323
+ // Add a small delay to simulate streaming
324
+ await new Promise((resolve) => setTimeout(resolve, 10));
325
+ }
326
+ }
327
+ catch (error) {
328
+ // Rethrow with enhanced error message
329
+ throw error;
330
+ }
331
+ }
332
+ /**
333
+ * Convert a base64 string to a Blob object
334
+ */
335
+ base64ToBlob(base64) {
336
+ const byteString = atob(base64);
337
+ const ab = new ArrayBuffer(byteString.length);
338
+ const ia = new Uint8Array(ab);
339
+ for (let i = 0; i < byteString.length; i++) {
340
+ ia[i] = byteString.charCodeAt(i);
69
341
  }
342
+ return new Blob([ab], { type: "image/jpeg" });
70
343
  }
71
344
  }
72
345
  exports.HuggingFaceModel = HuggingFaceModel;
@@ -6,4 +6,8 @@ export declare class OllamaModel extends BaseModel {
6
6
  constructor(config: AIModelConfig);
7
7
  generate(request: AIModelRequest): Promise<AIModelResponse>;
8
8
  stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
9
+ /**
10
+ * Creates the request payload for Ollama, handling multimodal content if provided
11
+ */
12
+ private createRequestPayload;
9
13
  }
@@ -8,6 +8,7 @@ const axios_1 = __importDefault(require("axios"));
8
8
  const types_1 = require("../types");
9
9
  const base_model_1 = require("./base-model");
10
10
  const utils_1 = require("../utils");
11
+ const image_utils_1 = require("../utils/image-utils");
11
12
  class OllamaModel extends base_model_1.BaseModel {
12
13
  constructor(config) {
13
14
  super(config);
@@ -16,59 +17,149 @@ class OllamaModel extends base_model_1.BaseModel {
16
17
  }
17
18
  async generate(request) {
18
19
  const config = this.mergeConfig(request.options);
19
- let prompt = request.prompt;
20
- // Add system prompt if provided
21
- if (request.systemPrompt) {
22
- prompt = `${request.systemPrompt}\n\n${prompt}`;
20
+ try {
21
+ const payload = await this.createRequestPayload(request, config);
22
+ const response = await axios_1.default.post(`${this.baseURL}/generate`, payload);
23
+ return {
24
+ text: response.data.response,
25
+ usage: {
26
+ promptTokens: response.data.prompt_eval_count,
27
+ completionTokens: response.data.eval_count,
28
+ totalTokens: response.data.prompt_eval_count + response.data.eval_count,
29
+ },
30
+ raw: response.data,
31
+ };
32
+ }
33
+ catch (error) {
34
+ // Enhance error message if it appears to be related to multimodal support
35
+ if (error.response?.status === 400 &&
36
+ (request.image || request.content) &&
37
+ (error.response?.data?.error?.includes("image") ||
38
+ error.response?.data?.error?.includes("multimodal") ||
39
+ error.response?.data?.error?.includes("vision"))) {
40
+ throw new Error(`The model "${config.model || "default"}" doesn't support multimodal inputs. Try a vision-capable model like "llama-3.2-vision" or "llava". Original error: ${error.message}`);
41
+ }
42
+ throw error;
23
43
  }
24
- const response = await axios_1.default.post(`${this.baseURL}/generate`, {
25
- model: config.model || "llama2",
26
- prompt,
27
- temperature: config.temperature,
28
- num_predict: config.maxTokens,
29
- top_p: config.topP,
30
- });
31
- return {
32
- text: response.data.response,
33
- usage: {
34
- promptTokens: response.data.prompt_eval_count,
35
- completionTokens: response.data.eval_count,
36
- totalTokens: response.data.prompt_eval_count + response.data.eval_count,
37
- },
38
- raw: response.data,
39
- };
40
44
  }
41
45
  async *stream(request) {
42
46
  const config = this.mergeConfig(request.options);
43
- let prompt = request.prompt;
44
- if (request.systemPrompt) {
45
- prompt = `${request.systemPrompt}\n\n${prompt}`;
47
+ try {
48
+ const payload = await this.createRequestPayload(request, config, true);
49
+ const response = await axios_1.default.post(`${this.baseURL}/generate`, payload, {
50
+ responseType: "stream",
51
+ });
52
+ const reader = response.data;
53
+ for await (const chunk of reader) {
54
+ const lines = chunk.toString().split("\n").filter(Boolean);
55
+ for (const line of lines) {
56
+ try {
57
+ const parsed = JSON.parse(line);
58
+ if (parsed.response) {
59
+ yield parsed.response;
60
+ }
61
+ }
62
+ catch (error) {
63
+ console.error("Error parsing Ollama stream data:", error);
64
+ }
65
+ }
66
+ }
46
67
  }
47
- const response = await axios_1.default.post(`${this.baseURL}/generate`, {
68
+ catch (error) {
69
+ // Enhance error message if it appears to be related to multimodal support
70
+ if (error.response?.status === 400 &&
71
+ (request.image || request.content) &&
72
+ (error.response?.data?.error?.includes("image") ||
73
+ error.response?.data?.error?.includes("multimodal") ||
74
+ error.response?.data?.error?.includes("vision"))) {
75
+ throw new Error(`The model "${config.model || "default"}" doesn't support multimodal inputs. Try a vision-capable model like "llama-3.2-vision" or "llava". Original error: ${error.message}`);
76
+ }
77
+ throw error;
78
+ }
79
+ }
80
+ /**
81
+ * Creates the request payload for Ollama, handling multimodal content if provided
82
+ */
83
+ async createRequestPayload(request, config, isStream = false) {
84
+ // Base payload
85
+ const payload = {
48
86
  model: config.model || "llama2",
49
- prompt,
50
87
  temperature: config.temperature,
51
88
  num_predict: config.maxTokens,
52
89
  top_p: config.topP,
53
- stream: true,
54
- }, {
55
- responseType: "stream",
56
- });
57
- const reader = response.data;
58
- for await (const chunk of reader) {
59
- const lines = chunk.toString().split("\n").filter(Boolean);
60
- for (const line of lines) {
61
- try {
62
- const parsed = JSON.parse(line);
63
- if (parsed.response) {
64
- yield parsed.response;
90
+ };
91
+ // Handle streaming
92
+ if (isStream) {
93
+ payload.stream = true;
94
+ }
95
+ // If there are any image inputs, use the messages format
96
+ if (request.image ||
97
+ (request.content && request.content.some((item) => item.type === "image"))) {
98
+ // Create a messages array for multimodal models (similar to OpenAI format)
99
+ const messages = [];
100
+ // Add system prompt if provided
101
+ if (request.systemPrompt) {
102
+ messages.push({
103
+ role: "system",
104
+ content: request.systemPrompt,
105
+ });
106
+ }
107
+ // Create a user message with potentially multiple content parts
108
+ const userMessage = { role: "user", content: [] };
109
+ // Add the main prompt as text content
110
+ if (request.prompt) {
111
+ userMessage.content.push({
112
+ type: "text",
113
+ text: request.prompt,
114
+ });
115
+ }
116
+ // Process structured content if available
117
+ if (request.content) {
118
+ for (const item of request.content) {
119
+ if (item.type === "text") {
120
+ userMessage.content.push({
121
+ type: "text",
122
+ text: item.text,
123
+ });
124
+ }
125
+ else if (item.type === "image") {
126
+ const { base64, mimeType } = await (0, image_utils_1.processImage)(item.source);
127
+ userMessage.content.push({
128
+ type: "image",
129
+ image: {
130
+ data: base64,
131
+ mimeType: mimeType,
132
+ },
133
+ });
65
134
  }
66
135
  }
67
- catch (error) {
68
- console.error("Error parsing Ollama stream data:", error);
69
- }
70
136
  }
137
+ // Handle the convenience image property
138
+ if (request.image) {
139
+ const { base64, mimeType } = await (0, image_utils_1.processImage)(request.image);
140
+ userMessage.content.push({
141
+ type: "image",
142
+ image: {
143
+ data: base64,
144
+ mimeType: mimeType,
145
+ },
146
+ });
147
+ }
148
+ // Add the user message
149
+ messages.push(userMessage);
150
+ // Set the messages in the payload
151
+ payload.messages = messages;
152
+ }
153
+ else {
154
+ // Traditional text-only format
155
+ let prompt = request.prompt;
156
+ // Add system prompt if provided
157
+ if (request.systemPrompt) {
158
+ prompt = `${request.systemPrompt}\n\n${prompt}`;
159
+ }
160
+ payload.prompt = prompt;
71
161
  }
162
+ return payload;
72
163
  }
73
164
  }
74
165
  exports.OllamaModel = OllamaModel;
@@ -6,4 +6,8 @@ export declare class OpenAIModel extends BaseModel {
6
6
  constructor(config: AIModelConfig);
7
7
  generate(request: AIModelRequest): Promise<AIModelResponse>;
8
8
  stream(request: AIModelRequest): AsyncGenerator<string, void, unknown>;
9
+ /**
10
+ * Format messages for OpenAI API, including handling multimodal content
11
+ */
12
+ private formatMessages;
9
13
  }
@@ -5,6 +5,7 @@ const openai_1 = require("openai");
5
5
  const types_1 = require("../types");
6
6
  const base_model_1 = require("./base-model");
7
7
  const utils_1 = require("../utils");
8
+ const image_utils_1 = require("../utils/image-utils");
8
9
  class OpenAIModel extends base_model_1.BaseModel {
9
10
  constructor(config) {
10
11
  super(config);
@@ -17,19 +18,8 @@ class OpenAIModel extends base_model_1.BaseModel {
17
18
  }
18
19
  async generate(request) {
19
20
  const config = this.mergeConfig(request.options);
20
- const messages = [];
21
- // Add system prompt if provided
22
- if (request.systemPrompt) {
23
- messages.push({
24
- role: "system",
25
- content: request.systemPrompt,
26
- });
27
- }
28
- // Add user prompt
29
- messages.push({
30
- role: "user",
31
- content: request.prompt,
32
- });
21
+ // Process messages for OpenAI API
22
+ const messages = await this.formatMessages(request);
33
23
  const response = await this.client.chat.completions.create({
34
24
  model: config.model || "gpt-3.5-turbo",
35
25
  messages,
@@ -49,17 +39,8 @@ class OpenAIModel extends base_model_1.BaseModel {
49
39
  }
50
40
  async *stream(request) {
51
41
  const config = this.mergeConfig(request.options);
52
- const messages = [];
53
- if (request.systemPrompt) {
54
- messages.push({
55
- role: "system",
56
- content: request.systemPrompt,
57
- });
58
- }
59
- messages.push({
60
- role: "user",
61
- content: request.prompt,
62
- });
42
+ // Process messages for OpenAI API
43
+ const messages = await this.formatMessages(request);
63
44
  const stream = await this.client.chat.completions.create({
64
45
  model: config.model || "gpt-3.5-turbo",
65
46
  messages,
@@ -75,5 +56,65 @@ class OpenAIModel extends base_model_1.BaseModel {
75
56
  }
76
57
  }
77
58
  }
59
+ /**
60
+ * Format messages for OpenAI API, including handling multimodal content
61
+ */
62
+ async formatMessages(request) {
63
+ const messages = [];
64
+ // Add system prompt if provided
65
+ if (request.systemPrompt) {
66
+ messages.push({
67
+ role: "system",
68
+ content: request.systemPrompt,
69
+ });
70
+ }
71
+ // Handle multimodal content
72
+ if (request.content || request.image) {
73
+ const content = [];
74
+ // Add the text prompt
75
+ if (request.prompt) {
76
+ content.push({ type: "text", text: request.prompt });
77
+ }
78
+ // Add any structured content
79
+ if (request.content) {
80
+ for (const item of request.content) {
81
+ if (item.type === "text") {
82
+ content.push({ type: "text", text: item.text });
83
+ }
84
+ else if (item.type === "image") {
85
+ const { base64, mimeType } = await (0, image_utils_1.processImage)(item.source);
86
+ content.push({
87
+ type: "image_url",
88
+ image_url: {
89
+ url: `data:${mimeType};base64,${base64}`,
90
+ },
91
+ });
92
+ }
93
+ }
94
+ }
95
+ // Add single image if provided via the convenience property
96
+ if (request.image) {
97
+ const { base64, mimeType } = await (0, image_utils_1.processImage)(request.image);
98
+ content.push({
99
+ type: "image_url",
100
+ image_url: {
101
+ url: `data:${mimeType};base64,${base64}`,
102
+ },
103
+ });
104
+ }
105
+ messages.push({
106
+ role: "user",
107
+ content,
108
+ });
109
+ }
110
+ else {
111
+ // Traditional text-only message
112
+ messages.push({
113
+ role: "user",
114
+ content: request.prompt,
115
+ });
116
+ }
117
+ return messages;
118
+ }
78
119
  }
79
120
  exports.OpenAIModel = OpenAIModel;
package/dist/types.d.ts CHANGED
@@ -13,6 +13,16 @@ export declare enum AIProvider {
13
13
  OLLAMA = "ollama",
14
14
  HUGGINGFACE = "huggingface"
15
15
  }
16
+ export type ContentType = "text" | "image";
17
+ export interface TextContent {
18
+ type: "text";
19
+ text: string;
20
+ }
21
+ export interface ImageContent {
22
+ type: "image";
23
+ source: string | Buffer;
24
+ }
25
+ export type Content = TextContent | ImageContent;
16
26
  export interface AIModelResponse {
17
27
  text: string;
18
28
  usage?: {
@@ -26,6 +36,8 @@ export interface AIModelRequest {
26
36
  prompt: string;
27
37
  systemPrompt?: string;
28
38
  options?: Partial<AIModelConfig>;
39
+ content?: Content[];
40
+ image?: string | Buffer;
29
41
  }
30
42
  export interface AIModel {
31
43
  provider: AIProvider;
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Checks if a string is a valid URL
3
+ */
4
+ export declare function isUrl(str: string): boolean;
5
+ /**
6
+ * Checks if a string is a valid file path
7
+ */
8
+ export declare function isFilePath(str: string): boolean;
9
+ /**
10
+ * Converts an image to base64 from various sources
11
+ * @param source - URL, file path, or Buffer
12
+ * @returns Promise with base64 encoded image
13
+ */
14
+ export declare function imageToBase64(source: string | Buffer): Promise<string>;
15
+ /**
16
+ * Determines the MIME type based on file extension
17
+ * @param filePath - Path to the file or URL
18
+ */
19
+ export declare function getMimeType(filePath: string): string;
20
+ /**
21
+ * Processes an image source and returns data needed for API requests
22
+ */
23
+ export declare function processImage(source: string | Buffer): Promise<{
24
+ base64: string;
25
+ mimeType: string;
26
+ }>;
@@ -0,0 +1,103 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.isUrl = isUrl;
7
+ exports.isFilePath = isFilePath;
8
+ exports.imageToBase64 = imageToBase64;
9
+ exports.getMimeType = getMimeType;
10
+ exports.processImage = processImage;
11
+ const fs_1 = __importDefault(require("fs"));
12
+ const path_1 = __importDefault(require("path"));
13
+ const axios_1 = __importDefault(require("axios"));
14
+ /**
15
+ * Checks if a string is a valid URL
16
+ */
17
+ function isUrl(str) {
18
+ try {
19
+ const url = new URL(str);
20
+ return url.protocol === "http:" || url.protocol === "https:";
21
+ }
22
+ catch {
23
+ return false;
24
+ }
25
+ }
26
+ /**
27
+ * Checks if a string is a valid file path
28
+ */
29
+ function isFilePath(str) {
30
+ try {
31
+ return fs_1.default.existsSync(str) && fs_1.default.statSync(str).isFile();
32
+ }
33
+ catch {
34
+ return false;
35
+ }
36
+ }
37
+ /**
38
+ * Converts an image to base64 from various sources
39
+ * @param source - URL, file path, or Buffer
40
+ * @returns Promise with base64 encoded image
41
+ */
42
+ async function imageToBase64(source) {
43
+ // If source is already a Buffer
44
+ if (Buffer.isBuffer(source)) {
45
+ return source.toString("base64");
46
+ }
47
+ // If source is a URL
48
+ if (isUrl(source)) {
49
+ try {
50
+ const response = await axios_1.default.get(source, { responseType: "arraybuffer" });
51
+ const buffer = Buffer.from(response.data, "binary");
52
+ return buffer.toString("base64");
53
+ }
54
+ catch (error) {
55
+ throw new Error(`Failed to fetch image from URL: ${error.message}`);
56
+ }
57
+ }
58
+ // If source is a file path
59
+ if (isFilePath(source)) {
60
+ try {
61
+ const buffer = fs_1.default.readFileSync(source);
62
+ return buffer.toString("base64");
63
+ }
64
+ catch (error) {
65
+ throw new Error(`Failed to read image file: ${error.message}`);
66
+ }
67
+ }
68
+ throw new Error("Invalid image source. Must be URL, file path, or Buffer");
69
+ }
70
+ /**
71
+ * Determines the MIME type based on file extension
72
+ * @param filePath - Path to the file or URL
73
+ */
74
+ function getMimeType(filePath) {
75
+ if (!filePath)
76
+ return "image/jpeg"; // Default
77
+ const ext = path_1.default.extname(filePath).toLowerCase();
78
+ switch (ext) {
79
+ case ".jpg":
80
+ case ".jpeg":
81
+ return "image/jpeg";
82
+ case ".png":
83
+ return "image/png";
84
+ case ".gif":
85
+ return "image/gif";
86
+ case ".webp":
87
+ return "image/webp";
88
+ case ".bmp":
89
+ return "image/bmp";
90
+ case ".svg":
91
+ return "image/svg+xml";
92
+ default:
93
+ return "image/jpeg"; // Default to JPEG
94
+ }
95
+ }
96
+ /**
97
+ * Processes an image source and returns data needed for API requests
98
+ */
99
+ async function processImage(source) {
100
+ const base64 = await imageToBase64(source);
101
+ const mimeType = typeof source === "string" ? getMimeType(source) : "image/jpeg";
102
+ return { base64, mimeType };
103
+ }
@@ -0,0 +1 @@
1
+ export * from "./image-utils";
@@ -0,0 +1,17 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ __exportStar(require("./image-utils"), exports);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "neural-ai-sdk",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "description": "Unified SDK for interacting with various AI LLM providers",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",