stratifyai 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,7 @@ class APIKeyHelper:
15
15
  "google": "GOOGLE_API_KEY",
16
16
  "deepseek": "DEEPSEEK_API_KEY",
17
17
  "groq": "GROQ_API_KEY",
18
- "grok": "GROK_API_KEY",
18
+ "grok": "XAI_API_KEY", # X.AI official environment variable
19
19
  "openrouter": "OPENROUTER_API_KEY",
20
20
  "ollama": "OLLAMA_API_KEY",
21
21
  "bedrock": "AWS_BEARER_TOKEN_BEDROCK", # Bedrock bearer token (or AWS_ACCESS_KEY_ID)
stratifyai/config.py CHANGED
@@ -29,7 +29,7 @@ OPENAI_MODELS: Dict[str, Dict[str, Any]] = {
29
29
  "context": 128000,
30
30
  "cost_input": 10.0,
31
31
  "cost_output": 30.0,
32
- "supports_vision": True,
32
+ "supports_vision": False,
33
33
  "supports_tools": True,
34
34
  },
35
35
  "gpt-4": {
@@ -310,16 +310,6 @@ ANTHROPIC_MODELS: Dict[str, Dict[str, Any]] = {
310
310
  "supports_tools": True,
311
311
  "supports_caching": True,
312
312
  },
313
- "claude-3-5-haiku-20241022": {
314
- "context": 200000,
315
- "cost_input": 1.0,
316
- "cost_output": 5.0,
317
- "cost_cache_write": 1.25,
318
- "cost_cache_read": 0.10,
319
- "supports_vision": False,
320
- "supports_tools": True,
321
- "supports_caching": True,
322
- },
323
313
  }
324
314
 
325
315
  # Google Gemini Model Catalog (OpenAI-compatible)
@@ -449,6 +439,105 @@ GROQ_MODELS: Dict[str, Dict[str, Any]] = {
449
439
 
450
440
  # Grok (X.AI) Model Catalog (OpenAI-compatible)
451
441
  GROK_MODELS: Dict[str, Dict[str, Any]] = {
442
+ # Grok 4.1 Fast - Best for agentic tool calling
443
+ "grok-4-1-fast-reasoning": {
444
+ "context": 2000000, # 2M tokens
445
+ "cost_input": 0.20,
446
+ "cost_output": 0.50,
447
+ "supports_vision": True,
448
+ "supports_tools": True,
449
+ "reasoning_model": True,
450
+ },
451
+ "grok-4-1-fast-non-reasoning": {
452
+ "context": 2000000,
453
+ "cost_input": 0.20,
454
+ "cost_output": 0.50,
455
+ "supports_vision": True,
456
+ "supports_tools": True,
457
+ },
458
+ # Grok 4 Fast - High performance
459
+ "grok-4-fast-reasoning": {
460
+ "context": 2000000,
461
+ "cost_input": 0.50,
462
+ "cost_output": 1.50,
463
+ "supports_vision": True,
464
+ "supports_tools": True,
465
+ "reasoning_model": True,
466
+ },
467
+ "grok-4-fast-non-reasoning": {
468
+ "context": 2000000,
469
+ "cost_input": 0.50,
470
+ "cost_output": 1.50,
471
+ "supports_vision": True,
472
+ "supports_tools": True,
473
+ },
474
+ # Grok 4 - Flagship reasoning model
475
+ "grok-4": {
476
+ "context": 128000,
477
+ "cost_input": 5.0,
478
+ "cost_output": 15.0,
479
+ "supports_vision": True,
480
+ "supports_tools": True,
481
+ "reasoning_model": True,
482
+ "fixed_temperature": 1.0,
483
+ },
484
+ # Grok Code Fast - Specialized for coding
485
+ "grok-code-fast-1": {
486
+ "context": 256000,
487
+ "cost_input": 0.0, # Free during promotional period
488
+ "cost_output": 0.0,
489
+ "supports_vision": False,
490
+ "supports_tools": True,
491
+ "reasoning_model": True,
492
+ },
493
+ # Grok 3 - Full model
494
+ "grok-3": {
495
+ "context": 128000,
496
+ "cost_input": 2.0,
497
+ "cost_output": 10.0,
498
+ "supports_vision": True,
499
+ "supports_tools": True,
500
+ },
501
+ "grok-3-fast": {
502
+ "context": 128000,
503
+ "cost_input": 1.0,
504
+ "cost_output": 5.0,
505
+ "supports_vision": True,
506
+ "supports_tools": True,
507
+ },
508
+ # Grok 3 Mini - Smaller, configurable reasoning
509
+ "grok-3-mini": {
510
+ "context": 128000,
511
+ "cost_input": 0.40,
512
+ "cost_output": 1.60,
513
+ "supports_vision": False,
514
+ "supports_tools": True,
515
+ "reasoning_model": True, # Supports reasoning_effort parameter
516
+ },
517
+ "grok-3-mini-fast": {
518
+ "context": 128000,
519
+ "cost_input": 0.20,
520
+ "cost_output": 0.80,
521
+ "supports_vision": False,
522
+ "supports_tools": True,
523
+ "reasoning_model": True,
524
+ },
525
+ # Grok 2 - Legacy models
526
+ "grok-2-1212": {
527
+ "context": 131072,
528
+ "cost_input": 2.0,
529
+ "cost_output": 10.0,
530
+ "supports_vision": False,
531
+ "supports_tools": True,
532
+ },
533
+ "grok-2-vision": {
534
+ "context": 131072,
535
+ "cost_input": 2.0,
536
+ "cost_output": 10.0,
537
+ "supports_vision": True,
538
+ "supports_tools": True,
539
+ },
540
+ # Legacy aliases
452
541
  "grok-beta": {
453
542
  "context": 131072,
454
543
  "cost_input": 5.0,
@@ -1049,7 +1138,7 @@ INTERACTIVE_OPENAI_MODELS: Dict[str, Dict[str, Any]] = {
1049
1138
  },
1050
1139
  "gpt-4-turbo": {
1051
1140
  "display_name": "GPT-4 Turbo",
1052
- "description": "Legacy flagship, vision support",
1141
+ "description": "Legacy flagship, tools support",
1053
1142
  "category": "Legacy Models",
1054
1143
  },
1055
1144
  }
@@ -1076,11 +1165,6 @@ INTERACTIVE_ANTHROPIC_MODELS: Dict[str, Dict[str, Any]] = {
1076
1165
  "description": "Proven stable, vision/tools",
1077
1166
  "category": "Claude 3.5 (Stable)",
1078
1167
  },
1079
- "claude-3-5-haiku-20241022": {
1080
- "display_name": "Claude 3.5 Haiku",
1081
- "description": "Budget option",
1082
- "category": "Claude 3.5 (Stable)",
1083
- },
1084
1168
  }
1085
1169
 
1086
1170
  # Google - 3 curated models
@@ -1140,16 +1224,46 @@ INTERACTIVE_GROQ_MODELS: Dict[str, Dict[str, Any]] = {
1140
1224
  },
1141
1225
  }
1142
1226
 
1143
- # Grok (X.AI) - 1 curated model
1227
+ # Grok (X.AI) - 7 curated models
1144
1228
  INTERACTIVE_GROK_MODELS: Dict[str, Dict[str, Any]] = {
1145
- "grok-beta": {
1146
- "display_name": "Grok Beta",
1147
- "description": "X.AI flagship model",
1148
- "category": "Grok",
1229
+ "grok-4-1-fast-reasoning": {
1230
+ "display_name": "Grok 4.1 Fast (Reasoning)",
1231
+ "description": "BEST VALUE - 2M context, agentic tools",
1232
+ "category": "Grok 4.1 Fast (Latest)",
1233
+ },
1234
+ "grok-4-1-fast-non-reasoning": {
1235
+ "display_name": "Grok 4.1 Fast (Non-Reasoning)",
1236
+ "description": "FASTEST - instant responses, 2M context",
1237
+ "category": "Grok 4.1 Fast (Latest)",
1238
+ },
1239
+ "grok-code-fast-1": {
1240
+ "display_name": "Grok Code Fast",
1241
+ "description": "FREE - specialized for coding",
1242
+ "category": "Grok Specialized",
1243
+ },
1244
+ "grok-4": {
1245
+ "display_name": "Grok 4",
1246
+ "description": "Flagship reasoning model",
1247
+ "category": "Grok 4 (Premium)",
1248
+ },
1249
+ "grok-3": {
1250
+ "display_name": "Grok 3",
1251
+ "description": "Stable production model",
1252
+ "category": "Grok 3",
1253
+ },
1254
+ "grok-3-mini": {
1255
+ "display_name": "Grok 3 Mini",
1256
+ "description": "Configurable reasoning effort",
1257
+ "category": "Grok 3",
1258
+ },
1259
+ "grok-2-1212": {
1260
+ "display_name": "Grok 2",
1261
+ "description": "Legacy stable model",
1262
+ "category": "Grok 2 (Legacy)",
1149
1263
  },
1150
1264
  }
1151
1265
 
1152
- # OpenRouter - 7 curated models (mix of free and paid)
1266
+ # OpenRouter - 11 curated models (mix of free and paid, multiple 1M context options)
1153
1267
  INTERACTIVE_OPENROUTER_MODELS: Dict[str, Dict[str, Any]] = {
1154
1268
  "anthropic/claude-sonnet-4-5": {
1155
1269
  "display_name": "Claude Sonnet 4.5",
@@ -1163,7 +1277,7 @@ INTERACTIVE_OPENROUTER_MODELS: Dict[str, Dict[str, Any]] = {
1163
1277
  },
1164
1278
  "google/gemini-2.5-flash": {
1165
1279
  "display_name": "Gemini 2.5 Flash",
1166
- "description": "Best value option",
1280
+ "description": "BEST VALUE - 1M context, fast/cheap",
1167
1281
  "category": "Premium Models",
1168
1282
  },
1169
1283
  "meta-llama/llama-3.3-70b-instruct:free": {
@@ -1186,6 +1300,26 @@ INTERACTIVE_OPENROUTER_MODELS: Dict[str, Dict[str, Any]] = {
1186
1300
  "description": "European alternative",
1187
1301
  "category": "Premium Models",
1188
1302
  },
1303
+ "anthropic/claude-opus-4-5": {
1304
+ "display_name": "Claude Opus 4.5",
1305
+ "description": "Premium quality, 1M context",
1306
+ "category": "1M Context Models",
1307
+ },
1308
+ "google/gemini-2.5-pro": {
1309
+ "display_name": "Gemini 2.5 Pro",
1310
+ "description": "Best quality, 1M context",
1311
+ "category": "1M Context Models",
1312
+ },
1313
+ "google/gemini-3": {
1314
+ "display_name": "Gemini 3",
1315
+ "description": "Latest Google, 1M context",
1316
+ "category": "1M Context Models",
1317
+ },
1318
+ "google/gemini-2.0-flash-exp:free": {
1319
+ "display_name": "Gemini 2.0 Flash Exp",
1320
+ "description": "FREE - 1M context, vision/tools",
1321
+ "category": "Free Models (1M Context)",
1322
+ },
1189
1323
  }
1190
1324
 
1191
1325
  # Ollama - 3 curated models (local)
stratifyai/models.py CHANGED
@@ -9,9 +9,44 @@ from typing import List, Literal, Optional
9
9
  class Message:
10
10
  """Standard message format for all providers (OpenAI-compatible)."""
11
11
  role: Literal["system", "user", "assistant"]
12
- content: str
12
+ content: str # Can be plain text or contain [IMAGE:mime_type]\nbase64_data format
13
13
  name: Optional[str] = None # For multi-agent scenarios
14
14
  cache_control: Optional[dict] = None # For providers that support prompt caching (Anthropic, OpenAI)
15
+
16
+ def has_image(self) -> bool:
17
+ """Check if message contains image data."""
18
+ return "[IMAGE:" in self.content
19
+
20
+ def parse_vision_content(self) -> tuple[Optional[str], Optional[tuple[str, str]]]:
21
+ """Parse content into text and image data.
22
+
23
+ Returns:
24
+ (text_content, (mime_type, base64_data)) or (text_content, None) if no image
25
+ """
26
+ if not self.has_image():
27
+ return (self.content, None)
28
+
29
+ # Split content by [IMAGE:...] marker
30
+ parts = self.content.split("[IMAGE:")
31
+ text_parts = []
32
+ image_data = None
33
+
34
+ for i, part in enumerate(parts):
35
+ if i == 0:
36
+ # First part is text before image
37
+ if part.strip():
38
+ text_parts.append(part.strip())
39
+ else:
40
+ # This part starts with mime_type]
41
+ if "]" in part:
42
+ mime_type, rest = part.split("]", 1)
43
+ # rest contains the base64 data (possibly with leading/trailing whitespace)
44
+ base64_data = rest.strip()
45
+ if base64_data:
46
+ image_data = (mime_type.strip(), base64_data)
47
+
48
+ text_content = "\n".join(text_parts).strip() if text_parts else None
49
+ return (text_content, image_data)
15
50
 
16
51
 
17
52
  @dataclass
@@ -94,7 +94,33 @@ class AnthropicProvider(BaseProvider):
94
94
  if msg.role == "system":
95
95
  system_message = msg.content
96
96
  else:
97
- message_dict = {"role": msg.role, "content": msg.content}
97
+ # Check if message contains image data
98
+ if msg.has_image():
99
+ # Parse vision content
100
+ text_content, image_data = msg.parse_vision_content()
101
+
102
+ # Build vision message content array
103
+ content_parts = []
104
+ if text_content:
105
+ content_parts.append({"type": "text", "text": text_content})
106
+
107
+ if image_data:
108
+ mime_type, base64_data = image_data
109
+ # Anthropic expects base64 with source
110
+ content_parts.append({
111
+ "type": "image",
112
+ "source": {
113
+ "type": "base64",
114
+ "media_type": mime_type,
115
+ "data": base64_data
116
+ }
117
+ })
118
+
119
+ message_dict = {"role": msg.role, "content": content_parts}
120
+ else:
121
+ # Regular text message
122
+ message_dict = {"role": msg.role, "content": msg.content}
123
+
98
124
  # Add cache_control if present and model supports caching
99
125
  if msg.cache_control and self.supports_caching(request.model):
100
126
  message_dict["cache_control"] = msg.cache_control
@@ -138,8 +164,16 @@ class AnthropicProvider(BaseProvider):
138
164
  # Normalize and return
139
165
  return self._normalize_response(raw_response.model_dump())
140
166
  except Exception as e:
167
+ error_str = str(e)
168
+ # Check for vision-related errors
169
+ if "image" in error_str.lower() and ("not supported" in error_str.lower() or "invalid" in error_str.lower()):
170
+ raise ProviderAPIError(
171
+ f"Vision not supported: The model '{request.model}' cannot process images. "
172
+ f"Please use a vision-capable Claude model like 'claude-sonnet-4-5' or 'claude-opus-4-5'.",
173
+ self.provider_name
174
+ )
141
175
  raise ProviderAPIError(
142
- f"Chat completion failed: {str(e)}",
176
+ f"Chat completion failed: {error_str}",
143
177
  self.provider_name
144
178
  )
145
179
 
@@ -170,7 +204,7 @@ class AnthropicProvider(BaseProvider):
170
204
  constraints.get("max_temperature", 1.0)
171
205
  )
172
206
 
173
- # Convert messages to Anthropic format
207
+ # Convert messages to Anthropic format with vision support
174
208
  system_message = None
175
209
  messages = []
176
210
 
@@ -178,7 +212,25 @@ class AnthropicProvider(BaseProvider):
178
212
  if msg.role == "system":
179
213
  system_message = msg.content
180
214
  else:
181
- messages.append({"role": msg.role, "content": msg.content})
215
+ if msg.has_image():
216
+ # Parse and format vision content
217
+ text_content, image_data = msg.parse_vision_content()
218
+ content_parts = []
219
+ if text_content:
220
+ content_parts.append({"type": "text", "text": text_content})
221
+ if image_data:
222
+ mime_type, base64_data = image_data
223
+ content_parts.append({
224
+ "type": "image",
225
+ "source": {
226
+ "type": "base64",
227
+ "media_type": mime_type,
228
+ "data": base64_data
229
+ }
230
+ })
231
+ messages.append({"role": msg.role, "content": content_parts})
232
+ else:
233
+ messages.append({"role": msg.role, "content": msg.content})
182
234
 
183
235
  # Build request parameters
184
236
  anthropic_params = {
@@ -196,8 +248,16 @@ class AnthropicProvider(BaseProvider):
196
248
  async for chunk in stream.text_stream:
197
249
  yield self._normalize_stream_chunk(chunk)
198
250
  except Exception as e:
251
+ error_str = str(e)
252
+ # Check for vision-related errors
253
+ if "image" in error_str.lower() and ("not supported" in error_str.lower() or "invalid" in error_str.lower()):
254
+ raise ProviderAPIError(
255
+ f"Vision not supported: The model '{request.model}' cannot process images. "
256
+ f"Please use a vision-capable Claude model like 'claude-sonnet-4-5' or 'claude-opus-4-5'.",
257
+ self.provider_name
258
+ )
199
259
  raise ProviderAPIError(
200
- f"Streaming chat completion failed: {str(e)}",
260
+ f"Streaming chat completion failed: {error_str}",
201
261
  self.provider_name
202
262
  )
203
263
 
@@ -161,6 +161,21 @@ class BedrockProvider(BaseProvider):
161
161
  except ClientError as e:
162
162
  error_code = e.response["Error"]["Code"]
163
163
  error_message = e.response["Error"]["Message"]
164
+
165
+ # Parse and provide user-friendly error messages
166
+ if error_code == "ValidationException":
167
+ # Extract specific validation issues
168
+ if "is not less or equal to" in error_message and "/p:" in error_message:
169
+ friendly_msg = "Model configuration error: top_p parameter exceeds maximum allowed value for this model."
170
+ elif "is not a valid enum value" in error_message and "role" in error_message:
171
+ friendly_msg = "Model configuration error: Invalid message role format for this model."
172
+ else:
173
+ friendly_msg = f"Request validation failed: {error_message}"
174
+ raise ProviderAPIError(
175
+ f"[bedrock] {friendly_msg}",
176
+ self.provider_name
177
+ )
178
+
164
179
  raise ProviderAPIError(
165
180
  f"Bedrock API error ({error_code}): {error_message}",
166
181
  self.provider_name
@@ -223,6 +238,20 @@ class BedrockProvider(BaseProvider):
223
238
  except ClientError as e:
224
239
  error_code = e.response["Error"]["Code"]
225
240
  error_message = e.response["Error"]["Message"]
241
+
242
+ # Parse and provide user-friendly error messages
243
+ if error_code == "ValidationException":
244
+ if "is not less or equal to" in error_message and "/p:" in error_message:
245
+ friendly_msg = "Model configuration error: top_p parameter exceeds maximum allowed value for this model."
246
+ elif "is not a valid enum value" in error_message and "role" in error_message:
247
+ friendly_msg = "Model configuration error: Invalid message role format for this model."
248
+ else:
249
+ friendly_msg = f"Request validation failed: {error_message}"
250
+ raise ProviderAPIError(
251
+ f"[bedrock] {friendly_msg}",
252
+ self.provider_name
253
+ )
254
+
226
255
  raise ProviderAPIError(
227
256
  f"Bedrock streaming error ({error_code}): {error_message}",
228
257
  self.provider_name
@@ -292,7 +321,29 @@ class BedrockProvider(BaseProvider):
292
321
  if msg.role == "system":
293
322
  system_message = msg.content
294
323
  else:
295
- messages.append({"role": msg.role, "content": msg.content})
324
+ # Check if message contains an image
325
+ if msg.has_image():
326
+ # Parse vision content
327
+ text_content, (mime_type, base64_data) = msg.parse_vision_content()
328
+
329
+ # Build content array for vision (Anthropic format)
330
+ content_parts = []
331
+ if text_content:
332
+ content_parts.append({"type": "text", "text": text_content})
333
+
334
+ # Add image in Anthropic format
335
+ content_parts.append({
336
+ "type": "image",
337
+ "source": {
338
+ "type": "base64",
339
+ "media_type": mime_type,
340
+ "data": base64_data
341
+ }
342
+ })
343
+
344
+ messages.append({"role": msg.role, "content": content_parts})
345
+ else:
346
+ messages.append({"role": msg.role, "content": msg.content})
296
347
 
297
348
  body = {
298
349
  "anthropic_version": "bedrock-2023-05-31",
@@ -338,17 +389,33 @@ class BedrockProvider(BaseProvider):
338
389
 
339
390
  def _build_cohere_request(self, request: ChatRequest) -> dict:
340
391
  """Build request for Cohere models."""
341
- # Cohere uses a message-based format similar to OpenAI
342
- messages = []
343
- for msg in request.messages:
344
- messages.append({"role": msg.role, "message": msg.content})
392
+ # Cohere Bedrock uses USER/CHATBOT roles and requires specific format
393
+ # Extract user message (last message should be from user)
394
+ user_message = ""
395
+ chat_history = []
396
+
397
+ for i, msg in enumerate(request.messages):
398
+ # Skip system messages - Cohere handles them differently
399
+ if msg.role == "system":
400
+ continue
401
+
402
+ # Last user message becomes the main message
403
+ if i == len(request.messages) - 1 and msg.role == "user":
404
+ user_message = msg.content
405
+ else:
406
+ # Map role names to Cohere's expected format
407
+ cohere_role = "USER" if msg.role == "user" else "CHATBOT"
408
+ chat_history.append({"role": cohere_role, "message": msg.content})
409
+
410
+ # Clamp top_p to Cohere's maximum of 0.99
411
+ top_p = min(request.top_p, 0.99)
345
412
 
346
413
  return {
347
- "message": messages[-1]["message"] if messages else "",
348
- "chat_history": messages[:-1] if len(messages) > 1 else [],
414
+ "message": user_message,
415
+ "chat_history": chat_history,
349
416
  "max_tokens": request.max_tokens or 2048,
350
417
  "temperature": request.temperature,
351
- "p": request.top_p,
418
+ "p": top_p,
352
419
  }
353
420
 
354
421
  def _build_nova_request(self, request: ChatRequest) -> dict:
@@ -361,7 +428,27 @@ class BedrockProvider(BaseProvider):
361
428
  if msg.role == "system":
362
429
  system_message = msg.content
363
430
  else:
364
- messages.append({"role": msg.role, "content": [{"text": msg.content}]})
431
+ # Check if message contains an image
432
+ if msg.has_image():
433
+ # Parse vision content
434
+ text_content, (mime_type, base64_data) = msg.parse_vision_content()
435
+
436
+ # Build content array for vision (Nova format)
437
+ content_parts = []
438
+ if text_content:
439
+ content_parts.append({"text": text_content})
440
+
441
+ # Add image in Nova format
442
+ content_parts.append({
443
+ "image": {
444
+ "format": mime_type.split("/")[1] if "/" in mime_type else "png",
445
+ "source": {"bytes": base64_data}
446
+ }
447
+ })
448
+
449
+ messages.append({"role": msg.role, "content": content_parts})
450
+ else:
451
+ messages.append({"role": msg.role, "content": [{"text": msg.content}]})
365
452
 
366
453
  body = {
367
454
  "messages": messages,
@@ -20,13 +20,14 @@ class GrokProvider(OpenAICompatibleProvider):
20
20
  Initialize Grok provider.
21
21
 
22
22
  Args:
23
- api_key: Grok API key (defaults to GROK_API_KEY env var)
23
+ api_key: Grok API key (defaults to XAI_API_KEY or GROK_API_KEY env var)
24
24
  config: Optional provider-specific configuration
25
25
 
26
26
  Raises:
27
27
  AuthenticationError: If API key not provided
28
28
  """
29
- api_key = api_key or os.getenv("GROK_API_KEY")
29
+ # Support both XAI_API_KEY (official) and GROK_API_KEY (legacy) for backward compatibility
30
+ api_key = api_key or os.getenv("XAI_API_KEY") or os.getenv("GROK_API_KEY")
30
31
  if not api_key:
31
32
  raise AuthenticationError("grok")
32
33
 
@@ -87,7 +87,30 @@ class OpenAIProvider(BaseProvider):
87
87
  # Build OpenAI-specific request parameters
88
88
  messages = []
89
89
  for msg in request.messages:
90
- message_dict = {"role": msg.role, "content": msg.content}
90
+ # Check if message contains image data
91
+ if msg.has_image():
92
+ # Parse vision content
93
+ text_content, image_data = msg.parse_vision_content()
94
+
95
+ # Build vision message content array
96
+ content_parts = []
97
+ if text_content:
98
+ content_parts.append({"type": "text", "text": text_content})
99
+
100
+ if image_data:
101
+ mime_type, base64_data = image_data
102
+ # OpenAI expects data URL format
103
+ image_url = f"data:{mime_type};base64,{base64_data}"
104
+ content_parts.append({
105
+ "type": "image_url",
106
+ "image_url": {"url": image_url}
107
+ })
108
+
109
+ message_dict = {"role": msg.role, "content": content_parts}
110
+ else:
111
+ # Regular text message
112
+ message_dict = {"role": msg.role, "content": msg.content}
113
+
91
114
  # Add cache_control if present and model supports caching
92
115
  if msg.cache_control and self.supports_caching(request.model):
93
116
  message_dict["cache_control"] = msg.cache_control
@@ -143,8 +166,16 @@ class OpenAIProvider(BaseProvider):
143
166
  # Normalize and return
144
167
  return self._normalize_response(raw_response.model_dump())
145
168
  except Exception as e:
169
+ error_str = str(e)
170
+ # Check for vision-related errors
171
+ if "image_url is only supported by certain models" in error_str or "Invalid content type" in error_str:
172
+ raise ProviderAPIError(
173
+ f"Vision not supported: The model '{request.model}' cannot process images. "
174
+ f"Please use a vision-capable model like 'gpt-4o' or 'gpt-4o-mini'.",
175
+ self.provider_name
176
+ )
146
177
  raise ProviderAPIError(
147
- f"Chat completion failed: {str(e)}",
178
+ f"Chat completion failed: {error_str}",
148
179
  self.provider_name
149
180
  )
150
181
 
@@ -167,13 +198,29 @@ class OpenAIProvider(BaseProvider):
167
198
  if not self.validate_model(request.model):
168
199
  raise InvalidModelError(request.model, self.provider_name)
169
200
 
170
- # Build request parameters
201
+ # Build request parameters with vision support
202
+ messages = []
203
+ for msg in request.messages:
204
+ if msg.has_image():
205
+ # Parse and format vision content
206
+ text_content, image_data = msg.parse_vision_content()
207
+ content_parts = []
208
+ if text_content:
209
+ content_parts.append({"type": "text", "text": text_content})
210
+ if image_data:
211
+ mime_type, base64_data = image_data
212
+ image_url = f"data:{mime_type};base64,{base64_data}"
213
+ content_parts.append({
214
+ "type": "image_url",
215
+ "image_url": {"url": image_url}
216
+ })
217
+ messages.append({"role": msg.role, "content": content_parts})
218
+ else:
219
+ messages.append({"role": msg.role, "content": msg.content})
220
+
171
221
  openai_params = {
172
222
  "model": request.model,
173
- "messages": [
174
- {"role": msg.role, "content": msg.content}
175
- for msg in request.messages
176
- ],
223
+ "messages": messages,
177
224
  "stream": True,
178
225
  }
179
226
 
@@ -207,8 +254,16 @@ class OpenAIProvider(BaseProvider):
207
254
  if chunk.choices and chunk.choices[0].delta.content:
208
255
  yield self._normalize_stream_chunk(chunk_dict)
209
256
  except Exception as e:
257
+ error_str = str(e)
258
+ # Check for vision-related errors
259
+ if "image_url is only supported by certain models" in error_str or "Invalid content type" in error_str:
260
+ raise ProviderAPIError(
261
+ f"Vision not supported: The model '{request.model}' cannot process images. "
262
+ f"Please use a vision-capable model like 'gpt-4o' or 'gpt-4o-mini'.",
263
+ self.provider_name
264
+ )
210
265
  raise ProviderAPIError(
211
- f"Streaming chat completion failed: {str(e)}",
266
+ f"Streaming chat completion failed: {error_str}",
212
267
  self.provider_name
213
268
  )
214
269