PyPI - solana-agent - Versions diffs - 28.2.0__py3-none-any.whl → 28.3.0__py3-none-any.whl - Mend

solana-agent 28.2.0py3-none-any.whl → 28.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

solana_agent/adapters/openai_adapter.py CHANGED Viewed

@@ -5,9 +5,22 @@ These adapters implement the LLMProvider interface for different LLM services.
 """
 import logging
-from typing import AsyncGenerator, List, Literal, Optional, Type, TypeVar
-from openai import AsyncOpenAI
+import base64
+import io
+import math
+from typing import (
+    AsyncGenerator,
+    List,
+    Literal,
+    Optional,
+    Type,
+    TypeVar,
+    Dict,
+    Any,
+    Union,
+)
+from PIL import Image
+from openai import AsyncOpenAI, OpenAIError
 from pydantic import BaseModel
 import instructor
 from instructor import Mode
@@ -21,12 +34,23 @@ logger = logging.getLogger(__name__)
 T = TypeVar("T", bound=BaseModel)
 DEFAULT_CHAT_MODEL = "gpt-4.1"
+DEFAULT_VISION_MODEL = "gpt-4.1"
 DEFAULT_PARSE_MODEL = "gpt-4.1-nano"
 DEFAULT_EMBEDDING_MODEL = "text-embedding-3-large"
 DEFAULT_EMBEDDING_DIMENSIONS = 3072
 DEFAULT_TRANSCRIPTION_MODEL = "gpt-4o-mini-transcribe"
 DEFAULT_TTS_MODEL = "tts-1"
+# Image constants
+SUPPORTED_IMAGE_FORMATS = {"PNG", "JPEG", "WEBP", "GIF"}
+MAX_IMAGE_SIZE_MB = 20
+MAX_TOTAL_IMAGE_SIZE_MB = 50
+MAX_IMAGE_COUNT = 500
+GPT41_PATCH_SIZE = 32
+GPT41_MAX_PATCHES = 1536
+GPT41_MINI_MULTIPLIER = 1.62
+GPT41_NANO_MULTIPLIER = 2.46
 class OpenAIAdapter(LLMProvider):
     """OpenAI implementation of LLMProvider with web search capabilities."""
@@ -39,13 +63,14 @@ class OpenAIAdapter(LLMProvider):
             try:
                 logfire.configure(token=logfire_api_key)
                 self.logfire = True
-                logger.info("Logfire configured successfully.")  # Use logger.info
+                logger.info("Logfire configured successfully.")
             except Exception as e:
-                logger.error(f"Failed to configure Logfire: {e}")  # Use logger.error
+                logger.error(f"Failed to configure Logfire: {e}")
                 self.logfire = False
         self.parse_model = DEFAULT_PARSE_MODEL
         self.text_model = DEFAULT_CHAT_MODEL
+        self.vision_model = DEFAULT_VISION_MODEL  # Add vision model attribute
         self.transcription_model = DEFAULT_TRANSCRIPTION_MODEL
         self.tts_model = DEFAULT_TTS_MODEL
         self.embedding_model = DEFAULT_EMBEDDING_MODEL
@@ -139,20 +164,17 @@ class OpenAIAdapter(LLMProvider):
         base_url: Optional[str] = None,
         model: Optional[str] = None,
     ) -> str:  # pragma: no cover
-        """Generate text from OpenAI models as a single string."""
+        """Generate text from OpenAI models as a single string (no images)."""
         messages = []
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
         messages.append({"role": "user", "content": prompt})
-        # Prepare request parameters - stream is always False now
         request_params = {
             "messages": messages,
-            "stream": False,  # Hardcoded to False
             "model": model or self.text_model,
         }
-        # Determine client based on provided api_key/base_url
         if api_key and base_url:
             client = AsyncOpenAI(api_key=api_key, base_url=base_url)
         else:
@@ -162,24 +184,221 @@ class OpenAIAdapter(LLMProvider):
             logfire.instrument_openai(client)
         try:
-            # Make the non-streaming API call
             response = await client.chat.completions.create(**request_params)
-            # Handle non-streaming response
             if response.choices and response.choices[0].message.content:
-                full_text = response.choices[0].message.content
-                return full_text  # Return the complete string
+                return response.choices[0].message.content
             else:
-                logger.warning(
-                    "Received non-streaming response with no content."
-                )  # Use logger.warning
-                return ""  # Return empty string if no content
+                logger.warning("Received non-streaming response with no content.")
+                return ""
+        except OpenAIError as e:  # Catch specific OpenAI errors
+            logger.error(f"OpenAI API error during text generation: {e}")
+            return f"I apologize, but I encountered an API error: {e}"
         except Exception as e:
-            # Log the exception and return an error message string
             logger.exception(f"Error in generate_text: {e}")
-            # Consider returning a more informative error string or raising
-            return f"Error generating text: {e}"
+            return f"I apologize, but I encountered an unexpected error: {e}"
+    def _calculate_gpt41_image_cost(self, width: int, height: int, model: str) -> int:
+        """Calculates the token cost for an image with GPT-4.1 models."""
+        patches_wide = math.ceil(width / GPT41_PATCH_SIZE)
+        patches_high = math.ceil(height / GPT41_PATCH_SIZE)
+        total_patches_needed = patches_wide * patches_high
+        if total_patches_needed > GPT41_MAX_PATCHES:
+            scale_factor = math.sqrt(GPT41_MAX_PATCHES / total_patches_needed)
+            new_width = math.floor(width * scale_factor)
+            new_height = math.floor(height * scale_factor)
+            final_patches_wide_scaled = math.ceil(new_width / GPT41_PATCH_SIZE)
+            final_patches_high_scaled = math.ceil(new_height / GPT41_PATCH_SIZE)
+            image_tokens = final_patches_wide_scaled * final_patches_high_scaled
+            # Ensure it doesn't exceed the cap due to ceiling operations after scaling
+            image_tokens = min(image_tokens, GPT41_MAX_PATCHES)
+            logger.debug(
+                f"Image scaled down. Original patches: {total_patches_needed}, New dims: ~{new_width}x{new_height}, Final patches: {image_tokens}"
+            )
+        else:
+            image_tokens = total_patches_needed
+            logger.debug(f"Image fits within patch limit. Patches: {image_tokens}")
+        # Apply model-specific multiplier
+        if "mini" in model:
+            total_tokens = math.ceil(image_tokens * GPT41_MINI_MULTIPLIER)
+        elif "nano" in model:
+            total_tokens = math.ceil(image_tokens * GPT41_NANO_MULTIPLIER)
+        else:  # Assume base gpt-4.1
+            total_tokens = image_tokens
+        logger.info(
+            f"Calculated token cost for image ({width}x{height}) with model '{model}': {total_tokens} tokens (base image tokens: {image_tokens})"
+        )
+        return total_tokens
+    async def generate_text_with_images(
+        self,
+        prompt: str,
+        images: List[Union[str, bytes]],
+        system_prompt: str = "",
+        detail: Literal["low", "high", "auto"] = "auto",
+    ) -> str:  # pragma: no cover
+        """Generate text from OpenAI models using text and image inputs."""
+        if not images:
+            logger.warning(
+                "generate_text_with_images called with no images. Falling back to generate_text."
+            )
+            return await self.generate_text(prompt, system_prompt)
+        target_model = self.vision_model
+        if "gpt-4.1" not in target_model:  # Basic check for vision model
+            logger.warning(
+                f"Model '{target_model}' might not support vision. Using it anyway."
+            )
+        content_list: List[Dict[str, Any]] = [{"type": "text", "text": prompt}]
+        total_image_bytes = 0
+        total_image_tokens = 0
+        if len(images) > MAX_IMAGE_COUNT:
+            logger.error(
+                f"Too many images provided ({len(images)}). Maximum is {MAX_IMAGE_COUNT}."
+            )
+            return f"Error: Too many images provided ({len(images)}). Maximum is {MAX_IMAGE_COUNT}."
+        for i, image_input in enumerate(images):
+            image_url_data: Dict[str, Any] = {"detail": detail}
+            image_bytes: Optional[bytes] = None
+            image_format: Optional[str] = None
+            width: Optional[int] = None
+            height: Optional[int] = None
+            try:
+                if isinstance(image_input, str):  # It's a URL
+                    logger.debug(f"Processing image URL: {image_input[:50]}...")
+                    image_url_data["url"] = image_input
+                    # Cannot easily validate size/format/dimensions or calculate cost for URLs
+                    logger.warning(
+                        "Cannot validate size/format or calculate token cost for image URLs."
+                    )
+                elif isinstance(image_input, bytes):  # It's image bytes
+                    logger.debug(
+                        f"Processing image bytes (size: {len(image_input)})..."
+                    )
+                    image_bytes = image_input
+                    size_mb = len(image_bytes) / (1024 * 1024)
+                    if size_mb > MAX_IMAGE_SIZE_MB:
+                        logger.error(
+                            f"Image {i + 1} size ({size_mb:.2f}MB) exceeds limit ({MAX_IMAGE_SIZE_MB}MB)."
+                        )
+                        return f"Error: Image {i + 1} size ({size_mb:.2f}MB) exceeds limit ({MAX_IMAGE_SIZE_MB}MB)."
+                    total_image_bytes += len(image_bytes)
+                    # Use Pillow to validate format and get dimensions
+                    try:
+                        img = Image.open(io.BytesIO(image_bytes))
+                        image_format = img.format
+                        width, height = img.size
+                        img.verify()  # Verify integrity
+                        # Re-open after verify
+                        img = Image.open(io.BytesIO(image_bytes))
+                        width, height = img.size  # Get dimensions again
+                        if image_format not in SUPPORTED_IMAGE_FORMATS:
+                            logger.error(
+                                f"Unsupported image format '{image_format}' for image {i + 1}."
+                            )
+                            return f"Error: Unsupported image format '{image_format}'. Supported formats: {SUPPORTED_IMAGE_FORMATS}."
+                        logger.debug(
+                            f"Image {i + 1}: Format={image_format}, Dimensions={width}x{height}"
+                        )
+                        # Calculate cost only if dimensions are available
+                        if width and height and "gpt-4.1" in target_model:
+                            total_image_tokens += self._calculate_gpt41_image_cost(
+                                width, height, target_model
+                            )
+                    except (IOError, SyntaxError) as img_err:
+                        logger.error(
+                            f"Invalid or corrupted image data for image {i + 1}: {img_err}"
+                        )
+                        return f"Error: Invalid or corrupted image data provided for image {i + 1}."
+                    except Exception as pillow_err:
+                        logger.error(
+                            f"Pillow error processing image {i + 1}: {pillow_err}"
+                        )
+                        return f"Error: Could not process image data for image {i + 1}."
+                    # Encode to Base64 Data URL
+                    mime_type = Image.MIME.get(image_format)
+                    if not mime_type:
+                        logger.warning(
+                            f"Could not determine MIME type for format {image_format}. Defaulting to image/jpeg."
+                        )
+                        mime_type = "image/jpeg"
+                    base64_image = base64.b64encode(image_bytes).decode("utf-8")
+                    image_url_data["url"] = f"data:{mime_type};base64,{base64_image}"
+                else:
+                    logger.error(
+                        f"Invalid image input type for image {i + 1}: {type(image_input)}"
+                    )
+                    return f"Error: Invalid image input type for image {i + 1}. Must be URL (str) or bytes."
+                content_list.append({"type": "image_url", "image_url": image_url_data})
+            except Exception as proc_err:
+                logger.error(
+                    f"Error processing image {i + 1}: {proc_err}", exc_info=True
+                )
+                return f"Error: Failed to process image {i + 1}."
+        total_size_mb = total_image_bytes / (1024 * 1024)
+        if total_size_mb > MAX_TOTAL_IMAGE_SIZE_MB:
+            logger.error(
+                f"Total image size ({total_size_mb:.2f}MB) exceeds limit ({MAX_TOTAL_IMAGE_SIZE_MB}MB)."
+            )
+            return f"Error: Total image size ({total_size_mb:.2f}MB) exceeds limit ({MAX_TOTAL_IMAGE_SIZE_MB}MB)."
+        messages: List[Dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": content_list})
+        request_params = {
+            "messages": messages,
+            "model": target_model,
+            # "max_tokens": 300 # Optional: Add max_tokens if needed
+        }
+        if self.logfire:
+            logfire.instrument_openai(self.client)
+        logger.info(
+            f"Sending request to '{target_model}' with {len(images)} images. Total calculated image tokens (approx): {total_image_tokens}"
+        )
+        try:
+            response = await self.client.chat.completions.create(**request_params)
+            if response.choices and response.choices[0].message.content:
+                # Log actual usage if available
+                if response.usage:
+                    logger.info(
+                        f"OpenAI API Usage: Prompt={response.usage.prompt_tokens}, Completion={response.usage.completion_tokens}, Total={response.usage.total_tokens}"
+                    )
+                return response.choices[0].message.content
+            else:
+                logger.warning("Received vision response with no content.")
+                return ""
+        except OpenAIError as e:  # Catch specific OpenAI errors
+            logger.error(f"OpenAI API error during vision request: {e}")
+            return f"I apologize, but I encountered an API error: {e}"
+        except Exception as e:
+            logger.exception(f"Error in generate_text_with_images: {e}")
+            return f"I apologize, but I encountered an unexpected error: {e}"
     async def parse_structured_output(
         self,

solana_agent/client/solana_agent.py CHANGED Viewed

@@ -68,8 +68,9 @@ class SolanaAgent(SolanaAgentInterface):
             "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
         ] = "mp4",
         router: Optional[RoutingInterface] = None,
+        images: Optional[List[Union[str, bytes]]] = None,
     ) -> AsyncGenerator[Union[str, bytes], None]:  # pragma: no cover
-        """Process a user message and return the response stream.
+        """Process a user message (text or audio) and optional images, returning the response stream.
         Args:
             user_id: User ID
@@ -81,6 +82,7 @@ class SolanaAgent(SolanaAgentInterface):
             audio_output_format: Audio output format
             audio_input_format: Audio input format
             router: Optional routing service for processing
+            images: Optional list of image URLs (str) or image bytes.
         Returns:
             Async generator yielding response chunks (text strings or audio bytes)
@@ -88,6 +90,7 @@ class SolanaAgent(SolanaAgentInterface):
         async for chunk in self.query_service.process(
             user_id=user_id,
             query=message,
+            images=images,
             output_format=output_format,
             audio_voice=audio_voice,
             audio_instructions=audio_instructions,

solana_agent/interfaces/client/client.py CHANGED Viewed

@@ -34,6 +34,7 @@ class SolanaAgent(ABC):
             "flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"
         ] = "mp4",
         router: Optional[RoutingInterface] = None,
+        images: Optional[List[Union[str, bytes]]] = None,
     ) -> AsyncGenerator[Union[str, bytes], None]:
         """Process a user message and return the response stream."""
         pass

solana_agent/interfaces/providers/llm.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import (
     Optional,
     Type,
     TypeVar,
+    Union,
 )
 from pydantic import BaseModel
@@ -91,3 +92,14 @@ class LLMProvider(ABC):
             A list of floats representing the embedding vector.
         """
         pass
+    @abstractmethod
+    async def generate_text_with_images(
+        self,
+        prompt: str,
+        images: List[Union[str, bytes]],
+        system_prompt: str = "",
+        detail: Literal["low", "high", "auto"] = "auto",
+    ) -> str:
+        """Generate text from the language model using images."""
+        pass

solana_agent/interfaces/services/agent.py CHANGED Viewed

@@ -44,6 +44,7 @@ class AgentService(ABC):
             "mp3", "opus", "aac", "flac", "wav", "pcm"
         ] = "aac",
         prompt: Optional[str] = None,
+        images: Optional[List[Union[str, bytes]]] = None,
     ) -> AsyncGenerator[Union[str, bytes], None]:
         """Generate a response from an agent."""
         pass

solana_agent/interfaces/services/query.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, AsyncGenerator, Dict, Literal, Optional, Union
+from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Union
 from solana_agent.interfaces.services.routing import RoutingService as RoutingInterface
@@ -34,6 +34,7 @@ class QueryService(ABC):
         ] = "mp4",
         prompt: Optional[str] = None,
         router: Optional[RoutingInterface] = None,
+        images: Optional[List[Union[str, bytes]]] = None,
     ) -> AsyncGenerator[Union[str, bytes], None]:
         """Process the user request and generate a response."""
         pass

solana_agent/services/agent.py CHANGED Viewed

@@ -341,6 +341,7 @@ class AgentService(AgentServiceInterface):
         agent_name: str,
         user_id: str,
         query: Union[str, bytes],
+        images: Optional[List[Union[str, bytes]]] = None,
         memory_context: str = "",
         output_format: Literal["text", "audio"] = "text",
         audio_voice: Literal[
@@ -362,16 +363,13 @@ class AgentService(AgentServiceInterface):
         prompt: Optional[str] = None,
     ) -> AsyncGenerator[Union[str, bytes], None]:  # pragma: no cover
         """Generate a response, supporting multiple sequential tool calls with placeholder substitution.
-        Text responses are always generated as a single block.
-        Audio responses always buffer text before TTS.
+        Optionally accepts images for vision-capable models.
         """
         agent = next((a for a in self.agents if a.name == agent_name), None)
         if not agent:
             error_msg = f"Agent '{agent_name}' not found."
             logger.warning(error_msg)
             if output_format == "audio":
-                # Assuming tts returns an async generator
                 async for chunk in self.llm_provider.tts(
                     error_msg,
                     instructions=audio_instructions,
@@ -380,11 +378,11 @@ class AgentService(AgentServiceInterface):
                 ):
                     yield chunk
             else:
-                yield error_msg  # Yield the single error string
+                yield error_msg
             return
         logger.debug(
-            f"Generating response for agent '{agent_name}'. Output format: {output_format}."
+            f"Generating response for agent '{agent_name}'. Output format: {output_format}. Images provided: {bool(images)}."
         )
         try:
@@ -406,20 +404,40 @@ class AgentService(AgentServiceInterface):
             start_marker = "[TOOL]"
             logger.info(f"Generating initial response for agent '{agent_name}'...")
-            # Call generate_text and await the string result
-            initial_llm_response_buffer = await self.llm_provider.generate_text(
-                prompt=str(query),
-                system_prompt=final_system_prompt,
-                api_key=self.api_key,
-                base_url=self.base_url,
-                model=self.model,
-            )
+            # --- CHOOSE LLM METHOD BASED ON IMAGE PRESENCE ---
+            if images:
+                # Use the new vision method if images are present
+                logger.info(
+                    f"Using generate_text_with_images for {len(images)} images."
+                )
+                # Ensure query is string for the text part
+                text_query = str(query) if isinstance(query, bytes) else query
+                initial_llm_response_buffer = (
+                    await self.llm_provider.generate_text_with_images(
+                        prompt=text_query,
+                        images=images,
+                        system_prompt=final_system_prompt,
+                    )
+                )
+            else:
+                # Use the standard text generation method
+                logger.info("Using generate_text (no images provided).")
+                initial_llm_response_buffer = await self.llm_provider.generate_text(
+                    prompt=str(query),
+                    system_prompt=final_system_prompt,
+                    api_key=self.api_key,
+                    base_url=self.base_url,
+                    model=self.model,
+                )
+            # --- END LLM METHOD CHOICE ---
             # Check for errors returned as string by the adapter
-            if isinstance(
-                initial_llm_response_buffer, str
-            ) and initial_llm_response_buffer.startswith(
-                "I apologize, but I encountered an error"
+            if isinstance(initial_llm_response_buffer, str) and (
+                initial_llm_response_buffer.startswith(
+                    "I apologize, but I encountered an"
+                )
+                or initial_llm_response_buffer.startswith("Error:")
             ):
                 logger.error(
                     f"LLM provider failed during initial generation: {initial_llm_response_buffer}"
@@ -452,24 +470,23 @@ class AgentService(AgentServiceInterface):
             # --- Tool Execution Phase (if tools were detected) ---
             final_response_text = ""
             if tool_calls_detected:
+                # NOTE: If tools need to operate on image content, this logic needs significant changes.
+                # Assuming for now tools operate based on the text query or the LLM's understanding derived from images.
                 parsed_calls = self._parse_tool_calls(initial_llm_response_buffer)
                 if parsed_calls:
-                    # --- Execute tools SEQUENTIALLY with Placeholder Substitution ---
-                    executed_tool_results = []  # Store full result dicts
-                    # Map tool names to their string results for substitution
+                    # ... (existing sequential tool execution with substitution) ...
+                    executed_tool_results = []
                     tool_results_map: Dict[str, str] = {}
                     logger.info(
                         f"Executing {len(parsed_calls)} tools sequentially with substitution..."
                     )
                     for i, call in enumerate(parsed_calls):
+                        # ... (existing substitution logic) ...
                         tool_name_to_exec = call.get("name", "unknown")
                         logger.info(
                             f"Executing tool {i + 1}/{len(parsed_calls)}: {tool_name_to_exec}"
                         )
-                        # --- Substitute placeholders in parameters ---
                         try:
                             original_params = call.get("parameters", {})
                             substituted_params = self._substitute_placeholders(
@@ -479,20 +496,17 @@ class AgentService(AgentServiceInterface):
                                 logger.info(
                                     f"Substituted parameters for tool '{tool_name_to_exec}': {substituted_params}"
                                 )
-                            call["parameters"] = substituted_params  # Update call dict
+                            call["parameters"] = substituted_params
                         except Exception as sub_err:
                             logger.error(
                                 f"Error substituting placeholders for tool '{tool_name_to_exec}': {sub_err}",
                                 exc_info=True,
                             )
-                            # Proceed with original params but log the error
-                        # --- Execute the tool ---
+                        # ... (existing tool execution call) ...
                         try:
                             result = await self._execute_single_tool(agent_name, call)
                             executed_tool_results.append(result)
-                            # --- Store successful result string for future substitutions ---
                             if result.get("status") == "success":
                                 tool_result_str = str(result.get("result", ""))
                                 tool_results_map[tool_name_to_exec] = tool_result_str
@@ -500,15 +514,13 @@ class AgentService(AgentServiceInterface):
                                     f"Stored result for '{tool_name_to_exec}' (length: {len(tool_result_str)})"
                                 )
                             else:
-                                # Store error message as result
                                 error_message = result.get("message", "Unknown error")
                                 tool_results_map[tool_name_to_exec] = (
                                     f"Error: {error_message}"
                                 )
                                 logger.warning(
-                                    f"Tool '{tool_name_to_exec}' failed, storing error message as result."
+                                    f"Tool '{tool_name_to_exec}' failed, storing error message."
                                 )
                         except Exception as tool_exec_err:
                             logger.error(
                                 f"Exception during execution of tool {tool_name_to_exec}: {tool_exec_err}",
@@ -521,20 +533,15 @@ class AgentService(AgentServiceInterface):
                             }
                             executed_tool_results.append(error_result)
                             tool_results_map[tool_name_to_exec] = (
-                                f"Error: {str(tool_exec_err)}"  # Store error
+                                f"Error: {str(tool_exec_err)}"
                             )
                     logger.info("Sequential tool execution with substitution complete.")
-                    # --- End Sequential Execution ---
-                    # Format results for the follow-up prompt (use executed_tool_results)
+                    # ... (existing formatting of tool results) ...
                     tool_results_text_parts = []
-                    for i, result in enumerate(
-                        executed_tool_results
-                    ):  # Use the collected results
-                        tool_name = result.get(
-                            "tool_name", "unknown"
-                        )  # Name should be in the result dict now
+                    for i, result in enumerate(executed_tool_results):
+                        tool_name = result.get("tool_name", "unknown")
                         if (
                             isinstance(result, Exception)
                             or result.get("status") == "error"
@@ -556,8 +563,12 @@ class AgentService(AgentServiceInterface):
                     tool_results_context = "\n\n".join(tool_results_text_parts)
                     # --- Generate Final Response using Tool Results (No Streaming) ---
-                    follow_up_prompt = f"Original Query: {str(query)}\n\nRESULTS FROM TOOL CALLS:\n{tool_results_context}\n\nBased on the original query and the tool results, please provide the final response to the user."
-                    # Rebuild system prompt
+                    # Include original query (text part) and mention images were provided if applicable
+                    original_query_context = f"Original Query: {str(query)}"
+                    if images:
+                        original_query_context += f" (with {len(images)} image(s))"
+                    follow_up_prompt = f"{original_query_context}\n\nRESULTS FROM TOOL CALLS:\n{tool_results_context}\n\nBased on the original query, any provided images, and the tool results, please provide the final response to the user."
                     follow_up_system_prompt_parts = [
                         self.get_agent_system_prompt(agent_name)
                     ]
@@ -571,7 +582,7 @@ class AgentService(AgentServiceInterface):
                             f"\nORIGINAL ADDITIONAL PROMPT:\n{prompt}"
                         )
                     follow_up_system_prompt_parts.append(
-                        f"\nCONTEXT: You previously decided to run {len(parsed_calls)} tool(s) sequentially to answer the query. The results are provided above."
+                        f"\nCONTEXT: You previously decided to run {len(parsed_calls)} tool(s) sequentially. The results are provided above."
                     )
                     final_follow_up_system_prompt = "\n\n".join(
                         filter(None, follow_up_system_prompt_parts)
@@ -580,25 +591,25 @@ class AgentService(AgentServiceInterface):
                     logger.info(
                         "Generating final response incorporating tool results..."
                     )
-                    # Call generate_text and await the string result
+                    # Use standard text generation for the final synthesis
                     synthesized_response_buffer = await self.llm_provider.generate_text(
                         prompt=follow_up_prompt,
                         system_prompt=final_follow_up_system_prompt,
                         api_key=self.api_key,
                         base_url=self.base_url,
-                        model=self.model,
+                        model=self.model
+                        or self.llm_provider.text_model,  # Use text model for synthesis
                     )
-                    # Check for errors returned as string by the adapter
-                    if isinstance(
-                        synthesized_response_buffer, str
-                    ) and synthesized_response_buffer.startswith(
-                        "I apologize, but I encountered an error"
+                    if isinstance(synthesized_response_buffer, str) and (
+                        synthesized_response_buffer.startswith(
+                            "I apologize, but I encountered an"
+                        )
+                        or synthesized_response_buffer.startswith("Error:")
                     ):
                         logger.error(
                             f"LLM provider failed during final generation: {synthesized_response_buffer}"
                         )
-                        # Yield the error and exit
                         if output_format == "audio":
                             async for chunk in self.llm_provider.tts(
                                 synthesized_response_buffer,
@@ -617,13 +628,11 @@ class AgentService(AgentServiceInterface):
                     )
                 else:
-                    # Tools detected but parsing failed
                     logger.warning(
                         "Tool markers detected, but no valid tool calls parsed. Treating initial response as final."
                     )
                     final_response_text = initial_llm_response_buffer
             else:
-                # No tools detected
                 final_response_text = initial_llm_response_buffer
                 logger.info("No tools detected. Using initial response as final.")
@@ -641,7 +650,7 @@ class AgentService(AgentServiceInterface):
                         )
                     except Exception as e:
                         logger.error(
-                            f"Error applying output guardrail {guardrail.__class__.__name__} to final text: {e}"
+                            f"Error applying output guardrail {guardrail.__class__.__name__}: {e}"
                         )
                 if len(processed_final_text) != original_len:
                     logger.info(
@@ -651,14 +660,12 @@ class AgentService(AgentServiceInterface):
             self.last_text_response = processed_final_text
             if output_format == "text":
-                # Yield the single final string
                 if processed_final_text:
                     yield processed_final_text
                 else:
                     logger.warning("Final processed text was empty.")
                     yield ""
             elif output_format == "audio":
-                # TTS still needs a generator
                 text_for_tts = processed_final_text
                 cleaned_audio_buffer = self._clean_for_audio(text_for_tts)
                 logger.info(

solana_agent/services/query.py CHANGED Viewed

@@ -22,9 +22,8 @@ from solana_agent.interfaces.services.knowledge_base import (
 )
 from solana_agent.interfaces.guardrails.guardrails import (
     InputGuardrail,
-)  # <-- Import InputGuardrail
+)
-# Service imports (assuming AgentService is the concrete implementation)
 from solana_agent.services.agent import AgentService
 from solana_agent.services.routing import RoutingService
@@ -58,12 +57,13 @@ class QueryService(QueryServiceInterface):
         self.memory_provider = memory_provider
         self.knowledge_base = knowledge_base
         self.kb_results_count = kb_results_count
-        self.input_guardrails = input_guardrails or []  # <-- Store guardrails
+        self.input_guardrails = input_guardrails or []
     async def process(
         self,
         user_id: str,
         query: Union[str, bytes],
+        images: Optional[List[Union[str, bytes]]] = None,
         output_format: Literal["text", "audio"] = "text",
         audio_voice: Literal[
             "alloy",
@@ -92,6 +92,7 @@ class QueryService(QueryServiceInterface):
         Args:
             user_id: User ID
             query: Text query or audio bytes
+            images: Optional list of image URLs (str) or image bytes.
             output_format: Response format ("text" or "audio")
             audio_voice: Voice for TTS (text-to-speech)
             audio_instructions: Audio voice instructions
@@ -143,7 +144,14 @@ class QueryService(QueryServiceInterface):
             # --- End Apply Input Guardrails ---
             # --- 3. Handle Simple Greetings ---
-            if user_text.strip().lower() in ["test", "hello", "hi", "hey", "ping"]:
+            # Simple greetings typically don't involve images
+            if not images and user_text.strip().lower() in [
+                "test",
+                "hello",
+                "hi",
+                "hey",
+                "ping",
+            ]:
                 response = "Hello! How can I help you today?"
                 logger.info("Handling simple greeting.")
                 if output_format == "audio":
@@ -201,7 +209,7 @@ class QueryService(QueryServiceInterface):
             # --- 6. Route Query ---
             agent_name = "default"  # Fallback agent
             try:
-                # Use processed user_text for routing
+                # Use processed user_text for routing (images generally don't affect routing logic here)
                 if router:
                     agent_name = await router.route_query(user_text)
                 else:
@@ -225,12 +233,13 @@ class QueryService(QueryServiceInterface):
             logger.debug(f"Combined context length: {len(combined_context)}")
             # --- 8. Generate Response ---
-            # Pass the processed user_text to the agent service
+            # Pass the processed user_text and images to the agent service
             if output_format == "audio":
                 async for audio_chunk in self.agent_service.generate_response(
                     agent_name=agent_name,
                     user_id=user_id,
                     query=user_text,  # Pass processed text
+                    images=images,
                     memory_context=combined_context,
                     output_format="audio",
                     audio_voice=audio_voice,
@@ -241,10 +250,11 @@ class QueryService(QueryServiceInterface):
                     yield audio_chunk
                 # Store conversation using processed user_text
+                # Note: Storing images in history is not directly supported by current memory provider interface
                 if self.memory_provider:
                     await self._store_conversation(
                         user_id=user_id,
-                        user_message=user_text,
+                        user_message=user_text,  # Store only text part of user query
                         assistant_message=self.agent_service.last_text_response,
                     )
             else:
@@ -253,6 +263,7 @@ class QueryService(QueryServiceInterface):
                     agent_name=agent_name,
                     user_id=user_id,
                     query=user_text,  # Pass processed text
+                    images=images,  # <-- Pass images
                     memory_context=combined_context,
                     output_format="text",
                     prompt=prompt,
@@ -261,10 +272,11 @@ class QueryService(QueryServiceInterface):
                     full_text_response += chunk
                 # Store conversation using processed user_text
+                # Note: Storing images in history is not directly supported by current memory provider interface
                 if self.memory_provider and full_text_response:
                     await self._store_conversation(
                         user_id=user_id,
-                        user_message=user_text,
+                        user_message=user_text,  # Store only text part of user query
                         assistant_message=full_text_response,
                     )
@@ -370,11 +382,15 @@ class QueryService(QueryServiceInterface):
                     if conv.get("timestamp")
                     else None
                 )
+                # Assuming the stored format matches what _store_conversation saves
+                # (which currently only stores text messages)
                 formatted_conversations.append(
                     {
                         "id": str(conv.get("_id")),
-                        "user_message": conv.get("user_message"),
-                        "assistant_message": conv.get("assistant_message"),
+                        "user_message": conv.get("user_message"),  # Or how it's stored
+                        "assistant_message": conv.get(
+                            "assistant_message"
+                        ),  # Or how it's stored
                         "timestamp": timestamp,
                     }
                 )
@@ -413,11 +429,13 @@ class QueryService(QueryServiceInterface):
         Args:
             user_id: User ID
-            user_message: User message (potentially processed by input guardrails)
+            user_message: User message (text part, potentially processed by input guardrails)
             assistant_message: Assistant message (potentially processed by output guardrails)
         """
         if self.memory_provider:
             try:
+                # Store only the text parts for now, as memory provider interface
+                # doesn't explicitly handle image data storage in history.
                 await self.memory_provider.store(
                     user_id,
                     [

{solana_agent-28.2.0.dist-info → solana_agent-28.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: solana-agent
-Version: 28.2.0
+Version: 28.3.0
 Summary: AI Agents for Solana
 License: MIT
 Keywords: solana,solana ai,solana agent,ai,ai agent,ai agents
@@ -15,10 +15,11 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Dist: instructor (>=1.7.9,<2.0.0)
-Requires-Dist: llama-index-core (>=0.12.30,<0.13.0)
+Requires-Dist: llama-index-core (>=0.12.32,<0.13.0)
 Requires-Dist: llama-index-embeddings-openai (>=0.3.1,<0.4.0)
 Requires-Dist: logfire (>=3.14.0,<4.0.0)
 Requires-Dist: openai (>=1.75.0,<2.0.0)
+Requires-Dist: pillow (>=11.2.1,<12.0.0)
 Requires-Dist: pinecone (>=6.0.2,<7.0.0)
 Requires-Dist: pydantic (>=2)
 Requires-Dist: pymongo (>=4.12.0,<5.0.0)
@@ -26,7 +27,7 @@ Requires-Dist: pypdf (>=5.4.0,<6.0.0)
 Requires-Dist: rich (>=13)
 Requires-Dist: scrubadub (>=2.0.1,<3.0.0)
 Requires-Dist: typer (>=0.15.2,<0.16.0)
-Requires-Dist: zep-cloud (>=2.10.1,<3.0.0)
+Requires-Dist: zep-cloud (>=2.10.2,<3.0.0)
 Project-URL: Documentation, https://docs.solana-agent.com
 Project-URL: Homepage, https://solana-agent.com
 Project-URL: Repository, https://github.com/truemagic-coder/solana-agent
@@ -56,7 +57,7 @@ Build your AI agents in three lines of code!
 * Fast Responses
 * Solana Ecosystem Integration
 * Multi-Agent Swarm
-* Multi-Modal Streaming (Text & Audio)
+* Multi-Modal (Images & Audio & Text)
 * Conversational Memory & History
 * Internet Search
 * Intelligent Routing
@@ -80,7 +81,7 @@ Build your AI agents in three lines of code!
 * MCP tool usage with first-class support for [Zapier](https://zapier.com/mcp)
 * Integrated observability and tracing via [Pydantic Logfire](https://pydantic.dev/logfire)
 * Designed for a multi-agent swarm
-* Seamless text and audio streaming with real-time multi-modal processing
+* Seamless streaming with real-time multi-modal processing of text, audio, and images
 * Persistent memory that preserves context across all agent interactions
 * Quick Internet search to answer users' queries
 * Streamlined message history for all agent interactions
@@ -286,6 +287,42 @@ async for response in solana_agent.process("user123", audio_content, audio_input
     print(response, end="")
 ```
+### Image/Text Streaming
+```python
+from solana_agent import SolanaAgent
+config = {
+    "openai": {
+        "api_key": "your-openai-api-key",
+    },
+    "agents": [
+        {
+            "name": "vision_expert",
+            "instructions": "You are an expert at analyzing images and answering questions about them.",
+            "specialization": "Image analysis",
+        }
+    ],
+}
+solana_agent = SolanaAgent(config=config)
+# Example with an image URL
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+# Example reading image bytes from a file
+image_bytes = await image_file.read()
+# You can mix URLs and bytes in the list
+images_to_process = [
+    image_url,
+    image_bytes,
+]
+async for response in solana_agent.process("user123", "What is in this image? Describe the scene.", images=images_to_process):
+    print(response, end="")
+```
 ### Command Line Interface (CLI)
 Solana Agent includes a command-line interface (CLI) for text-based chat using a configuration file.
@@ -585,7 +622,7 @@ config = {
             "rpc_url": "your-solana-rpc-url",
         },
     },
-    "ai_agents": [
+    "agents": [
         {
             "name": "solana_expert",
             "instructions": "You are an expert Solana blockchain assistant. You always use the Solana tool to perform actions on the Solana blockchain.",

{solana_agent-28.2.0.dist-info → solana_agent-28.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 solana_agent/__init__.py,sha256=g83qhMOCwcWL19V4CYbQwl0Ykpb0xn49OUh05i-pu3g,1001
 solana_agent/adapters/__init__.py,sha256=tiEEuuy0NF3ngc_tGEcRTt71zVI58v3dYY9RvMrF2Cg,204
 solana_agent/adapters/mongodb_adapter.py,sha256=0KWIa6kaFbUFvtKUzuV_0p0RFlPPGKrDVIEU2McVY3k,2734
-solana_agent/adapters/openai_adapter.py,sha256=rSa1yYYIpr5ES3Zw0DZm7iGp59p0WYBQXH0EQ1UDAB4,13594
+solana_agent/adapters/openai_adapter.py,sha256=XnocNAV1nJGcjpRgOyMXnyDQSU8HvTx9zmb4pWtSb58,23432
 solana_agent/adapters/pinecone_adapter.py,sha256=XlfOpoKHwzpaU4KZnovO2TnEYbsw-3B53ZKQDtBeDgU,23847
 solana_agent/cli.py,sha256=FGvTIQmKLp6XsQdyKtuhIIfbBtMmcCCXfigNrj4bzMc,4704
 solana_agent/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-solana_agent/client/solana_agent.py,sha256=jUGWxYJL9ZWxGsVX9C6FrRQyX7r6Cep0ijcfm7cbkJI,10098
+solana_agent/client/solana_agent.py,sha256=-oVH_xGS9Al3csQ-IK9jlQhheutbfm69QBXmAc8Hmkw,10289
 solana_agent/domains/__init__.py,sha256=HiC94wVPRy-QDJSSRywCRrhrFfTBeHjfi5z-QfZv46U,168
 solana_agent/domains/agent.py,sha256=3Q1wg4eIul0CPpaYBOjEthKTfcdhf1SAiWc2R-IMGO8,2561
 solana_agent/domains/routing.py,sha256=1yR4IswGcmREGgbOOI6TKCfuM7gYGOhQjLkBqnZ-rNo,582
@@ -13,16 +13,16 @@ solana_agent/factories/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
 solana_agent/factories/agent_factory.py,sha256=kduhtCMAxiPmCW_wx-hGGlhehRRGt4OBKY8r-R-LZnI,13246
 solana_agent/guardrails/pii.py,sha256=FCz1IC3mmkr41QFFf5NaC0fwJrVkwFsxgyOCS2POO5I,4428
 solana_agent/interfaces/__init__.py,sha256=IQs1WIM1FeKP1-kY2FEfyhol_dB-I-VAe2rD6jrVF6k,355
-solana_agent/interfaces/client/client.py,sha256=hsvaQiQdz3MLMNc77oD6ocvvnyl7Ez2n087ptFDA19M,3687
+solana_agent/interfaces/client/client.py,sha256=RURf6W3dSK4mlQ_8ZTLKmh5TIu5QNpXphp_cye5yhPE,3745
 solana_agent/interfaces/guardrails/guardrails.py,sha256=gZCQ1FrirW-mX6s7FoYrbRs6golsp-x269kk4kQiZzc,572
 solana_agent/interfaces/plugins/plugins.py,sha256=Rz52cWBLdotwf4kV-2mC79tRYlN29zHSu1z9-y1HVPk,3329
 solana_agent/interfaces/providers/data_storage.py,sha256=Y92Cq8BtC55VlsYLD7bo3ofqQabNnlg7Q4H1Q6CDsLU,1713
-solana_agent/interfaces/providers/llm.py,sha256=Wxn0qXIk7BmpI0FBrhjJVV6DmsfLUpUauZR-pE3brz8,2395
+solana_agent/interfaces/providers/llm.py,sha256=FbK6HNMBOIONPE-ljPRElkO2fmFbkzWEo4KuYfcDEFE,2727
 solana_agent/interfaces/providers/memory.py,sha256=h3HEOwWCiFGIuFBX49XOv1jFaQW3NGjyKPOfmQloevk,1011
 solana_agent/interfaces/providers/vector_storage.py,sha256=XPYzvoWrlDVFCS9ItBmoqCFWXXWNYY-d9I7_pvP7YYk,1561
-solana_agent/interfaces/services/agent.py,sha256=YsxyvBPK3ygBEStLyL4BwmIl84NMrV3dK0PlwCFoyq0,2094
+solana_agent/interfaces/services/agent.py,sha256=MgLudTwzCzzzSR6PsVTB-w5rhGDHB5B81TGjo2z3G-A,2152
 solana_agent/interfaces/services/knowledge_base.py,sha256=HsU4fAMc_oOUCqCX2z76_IbAtbTNTyvffHZ49J0ynSQ,2092
-solana_agent/interfaces/services/query.py,sha256=QfpBA3hrv8pQdNbK05hbu3Vh3-53F46IFcoUjwj5J9w,1568
+solana_agent/interfaces/services/query.py,sha256=eLMMwc8hwHHjxFxlvVvkZfoQi8cSgQycWJbYAVphl9E,1632
 solana_agent/interfaces/services/routing.py,sha256=Qbn3-DQGVSQKaegHDekSFmn_XCklA0H2f0XUx9-o3wA,367
 solana_agent/plugins/__init__.py,sha256=coZdgJKq1ExOaj6qB810i3rEhbjdVlrkN76ozt_Ojgo,193
 solana_agent/plugins/manager.py,sha256=mO_dKSVJ8GToD3wZflMcpKDEBXRoaaMRtY267HENCI0,5542
@@ -32,12 +32,12 @@ solana_agent/plugins/tools/auto_tool.py,sha256=uihijtlc9CCqCIaRcwPuuN7o1SHIpWL2G
 solana_agent/repositories/__init__.py,sha256=fP83w83CGzXLnSdq-C5wbw9EhWTYtqE2lQTgp46-X_4,163
 solana_agent/repositories/memory.py,sha256=e-27ju6wmurxSxULzr_uDHxxdnvw8KrJt9NWyvAz-i4,7684
 solana_agent/services/__init__.py,sha256=iko0c2MlF8b_SA_nuBGFllr2E3g_JowOrOzGcnU9tkA,162
-solana_agent/services/agent.py,sha256=9FB1Tj7v8JwJVVmZwK8IOSrBbgbV4iFZOtFHzw3gcEs,41780
+solana_agent/services/agent.py,sha256=QoeQq_OEWyLdBS0FPa-lXm5qiE0RnRfrCKiFTfOSGE0,42369
 solana_agent/services/knowledge_base.py,sha256=D4QNGC3Z8E7iX-CEGpRks0lW4wWJt-WorO3J8mu6ayU,35318
-solana_agent/services/query.py,sha256=bAoUfe_2EBVEVeh99-2E9KZ0zaHUzf7Lqel3rlHyNX8,17459
+solana_agent/services/query.py,sha256=ENUfs4WSTpODMRXppDVW-Y3li9jYn8pOfQIHIPerUdQ,18498
 solana_agent/services/routing.py,sha256=C5Ku4t9TqvY7S8wlUPMTC04HCrT4Ib3E8Q8yX0lVU_s,7137
-solana_agent-28.2.0.dist-info/LICENSE,sha256=BnSRc-NSFuyF2s496l_4EyrwAP6YimvxWcjPiJ0J7g4,1057
-solana_agent-28.2.0.dist-info/METADATA,sha256=b_KhpeB3McBc0uAu63yjkJn4g6_FG_EyO4-uMPQjYno,28286
-solana_agent-28.2.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
-solana_agent-28.2.0.dist-info/entry_points.txt,sha256=-AuT_mfqk8dlZ0pHuAjx1ouAWpTRjpqvEUa6YV3lmc0,53
-solana_agent-28.2.0.dist-info/RECORD,,
+solana_agent-28.3.0.dist-info/LICENSE,sha256=BnSRc-NSFuyF2s496l_4EyrwAP6YimvxWcjPiJ0J7g4,1057
+solana_agent-28.3.0.dist-info/METADATA,sha256=d0bjKGS6LRao_sJXWyCizuoN4aoGNRBoQDXht8HcqGQ,29305
+solana_agent-28.3.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
+solana_agent-28.3.0.dist-info/entry_points.txt,sha256=-AuT_mfqk8dlZ0pHuAjx1ouAWpTRjpqvEUa6YV3lmc0,53
+solana_agent-28.3.0.dist-info/RECORD,,

{solana_agent-28.2.0.dist-info → solana_agent-28.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{solana_agent-28.2.0.dist-info → solana_agent-28.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{solana_agent-28.2.0.dist-info → solana_agent-28.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

solana-agent 28.2.0__py3-none-any.whl → 28.3.0__py3-none-any.whl

solana-agent 28.2.0py3-none-any.whl → 28.3.0py3-none-any.whl