PyPI - isa-model - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

isa-model 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

isa_model/core/model_manager.py +69 -4
isa_model/inference/ai_factory.py +335 -46
isa_model/inference/billing_tracker.py +406 -0
isa_model/inference/providers/base_provider.py +51 -4
isa_model/inference/providers/ollama_provider.py +37 -18
isa_model/inference/providers/openai_provider.py +65 -36
isa_model/inference/providers/replicate_provider.py +42 -30
isa_model/inference/services/audio/base_stt_service.py +21 -2
isa_model/inference/services/audio/openai_realtime_service.py +353 -0
isa_model/inference/services/audio/openai_stt_service.py +252 -0
isa_model/inference/services/audio/openai_tts_service.py +48 -9
isa_model/inference/services/audio/replicate_tts_service.py +239 -0
isa_model/inference/services/base_service.py +36 -1
isa_model/inference/services/embedding/openai_embed_service.py +223 -0
isa_model/inference/services/llm/base_llm_service.py +88 -192
isa_model/inference/services/llm/llm_adapter.py +459 -0
isa_model/inference/services/llm/ollama_llm_service.py +111 -185
isa_model/inference/services/llm/openai_llm_service.py +115 -360
isa_model/inference/services/vision/helpers/image_utils.py +4 -3
isa_model/inference/services/vision/ollama_vision_service.py +11 -3
isa_model/inference/services/vision/openai_vision_service.py +275 -41
isa_model/inference/services/vision/replicate_image_gen_service.py +233 -205
{isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/METADATA +1 -1
{isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/RECORD +26 -21
{isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/WHEEL +0 -0
{isa_model-0.3.0.dist-info → isa_model-0.3.2.dist-info}/top_level.txt +0 -0

isa_model/inference/services/vision/openai_vision_service.py CHANGED Viewed

@@ -1,80 +1,314 @@
-from typing import Dict, Any, Union
+from typing import Dict, Any, Union, List, Optional, BinaryIO
+import base64
+import aiohttp
 from openai import AsyncOpenAI
 from tenacity import retry, stop_after_attempt, wait_exponential
-from isa_model.inference.services.base_service import BaseService
+from isa_model.inference.services.vision.base_vision_service import BaseVisionService
 from isa_model.inference.providers.base_provider import BaseProvider
-from .helpers.image_utils import compress_image, encode_image_to_base64
+from isa_model.inference.billing_tracker import ServiceType
 import logging
 logger = logging.getLogger(__name__)
-class OpenAIVisionService(BaseService):
-    """Vision model service wrapper for YYDS"""
+class OpenAIVisionService(BaseVisionService):
+    """OpenAI Vision service using gpt-4.1-nano with vision capabilities"""
-    def __init__(self, provider: 'BaseProvider', model_name: str):
+    def __init__(self, provider: 'BaseProvider', model_name: str = "gpt-4.1-nano"):
         super().__init__(provider, model_name)
-        # 初始化 AsyncOpenAI 客户端
-        self._client = AsyncOpenAI(
-            api_key=self.config.get('api_key'),
-            base_url=self.config.get('base_url')
-        )
-        self.max_tokens = self.config.get('max_tokens', 1000)
-        self.temperature = self.config.get('temperature', 0.7)
+        # Get full configuration from provider (including sensitive data)
+        provider_config = provider.get_full_config()
+        # Initialize AsyncOpenAI client with provider configuration
+        try:
+            if not provider_config.get("api_key"):
+                raise ValueError("OpenAI API key not found in provider configuration")
+            self._client = AsyncOpenAI(
+                api_key=provider_config["api_key"],
+                base_url=provider_config.get("base_url", "https://api.openai.com/v1"),
+                organization=provider_config.get("organization")
+            )
+            logger.info(f"Initialized OpenAIVisionService with model {self.model_name}")
+        except Exception as e:
+            logger.error(f"Failed to initialize OpenAI client: {e}")
+            raise ValueError(f"Failed to initialize OpenAI client. Check your API key configuration: {e}") from e
+        self.max_tokens = provider_config.get('max_tokens', 1000)
+        self.temperature = provider_config.get('temperature', 0.7)
     @property
     def client(self) -> AsyncOpenAI:
-        """获取底层的 OpenAI 客户端"""
+        """Get the underlying OpenAI client"""
         return self._client
+    async def _download_image(self, image_url: str) -> bytes:
+        """Download image from URL"""
+        async with aiohttp.ClientSession() as session:
+            async with session.get(image_url) as response:
+                if response.status == 200:
+                    return await response.read()
+                else:
+                    raise ValueError(f"Failed to download image from {image_url}: {response.status}")
+    def _encode_image(self, image_path_or_data: Union[str, bytes, BinaryIO]) -> str:
+        """Encode image to base64"""
+        if isinstance(image_path_or_data, str):
+            # If it's a file path
+            with open(image_path_or_data, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        elif hasattr(image_path_or_data, 'read'):
+            # If it's a file-like object (BinaryIO)
+            data = image_path_or_data.read()  # type: ignore
+            if isinstance(data, bytes):
+                return base64.b64encode(data).decode("utf-8")
+            else:
+                raise ValueError("File-like object did not return bytes")
+        else:
+            # If it's bytes data
+            return base64.b64encode(image_path_or_data).decode("utf-8")  # type: ignore
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=4, max=10),
         reraise=True
     )
-    async def analyze_image(self, image_data: Union[bytes, str], query: str) -> str:
-        """分析图片并返回结果
+    async def analyze_image(
+        self,
+        image: Union[str, BinaryIO],
+        prompt: Optional[str] = None,
+        max_tokens: int = 1000
+    ) -> Dict[str, Any]:
+        """
+        Analyze image and provide description or answer questions
         Args:
-            image_data: 图片数据，可以是 bytes 或已编码的 base64 字符串
-            query: 查询文本
+            image: Path to image file, URL, or image data
+            prompt: Optional text prompt/question about the image
+            max_tokens: Maximum tokens in response
         Returns:
-            str: 分析结果
+            Dict containing analysis results
         """
         try:
-            # 处理图片数据
-            if isinstance(image_data, bytes):
-                # 压缩并编码图片
-                compressed_image = compress_image(image_data)
-                image_b64 = encode_image_to_base64(compressed_image)
+            # Handle different input types
+            if isinstance(image, str):
+                if image.startswith(('http://', 'https://')):
+                    # Download image from URL
+                    image_bytes = await self._download_image(image)
+                    base64_image = self._encode_image(image_bytes)
+                else:
+                    # File path
+                    base64_image = self._encode_image(image)
             else:
-                image_b64 = image_data
-            # 移除可能存在的 base64 前缀
-            if 'base64,' in image_b64:
-                image_b64 = image_b64.split('base64,')[1]
-            # 使用 AsyncOpenAI 客户端创建请求
-            response = await self._client.chat.completions.create(
-                model=self.model_name,
-                messages=[{
+                # BinaryIO or bytes data
+                if hasattr(image, 'read'):
+                    image_data = image.read()
+                else:
+                    image_data = image
+                base64_image = self._encode_image(image_data)
+            # Use default prompt if none provided
+            if prompt is None:
+                prompt = "Please describe what you see in this image in detail."
+            # Use the standard chat completions API with vision
+            messages = [
+                {
                     "role": "user",
                     "content": [
-                        {"type": "text", "text": query},
+                        {"type": "text", "text": prompt},
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": f"data:image/jpeg;base64,{image_b64}"
+                                "url": f"data:image/jpeg;base64,{base64_image}",
+                                "detail": "auto"
                             }
-                        }
-                    ]
-                }],
-                max_tokens=self.max_tokens,
+                        },
+                    ],
+                }
+            ]
+            response = await self._client.chat.completions.create(  # type: ignore
+                model=self.model_name,
+                messages=messages,  # type: ignore
+                max_tokens=max_tokens,
                 temperature=self.temperature
             )
-            return response.choices[0].message.content
+            # Track usage for billing
+            if response.usage:
+                self._track_usage(
+                    service_type=ServiceType.VISION,
+                    operation="image_analysis",
+                    input_tokens=response.usage.prompt_tokens,
+                    output_tokens=response.usage.completion_tokens,
+                    metadata={"prompt": prompt[:100], "model": self.model_name}
+                )
+            content = response.choices[0].message.content or ""
+            return {
+                "text": content,
+                "confidence": 1.0,  # OpenAI doesn't provide confidence scores
+                "detected_objects": [],  # Would need separate object detection
+                "metadata": {
+                    "model": self.model_name,
+                    "prompt": prompt,
+                    "tokens_used": response.usage.total_tokens if response.usage else 0
+                }
+            }
         except Exception as e:
             logger.error(f"Error in image analysis: {e}")
             raise
+    async def analyze_images(
+        self,
+        images: List[Union[str, BinaryIO]],
+        prompt: Optional[str] = None,
+        max_tokens: int = 1000
+    ) -> List[Dict[str, Any]]:
+        """Analyze multiple images"""
+        results = []
+        for image in images:
+            result = await self.analyze_image(image, prompt, max_tokens)
+            results.append(result)
+        return results
+    async def describe_image(
+        self,
+        image: Union[str, BinaryIO],
+        detail_level: str = "medium"
+    ) -> Dict[str, Any]:
+        """Generate detailed description of image"""
+        detail_prompts = {
+            "low": "Briefly describe what you see in this image.",
+            "medium": "Describe what you see in this image in detail, including objects, colors, and scene.",
+            "high": "Provide a comprehensive and detailed description of this image, including all visible objects, their positions, colors, textures, lighting, composition, and any text or symbols present."
+        }
+        prompt = detail_prompts.get(detail_level, detail_prompts["medium"])
+        result = await self.analyze_image(image, prompt, 1500)
+        return {
+            "description": result["text"],
+            "objects": [],  # Would need object detection API
+            "scene": result["text"],  # Use same description
+            "colors": [],  # Would need color analysis
+            "detail_level": detail_level,
+            "metadata": result["metadata"]
+        }
+    async def extract_text(self, image: Union[str, BinaryIO]) -> Dict[str, Any]:
+        """Extract text from image (OCR)"""
+        prompt = "Extract all text visible in this image. Provide only the text content, maintaining the original structure and formatting as much as possible."
+        result = await self.analyze_image(image, prompt, 1000)
+        return {
+            "text": result["text"],
+            "confidence": 1.0,
+            "bounding_boxes": [],  # OpenAI vision doesn't provide bounding boxes
+            "language": "unknown",  # Would need language detection
+            "metadata": result["metadata"]
+        }
+    async def detect_objects(
+        self,
+        image: Union[str, BinaryIO],
+        confidence_threshold: float = 0.5
+    ) -> Dict[str, Any]:
+        """Detect objects in image"""
+        prompt = "List all objects visible in this image. For each object, provide the object name and a brief description of its location in the image."
+        result = await self.analyze_image(image, prompt, 1000)
+        # Parse the response to extract object information
+        objects = []
+        lines = result["text"].split('\n')
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith(('In this image', 'The image shows', 'I can see')):
+                objects.append({
+                    "label": line,
+                    "confidence": 1.0  # OpenAI doesn't provide confidence scores
+                })
+        return {
+            "objects": objects,
+            "count": len(objects),
+            "bounding_boxes": [],  # Not available with current API
+            "metadata": result["metadata"]
+        }
+    async def classify_image(
+        self,
+        image: Union[str, BinaryIO],
+        categories: Optional[List[str]] = None
+    ) -> Dict[str, Any]:
+        """Classify image into categories"""
+        if categories:
+            category_list = ", ".join(categories)
+            prompt = f"Classify this image into one of these categories: {category_list}. Respond with only the most appropriate category name."
+        else:
+            prompt = "What category best describes this image? Provide a single category name."
+        result = await self.analyze_image(image, prompt, 100)
+        category = result["text"].strip()
+        return {
+            "category": category,
+            "confidence": 1.0,
+            "all_predictions": [{"category": category, "confidence": 1.0}],
+            "metadata": result["metadata"]
+        }
+    async def compare_images(
+        self,
+        image1: Union[str, BinaryIO],
+        image2: Union[str, BinaryIO]
+    ) -> Dict[str, Any]:
+        """Compare two images for similarity"""
+        # For now, analyze both images separately and compare descriptions
+        result1 = await self.analyze_image(image1, "Describe this image in detail.")
+        result2 = await self.analyze_image(image2, "Describe this image in detail.")
+        # Use LLM to compare the descriptions
+        comparison_prompt = f"Compare these two image descriptions and provide a similarity analysis:\n\nImage 1: {result1['text']}\n\nImage 2: {result2['text']}\n\nProvide: 1) A similarity score from 0.0 to 1.0, 2) Key differences, 3) Common elements."
+        comparison_result = await self._client.chat.completions.create(
+            model=self.model_name,
+            messages=[{"role": "user", "content": comparison_prompt}],
+            max_tokens=500,
+            temperature=0.3
+        )
+        comparison_text = comparison_result.choices[0].message.content or ""
+        return {
+            "similarity_score": 0.5,  # Would need better parsing to extract actual score
+            "differences": comparison_text,
+            "common_elements": comparison_text,
+            "metadata": {
+                "model": self.model_name,
+                "comparison_method": "description_based"
+            }
+        }
+    def get_supported_formats(self) -> List[str]:
+        """Get list of supported image formats"""
+        return ['jpg', 'jpeg', 'png', 'gif', 'webp']
+    def get_max_image_size(self) -> Dict[str, int]:
+        """Get maximum supported image dimensions"""
+        return {
+            "width": 2048,
+            "height": 2048,
+            "file_size_mb": 20
+        }
+    async def close(self):
+        """Clean up resources"""
+        if hasattr(self._client, 'close'):
+            await self._client.close()

isa-model 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

isa-model 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl