PyPI - isa-model - Versions diffs - 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (228) hide show

isa_model/client.py +1166 -584
isa_model/core/cache/redis_cache.py +410 -0
isa_model/core/config/config_manager.py +282 -12
isa_model/core/config.py +91 -1
isa_model/core/database/__init__.py +1 -0
isa_model/core/database/direct_db_client.py +114 -0
isa_model/core/database/migration_manager.py +563 -0
isa_model/core/database/migrations.py +297 -0
isa_model/core/database/supabase_client.py +258 -0
isa_model/core/dependencies.py +316 -0
isa_model/core/discovery/__init__.py +19 -0
isa_model/core/discovery/consul_discovery.py +190 -0
isa_model/core/logging/__init__.py +54 -0
isa_model/core/logging/influx_logger.py +523 -0
isa_model/core/logging/loki_logger.py +160 -0
isa_model/core/models/__init__.py +46 -0
isa_model/core/models/config_models.py +625 -0
isa_model/core/models/deployment_billing_tracker.py +430 -0
isa_model/core/models/model_billing_tracker.py +60 -88
isa_model/core/models/model_manager.py +66 -25
isa_model/core/models/model_metadata.py +690 -0
isa_model/core/models/model_repo.py +217 -55
isa_model/core/models/model_statistics_tracker.py +234 -0
isa_model/core/models/model_storage.py +0 -1
isa_model/core/models/model_version_manager.py +959 -0
isa_model/core/models/system_models.py +857 -0
isa_model/core/pricing_manager.py +2 -249
isa_model/core/repositories/__init__.py +9 -0
isa_model/core/repositories/config_repository.py +912 -0
isa_model/core/resilience/circuit_breaker.py +366 -0
isa_model/core/security/secrets.py +358 -0
isa_model/core/services/__init__.py +2 -4
isa_model/core/services/intelligent_model_selector.py +479 -370
isa_model/core/storage/hf_storage.py +2 -2
isa_model/core/types.py +8 -0
isa_model/deployment/__init__.py +5 -48
isa_model/deployment/core/__init__.py +2 -31
isa_model/deployment/core/deployment_manager.py +1278 -368
isa_model/deployment/local/__init__.py +31 -0
isa_model/deployment/local/config.py +248 -0
isa_model/deployment/local/gpu_gateway.py +607 -0
isa_model/deployment/local/health_checker.py +428 -0
isa_model/deployment/local/provider.py +586 -0
isa_model/deployment/local/tensorrt_service.py +621 -0
isa_model/deployment/local/transformers_service.py +644 -0
isa_model/deployment/local/vllm_service.py +527 -0
isa_model/deployment/modal/__init__.py +8 -0
isa_model/deployment/modal/config.py +136 -0
isa_model/deployment/modal/deployer.py +894 -0
isa_model/deployment/modal/services/__init__.py +3 -0
isa_model/deployment/modal/services/audio/__init__.py +1 -0
isa_model/deployment/modal/services/audio/isa_audio_chatTTS_service.py +520 -0
isa_model/deployment/modal/services/audio/isa_audio_openvoice_service.py +758 -0
isa_model/deployment/modal/services/audio/isa_audio_service_v2.py +1044 -0
isa_model/deployment/modal/services/embedding/__init__.py +1 -0
isa_model/deployment/modal/services/embedding/isa_embed_rerank_service.py +296 -0
isa_model/deployment/modal/services/llm/__init__.py +1 -0
isa_model/deployment/modal/services/llm/isa_llm_service.py +424 -0
isa_model/deployment/modal/services/video/__init__.py +1 -0
isa_model/deployment/modal/services/video/isa_video_hunyuan_service.py +423 -0
isa_model/deployment/modal/services/vision/__init__.py +1 -0
isa_model/deployment/modal/services/vision/isa_vision_ocr_service.py +519 -0
isa_model/deployment/modal/services/vision/isa_vision_qwen25_service.py +709 -0
isa_model/deployment/modal/services/vision/isa_vision_table_service.py +676 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service.py +833 -0
isa_model/deployment/modal/services/vision/isa_vision_ui_service_optimized.py +660 -0
isa_model/deployment/models/org-org-acme-corp-tenant-a-service-llm-20250825-225822/tenant-a-service_modal_service.py +48 -0
isa_model/deployment/models/org-test-org-123-prefix-test-service-llm-20250825-225822/prefix-test-service_modal_service.py +48 -0
isa_model/deployment/models/test-llm-service-llm-20250825-204442/test-llm-service_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-212906/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/models/test-monitoring-gpt2-llm-20250825-213009/test-monitoring-gpt2_modal_service.py +48 -0
isa_model/deployment/storage/__init__.py +5 -0
isa_model/deployment/storage/deployment_repository.py +824 -0
isa_model/deployment/triton/__init__.py +10 -0
isa_model/deployment/triton/config.py +196 -0
isa_model/deployment/triton/configs/__init__.py +1 -0
isa_model/deployment/triton/provider.py +512 -0
isa_model/deployment/triton/scripts/__init__.py +1 -0
isa_model/deployment/triton/templates/__init__.py +1 -0
isa_model/inference/__init__.py +47 -1
isa_model/inference/ai_factory.py +179 -16
isa_model/inference/legacy_services/__init__.py +21 -0
isa_model/inference/legacy_services/model_evaluation.py +637 -0
isa_model/inference/legacy_services/model_service.py +573 -0
isa_model/inference/legacy_services/model_serving.py +717 -0
isa_model/inference/legacy_services/model_training.py +561 -0
isa_model/inference/models/__init__.py +21 -0
isa_model/inference/models/inference_config.py +551 -0
isa_model/inference/models/inference_record.py +675 -0
isa_model/inference/models/performance_models.py +714 -0
isa_model/inference/repositories/__init__.py +9 -0
isa_model/inference/repositories/inference_repository.py +828 -0
isa_model/inference/services/audio/__init__.py +21 -0
isa_model/inference/services/audio/base_realtime_service.py +225 -0
isa_model/inference/services/audio/base_stt_service.py +184 -11
isa_model/inference/services/audio/isa_tts_service.py +0 -0
isa_model/inference/services/audio/openai_realtime_service.py +320 -124
isa_model/inference/services/audio/openai_stt_service.py +53 -11
isa_model/inference/services/base_service.py +17 -1
isa_model/inference/services/custom_model_manager.py +277 -0
isa_model/inference/services/embedding/__init__.py +13 -0
isa_model/inference/services/embedding/base_embed_service.py +111 -8
isa_model/inference/services/embedding/isa_embed_service.py +305 -0
isa_model/inference/services/embedding/ollama_embed_service.py +15 -3
isa_model/inference/services/embedding/openai_embed_service.py +2 -4
isa_model/inference/services/embedding/resilient_embed_service.py +285 -0
isa_model/inference/services/embedding/tests/test_embedding.py +222 -0
isa_model/inference/services/img/__init__.py +2 -2
isa_model/inference/services/img/base_image_gen_service.py +24 -7
isa_model/inference/services/img/replicate_image_gen_service.py +84 -422
isa_model/inference/services/img/services/replicate_face_swap.py +193 -0
isa_model/inference/services/img/services/replicate_flux.py +226 -0
isa_model/inference/services/img/services/replicate_flux_kontext.py +219 -0
isa_model/inference/services/img/services/replicate_sticker_maker.py +249 -0
isa_model/inference/services/img/tests/test_img_client.py +297 -0
isa_model/inference/services/llm/__init__.py +10 -2
isa_model/inference/services/llm/base_llm_service.py +361 -26
isa_model/inference/services/llm/cerebras_llm_service.py +628 -0
isa_model/inference/services/llm/helpers/llm_adapter.py +71 -12
isa_model/inference/services/llm/helpers/llm_prompts.py +342 -0
isa_model/inference/services/llm/helpers/llm_utils.py +321 -23
isa_model/inference/services/llm/huggingface_llm_service.py +581 -0
isa_model/inference/services/llm/local_llm_service.py +747 -0
isa_model/inference/services/llm/ollama_llm_service.py +11 -3
isa_model/inference/services/llm/openai_llm_service.py +670 -56
isa_model/inference/services/llm/yyds_llm_service.py +10 -3
isa_model/inference/services/vision/__init__.py +27 -6
isa_model/inference/services/vision/base_vision_service.py +118 -185
isa_model/inference/services/vision/blip_vision_service.py +359 -0
isa_model/inference/services/vision/helpers/image_utils.py +19 -10
isa_model/inference/services/vision/isa_vision_service.py +634 -0
isa_model/inference/services/vision/openai_vision_service.py +19 -10
isa_model/inference/services/vision/tests/test_ocr_client.py +284 -0
isa_model/inference/services/vision/vgg16_vision_service.py +257 -0
isa_model/serving/api/cache_manager.py +245 -0
isa_model/serving/api/dependencies/__init__.py +1 -0
isa_model/serving/api/dependencies/auth.py +194 -0
isa_model/serving/api/dependencies/database.py +139 -0
isa_model/serving/api/error_handlers.py +284 -0
isa_model/serving/api/fastapi_server.py +240 -18
isa_model/serving/api/middleware/auth.py +317 -0
isa_model/serving/api/middleware/security.py +268 -0
isa_model/serving/api/middleware/tenant_context.py +414 -0
isa_model/serving/api/routes/analytics.py +489 -0
isa_model/serving/api/routes/config.py +645 -0
isa_model/serving/api/routes/deployment_billing.py +315 -0
isa_model/serving/api/routes/deployments.py +475 -0
isa_model/serving/api/routes/gpu_gateway.py +440 -0
isa_model/serving/api/routes/health.py +32 -12
isa_model/serving/api/routes/inference_monitoring.py +486 -0
isa_model/serving/api/routes/local_deployments.py +448 -0
isa_model/serving/api/routes/logs.py +430 -0
isa_model/serving/api/routes/settings.py +582 -0
isa_model/serving/api/routes/tenants.py +575 -0
isa_model/serving/api/routes/unified.py +992 -171
isa_model/serving/api/routes/webhooks.py +479 -0
isa_model/serving/api/startup.py +318 -0
isa_model/serving/modal_proxy_server.py +249 -0
isa_model/utils/gpu_utils.py +311 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/METADATA +76 -22
isa_model-0.4.3.dist-info/RECORD +193 -0
isa_model/deployment/cloud/__init__.py +0 -9
isa_model/deployment/cloud/modal/__init__.py +0 -10
isa_model/deployment/cloud/modal/isa_vision_doc_service.py +0 -766
isa_model/deployment/cloud/modal/isa_vision_table_service.py +0 -532
isa_model/deployment/cloud/modal/isa_vision_ui_service.py +0 -406
isa_model/deployment/cloud/modal/register_models.py +0 -321
isa_model/deployment/core/deployment_config.py +0 -356
isa_model/deployment/core/isa_deployment_service.py +0 -401
isa_model/deployment/gpu_int8_ds8/app/server.py +0 -66
isa_model/deployment/gpu_int8_ds8/scripts/test_client.py +0 -43
isa_model/deployment/gpu_int8_ds8/scripts/test_client_os.py +0 -35
isa_model/deployment/runtime/deployed_service.py +0 -338
isa_model/deployment/services/__init__.py +0 -9
isa_model/deployment/services/auto_deploy_vision_service.py +0 -538
isa_model/deployment/services/model_service.py +0 -332
isa_model/deployment/services/service_monitor.py +0 -356
isa_model/deployment/services/service_registry.py +0 -527
isa_model/eval/__init__.py +0 -92
isa_model/eval/benchmarks.py +0 -469
isa_model/eval/config/__init__.py +0 -10
isa_model/eval/config/evaluation_config.py +0 -108
isa_model/eval/evaluators/__init__.py +0 -18
isa_model/eval/evaluators/base_evaluator.py +0 -503
isa_model/eval/evaluators/llm_evaluator.py +0 -472
isa_model/eval/factory.py +0 -531
isa_model/eval/infrastructure/__init__.py +0 -24
isa_model/eval/infrastructure/experiment_tracker.py +0 -466
isa_model/eval/metrics.py +0 -798
isa_model/inference/adapter/unified_api.py +0 -248
isa_model/inference/services/helpers/stacked_config.py +0 -148
isa_model/inference/services/img/flux_professional_service.py +0 -603
isa_model/inference/services/img/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/others/table_transformer_service.py +0 -61
isa_model/inference/services/vision/doc_analysis_service.py +0 -640
isa_model/inference/services/vision/helpers/base_stacked_service.py +0 -274
isa_model/inference/services/vision/ui_analysis_service.py +0 -823
isa_model/scripts/inference_tracker.py +0 -283
isa_model/scripts/mlflow_manager.py +0 -379
isa_model/scripts/model_registry.py +0 -465
isa_model/scripts/register_models.py +0 -370
isa_model/scripts/register_models_with_embeddings.py +0 -510
isa_model/scripts/start_mlflow.py +0 -95
isa_model/scripts/training_tracker.py +0 -257
isa_model/training/__init__.py +0 -74
isa_model/training/annotation/annotation_schema.py +0 -47
isa_model/training/annotation/processors/annotation_processor.py +0 -126
isa_model/training/annotation/storage/dataset_manager.py +0 -131
isa_model/training/annotation/storage/dataset_schema.py +0 -44
isa_model/training/annotation/tests/test_annotation_flow.py +0 -109
isa_model/training/annotation/tests/test_minio copy.py +0 -113
isa_model/training/annotation/tests/test_minio_upload.py +0 -43
isa_model/training/annotation/views/annotation_controller.py +0 -158
isa_model/training/cloud/__init__.py +0 -22
isa_model/training/cloud/job_orchestrator.py +0 -402
isa_model/training/cloud/runpod_trainer.py +0 -454
isa_model/training/cloud/storage_manager.py +0 -482
isa_model/training/core/__init__.py +0 -23
isa_model/training/core/config.py +0 -181
isa_model/training/core/dataset.py +0 -222
isa_model/training/core/trainer.py +0 -720
isa_model/training/core/utils.py +0 -213
isa_model/training/factory.py +0 -424
isa_model-0.3.91.dist-info/RECORD +0 -138
/isa_model/{core/storage/minio_storage.py → deployment/modal/services/audio/isa_audio_fish_service.py} +0 -0
/isa_model/deployment/{services → modal/services/vision}/simple_auto_deploy_vision_service.py +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/WHEEL +0 -0
{isa_model-0.3.91.dist-info → isa_model-0.4.3.dist-info}/top_level.txt +0 -0

isa_model/eval/metrics.py DELETED Viewed

@@ -1,798 +0,0 @@
-"""
-Evaluation Metrics for ISA Model Framework
-This module provides various metrics for evaluating AI models:
-- LLM metrics: perplexity, BLEU, ROUGE, accuracy, etc.
-- Image metrics: FID, IS, LPIPS, etc.
-- Custom metrics and benchmark runners
-"""
-import os
-import json
-import logging
-import numpy as np
-from typing import Dict, List, Any, Optional, Union
-from enum import Enum
-from abc import ABC, abstractmethod
-try:
-    from ..inference.ai_factory import AIFactory
-    AI_FACTORY_AVAILABLE = True
-except ImportError:
-    AI_FACTORY_AVAILABLE = False
-logger = logging.getLogger(__name__)
-class MetricType(str, Enum):
-    """Types of evaluation metrics."""
-    PERPLEXITY = "perplexity"
-    BLEU = "bleu"
-    ROUGE = "rouge"
-    ACCURACY = "accuracy"
-    F1_SCORE = "f1"
-    DIVERSITY = "diversity"
-    COHERENCE = "coherence"
-    FLUENCY = "fluency"
-    FID = "fid"
-    IS = "is"
-    LPIPS = "lpips"
-class BaseMetric(ABC):
-    """Base class for all metrics."""
-    @abstractmethod
-    def compute(self, predictions: List[str], references: List[str] = None, **kwargs) -> Dict[str, float]:
-        """Compute the metric."""
-        pass
-class LLMMetrics:
-    """
-    Metrics calculator for Language Models.
-    Supports various metrics including:
-    - Perplexity
-    - BLEU score
-    - ROUGE score
-    - Accuracy
-    - F1 score
-    - Generation quality metrics
-    """
-    def __init__(self):
-        self.available_metrics = [
-            MetricType.PERPLEXITY,
-            MetricType.BLEU,
-            MetricType.ROUGE,
-            MetricType.ACCURACY,
-            MetricType.F1_SCORE,
-            MetricType.DIVERSITY,
-            MetricType.COHERENCE,
-            MetricType.FLUENCY
-        ]
-        # Initialize AI factory if available
-        if AI_FACTORY_AVAILABLE:
-            try:
-                self.ai_factory = AIFactory()
-            except Exception as e:
-                logger.warning(f"Failed to initialize AIFactory: {e}")
-                self.ai_factory = None
-        else:
-            self.ai_factory = None
-    def evaluate(
-        self,
-        model_path: str,
-        dataset: List[Dict[str, Any]],
-        metrics: List[str],
-        batch_size: int = 8,
-        provider: str = "ollama",
-        **kwargs
-    ) -> Dict[str, Any]:
-        """
-        Evaluate LLM on dataset with specified metrics.
-        Args:
-            model_path: Path to the model
-            dataset: Evaluation dataset
-            metrics: List of metrics to compute
-            batch_size: Batch size for evaluation
-            provider: Model provider
-            **kwargs: Additional parameters
-        Returns:
-            Dictionary with metric results
-        """
-        results = {
-            "model_path": model_path,
-            "num_samples": len(dataset),
-            "metrics": {}
-        }
-        # Generate predictions
-        predictions, references = self._generate_predictions(
-            model_path, dataset, batch_size, provider, **kwargs
-        )
-        # Compute each metric
-        for metric in metrics:
-            try:
-                if metric == MetricType.PERPLEXITY:
-                    score = self._compute_perplexity(predictions, references)
-                elif metric == MetricType.BLEU:
-                    score = self._compute_bleu(predictions, references)
-                elif metric == MetricType.ROUGE:
-                    score = self._compute_rouge(predictions, references)
-                elif metric == MetricType.ACCURACY:
-                    score = self._compute_accuracy(predictions, references)
-                elif metric == MetricType.F1_SCORE:
-                    score = self._compute_f1(predictions, references)
-                elif metric == MetricType.DIVERSITY:
-                    score = self._compute_diversity(predictions)
-                elif metric == MetricType.COHERENCE:
-                    score = self._compute_coherence(predictions)
-                elif metric == MetricType.FLUENCY:
-                    score = self._compute_fluency(predictions)
-                else:
-                    logger.warning(f"Unknown metric: {metric}")
-                    continue
-                results["metrics"][metric] = score
-                logger.info(f"Computed {metric}: {score}")
-            except Exception as e:
-                logger.error(f"Failed to compute {metric}: {e}")
-                results["metrics"][metric] = {"error": str(e)}
-        return results
-    def evaluate_generation(
-        self,
-        model_path: str,
-        prompts: List[str],
-        reference_texts: List[str] = None,
-        metrics: List[str] = None,
-        provider: str = "ollama",
-        **kwargs
-    ) -> Dict[str, Any]:
-        """
-        Evaluate text generation quality.
-        Args:
-            model_path: Path to the model
-            prompts: Input prompts
-            reference_texts: Reference texts (optional)
-            metrics: Metrics to compute
-            provider: Model provider
-            **kwargs: Additional parameters
-        Returns:
-            Generation evaluation results
-        """
-        if metrics is None:
-            metrics = [MetricType.DIVERSITY, MetricType.COHERENCE, MetricType.FLUENCY]
-        # Generate texts
-        generated_texts = self._generate_texts(model_path, prompts, provider, **kwargs)
-        results = {
-            "model_path": model_path,
-            "num_prompts": len(prompts),
-            "metrics": {}
-        }
-        # Compute metrics
-        for metric in metrics:
-            try:
-                if metric == MetricType.DIVERSITY:
-                    score = self._compute_diversity(generated_texts)
-                elif metric == MetricType.COHERENCE:
-                    score = self._compute_coherence(generated_texts)
-                elif metric == MetricType.FLUENCY:
-                    score = self._compute_fluency(generated_texts)
-                elif metric == MetricType.BLEU and reference_texts:
-                    score = self._compute_bleu(generated_texts, reference_texts)
-                elif metric == MetricType.ROUGE and reference_texts:
-                    score = self._compute_rouge(generated_texts, reference_texts)
-                else:
-                    continue
-                results["metrics"][metric] = score
-            except Exception as e:
-                logger.error(f"Failed to compute {metric}: {e}")
-                results["metrics"][metric] = {"error": str(e)}
-        return results
-    def _generate_predictions(
-        self,
-        model_path: str,
-        dataset: List[Dict[str, Any]],
-        batch_size: int,
-        provider: str,
-        **kwargs
-    ) -> tuple:
-        """Generate predictions from model using actual inference."""
-        predictions = []
-        references = []
-        if not self.ai_factory:
-            logger.warning("AIFactory not available, using placeholder predictions")
-            # Fallback to placeholder predictions
-            for item in dataset:
-                if isinstance(item, dict):
-                    if "input" in item and "output" in item:
-                        predictions.append(f"Generated response for: {item['input']}")
-                        references.append(item["output"])
-                    elif "prompt" in item and "response" in item:
-                        predictions.append(f"Generated response for: {item['prompt']}")
-                        references.append(item["response"])
-            return predictions, references
-        try:
-            # Get LLM service
-            llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
-            # Process dataset in batches
-            for i in range(0, len(dataset), batch_size):
-                batch = dataset[i:i + batch_size]
-                batch_predictions = []
-                batch_references = []
-                for item in batch:
-                    if isinstance(item, dict):
-                        prompt = None
-                        reference = None
-                        # Extract prompt and reference based on data format
-                        if "input" in item and "output" in item:
-                            prompt = item["input"]
-                            reference = item["output"]
-                        elif "prompt" in item and "response" in item:
-                            prompt = item["prompt"]
-                            reference = item["response"]
-                        elif "question" in item and "answer" in item:
-                            prompt = item["question"]
-                            reference = item["answer"]
-                        elif "text" in item and "label" in item:
-                            prompt = item["text"]
-                            reference = str(item["label"])
-                        if prompt and reference:
-                            try:
-                                # Generate prediction using actual model
-                                response = await llm_service.ainvoke(prompt)
-                                # Extract text from response
-                                if hasattr(response, 'text'):
-                                    prediction = response.text
-                                elif isinstance(response, dict) and 'text' in response:
-                                    prediction = response['text']
-                                elif isinstance(response, str):
-                                    prediction = response
-                                else:
-                                    prediction = str(response)
-                                batch_predictions.append(prediction.strip())
-                                batch_references.append(reference)
-                            except Exception as e:
-                                logger.error(f"Failed to generate prediction for item: {e}")
-                                # Use fallback prediction
-                                batch_predictions.append(f"Error generating prediction: {str(e)}")
-                                batch_references.append(reference)
-                predictions.extend(batch_predictions)
-                references.extend(batch_references)
-                logger.info(f"Processed batch {i//batch_size + 1}/{(len(dataset) + batch_size - 1)//batch_size}")
-        except Exception as e:
-            logger.error(f"Failed to use AIFactory for predictions: {e}")
-            # Fallback to placeholder predictions
-            for item in dataset:
-                if isinstance(item, dict):
-                    if "input" in item and "output" in item:
-                        predictions.append(f"Generated response for: {item['input']}")
-                        references.append(item["output"])
-                    elif "prompt" in item and "response" in item:
-                        predictions.append(f"Generated response for: {item['prompt']}")
-                        references.append(item["response"])
-        logger.info(f"Generated {len(predictions)} predictions")
-        return predictions, references
-    def _generate_texts(
-        self,
-        model_path: str,
-        prompts: List[str],
-        provider: str,
-        **kwargs
-    ) -> List[str]:
-        """Generate texts from prompts using actual model inference."""
-        generated_texts = []
-        if not self.ai_factory:
-            logger.warning("AIFactory not available, using placeholder text generation")
-            # Fallback to placeholder generation
-            for prompt in prompts:
-                generated_texts.append(f"Generated response for: {prompt}")
-            return generated_texts
-        try:
-            # Get LLM service
-            llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
-            for prompt in prompts:
-                try:
-                    # Generate text using actual model
-                    response = await llm_service.ainvoke(prompt)
-                    # Extract text from response
-                    if hasattr(response, 'text'):
-                        generated_text = response.text
-                    elif isinstance(response, dict) and 'text' in response:
-                        generated_text = response['text']
-                    elif isinstance(response, str):
-                        generated_text = response
-                    else:
-                        generated_text = str(response)
-                    generated_texts.append(generated_text.strip())
-                except Exception as e:
-                    logger.error(f"Failed to generate text for prompt: {e}")
-                    # Use fallback generation
-                    generated_texts.append(f"Error generating text: {str(e)}")
-        except Exception as e:
-            logger.error(f"Failed to use AIFactory for text generation: {e}")
-            # Fallback to placeholder generation
-            for prompt in prompts:
-                generated_texts.append(f"Generated response for: {prompt}")
-        return generated_texts
-    def _compute_perplexity(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
-        """Compute perplexity score (simplified implementation)."""
-        # This is a placeholder - actual perplexity requires model probabilities
-        return {
-            "perplexity": np.random.uniform(10, 100),  # Placeholder
-            "log_perplexity": np.random.uniform(2, 5)
-        }
-    def _compute_bleu(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
-        """Compute BLEU score (simplified implementation)."""
-        try:
-            # Placeholder implementation - use actual BLEU calculation
-            # from nltk.translate.bleu_score import sentence_bleu
-            scores = []
-            for pred, ref in zip(predictions, references):
-                # Simplified BLEU calculation
-                pred_words = pred.lower().split()
-                ref_words = ref.lower().split()
-                # Simple overlap calculation (not actual BLEU)
-                overlap = len(set(pred_words) & set(ref_words))
-                total = len(set(pred_words) | set(ref_words))
-                if total > 0:
-                    scores.append(overlap / total)
-                else:
-                    scores.append(0.0)
-            return {
-                "bleu": np.mean(scores),
-                "bleu_std": np.std(scores)
-            }
-        except Exception as e:
-            logger.error(f"BLEU computation failed: {e}")
-            return {"bleu": 0.0, "error": str(e)}
-    def _compute_rouge(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
-        """Compute ROUGE score (simplified implementation)."""
-        try:
-            rouge_1_scores = []
-            rouge_l_scores = []
-            for pred, ref in zip(predictions, references):
-                pred_words = set(pred.lower().split())
-                ref_words = set(ref.lower().split())
-                # ROUGE-1 (unigram overlap)
-                if len(ref_words) > 0:
-                    rouge_1 = len(pred_words & ref_words) / len(ref_words)
-                    rouge_1_scores.append(rouge_1)
-                # Simplified ROUGE-L (longest common subsequence)
-                rouge_l = len(pred_words & ref_words) / max(len(pred_words), len(ref_words), 1)
-                rouge_l_scores.append(rouge_l)
-            return {
-                "rouge_1": np.mean(rouge_1_scores),
-                "rouge_l": np.mean(rouge_l_scores),
-                "rouge_1_std": np.std(rouge_1_scores),
-                "rouge_l_std": np.std(rouge_l_scores)
-            }
-        except Exception as e:
-            logger.error(f"ROUGE computation failed: {e}")
-            return {"rouge_1": 0.0, "rouge_l": 0.0, "error": str(e)}
-    def _compute_accuracy(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
-        """Compute accuracy score."""
-        try:
-            correct = 0
-            total = len(predictions)
-            for pred, ref in zip(predictions, references):
-                if pred.strip().lower() == ref.strip().lower():
-                    correct += 1
-            accuracy = correct / total if total > 0 else 0.0
-            return {
-                "accuracy": accuracy,
-                "correct": correct,
-                "total": total
-            }
-        except Exception as e:
-            logger.error(f"Accuracy computation failed: {e}")
-            return {"accuracy": 0.0, "error": str(e)}
-    def _compute_f1(self, predictions: List[str], references: List[str]) -> Dict[str, float]:
-        """Compute F1 score (simplified implementation)."""
-        try:
-            f1_scores = []
-            for pred, ref in zip(predictions, references):
-                pred_words = set(pred.lower().split())
-                ref_words = set(ref.lower().split())
-                if len(pred_words) == 0 and len(ref_words) == 0:
-                    f1_scores.append(1.0)
-                elif len(pred_words) == 0 or len(ref_words) == 0:
-                    f1_scores.append(0.0)
-                else:
-                    intersection = len(pred_words & ref_words)
-                    precision = intersection / len(pred_words)
-                    recall = intersection / len(ref_words)
-                    if precision + recall > 0:
-                        f1 = 2 * (precision * recall) / (precision + recall)
-                        f1_scores.append(f1)
-                    else:
-                        f1_scores.append(0.0)
-            return {
-                "f1": np.mean(f1_scores),
-                "f1_std": np.std(f1_scores)
-            }
-        except Exception as e:
-            logger.error(f"F1 computation failed: {e}")
-            return {"f1": 0.0, "error": str(e)}
-    def _compute_diversity(self, texts: List[str]) -> Dict[str, float]:
-        """Compute diversity metrics."""
-        try:
-            # Distinct-1 and Distinct-2
-            all_unigrams = []
-            all_bigrams = []
-            for text in texts:
-                words = text.lower().split()
-                all_unigrams.extend(words)
-                # Create bigrams
-                for i in range(len(words) - 1):
-                    all_bigrams.append((words[i], words[i + 1]))
-            distinct_1 = len(set(all_unigrams)) / len(all_unigrams) if all_unigrams else 0
-            distinct_2 = len(set(all_bigrams)) / len(all_bigrams) if all_bigrams else 0
-            return {
-                "distinct_1": distinct_1,
-                "distinct_2": distinct_2,
-                "vocab_size": len(set(all_unigrams))
-            }
-        except Exception as e:
-            logger.error(f"Diversity computation failed: {e}")
-            return {"distinct_1": 0.0, "distinct_2": 0.0, "error": str(e)}
-    def _compute_coherence(self, texts: List[str]) -> Dict[str, float]:
-        """Compute coherence score (simplified implementation)."""
-        try:
-            # Simplified coherence based on sentence length consistency
-            coherence_scores = []
-            for text in texts:
-                sentences = text.split('.')
-                if len(sentences) > 1:
-                    lengths = [len(s.split()) for s in sentences if s.strip()]
-                    if lengths:
-                        # Coherence as inverse of length variance
-                        coherence = 1.0 / (1.0 + np.var(lengths))
-                        coherence_scores.append(coherence)
-                    else:
-                        coherence_scores.append(0.5)
-                else:
-                    coherence_scores.append(0.5)
-            return {
-                "coherence": np.mean(coherence_scores),
-                "coherence_std": np.std(coherence_scores)
-            }
-        except Exception as e:
-            logger.error(f"Coherence computation failed: {e}")
-            return {"coherence": 0.5, "error": str(e)}
-    def _compute_fluency(self, texts: List[str]) -> Dict[str, float]:
-        """Compute fluency score (simplified implementation)."""
-        try:
-            fluency_scores = []
-            for text in texts:
-                # Simplified fluency based on word count and sentence structure
-                words = text.split()
-                sentences = text.split('.')
-                if len(words) > 0 and len(sentences) > 0:
-                    avg_words_per_sentence = len(words) / len(sentences)
-                    # Fluency based on reasonable sentence length (5-20 words)
-                    if 5 <= avg_words_per_sentence <= 20:
-                        fluency = 1.0
-                    else:
-                        fluency = max(0.0, 1.0 - abs(avg_words_per_sentence - 12.5) / 12.5)
-                    fluency_scores.append(fluency)
-                else:
-                    fluency_scores.append(0.0)
-            return {
-                "fluency": np.mean(fluency_scores),
-                "fluency_std": np.std(fluency_scores)
-            }
-        except Exception as e:
-            logger.error(f"Fluency computation failed: {e}")
-            return {"fluency": 0.0, "error": str(e)}
-class ImageMetrics:
-    """
-    Metrics calculator for Image Generation Models.
-    Supports metrics including:
-    - FID (Fréchet Inception Distance)
-    - IS (Inception Score)
-    - LPIPS (Learned Perceptual Image Patch Similarity)
-    """
-    def __init__(self):
-        self.available_metrics = [
-            MetricType.FID,
-            MetricType.IS,
-            MetricType.LPIPS
-        ]
-    def evaluate(
-        self,
-        model_path: str,
-        test_images_dir: str,
-        reference_images_dir: Optional[str] = None,
-        metrics: List[str] = None,
-        **kwargs
-    ) -> Dict[str, Any]:
-        """
-        Evaluate image generation model.
-        Args:
-            model_path: Path to the image model
-            test_images_dir: Directory with test images
-            reference_images_dir: Directory with reference images
-            metrics: Metrics to compute
-            **kwargs: Additional parameters
-        Returns:
-            Image evaluation results
-        """
-        if metrics is None:
-            metrics = [MetricType.FID, MetricType.IS]
-        results = {
-            "model_path": model_path,
-            "test_images_dir": test_images_dir,
-            "reference_images_dir": reference_images_dir,
-            "metrics": {}
-        }
-        for metric in metrics:
-            try:
-                if metric == MetricType.FID:
-                    score = self._compute_fid(test_images_dir, reference_images_dir)
-                elif metric == MetricType.IS:
-                    score = self._compute_is(test_images_dir)
-                elif metric == MetricType.LPIPS:
-                    score = self._compute_lpips(test_images_dir, reference_images_dir)
-                else:
-                    logger.warning(f"Unknown image metric: {metric}")
-                    continue
-                results["metrics"][metric] = score
-                logger.info(f"Computed {metric}: {score}")
-            except Exception as e:
-                logger.error(f"Failed to compute {metric}: {e}")
-                results["metrics"][metric] = {"error": str(e)}
-        return results
-    def _compute_fid(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
-        """Compute FID score (placeholder implementation)."""
-        # This is a placeholder - actual FID requires complex neural network computations
-        logger.warning("FID computation not fully implemented - returning placeholder")
-        return {
-            "fid": np.random.uniform(20, 100),  # Placeholder
-            "note": "Placeholder implementation"
-        }
-    def _compute_is(self, images_dir: str) -> Dict[str, float]:
-        """Compute Inception Score (placeholder implementation)."""
-        # This is a placeholder - actual IS requires Inception network
-        logger.warning("IS computation not fully implemented - returning placeholder")
-        return {
-            "is_mean": np.random.uniform(2, 10),  # Placeholder
-            "is_std": np.random.uniform(0.1, 1.0),
-            "note": "Placeholder implementation"
-        }
-    def _compute_lpips(self, test_dir: str, reference_dir: Optional[str]) -> Dict[str, float]:
-        """Compute LPIPS score (placeholder implementation)."""
-        # This is a placeholder - actual LPIPS requires perceptual loss networks
-        logger.warning("LPIPS computation not fully implemented - returning placeholder")
-        return {
-            "lpips": np.random.uniform(0.1, 0.8),  # Placeholder
-            "note": "Placeholder implementation"
-        }
-class BenchmarkRunner:
-    """
-    Runner for standard AI benchmarks.
-    Supports running various benchmarks and collecting results.
-    """
-    def __init__(self):
-        self.supported_benchmarks = ["mmlu", "hellaswag", "arc", "gsm8k"]
-        # Initialize AI factory if available
-        if AI_FACTORY_AVAILABLE:
-            try:
-                self.ai_factory = AIFactory()
-            except Exception as e:
-                logger.warning(f"Failed to initialize AIFactory: {e}")
-                self.ai_factory = None
-        else:
-            self.ai_factory = None
-    def run(
-        self,
-        benchmark,
-        model_path: str,
-        num_shots: int = 0,
-        max_samples: Optional[int] = None,
-        provider: str = "ollama",
-        **kwargs
-    ) -> Dict[str, Any]:
-        """
-        Run a benchmark evaluation.
-        Args:
-            benchmark: Benchmark instance
-            model_path: Path to the model
-            num_shots: Number of few-shot examples
-            max_samples: Maximum samples to evaluate
-            provider: Model provider
-            **kwargs: Additional parameters
-        Returns:
-            Benchmark results
-        """
-        logger.info(f"Running benchmark {benchmark.name} on {model_path}")
-        # Load benchmark data
-        test_data = benchmark.load_data(max_samples=max_samples)
-        # Run evaluation
-        results = {
-            "benchmark": benchmark.name,
-            "model_path": model_path,
-            "num_shots": num_shots,
-            "num_samples": len(test_data),
-            "results": {}
-        }
-        # Process each sample
-        correct = 0
-        total = 0
-        for sample in test_data:
-            try:
-                # Format prompt using benchmark's method
-                prompt = benchmark.format_prompt(sample)
-                # Generate prediction using actual model
-                prediction = self._generate_prediction(
-                    model_path, {"prompt": prompt}, num_shots, provider, **kwargs
-                )
-                # Check if correct
-                is_correct = benchmark.evaluate_sample(sample, prediction)
-                if is_correct:
-                    correct += 1
-                total += 1
-            except Exception as e:
-                logger.error(f"Failed to process sample: {e}")
-                continue
-        # Calculate final score
-        accuracy = correct / total if total > 0 else 0.0
-        results["results"] = {
-            "accuracy": accuracy,
-            "correct": correct,
-            "total": total
-        }
-        logger.info(f"Benchmark completed: {accuracy:.3f} accuracy ({correct}/{total})")
-        return results
-    def _generate_prediction(
-        self,
-        model_path: str,
-        sample: Dict[str, Any],
-        num_shots: int,
-        provider: str,
-        **kwargs
-    ) -> str:
-        """Generate prediction for a sample using actual model inference."""
-        if not self.ai_factory:
-            logger.warning("AIFactory not available, using placeholder prediction")
-            return "A"  # Placeholder answer
-        try:
-            # Get LLM service
-            llm_service = self.ai_factory.get_llm(model_name=model_path, provider=provider)
-            # Format the prompt (this should be done by the benchmark)
-            if hasattr(sample, 'get'):
-                prompt = sample.get('prompt', str(sample))
-            else:
-                prompt = str(sample)
-            # Generate prediction using actual model
-            response = llm_service.generate(
-                prompt=prompt,
-                max_tokens=kwargs.get("max_tokens", 50),
-                temperature=kwargs.get("temperature", 0.0)  # Low temperature for consistency
-            )
-            # Extract text from response
-            if hasattr(response, 'text'):
-                prediction = response.text
-            elif isinstance(response, dict) and 'text' in response:
-                prediction = response['text']
-            elif isinstance(response, str):
-                prediction = response
-            else:
-                prediction = str(response)
-            return prediction.strip()
-        except Exception as e:
-            logger.error(f"Failed to generate prediction: {e}")
-            return "A"  # Fallback answer

isa-model 0.3.91__py3-none-any.whl → 0.4.3__py3-none-any.whl

isa-model 0.3.91py3-none-any.whl → 0.4.3py3-none-any.whl