PyPI - themis-eval - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

themis-eval 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

themis/__init__.py +23 -1
themis/_version.py +1 -1
themis/api.py +156 -17
themis/experiment/orchestrator.py +61 -5
themis/experiment/storage.py +163 -19
themis/generation/providers/litellm_provider.py +46 -0
themis/generation/runner.py +22 -6
themis/presets/__init__.py +13 -2
themis/utils/logging_utils.py +8 -3
themis/utils/progress.py +32 -13
{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/METADATA +6 -5
{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/RECORD +15 -15
{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/WHEEL +0 -0
{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/top_level.txt +0 -0

themis/__init__.py CHANGED Viewed

@@ -4,15 +4,37 @@ The primary interface is the `evaluate()` function:
     import themis
     report = themis.evaluate("math500", model="gpt-4", limit=100)
+Extension APIs for registering custom components:
+    - themis.register_metric() - Register custom metrics
+    - themis.register_dataset() - Register custom datasets
+    - themis.register_provider() - Register custom model providers
+    - themis.register_benchmark() - Register custom benchmark presets
 """
 from themis import config, core, evaluation, experiment, generation, project
 from themis._version import __version__
-from themis.api import evaluate
+from themis.api import evaluate, get_registered_metrics, register_metric
+from themis.datasets import register_dataset, list_datasets, is_dataset_registered
+from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
+from themis.providers import register_provider
 __all__ = [
     # Main API
     "evaluate",
+    # Metrics
+    "register_metric",
+    "get_registered_metrics",
+    # Datasets
+    "register_dataset",
+    "list_datasets",
+    "is_dataset_registered",
+    # Benchmarks
+    "register_benchmark",
+    "list_benchmarks",
+    "get_benchmark_preset",
+    # Providers
+    "register_provider",
     # Submodules
     "config",
     "core",

themis/_version.py CHANGED Viewed

@@ -9,7 +9,7 @@ def _detect_version() -> str:
     try:
         return metadata.version("themis-eval")
     except metadata.PackageNotFoundError:  # pragma: no cover - local dev only
-        return "0.2.0"  # Fallback for development
+        return "0.2.2"  # Fallback for development
 __version__ = _detect_version()

themis/api.py CHANGED Viewed

@@ -33,6 +33,7 @@ Example:
 from __future__ import annotations
+import logging
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Callable, Sequence
@@ -52,6 +53,67 @@ from themis.generation.runner import GenerationRunner
 from themis.generation.templates import PromptTemplate
 from themis.providers import create_provider
+# Import provider modules to ensure they register themselves
+try:
+    from themis.generation import clients  # noqa: F401 - registers fake provider
+    from themis.generation.providers import (
+        litellm_provider,  # noqa: F401
+        vllm_provider,  # noqa: F401
+    )
+except ImportError:
+    pass
+logger = logging.getLogger(__name__)
+# Module-level metrics registry for custom metrics
+_METRICS_REGISTRY: dict[str, type] = {}
+def register_metric(name: str, metric_cls: type) -> None:
+    """Register a custom metric for use in evaluate().
+    This allows users to add their own metrics to Themis without modifying
+    the source code. Registered metrics can be used by passing their names
+    to the `metrics` parameter in evaluate().
+    Args:
+        name: Metric name (used in evaluate(metrics=[name]))
+        metric_cls: Metric class implementing the Metric interface.
+            Must have a compute() method that takes prediction, references,
+            and metadata parameters.
+    Raises:
+        TypeError: If metric_cls is not a class
+        ValueError: If metric_cls doesn't implement the required interface
+    Example:
+        >>> from themis.evaluation.metrics import MyCustomMetric
+        >>> themis.register_metric("my_metric", MyCustomMetric)
+        >>> report = themis.evaluate("math500", model="gpt-4", metrics=["my_metric"])
+    """
+    if not isinstance(metric_cls, type):
+        raise TypeError(f"metric_cls must be a class, got {type(metric_cls)}")
+    # Validate that it implements the Metric interface
+    if not hasattr(metric_cls, "compute"):
+        raise ValueError(
+            f"{metric_cls.__name__} must implement compute() method. "
+            f"See themis.evaluation.metrics for examples."
+        )
+    _METRICS_REGISTRY[name] = metric_cls
+    logger.info(f"Registered custom metric: {name} -> {metric_cls.__name__}")
+def get_registered_metrics() -> dict[str, type]:
+    """Get all currently registered custom metrics.
+    Returns:
+        Dictionary mapping metric names to their classes
+    """
+    return _METRICS_REGISTRY.copy()
 def evaluate(
     benchmark_or_dataset: str | Sequence[dict[str, Any]],
@@ -123,6 +185,19 @@ def evaluate(
         >>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
         Accuracy: 85.00%
     """
+    logger.info("=" * 60)
+    logger.info("Starting Themis evaluation")
+    logger.info(f"Model: {model}")
+    logger.info(f"Workers: {workers}")
+    logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
+    if "api_base" in kwargs:
+        logger.info(f"Custom API base: {kwargs['api_base']}")
+    if "api_key" in kwargs:
+        logger.info("API key: <provided>")
+    else:
+        logger.warning("⚠️  No api_key provided - may fail for custom API endpoints")
+    logger.info("=" * 60)
     # Import presets system (lazy import to avoid circular dependencies)
     from themis.presets import get_benchmark_preset, parse_model_name
@@ -131,11 +206,23 @@ def evaluate(
     if is_benchmark:
         benchmark_name = benchmark_or_dataset
+        logger.info(f"Loading benchmark: {benchmark_name}")
         # Get preset configuration
-        preset = get_benchmark_preset(benchmark_name)
+        try:
+            preset = get_benchmark_preset(benchmark_name)
+        except Exception as e:
+            logger.error(f"❌ Failed to get benchmark preset '{benchmark_name}': {e}")
+            raise
         # Load dataset using preset loader
-        dataset = preset.load_dataset(limit=limit)
+        logger.info(f"Loading dataset (limit={limit})...")
+        try:
+            dataset = preset.load_dataset(limit=limit)
+            logger.info(f"✅ Loaded {len(dataset)} samples from {benchmark_name}")
+        except Exception as e:
+            logger.error(f"❌ Failed to load dataset: {e}")
+            raise
         # Use preset prompt if not overridden
         if prompt is None:
@@ -158,11 +245,14 @@ def evaluate(
         dataset_id_field = preset.dataset_id_field
     else:
         # Custom dataset
+        logger.info("Using custom dataset")
         dataset = list(benchmark_or_dataset)
+        logger.info(f"Custom dataset has {len(dataset)} samples")
         # Limit dataset if requested
         if limit is not None:
             dataset = dataset[:limit]
+            logger.info(f"Limited to {len(dataset)} samples")
         # Use provided prompt or default
         if prompt is None:
@@ -188,7 +278,15 @@ def evaluate(
         dataset_id_field = "id"
     # Parse model name to get provider and options
-    provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
+    logger.info(f"Parsing model configuration...")
+    try:
+        provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
+        logger.info(f"Provider: {provider_name}")
+        logger.info(f"Model ID: {model_id}")
+        logger.debug(f"Provider options: {provider_options}")
+    except Exception as e:
+        logger.error(f"❌ Failed to parse model name '{model}': {e}")
+        raise
     # Create model spec
     model_spec = ModelSpec(
@@ -214,17 +312,31 @@ def evaluate(
     )
     # Create provider and router
-    provider = create_provider(provider_name, **provider_options)
+    logger.info(f"Creating provider '{provider_name}'...")
+    try:
+        provider = create_provider(provider_name, **provider_options)
+        logger.info(f"✅ Provider created successfully")
+    except KeyError as e:
+        logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
+        logger.error(f"   This usually means the provider module wasn't imported.")
+        raise
+    except Exception as e:
+        logger.error(f"❌ Failed to create provider: {e}")
+        raise
     router = ProviderRouter({model_id: provider})
+    logger.debug(f"Router configured for model: {model_id}")
     # Create runner
-    runner = GenerationRunner(provider=router)
+    runner = GenerationRunner(provider=router, max_parallel=workers)
+    logger.info(f"Runner configured with {workers} parallel workers")
     # Create evaluation pipeline
     pipeline = EvaluationPipeline(
         extractor=extractor,
         metrics=metrics_list,
     )
+    logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
     # Determine storage location
     if storage is None:
@@ -235,11 +347,15 @@ def evaluate(
     # Generate run ID if not provided
     if run_id is None:
         run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
+    logger.info(f"Run ID: {run_id}")
+    logger.info(f"Storage: {storage_dir}")
+    logger.info(f"Resume: {resume}")
     # Create storage backend
     if isinstance(storage_dir, Path):
         from themis.experiment.storage import ExperimentStorage
         storage_backend = ExperimentStorage(storage_dir)
+        logger.debug(f"Storage backend created at {storage_dir}")
     else:
         # Cloud storage (to be implemented in Phase 3)
         raise NotImplementedError(
@@ -264,15 +380,34 @@ def evaluate(
         )
     # Run locally
-    report = orchestrator.run(
-        dataset=dataset,
-        max_samples=limit,
-        run_id=run_id,
-        resume=resume,
-        on_result=on_result,
-    )
+    logger.info("=" * 60)
+    logger.info("🚀 Starting experiment execution...")
+    logger.info("=" * 60)
-    return report
+    try:
+        report = orchestrator.run(
+            dataset=dataset,
+            max_samples=limit,
+            run_id=run_id,
+            resume=resume,
+            on_result=on_result,
+        )
+        logger.info("=" * 60)
+        logger.info("✅ Evaluation completed successfully!")
+        logger.info(f"   Total samples: {len(report.generation_results)}")
+        logger.info(f"   Successful: {report.metadata.get('successful_generations', 0)}")
+        logger.info(f"   Failed: {report.metadata.get('failed_generations', 0)}")
+        if report.evaluation_report.metrics:
+            logger.info(f"   Metrics: {list(report.evaluation_report.metrics.keys())}")
+        logger.info("=" * 60)
+        return report
+    except Exception as e:
+        logger.error("=" * 60)
+        logger.error(f"❌ Evaluation failed: {e}")
+        logger.error("=" * 60)
+        raise
 def _resolve_metrics(metric_names: list[str]) -> list:
@@ -298,8 +433,8 @@ def _resolve_metrics(metric_names: list[str]) -> list:
     except ImportError:
         nlp_available = False
-    # Metric registry
-    METRICS_REGISTRY = {
+    # Built-in metrics registry
+    BUILTIN_METRICS = {
         # Core metrics
         "exact_match": ExactMatch,
         "math_verify": MathVerifyAccuracy,
@@ -308,7 +443,7 @@ def _resolve_metrics(metric_names: list[str]) -> list:
     # Add NLP metrics if available
     if nlp_available:
-        METRICS_REGISTRY.update({
+        BUILTIN_METRICS.update({
             "bleu": BLEU,
             "rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
             "rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
@@ -321,6 +456,10 @@ def _resolve_metrics(metric_names: list[str]) -> list:
     # "pass_at_k": PassAtK,
     # "codebleu": CodeBLEU,
+    # Merge built-in and custom metrics
+    # Custom metrics can override built-in metrics
+    METRICS_REGISTRY = {**BUILTIN_METRICS, **_METRICS_REGISTRY}
     metrics = []
     for name in metric_names:
         if name not in METRICS_REGISTRY:
@@ -340,4 +479,4 @@ def _resolve_metrics(metric_names: list[str]) -> list:
     return metrics
-__all__ = ["evaluate"]
+__all__ = ["evaluate", "register_metric", "get_registered_metrics"]

themis/experiment/orchestrator.py CHANGED Viewed

@@ -2,10 +2,13 @@
 from __future__ import annotations
+import logging
 from datetime import datetime, timezone
 from typing import Callable, Sequence
 from themis.config.schema import IntegrationsConfig
+logger = logging.getLogger(__name__)
 from themis.core.entities import (
     EvaluationRecord,
     ExperimentFailure,
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
         Returns:
             ExperimentReport with generation results, evaluation, and metadata
         """
+        logger.info("Orchestrator: Initializing experiment run")
         # Initialize integrations
         self._integrations.initialize_run(
             {
@@ -112,13 +117,23 @@ class ExperimentOrchestrator:
         )
         # Prepare dataset
-        dataset_list = self._resolve_dataset(
-            dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
-        )
+        logger.info("Orchestrator: Loading dataset...")
+        try:
+            dataset_list = self._resolve_dataset(
+                dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
+            )
+            logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
+        except Exception as e:
+            logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
+            raise
         selected_dataset = (
             dataset_list[:max_samples] if max_samples is not None else dataset_list
         )
         run_identifier = run_id or self._default_run_id()
+        logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
+        logger.info(f"Orchestrator: Run ID = {run_identifier}")
         # Initialize run in storage (if storage exists and run doesn't exist)
         if self._cache.has_storage:
@@ -130,18 +145,30 @@ class ExperimentOrchestrator:
             self._cache.cache_dataset(run_identifier, dataset_list)
         # Expand dataset into generation tasks
-        tasks = list(self._plan.expand(selected_dataset))
+        logger.info("Orchestrator: Expanding dataset into generation tasks...")
+        try:
+            tasks = list(self._plan.expand(selected_dataset))
+            logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
+        except Exception as e:
+            logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
+            raise
         # Build evaluation configuration for cache invalidation
         evaluation_config = self._build_evaluation_config()
         # Load cached results if resuming
+        if resume:
+            logger.info("Orchestrator: Loading cached results...")
         cached_records = (
             self._cache.load_cached_records(run_identifier) if resume else {}
         )
         cached_evaluations = (
             self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
         )
+        if resume and cached_records:
+            logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
+        if resume and cached_evaluations:
+            logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
         # Process tasks: use cached or run new generations
         generation_results: list[GenerationRecord] = []
@@ -178,9 +205,18 @@ class ExperimentOrchestrator:
         # Run pending generation tasks
         if pending_tasks:
+            logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
+            completed = 0
             for record in self._runner.run(pending_tasks):
+                logger.debug(f"Orchestrator: Received generation record")
                 generation_results.append(record)
+                completed += 1
+                # Log progress every 10 samples or at key milestones
+                if completed % 10 == 0 or completed == len(pending_tasks):
+                    logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
+                logger.debug(f"Orchestrator: Processing record (cost tracking...)")
                 # Track cost for successful generations
                 if record.output and record.output.usage:
                     usage = record.output.usage
@@ -197,6 +233,7 @@ class ExperimentOrchestrator:
                         cost=cost,
                     )
+                logger.debug(f"Orchestrator: Processing record (error handling...)")
                 if record.error:
                     failures.append(
                         ExperimentFailure(
@@ -204,20 +241,35 @@ class ExperimentOrchestrator:
                             message=record.error.message,
                         )
                     )
+                logger.debug(f"Orchestrator: Processing record (caching...)")
                 cache_key = experiment_storage.task_cache_key(record.task)
                 if cache_results:
                     self._cache.save_generation_record(
                         run_identifier, record, cache_key
                     )
+                logger.debug(f"Orchestrator: Processing record (adding to pending...)")
                 pending_records.append(record)
                 pending_keys.append(cache_key)
+                logger.debug(f"Orchestrator: Processing record (callback...)")
                 if on_result:
                     on_result(record)
+                logger.debug(f"Orchestrator: Record processing complete")
         # Evaluate pending records
+        logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
         if pending_records:
-            new_evaluation_report = self._evaluation.evaluate(pending_records)
+            logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
+            try:
+                new_evaluation_report = self._evaluation.evaluate(pending_records)
+                logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
+            except Exception as e:
+                logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
+                raise
         else:
+            logger.info("Orchestrator: No new records to evaluate (all cached)")
             new_evaluation_report = evaluation_pipeline.EvaluationReport(
                 metrics={}, failures=[], records=[]
             )
@@ -229,12 +281,16 @@ class ExperimentOrchestrator:
             )
         # Combine cached and new evaluations
+        logger.info("Orchestrator: Combining cached and new evaluations...")
         evaluation_report = self._combine_evaluations(
             cached_eval_records, new_evaluation_report
         )
+        logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
         # Get cost breakdown
         cost_breakdown = self._cost_tracker.get_breakdown()
+        if cost_breakdown.total_cost > 0:
+            logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
         # Build metadata
         metadata = {

themis/experiment/storage.py CHANGED Viewed

@@ -184,7 +184,7 @@ class ExperimentStorage:
         # In-memory caches
         self._task_index: dict[str, set[str]] = {}
         self._template_index: dict[str, dict[str, str]] = {}
-        self._locks: dict[str, int] = {}  # fd for lock files
+        self._locks: dict[str, tuple[int, int]] = {}  # (fd, count) for reentrant locks
     def _init_database(self):
         """Initialize SQLite metadata database."""
@@ -253,34 +253,175 @@ class ExperimentStorage:
     @contextlib.contextmanager
     def _acquire_lock(self, run_id: str):
-        """Acquire exclusive lock for run directory."""
+        """Acquire exclusive lock for run directory with timeout (reentrant).
+        This lock is reentrant within the same thread to prevent deadlocks when
+        the same process acquires the lock multiple times (e.g., start_run()
+        followed by append_record()).
+        The lock uses OS-specific file locking:
+        - Unix/Linux/macOS: fcntl.flock with non-blocking retry
+        - Windows: msvcrt.locking
+        - Fallback: No locking (single-process mode)
+        Args:
+            run_id: Unique run identifier
+        Yields:
+            Context manager that holds the lock
+        Raises:
+            TimeoutError: If lock cannot be acquired within 30 seconds
+        """
+        import time
+        # Check if we already hold the lock (reentrant)
+        if run_id in self._locks:
+            lock_fd, count = self._locks[run_id]
+            self._locks[run_id] = (lock_fd, count + 1)
+            try:
+                yield
+            finally:
+                # Check if lock still exists (might have been cleaned up by another thread)
+                if run_id in self._locks:
+                    lock_fd, count = self._locks[run_id]
+                    if count > 1:
+                        self._locks[run_id] = (lock_fd, count - 1)
+                    else:
+                        # Last unlock - release the actual lock
+                        self._release_os_lock(lock_fd, run_id)
+            return
+        # First time acquiring lock for this run_id
         lock_path = self._get_run_dir(run_id) / ".lock"
         lock_path.parent.mkdir(parents=True, exist_ok=True)
-        # Open lock file
-        lock_fd = os.open(lock_path, os.O_CREAT | os.O_RDWR)
+        # Open lock file (OS-independent flags)
+        lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR)
         try:
-            # Acquire exclusive lock (blocking)
-            if sys.platform == "win32":
-                # Windows file locking
-                msvcrt.locking(lock_fd, msvcrt.LK_LOCK, 1)
-            elif FCNTL_AVAILABLE:
-                # Unix file locking
-                fcntl.flock(lock_fd, fcntl.LOCK_EX)
-            # If neither available, proceed without locking (single-process only)
+            # Acquire exclusive lock with timeout
+            self._acquire_os_lock(lock_fd, run_id, lock_path, timeout=30)
-            self._locks[run_id] = lock_fd
+            self._locks[run_id] = (lock_fd, 1)
             yield
         finally:
-            # Release lock
-            if sys.platform == "win32":
+            # Release lock (only if this was the outermost lock)
+            if run_id in self._locks:
+                lock_fd, count = self._locks[run_id]
+                if count == 1:
+                    self._release_os_lock(lock_fd, run_id)
+                else:
+                    # Decrement count
+                    self._locks[run_id] = (lock_fd, count - 1)
+    def _acquire_os_lock(
+        self,
+        lock_fd: int,
+        run_id: str,
+        lock_path: Path,
+        timeout: int = 30
+    ) -> None:
+        """Acquire OS-specific file lock with timeout.
+        Args:
+            lock_fd: File descriptor for lock file
+            run_id: Run identifier (for error messages)
+            lock_path: Path to lock file (for error messages)
+            timeout: Timeout in seconds
+        Raises:
+            TimeoutError: If lock cannot be acquired within timeout
+        """
+        import time
+        if sys.platform == "win32":
+            # Windows file locking with retry
+            try:
+                import msvcrt
+            except ImportError:
+                # msvcrt not available - single-process mode
+                import logging
+                logger = logging.getLogger(__name__)
+                logger.debug("msvcrt not available. Single-process mode only.")
+                return
+            start_time = time.time()
+            while True:
+                try:
+                    msvcrt.locking(lock_fd, msvcrt.LK_NBLCK, 1)
+                    break  # Lock acquired
+                except OSError as e:
+                    # Lock is held by another thread/process (errno 13 Permission denied)
+                    if time.time() - start_time > timeout:
+                        try:
+                            os.close(lock_fd)
+                        except:
+                            pass
+                        raise TimeoutError(
+                            f"Failed to acquire lock for run {run_id} after {timeout}s on Windows. "
+                            f"This usually means another process is holding the lock or a previous process crashed. "
+                            f"Try deleting: {lock_path}"
+                        ) from e
+                    time.sleep(0.1)  # Wait 100ms before retry
+        elif FCNTL_AVAILABLE:
+            # Unix file locking with non-blocking retry
+            start_time = time.time()
+            while True:
+                try:
+                    fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                    break  # Lock acquired
+                except (IOError, OSError) as e:
+                    # Lock is held by another process
+                    if time.time() - start_time > timeout:
+                        try:
+                            os.close(lock_fd)
+                        except:
+                            pass
+                        raise TimeoutError(
+                            f"Failed to acquire lock for run {run_id} after {timeout}s. "
+                            f"This usually means another process is holding the lock or a previous process crashed. "
+                            f"Try: rm -f {lock_path}"
+                        ) from e
+                    time.sleep(0.1)  # Wait 100ms before retry
+        else:
+            # No locking available - single-process mode
+            # This is safe for single-process usage (most common case)
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.debug(
+                f"File locking not available on this platform. "
+                f"Storage will work in single-process mode only."
+            )
+    def _release_os_lock(self, lock_fd: int, run_id: str) -> None:
+        """Release OS-specific file lock.
+        Args:
+            lock_fd: File descriptor to close
+            run_id: Run identifier (for cleanup)
+        """
+        # Release lock
+        if sys.platform == "win32":
+            try:
+                import msvcrt
                 msvcrt.locking(lock_fd, msvcrt.LK_UNLCK, 1)
-            elif FCNTL_AVAILABLE:
+            except (ImportError, OSError):
+                pass  # Lock may already be released
+        elif FCNTL_AVAILABLE:
+            try:
                 fcntl.flock(lock_fd, fcntl.LOCK_UN)
+            except (IOError, OSError):
+                pass  # Lock may already be released
+        # Close file descriptor
+        try:
             os.close(lock_fd)
-            self._locks.pop(run_id, None)
+        except OSError:
+            pass  # FD may already be closed
+        # Clean up tracking
+        self._locks.pop(run_id, None)
     def start_run(
         self,
@@ -456,16 +597,19 @@ class ExperimentStorage:
         try:
             if self._config.compression == "gzip":
+                # Close the fd first since gzip.open will open by path
+                os.close(temp_fd)
                 with gzip.open(temp_path, "wt", encoding="utf-8") as f:
                     f.write(json_line)
                     f.flush()
                     os.fsync(f.fileno())
             else:
+                # Use the fd directly
                 with open(temp_fd, "w", encoding="utf-8") as f:
                     f.write(json_line)
                     f.flush()
                     os.fsync(f.fileno())
-            os.close(temp_fd)
+                # fd is closed by context manager, don't close again
             # Get target path with compression
             target_path = (

themis/generation/providers/litellm_provider.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import logging
 import threading
 from dataclasses import dataclass
 from typing import Any, Dict
@@ -10,6 +11,8 @@ from themis.core import entities as core_entities
 from themis.interfaces import ModelProvider
 from themis.providers import register_provider
+logger = logging.getLogger(__name__)
 @dataclass
 class LiteLLMProvider(ModelProvider):
@@ -57,7 +60,22 @@ class LiteLLMProvider(ModelProvider):
             litellm.drop_params = self.drop_params
             if self.max_retries > 0:
                 litellm.num_retries = self.max_retries
+            logger.debug(f"LiteLLMProvider initialized:")
+            logger.debug(f"  api_base: {self.api_base or 'default'}")
+            logger.debug(f"  timeout: {self.timeout}s")
+            logger.debug(f"  max_retries: {self.max_retries}")
+            logger.debug(f"  n_parallel: {self.n_parallel}")
+            # Warn if api_base is set but no api_key
+            if self.api_base and not self.api_key:
+                logger.warning(
+                    "⚠️  LiteLLMProvider: api_base is set but api_key is not. "
+                    "This may cause authentication errors. "
+                    "Set api_key='dummy' for local servers."
+                )
         except ImportError as exc:
+            logger.error("❌ LiteLLM is not installed")
             raise RuntimeError(
                 "LiteLLM is not installed. Install via `pip install litellm` or "
                 "`uv add litellm` to use LiteLLMProvider."
@@ -70,6 +88,10 @@ class LiteLLMProvider(ModelProvider):
         messages = self._build_messages(task)
         completion_kwargs = self._build_completion_kwargs(task, messages)
+        logger.debug(f"LiteLLMProvider: Calling model={completion_kwargs.get('model')}")
+        if self.api_base:
+            logger.debug(f"LiteLLMProvider: Using custom api_base={self.api_base}")
         try:
             with self._semaphore:
@@ -131,6 +153,30 @@ class LiteLLMProvider(ModelProvider):
                 details["status_code"] = exc.status_code  # type: ignore
             if hasattr(exc, "llm_provider"):
                 details["llm_provider"] = exc.llm_provider  # type: ignore
+            # Log with helpful context
+            if "AuthenticationError" in error_type or "api_key" in error_message.lower():
+                logger.error(
+                    f"LiteLLMProvider: ❌ Authentication error for model {task.model.identifier}"
+                )
+                logger.error(
+                    f"  Error: {error_message[:200]}"
+                )
+                logger.error(
+                    f"  Hint: If using a custom api_base, ensure you also pass api_key='dummy'"
+                )
+            elif "Connection" in error_type or "timeout" in error_message.lower():
+                logger.error(
+                    f"LiteLLMProvider: ❌ Connection error for model {task.model.identifier}"
+                )
+                logger.error(f"  Error: {error_message[:200]}")
+                if self.api_base:
+                    logger.error(f"  Check that the server at {self.api_base} is running")
+            else:
+                logger.error(
+                    f"LiteLLMProvider: ❌ Generation failed for {task.model.identifier}: "
+                    f"{error_type}: {error_message[:200]}"
+                )
             return core_entities.GenerationRecord(
                 task=task,

themis/generation/runner.py CHANGED Viewed

@@ -49,16 +49,32 @@ class GenerationRunner:
     ) -> Iterator[core_entities.GenerationRecord]:
         task_list = list(tasks)
         if not task_list:
+            logger.info("Runner: No tasks to execute")
             return
+        logger.info(f"Runner: Starting execution of {len(task_list)} tasks with {self._max_parallel} workers")
         if self._max_parallel <= 1:
-            for task in task_list:
+            logger.info("Runner: Using sequential execution (1 worker)")
+            for i, task in enumerate(task_list, 1):
+                logger.debug(f"Runner: Processing task {i}/{len(task_list)}")
                 yield self._execute_task(task)
             return
+        logger.info(f"Runner: Using parallel execution ({self._max_parallel} workers)")
         with ThreadPoolExecutor(max_workers=self._max_parallel) as executor:
             futures = [executor.submit(self._execute_task, task) for task in task_list]
+            completed = 0
             for future in futures:
-                yield future.result()
+                try:
+                    result = future.result()
+                    completed += 1
+                    if completed % max(1, len(task_list) // 10) == 0 or completed == len(task_list):
+                        logger.debug(f"Runner: Completed {completed}/{len(task_list)} tasks")
+                    yield result
+                except Exception as e:
+                    logger.error(f"Runner: Task execution failed: {e}")
+                    raise
     def _run_single_attempt(
         self, task: core_entities.GenerationTask
@@ -70,7 +86,7 @@ class GenerationRunner:
         for attempt in range(1, self._max_retries + 1):
             try:
                 logger.debug(
-                    "Starting generation for %s attempt %s/%s",
+                    "Runner: Starting generation for %s (attempt %s/%s)",
                     task_label,
                     attempt,
                     self._max_retries,
@@ -79,16 +95,16 @@ class GenerationRunner:
                 record.metrics["generation_attempts"] = attempt
                 if attempt_errors:
                     record.metrics.setdefault("retry_errors", attempt_errors)
-                logger.debug("Completed %s in %s attempt(s)", task_label, attempt)
+                logger.debug("Runner: ✅ Completed %s in %s attempt(s)", task_label, attempt)
                 return record
             except Exception as exc:  # pragma: no cover - defensive path
                 last_error = exc
                 logger.warning(
-                    "Attempt %s/%s for %s failed: %s",
+                    "Runner: ⚠️  Attempt %s/%s for %s failed: %s",
                     attempt,
                     self._max_retries,
                     task_label,
-                    exc,
+                    str(exc)[:100],  # Truncate long error messages
                 )
                 attempt_errors.append(
                     {

themis/presets/__init__.py CHANGED Viewed

@@ -4,7 +4,18 @@ This module provides automatic configuration for popular benchmarks,
 eliminating the need for manual setup of prompts, metrics, and extractors.
 """
-from themis.presets.benchmarks import get_benchmark_preset, list_benchmarks
+from themis.presets.benchmarks import (
+    BenchmarkPreset,
+    get_benchmark_preset,
+    list_benchmarks,
+    register_benchmark,
+)
 from themis.presets.models import parse_model_name
-__all__ = ["get_benchmark_preset", "list_benchmarks", "parse_model_name"]
+__all__ = [
+    "BenchmarkPreset",
+    "register_benchmark",
+    "get_benchmark_preset",
+    "list_benchmarks",
+    "parse_model_name",
+]

themis/utils/logging_utils.py CHANGED Viewed

@@ -5,6 +5,9 @@ from __future__ import annotations
 import logging
 from typing import Mapping
+from rich.logging import RichHandler
+from rich.traceback import install as install_rich_traceback
 TRACE_LEVEL = 5
 logging.addLevelName(TRACE_LEVEL, "TRACE")
@@ -28,12 +31,14 @@ _LEVELS: Mapping[str, int] = {
 def configure_logging(level: str = "info") -> None:
     """Configure root logging with human-friendly formatting."""
+    install_rich_traceback()
     numeric_level = _LEVELS.get(level.lower(), logging.INFO)
     logging.basicConfig(
         level=numeric_level,
-        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-        datefmt="%H:%M:%S",
+        format="%(message)s",
+        datefmt="[%X]",
+        handlers=[RichHandler(rich_tracebacks=True, markup=True)],
         force=True,
     )

themis/utils/progress.py CHANGED Viewed

@@ -5,7 +5,16 @@ from __future__ import annotations
 from contextlib import AbstractContextManager
 from typing import Any, Callable
-from tqdm import tqdm
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    SpinnerColumn,
+    TaskProgressColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
 class ProgressReporter(AbstractContextManager["ProgressReporter"]):
@@ -21,7 +30,8 @@ class ProgressReporter(AbstractContextManager["ProgressReporter"]):
         self._description = description
         self._unit = unit
         self._leave = leave
-        self._pbar: tqdm | None = None
+        self._progress: Progress | None = None
+        self._task_id = None
     def __enter__(self) -> "ProgressReporter":
         self.start()
@@ -31,22 +41,31 @@ class ProgressReporter(AbstractContextManager["ProgressReporter"]):
         self.close()
     def start(self) -> None:
-        if self._pbar is None:
-            self._pbar = tqdm(
-                total=self._total,
-                desc=self._description,
-                unit=self._unit,
-                leave=self._leave,
+        if self._progress is None:
+            self._progress = Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                MofNCompleteColumn(),
+                TimeElapsedColumn(),
+                TimeRemainingColumn(),
+                transient=not self._leave,
+            )
+            self._progress.start()
+            self._task_id = self._progress.add_task(
+                self._description, total=self._total
             )
     def close(self) -> None:
-        if self._pbar is not None:
-            self._pbar.close()
-            self._pbar = None
+        if self._progress is not None:
+            self._progress.stop()
+            self._progress = None
+            self._task_id = None
     def increment(self, step: int = 1) -> None:
-        if self._pbar is not None:
-            self._pbar.update(step)
+        if self._progress is not None and self._task_id is not None:
+            self._progress.update(self._task_id, advance=step)
     def on_result(self, _record: Any) -> None:
         self.increment()

{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: themis-eval
-Version: 0.2.0
+Version: 0.2.2
 Summary: Lightweight evaluation platform for LLM experiments
 Author: Pittawat Taveekitworachai
 License: MIT
@@ -25,6 +25,7 @@ Requires-Dist: tabulate>=0.9.0
 Requires-Dist: tenacity>=9.1.2
 Requires-Dist: plotly>=6.5.0
 Requires-Dist: math-verify>=0.8.0
+Requires-Dist: rich>=14.2.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0; extra == "dev"
 Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
@@ -358,9 +359,9 @@ Themis is built on a clean, modular architecture:
 - **[API Reference](docs/index.md)** - Detailed API documentation
 - **[Examples](examples-simple/)** - Runnable code examples
-- **[Extending Backends](docs/EXTENDING_BACKENDS.md)** - Custom storage and execution
-- **[API Server](docs/API_SERVER.md)** - Web dashboard and REST API
-- **[Comparison Engine](docs/COMPARISON.md)** - Statistical testing guide
+- **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
+- **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
+- **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
 ---
@@ -388,7 +389,7 @@ result = evaluate(
 )
 ```
-See [EXTENDING_BACKENDS.md](docs/EXTENDING_BACKENDS.md) for details.
+See [docs/customization/backends.md](docs/customization/backends.md) for details.
 ### Distributed Execution

{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
-themis/__init__.py,sha256=Pswn5ZiXyU5ANoknjdBLkqouZQdeWMm3DoUMVzU_j8M,543
-themis/_version.py,sha256=xRJB6N107oMsasuLYKaoIzuBo5Oe2hlK3-lGyTzxAC8,378
-themis/api.py,sha256=myHeMaWQMnyjCUAlr9P6cX2Awt50q1XGtyKDCimJgCg,12077
+themis/__init__.py,sha256=rQL3njf3i5lnAcmu0HuRzGGMELbA9xX21hzw4HrbIxw,1394
+themis/_version.py,sha256=y0Oqv0Je2udPmKCy5_D8Lib7GNLGxtLVp8b5WdavITg,378
+themis/api.py,sha256=flZTbU-jRcbv7oXcfRKG4hkZjASmWlT52A4PghKj9G0,17700
 themis/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 themis/backends/__init__.py,sha256=RWM5SnV5FrS_cVjpHHeZZM_b9CgqBu1rPS5DlT5YQTY,578
 themis/backends/execution.py,sha256=RAFuB9ri8TMil5PcnsisypKO2ViyLFXj08P_vjNYguU,6095
@@ -112,9 +112,9 @@ themis/experiment/export_csv.py,sha256=80w3gEGjeLjuiNq539rRP73k3MBtwrzJy90hgE91A
 themis/experiment/integration_manager.py,sha256=wTVTjDGcUkzz4tfnwSxa5nK1A4e2FKCPazDYGcdzYS8,3325
 themis/experiment/math.py,sha256=P2E9F_UKI7pb-aXepSztGdr_g309WEMe83zqg1nWO7A,6973
 themis/experiment/mcq.py,sha256=DDB99FHQsU_5vMIRDRhSZ7pReYvVf57wLmmo3OU_An4,6276
-themis/experiment/orchestrator.py,sha256=-6epspKnPoAJQPKzoNAxd54MrEX3lIhrKyqQ9dmD00A,16120
+themis/experiment/orchestrator.py,sha256=VeSasDmCXrYlrv1r47I698RUq14vEBR7c_uyZzM01hw,19304
 themis/experiment/pricing.py,sha256=fTM32yE3L8vahMP4sr1zr7dbp9zYCjiPN4D4VuZ8-q8,9346
-themis/experiment/storage.py,sha256=QS3fJD79bzgodM5x79yJ2A69O5hTL2r2ROAKSvtRnkI,49471
+themis/experiment/storage.py,sha256=ujGiQTeRPOfS8hYHB1a7F9t-dQnXquhqomI1vDjqmno,55250
 themis/experiment/visualization.py,sha256=dJYHrp3mntl8CPc5HPI3iKqPztVsddQB3ogRkd_FCNc,18473
 themis/generation/__init__.py,sha256=6KVwCQYMpPIsXNuWDZOGuqHkUkA45lbSacIFn8ZbD4s,36
 themis/generation/agentic_runner.py,sha256=armBQBk7qZDBEwT8HqjIWomYDQm57NfrP5CZJzay2uA,13669
@@ -123,18 +123,18 @@ themis/generation/clients.py,sha256=6apXCp_VNQosnpnmohTHOhHGXw-VZgsUyLds8MwtYUE,
 themis/generation/conversation_runner.py,sha256=kSZHwEvfqzxZ-eQYxmg5OkNZcgEHggZExjad6nBOeTM,7980
 themis/generation/plan.py,sha256=RmPIdefXkQMHYv5EWiilpx91I9a-svw31imvG0wV3fE,15961
 themis/generation/router.py,sha256=jZc0KFL483f8TrYtt9yxzFKs-T9CG2CoE2kfOQdHMEc,1082
-themis/generation/runner.py,sha256=iHTE5vSMWMYRrv4PEWMaZflF939nv1wWccK8V0e092c,8009
+themis/generation/runner.py,sha256=pH4Dw77qskMQk3yxEkaHYAl1PItTofI7OXdvevnFiCA,8984
 themis/generation/strategies.py,sha256=hjqaVkNycFxJWh_edJ7ilBl7HS6bL-8pYm24zTfoAvg,2975
 themis/generation/templates.py,sha256=ut_6akp8Y6Ey_9O3s64jDbwCB74pw62Zf8URlYcKHkA,2325
 themis/generation/turn_strategies.py,sha256=w33qhzpQbGTsfeOgOgMDovV0wEeXeNZUUBm5yZy1naw,10973
 themis/generation/types.py,sha256=MkJnZk6lMHmHzlJVEsuIC9ioRW8XhWcSk9AdDeb_aLE,338
-themis/generation/providers/litellm_provider.py,sha256=rlTuglIwhcvSakCo5G-ffgQtEHbCEX0ZeKk6M1MaWmU,8155
+themis/generation/providers/litellm_provider.py,sha256=tvLY8hrSjo4CnyWzccFp1PkXj8R2j8pda5irJiarWd8,10334
 themis/generation/providers/vllm_provider.py,sha256=0K4we6xDrRXlBXseC1ixLq2sJpRF4T8Ikv45dw-zNk4,4625
 themis/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 themis/integrations/huggingface.py,sha256=vrLwYwn65pU4W3FUe0ImCOZxKKlpRshDqMoLFsclB3E,2370
 themis/integrations/wandb.py,sha256=LJOPojjlqG05EIPxcjy3QmA15Gxgs1db3encDWVzYYw,2545
 themis/interfaces/__init__.py,sha256=78dNE_eHfFmb9hXNy5sLZ1jOTGWS8TzdVE_eiYQPFVc,5967
-themis/presets/__init__.py,sha256=hkoyODYiWFFSQAIKTpEbAIUuFIwTibBhzTOkiTbzhVQ,411
+themis/presets/__init__.py,sha256=w58fJcy4eNiE034qHO2xE5pp-H-4LNLXo5hLMuC7wIQ,533
 themis/presets/benchmarks.py,sha256=s9JxRogHwZs8oiuiI7Z7uiUBZXEp3gg7AQZnBvdGieA,12026
 themis/presets/models.py,sha256=c6-I_drHa4vMLIajSkCcrFbsJOsauFjY8fU1leBxZLg,5173
 themis/project/__init__.py,sha256=vgLv2nS62yz1XsFSFzFf7eIo6FyQJXpOY9OPRUcTQLQ,465
@@ -147,11 +147,11 @@ themis/server/app.py,sha256=OZ39gCC47AXVqZxroC_4KtIYBYx_rfpde7C25AF3EI0,11166
 themis/utils/api_generator.py,sha256=3oQ7mGZlFx2Dpm45pMg3rNIqNK2Smj05PjOMXp5RIkQ,10776
 themis/utils/cost_tracking.py,sha256=9_Z2iTfNaQse9G_bnqn4hme4T0fG2W-fxOLEDeF_3VI,11545
 themis/utils/dashboard.py,sha256=2yiIu9_oENglTde_J3G1d5cpQ5VtSnfbUvdliw5Og1E,13008
-themis/utils/logging_utils.py,sha256=YNSiDfO4LsciSzUhHF1aTVI5rkfnWiVbn1NcGjjmJuQ,1019
-themis/utils/progress.py,sha256=b3YwHKV5x3Cvr5rBukqifJimK3Si4CGY2fpN6a_ZySI,1434
+themis/utils/logging_utils.py,sha256=buC64X-xOu-2SZ0wVkz3nCXzYVGiqKbxK-8DGSGsAdM,1173
+themis/utils/progress.py,sha256=HS0-yVbRT7Ai9zRlsJcex_OKP6dUiKx1vOp_IsobiHM,2097
 themis/utils/tracing.py,sha256=VTeiRjcW_B5fOOoSeAp37nrmlwP1DiqPcoe6OtIQ7dk,8468
-themis_eval-0.2.0.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
-themis_eval-0.2.0.dist-info/METADATA,sha256=S4dy0AD2REsRtPfULUYMiYC2Zk8nWgz4BWjBBJz2gHU,15173
-themis_eval-0.2.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
-themis_eval-0.2.0.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
-themis_eval-0.2.0.dist-info/RECORD,,
+themis_eval-0.2.2.dist-info/licenses/LICENSE,sha256=K5FLE7iqn5-_6k1sf3IGy7w-Wx_Vdx3t0sOVJByNlF0,1076
+themis_eval-0.2.2.dist-info/METADATA,sha256=eOlF2Obimv_822azCt0vwhLaBz3CKsuvJPgDHMA3WFU,15235
+themis_eval-0.2.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
+themis_eval-0.2.2.dist-info/top_level.txt,sha256=QGIl4v-KB32upFS5UTXMJxHVX3vF7yBso82wJFI1Vbs,7
+themis_eval-0.2.2.dist-info/RECORD,,

{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{themis_eval-0.2.0.dist-info → themis_eval-0.2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

themis-eval 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

themis-eval 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl