PyPI - wisent - Versions diffs - 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show

wisent/__init__.py +1 -8
wisent/benchmarks/__init__.py +0 -0
wisent/benchmarks/coding/__init__.py +0 -0
wisent/benchmarks/coding/metrics/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
wisent/benchmarks/coding/metrics/evaluator.py +275 -0
wisent/benchmarks/coding/metrics/passk.py +66 -0
wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
wisent/benchmarks/coding/providers/__init__.py +18 -0
wisent/benchmarks/coding/providers/core/__init__.py +0 -0
wisent/benchmarks/coding/providers/core/atoms.py +31 -0
wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
wisent/classifiers/__init__.py +0 -0
wisent/classifiers/core/__init__.py +0 -0
wisent/classifiers/core/atoms.py +747 -0
wisent/classifiers/models/__init__.py +0 -0
wisent/classifiers/models/logistic.py +29 -0
wisent/classifiers/models/mlp.py +47 -0
wisent/cli/__init__.py +0 -0
wisent/cli/classifiers/__init__.py +0 -0
wisent/cli/classifiers/classifier_rotator.py +137 -0
wisent/cli/cli_logger.py +142 -0
wisent/cli/data_loaders/__init__.py +0 -0
wisent/cli/data_loaders/data_loader_rotator.py +96 -0
wisent/cli/evaluators/__init__.py +0 -0
wisent/cli/evaluators/evaluator_rotator.py +148 -0
wisent/cli/steering_methods/__init__.py +0 -0
wisent/cli/steering_methods/steering_rotator.py +110 -0
wisent/cli/wisent_cli/__init__.py +0 -0
wisent/cli/wisent_cli/commands/__init__.py +0 -0
wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
wisent/cli/wisent_cli/commands/listing.py +154 -0
wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
wisent/cli/wisent_cli/main.py +93 -0
wisent/cli/wisent_cli/shell.py +80 -0
wisent/cli/wisent_cli/ui.py +69 -0
wisent/cli/wisent_cli/util/__init__.py +0 -0
wisent/cli/wisent_cli/util/aggregations.py +43 -0
wisent/cli/wisent_cli/util/parsing.py +126 -0
wisent/cli/wisent_cli/version.py +4 -0
wisent/core/__init__.py +27 -0
wisent/core/activations/__init__.py +0 -0
wisent/core/activations/activations_collector.py +338 -0
wisent/core/activations/core/__init__.py +0 -0
wisent/core/activations/core/atoms.py +216 -0
wisent/core/agent/__init__.py +18 -0
wisent/core/agent/budget.py +638 -0
wisent/core/agent/device_benchmarks.py +685 -0
wisent/core/agent/diagnose/__init__.py +55 -0
wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
wisent/core/agent/diagnose/create_classifier.py +1154 -0
wisent/core/agent/diagnose/response_diagnostics.py +268 -0
wisent/core/agent/diagnose/select_classifiers.py +506 -0
wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
wisent/core/agent/diagnose/tasks/__init__.py +33 -0
wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
wisent/core/agent/diagnose.py +242 -0
wisent/core/agent/steer.py +212 -0
wisent/core/agent/timeout.py +134 -0
wisent/core/autonomous_agent.py +1234 -0
wisent/core/bigcode_integration.py +583 -0
wisent/core/contrastive_pairs/__init__.py +15 -0
wisent/core/contrastive_pairs/core/__init__.py +0 -0
wisent/core/contrastive_pairs/core/atoms.py +45 -0
wisent/core/contrastive_pairs/core/buliders.py +59 -0
wisent/core/contrastive_pairs/core/pair.py +178 -0
wisent/core/contrastive_pairs/core/response.py +152 -0
wisent/core/contrastive_pairs/core/serialization.py +300 -0
wisent/core/contrastive_pairs/core/set.py +133 -0
wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
wisent/core/data_loaders/__init__.py +0 -0
wisent/core/data_loaders/core/__init__.py +0 -0
wisent/core/data_loaders/core/atoms.py +98 -0
wisent/core/data_loaders/loaders/__init__.py +0 -0
wisent/core/data_loaders/loaders/custom.py +120 -0
wisent/core/data_loaders/loaders/lm_loader.py +218 -0
wisent/core/detection_handling.py +257 -0
wisent/core/download_full_benchmarks.py +1386 -0
wisent/core/evaluators/__init__.py +0 -0
wisent/core/evaluators/oracles/__init__.py +0 -0
wisent/core/evaluators/oracles/interactive.py +73 -0
wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
wisent/core/evaluators/oracles/user_specified.py +67 -0
wisent/core/hyperparameter_optimizer.py +429 -0
wisent/core/lm_eval_harness_ground_truth.py +1396 -0
wisent/core/log_likelihoods_evaluator.py +321 -0
wisent/core/managed_cached_benchmarks.py +595 -0
wisent/core/mixed_benchmark_sampler.py +364 -0
wisent/core/model_config_manager.py +330 -0
wisent/core/model_persistence.py +317 -0
wisent/core/models/__init__.py +0 -0
wisent/core/models/core/__init__.py +0 -0
wisent/core/models/core/atoms.py +460 -0
wisent/core/models/wisent_model.py +727 -0
wisent/core/multi_steering.py +316 -0
wisent/core/optuna/__init__.py +57 -0
wisent/core/optuna/classifier/__init__.py +25 -0
wisent/core/optuna/classifier/activation_generator.py +349 -0
wisent/core/optuna/classifier/classifier_cache.py +509 -0
wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
wisent/core/optuna/steering/__init__.py +0 -0
wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
wisent/core/optuna/steering/data_utils.py +342 -0
wisent/core/optuna/steering/metrics.py +474 -0
wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
wisent/core/optuna/steering/steering_optimization.py +1111 -0
wisent/core/parser.py +1668 -0
wisent/core/prompts/__init__.py +0 -0
wisent/core/prompts/core/__init__.py +0 -0
wisent/core/prompts/core/atom.py +57 -0
wisent/core/prompts/core/prompt_formater.py +157 -0
wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
wisent/core/representation.py +5 -0
wisent/core/sample_size_optimizer.py +648 -0
wisent/core/sample_size_optimizer_v2.py +355 -0
wisent/core/save_results.py +277 -0
wisent/core/steering.py +652 -0
wisent/core/steering_method.py +26 -0
wisent/core/steering_methods/__init__.py +0 -0
wisent/core/steering_methods/core/__init__.py +0 -0
wisent/core/steering_methods/core/atoms.py +153 -0
wisent/core/steering_methods/methods/__init__.py +0 -0
wisent/core/steering_methods/methods/caa.py +44 -0
wisent/core/steering_optimizer.py +1297 -0
wisent/core/task_interface.py +132 -0
wisent/core/task_selector.py +189 -0
wisent/core/tasks/__init__.py +175 -0
wisent/core/tasks/aime_task.py +141 -0
wisent/core/tasks/file_task.py +211 -0
wisent/core/tasks/hle_task.py +180 -0
wisent/core/tasks/hmmt_task.py +119 -0
wisent/core/tasks/livecodebench_task.py +201 -0
wisent/core/tasks/livemathbench_task.py +158 -0
wisent/core/tasks/lm_eval_task.py +455 -0
wisent/core/tasks/math500_task.py +84 -0
wisent/core/tasks/polymath_task.py +146 -0
wisent/core/tasks/supergpqa_task.py +220 -0
wisent/core/time_estimator.py +149 -0
wisent/core/timing_calibration.py +174 -0
wisent/core/tracking/__init__.py +54 -0
wisent/core/tracking/latency.py +618 -0
wisent/core/tracking/memory.py +359 -0
wisent/core/trainers/__init__.py +0 -0
wisent/core/trainers/core/__init__.py +11 -0
wisent/core/trainers/core/atoms.py +45 -0
wisent/core/trainers/steering_trainer.py +271 -0
wisent/core/user_model_config.py +158 -0
wisent/opti/__init__.py +0 -0
wisent/opti/core/__init__.py +0 -0
wisent/opti/core/atoms.py +175 -0
wisent/opti/methods/__init__.py +0 -0
wisent/opti/methods/opti_classificator.py +172 -0
wisent/opti/methods/opti_steering.py +138 -0
wisent/synthetic/__init__.py +0 -0
wisent/synthetic/cleaners/__init__.py +0 -0
wisent/synthetic/cleaners/core/__init__.py +0 -0
wisent/synthetic/cleaners/core/atoms.py +58 -0
wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
wisent/synthetic/cleaners/methods/__init__.py +0 -0
wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
wisent/synthetic/db_instructions/__init__.py +0 -0
wisent/synthetic/db_instructions/core/__init__.py +0 -0
wisent/synthetic/db_instructions/core/atoms.py +25 -0
wisent/synthetic/db_instructions/mini_dp.py +37 -0
wisent/synthetic/generators/__init__.py +0 -0
wisent/synthetic/generators/core/__init__.py +0 -0
wisent/synthetic/generators/core/atoms.py +73 -0
wisent/synthetic/generators/diversities/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/core.py +68 -0
wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
wisent/synthetic/generators/pairs_generator.py +179 -0
wisent-0.5.1.dist-info/METADATA +67 -0
wisent-0.5.1.dist-info/RECORD +218 -0
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
wisent/activations/__init__.py +0 -9
wisent/activations/client.py +0 -97
wisent/activations/extractor.py +0 -251
wisent/activations/models.py +0 -95
wisent/client.py +0 -45
wisent/control_vector/__init__.py +0 -9
wisent/control_vector/client.py +0 -85
wisent/control_vector/manager.py +0 -168
wisent/control_vector/models.py +0 -70
wisent/inference/__init__.py +0 -9
wisent/inference/client.py +0 -103
wisent/inference/inferencer.py +0 -250
wisent/inference/models.py +0 -66
wisent/utils/__init__.py +0 -3
wisent/utils/auth.py +0 -30
wisent/utils/http.py +0 -228
wisent/version.py +0 -3
wisent-0.1.1.dist-info/METADATA +0 -142
wisent-0.1.1.dist-info/RECORD +0 -23
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0

wisent/core/agent/device_benchmarks.py ADDED Viewed

@@ -0,0 +1,685 @@
+"""
+Device-specific performance benchmarking for wisent-guard.
+This module runs quick performance tests on the current device to measure
+actual execution times for different operations, then saves those estimates
+for future budget calculations.
+"""
+import json
+import time
+import os
+import tempfile
+import subprocess
+import sys
+from typing import Dict, Any, Optional, List
+from dataclasses import dataclass, asdict
+from pathlib import Path
+import hashlib
+import torch
+from wisent_guard.core.utils.device import resolve_default_device
+@dataclass
+class DeviceBenchmark:
+    """Performance benchmark results for a specific device."""
+    device_id: str
+    device_type: str  # "cpu", "cuda", "mps", etc.
+    model_loading_seconds: float
+    benchmark_eval_seconds_per_100_examples: float
+    classifier_training_seconds_per_100_samples: float  # Actually measures full classifier creation time (per 100 classifiers)
+    data_generation_seconds_per_example: float
+    steering_seconds_per_example: float
+    benchmark_timestamp: float
+    python_version: str
+    platform_info: str
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for JSON serialization."""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'DeviceBenchmark':
+        """Create from dictionary loaded from JSON."""
+        return cls(**data)
+class DeviceBenchmarker:
+    """Runs performance benchmarks and manages device-specific estimates."""
+    def __init__(self, benchmarks_file: str = "device_benchmarks.json"):
+        self.benchmarks_file = benchmarks_file
+        self.cached_benchmark: Optional[DeviceBenchmark] = None
+    def get_device_id(self) -> str:
+        """Generate a unique ID for the current device configuration."""
+        import platform
+        # Create device fingerprint from hardware/software info
+        info_parts = [
+            platform.machine(),
+            platform.processor(),
+            platform.system(),
+            platform.release(),
+            sys.version,
+        ]
+        # Add GPU info if available
+        device_kind = resolve_default_device()
+        if device_kind == "cuda" and torch.cuda.is_available():
+            info_parts.append(f"cuda_{torch.cuda.get_device_name(torch.cuda.current_device())}")
+        elif device_kind == "mps":
+            info_parts.append("mps")
+        # Create hash of the combined info
+        combined = "|".join(str(part) for part in info_parts)
+        device_hash = hashlib.md5(combined.encode()).hexdigest()[:12]
+        return device_hash
+    def get_device_type(self) -> str:
+        """Detect the device type (cpu, cuda, mps, etc.)."""
+        return resolve_default_device()
+    def load_cached_benchmark(self) -> Optional[DeviceBenchmark]:
+        """Load cached benchmark results if they exist and are recent."""
+        if not os.path.exists(self.benchmarks_file):
+            return None
+        try:
+            with open(self.benchmarks_file, 'r') as f:
+                data = json.load(f)
+            device_id = self.get_device_id()
+            if device_id not in data:
+                return None
+            benchmark_data = data[device_id]
+            benchmark = DeviceBenchmark.from_dict(benchmark_data)
+            # Check if benchmark is recent (within 7 days)
+            current_time = time.time()
+            age_days = (current_time - benchmark.benchmark_timestamp) / (24 * 3600)
+            if age_days > 7:
+                print(f"   ⚠️ Cached benchmark is {age_days:.1f} days old, will re-run")
+                return None
+            return benchmark
+        except Exception as e:
+            print(f"   ⚠️ Error loading cached benchmark: {e}")
+            return None
+    def save_benchmark(self, benchmark: DeviceBenchmark) -> None:
+        """Save benchmark results to JSON file."""
+        try:
+            # Load existing data
+            existing_data = {}
+            if os.path.exists(self.benchmarks_file):
+                with open(self.benchmarks_file, 'r') as f:
+                    existing_data = json.load(f)
+            # Update with new benchmark
+            existing_data[benchmark.device_id] = benchmark.to_dict()
+            # Save back to file
+            with open(self.benchmarks_file, 'w') as f:
+                json.dump(existing_data, f, indent=2)
+            print(f"   💾 Saved benchmark results to {self.benchmarks_file}")
+        except Exception as e:
+            print(f"   ❌ Error saving benchmark: {e}")
+    def run_model_loading_benchmark(self) -> float:
+        """Benchmark actual model loading time using the real model."""
+        print("   📊 Benchmarking model loading...")
+        # Create actual model loading test script
+        test_script = '''
+import time
+import sys
+sys.path.append('.')
+start_time = time.time()
+try:
+    from wisent_guard.core.model import Model
+    # Use the actual model that will be used in production
+    model = Model("meta-llama/Llama-3.1-8B-Instruct")
+    end_time = time.time()
+    print(f"BENCHMARK_RESULT:{end_time - start_time}")
+except Exception as e:
+    print(f"BENCHMARK_ERROR:{e}")
+    raise
+'''
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+                f.write(test_script)
+                temp_script = f.name
+            # Run with 2-minute timeout
+            result = subprocess.run([
+                sys.executable, temp_script
+            ], capture_output=True, text=True, timeout=120)
+            # Clean up
+            os.unlink(temp_script)
+            # Parse result
+            for line in result.stdout.split('\n'):
+                if line.startswith('BENCHMARK_RESULT:'):
+                    loading_time = float(line.split(':')[1])
+                    print(f"      Model loading: {loading_time:.1f}s")
+                    return loading_time
+        except Exception as e:
+            print(f"      Error in model loading benchmark: {e}")
+            raise RuntimeError(f"Model loading benchmark failed: {e}")
+    def run_benchmark_eval_test(self) -> float:
+        """Benchmark evaluation performance using real CLI functionality."""
+        print("   📊 Benchmarking evaluation performance...")
+        print("   🔧 DEBUG: Creating evaluation test script...")
+        # Create evaluation test script using actual CLI
+        test_script = '''
+import time
+import sys
+sys.path.append('.')
+print("BENCHMARK_DEBUG: Starting evaluation benchmark")
+start_time = time.time()
+try:
+    print("BENCHMARK_DEBUG: Importing CLI...")
+    from wisent_guard.cli import run_task_pipeline
+    print("BENCHMARK_DEBUG: CLI imported successfully")
+    print("BENCHMARK_DEBUG: Running task pipeline...")
+    # Run actual evaluation with real model and minimal examples
+    run_task_pipeline(
+        task_name="truthfulqa_mc",
+        model_name="meta-llama/Llama-3.1-8B-Instruct",
+        layer="15",  # Required parameter
+        limit=3,  # Minimum examples for timing
+        steering_mode=False,  # No steering for baseline timing
+        verbose=False,
+        allow_small_dataset=True,
+        output_mode="likelihoods"
+    )
+    print("BENCHMARK_DEBUG: Task pipeline completed")
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"BENCHMARK_DEBUG: Total time: {total_time}s for 3 examples")
+    # Scale to per-100-examples
+    time_per_100 = (total_time / 3) * 100
+    print(f"BENCHMARK_DEBUG: Scaled time per 100: {time_per_100}s")
+    print(f"BENCHMARK_RESULT:{time_per_100}")
+except Exception as e:
+    print(f"BENCHMARK_ERROR:{e}")
+    import traceback
+    traceback.print_exc()
+    raise
+'''
+        print("   🔧 DEBUG: Writing test script to temporary file...")
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+                f.write(test_script)
+                temp_script = f.name
+            print(f"   🔧 DEBUG: Test script written to {temp_script}")
+            print("   🔧 DEBUG: Running evaluation subprocess...")
+            result = subprocess.run([
+                sys.executable, temp_script
+            ], capture_output=True, text=True, timeout=120)  # 2-minute timeout
+            print(f"   🔧 DEBUG: Subprocess completed with return code: {result.returncode}")
+            print(f"   🔧 DEBUG: Stdout length: {len(result.stdout)} chars")
+            print(f"   🔧 DEBUG: Stderr length: {len(result.stderr)} chars")
+            if result.stderr:
+                print(f"   ⚠️ DEBUG: Stderr content:\n{result.stderr}")
+            os.unlink(temp_script)
+            print("   🔧 DEBUG: Temporary script cleaned up")
+            # Parse result
+            print("   🔧 DEBUG: Parsing output for BENCHMARK_RESULT...")
+            found_result = False
+            for line in result.stdout.split('\n'):
+                print(f"   🔍 DEBUG: Output line: {repr(line)}")
+                if line.startswith('BENCHMARK_RESULT:'):
+                    eval_time = float(line.split(':')[1])
+                    print(f"      ✅ Evaluation: {eval_time:.1f}s per 100 examples")
+                    found_result = True
+                    return eval_time
+            if not found_result:
+                print("   ❌ DEBUG: No BENCHMARK_RESULT found in output!")
+                print("   📜 DEBUG: Full stdout:")
+                print(result.stdout)
+                return None
+        except Exception as e:
+            print(f"      ❌ Error in evaluation benchmark: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+    def run_classifier_training_test(self) -> float:
+        """Benchmark ACTUAL classifier training using real synthetic classifier creation."""
+        print("   📊 Benchmarking classifier training...")
+        print("   🔧 DEBUG: Creating classifier training test script...")
+        # Create test script that uses real synthetic classifier creation
+        test_script = '''
+import time
+import platform
+import sys
+import time
+from pathlib import Path
+from typing import Dict, Optional
+try:
+    print("BENCHMARK_DEBUG: Importing required modules...")
+    from wisent_guard.core.model import Model
+    from wisent_guard.core.agent.diagnose.synthetic_classifier_option import create_classifier_from_trait_description
+    from wisent_guard.core.agent.budget import set_time_budget
+    import time
+    print("BENCHMARK_DEBUG: All modules imported successfully")
+    print("BENCHMARK_DEBUG: Starting classifier benchmark")
+    # Set a budget for the classifier creation
+    print("BENCHMARK_DEBUG: Setting time budget...")
+    set_time_budget(5.0)  # 5 minutes
+    print("BENCHMARK_DEBUG: Set time budget to 5.0 minutes")
+    # Load the actual model
+    print("BENCHMARK_DEBUG: Loading model...")
+    model_start = time.time()
+    model = Model("meta-llama/Llama-3.1-8B-Instruct")
+    model_time = time.time() - model_start
+    print(f"BENCHMARK_DEBUG: Model loaded in {model_time}s")
+    # Create ONE actual classifier using the real synthetic process
+    print("BENCHMARK_DEBUG: Creating classifier...")
+    classifier_start = time.time()
+    classifier = create_classifier_from_trait_description(
+        model=model,
+        trait_description="accuracy and truthfulness",
+        num_pairs=3  # Minimum needed for training
+    )
+    classifier_time = time.time() - classifier_start
+    print(f"BENCHMARK_DEBUG: Classifier created in {classifier_time}s")
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"BENCHMARK_DEBUG: Total benchmark time: {total_time}s")
+    # This is time for ONE complete classifier creation
+    # Scale to "per 100 classifiers" for compatibility with existing code
+    time_per_100 = total_time * 100
+    print(f"BENCHMARK_DEBUG: Scaled time per 100 classifiers: {time_per_100}s")
+    print(f"BENCHMARK_RESULT:{time_per_100}")
+except Exception as e:
+    print(f"BENCHMARK_ERROR:{e}")
+    import traceback
+    traceback.print_exc()
+    raise
+'''
+        print("   🔧 DEBUG: Writing classifier test script to temporary file...")
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+                f.write(test_script)
+                temp_script = f.name
+            print(f"   🔧 DEBUG: Classifier test script written to {temp_script}")
+            print("   🔧 DEBUG: Running classifier training subprocess (20 min timeout)...")
+            result = subprocess.run([
+                sys.executable,
+                temp_script,
+            ], capture_output=True, text=True, timeout=1200)
+            print(f"   🔧 DEBUG: Classifier subprocess completed with return code: {result.returncode}")
+            print(f"   🔧 DEBUG: Stdout length: {len(result.stdout)} chars")
+            print(f"   🔧 DEBUG: Stderr length: {len(result.stderr)} chars")
+            if result.stderr:
+                print(f"   ⚠️ DEBUG: Classifier stderr content:\n{result.stderr}")
+            os.unlink(temp_script)
+            print("   🔧 DEBUG: Classifier temporary script cleaned up")
+            # Parse result
+            print("   🔧 DEBUG: Parsing classifier output for BENCHMARK_RESULT...")
+            for line in result.stdout.split('\n'):
+                print(f"   🔍 DEBUG: Classifier output line: {repr(line)}")
+                if line.startswith('BENCHMARK_RESULT:'):
+                    training_time = float(line.split(':')[1])
+                    print(f"      ✅ Classifier training: {training_time:.1f}s per 100 classifiers")
+                    return training_time
+            print("   ❌ DEBUG: No BENCHMARK_RESULT found in classifier output!")
+            print("   📜 DEBUG: Full classifier stdout:")
+            print(result.stdout)
+            return None
+        except Exception as e:
+            print(f"      ❌ Error in classifier training benchmark: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+    def run_steering_test(self) -> float:
+        """Benchmark steering performance using real CLI functionality."""
+        print("   📊 Benchmarking steering performance...")
+        # Create steering test script using actual CLI
+        test_script = '''
+import time
+import sys
+sys.path.append('.')
+start_time = time.time()
+try:
+    from wisent_guard.cli import run_task_pipeline
+    # Run actual steering with real model and minimal examples
+    run_task_pipeline(
+        task_name="truthfulqa_mc",
+        model_name="meta-llama/Llama-3.1-8B-Instruct",
+        limit=2,  # Minimum examples for timing
+        steering_mode=True,
+        steering_method="CAA",
+        steering_strength=1.0,
+        layer="15",
+        verbose=False,
+        allow_small_dataset=True,
+        output_mode="likelihoods"
+    )
+    end_time = time.time()
+    total_time = end_time - start_time
+    # Time per example
+    time_per_example = total_time / 2
+    print(f"BENCHMARK_RESULT:{time_per_example}")
+except Exception as e:
+    print(f"BENCHMARK_ERROR:{e}")
+    raise
+'''
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+                f.write(test_script)
+                temp_script = f.name
+            result = subprocess.run([
+                sys.executable,
+                temp_script,
+            ], capture_output=True, text=True, timeout=300)
+            os.unlink(temp_script)
+            for line in result.stdout.split('\n'):
+                if line.startswith('BENCHMARK_RESULT:'):
+                    steering_time = float(line.split(':')[1])
+                    print(f"      Steering: {steering_time:.1f}s per example")
+                    return steering_time
+            print("   ❌ No BENCHMARK_RESULT found in steering output!")
+            print(result.stdout)
+            return None
+        except Exception as e:
+            print(f"      Error in steering benchmark: {e}")
+            raise RuntimeError(f"Steering benchmark failed: {e}")
+    def run_data_generation_test(self) -> float:
+        """Benchmark data generation performance using real synthetic generation."""
+        print("   📊 Benchmarking data generation...")
+        # Create data generation test script using actual synthetic pair generation
+        test_script = '''
+import time
+import sys
+sys.path.append('.')
+start_time = time.time()
+try:
+    from wisent_guard.core.model import Model
+    from wisent_guard.core.contrastive_pairs.generate_synthetically import SyntheticContrastivePairGenerator
+    # Load the actual model
+    model = Model("meta-llama/Llama-3.1-8B-Instruct")
+    # Create generator and generate actual synthetic pairs
+    generator = SyntheticContrastivePairGenerator(model)
+    # Generate a small set of pairs for timing
+    pair_set = generator.generate_contrastive_pair_set(
+        trait_description="accuracy and truthfulness",
+        num_pairs=1,  # Minimum needed for estimation
+        name="benchmark_test"
+    )
+    end_time = time.time()
+    total_time = end_time - start_time
+    # Calculate time per generated pair (each pair has 2 responses)
+    num_generated_responses = len(pair_set.pairs) * 2
+    if num_generated_responses == 0:
+        raise RuntimeError("No pairs were generated during data generation benchmark")
+    time_per_example = total_time / num_generated_responses
+    print(f"BENCHMARK_RESULT:{time_per_example}")
+except Exception as e:
+    print(f"BENCHMARK_ERROR:{e}")
+    raise
+'''
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+                f.write(test_script)
+                temp_script = f.name
+            result = subprocess.run([
+                sys.executable, temp_script
+            ], capture_output=True, text=True, timeout=300)  # 5-minute timeout
+            os.unlink(temp_script)
+            # Parse result
+            for line in result.stdout.split('\n'):
+                if line.startswith('BENCHMARK_RESULT:'):
+                    generation_time = float(line.split(':')[1])
+                    print(f"      Data generation: {generation_time:.1f}s per example")
+                    return generation_time
+        except Exception as e:
+            print(f"      Error in data generation benchmark: {e}")
+            raise RuntimeError(f"Data generation benchmark failed: {e}")
+    def run_full_benchmark(self, force_rerun: bool = False) -> DeviceBenchmark:
+        """Run complete device benchmark suite."""
+        # Check for cached results first
+        if not force_rerun:
+            cached = self.load_cached_benchmark()
+            if cached:
+                print(f"   ✅ Using cached benchmark results (device: {cached.device_id[:8]}...)")
+                self.cached_benchmark = cached
+                return cached
+        print("🚀 Running device performance benchmark...")
+        print("   This will take 1-2 minutes to measure your hardware performance")
+        import platform
+        device_id = self.get_device_id()
+        device_type = self.get_device_type()
+        print(f"   🖥️ Device ID: {device_id[:8]}... ({device_type})")
+        # Run all benchmarks with error handling
+        try:
+            model_loading = self.run_model_loading_benchmark()
+            if model_loading is None:
+                print(f"   ❌ Model loading benchmark returned None")
+                raise RuntimeError("Model loading benchmark failed")
+        except Exception as e:
+            print(f"   ❌ Model loading benchmark failed: {e}")
+            raise
+        try:
+            benchmark_eval = self.run_benchmark_eval_test()
+            if benchmark_eval is None:
+                print(f"   ⚠️ Evaluation benchmark returned None, using default value")
+                benchmark_eval = 60.0  # Default 60 seconds per 100 examples
+        except Exception as e:
+            print(f"   ❌ Evaluation benchmark failed: {e}")
+            benchmark_eval = 60.0  # Default fallback
+        try:
+            classifier_training = self.run_classifier_training_test()
+            if classifier_training is None:
+                print(f"   ⚠️ Classifier training benchmark returned None, using default value")
+                classifier_training = 600.0  # Default 600 seconds per 100 classifiers
+        except Exception as e:
+            print(f"   ❌ Classifier training benchmark failed: {e}")
+            classifier_training = 600.0  # Default fallback
+        try:
+            steering = self.run_steering_test()
+            if steering is None:
+                print(f"   ❌ Steering benchmark returned None")
+                raise RuntimeError("Steering benchmark failed")
+        except Exception as e:
+            print(f"   ❌ Steering benchmark failed: {e}")
+            raise
+        try:
+            data_generation = self.run_data_generation_test()
+            if data_generation is None:
+                print(f"   ❌ Data generation benchmark returned None")
+                raise RuntimeError("Data generation benchmark failed")
+        except Exception as e:
+            print(f"   ❌ Data generation benchmark failed: {e}")
+            raise
+        # Create benchmark result
+        benchmark = DeviceBenchmark(
+            device_id=device_id,
+            device_type=device_type,
+            model_loading_seconds=model_loading,
+            benchmark_eval_seconds_per_100_examples=benchmark_eval,
+            classifier_training_seconds_per_100_samples=classifier_training,
+            data_generation_seconds_per_example=data_generation,
+            steering_seconds_per_example=steering,
+            benchmark_timestamp=time.time(),
+            python_version=sys.version,
+            platform_info=platform.platform()
+        )
+        # Save results
+        self.save_benchmark(benchmark)
+        self.cached_benchmark = benchmark
+        print("   ✅ Benchmark complete!")
+        print(f"      Model loading: {model_loading:.1f}s")
+        print(f"      Evaluation: {benchmark_eval:.1f}s per 100 examples")
+        print(f"      Classifier creation: {classifier_training:.1f}s per 100 classifiers")
+        print(f"      Steering: {steering:.1f}s per example")
+        print(f"      Generation: {data_generation:.1f}s per example")
+        return benchmark
+    def get_current_benchmark(self, auto_run: bool = True) -> Optional[DeviceBenchmark]:
+        """Get current device benchmark, optionally auto-running if needed."""
+        if self.cached_benchmark:
+            return self.cached_benchmark
+        cached = self.load_cached_benchmark()
+        if cached:
+            self.cached_benchmark = cached
+            return cached
+        if auto_run:
+            return self.run_full_benchmark()
+        return None
+    def estimate_task_time(self, task_type: str, quantity: int = 1) -> float:
+        """
+        Estimate time for a specific task type and quantity.
+        Args:
+            task_type: Type of task ("model_loading", "benchmark_eval", etc.)
+            quantity: Number of items (examples, samples, etc.)
+        Returns:
+            Estimated time in seconds
+        """
+        benchmark = self.get_current_benchmark()
+        if not benchmark:
+            raise RuntimeError(f"No benchmark available for device. Run benchmark first with: python -m wisent_guard.core.agent.budget benchmark")
+        else:
+            # Use actual benchmark results
+            if task_type == "model_loading":
+                return benchmark.model_loading_seconds
+            elif task_type == "benchmark_eval":
+                base_time = benchmark.benchmark_eval_seconds_per_100_examples
+                return (base_time / 100.0) * quantity
+            elif task_type == "classifier_training":
+                base_time = benchmark.classifier_training_seconds_per_100_samples  # Actually per 100 classifiers now
+                return (base_time / 100.0) * quantity
+            elif task_type == "steering":
+                return benchmark.steering_seconds_per_example * quantity
+            elif task_type == "data_generation":
+                return benchmark.data_generation_seconds_per_example * quantity
+            else:
+                raise ValueError(f"Unknown task type: {task_type}")
+# Global benchmarker instance
+_device_benchmarker = DeviceBenchmarker()
+def get_device_benchmarker() -> DeviceBenchmarker:
+    """Get the global device benchmarker instance."""
+    return _device_benchmarker
+def ensure_benchmark_exists(force_rerun: bool = False) -> DeviceBenchmark:
+    """Ensure device benchmark exists, running it if necessary."""
+    return _device_benchmarker.run_full_benchmark(force_rerun=force_rerun)
+def estimate_task_time(task_type: str, quantity: int = 1) -> float:
+    """
+    Convenience function to estimate task time.
+    Args:
+        task_type: Type of task ("model_loading", "benchmark_eval", etc.)
+        quantity: Number of items
+    Returns:
+        Estimated time in seconds
+    """
+    return _device_benchmarker.estimate_task_time(task_type, quantity)
+def get_current_device_info() -> Dict[str, str]:
+    """Get current device information."""
+    benchmarker = get_device_benchmarker()
+    return {
+        "device_id": benchmarker.get_device_id(),
+        "device_type": benchmarker.get_device_type()
+    }

wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl