PyPI - wisent - Versions diffs - 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show

wisent/__init__.py +1 -8
wisent/benchmarks/__init__.py +0 -0
wisent/benchmarks/coding/__init__.py +0 -0
wisent/benchmarks/coding/metrics/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
wisent/benchmarks/coding/metrics/evaluator.py +275 -0
wisent/benchmarks/coding/metrics/passk.py +66 -0
wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
wisent/benchmarks/coding/providers/__init__.py +18 -0
wisent/benchmarks/coding/providers/core/__init__.py +0 -0
wisent/benchmarks/coding/providers/core/atoms.py +31 -0
wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
wisent/classifiers/__init__.py +0 -0
wisent/classifiers/core/__init__.py +0 -0
wisent/classifiers/core/atoms.py +747 -0
wisent/classifiers/models/__init__.py +0 -0
wisent/classifiers/models/logistic.py +29 -0
wisent/classifiers/models/mlp.py +47 -0
wisent/cli/__init__.py +0 -0
wisent/cli/classifiers/__init__.py +0 -0
wisent/cli/classifiers/classifier_rotator.py +137 -0
wisent/cli/cli_logger.py +142 -0
wisent/cli/data_loaders/__init__.py +0 -0
wisent/cli/data_loaders/data_loader_rotator.py +96 -0
wisent/cli/evaluators/__init__.py +0 -0
wisent/cli/evaluators/evaluator_rotator.py +148 -0
wisent/cli/steering_methods/__init__.py +0 -0
wisent/cli/steering_methods/steering_rotator.py +110 -0
wisent/cli/wisent_cli/__init__.py +0 -0
wisent/cli/wisent_cli/commands/__init__.py +0 -0
wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
wisent/cli/wisent_cli/commands/listing.py +154 -0
wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
wisent/cli/wisent_cli/main.py +93 -0
wisent/cli/wisent_cli/shell.py +80 -0
wisent/cli/wisent_cli/ui.py +69 -0
wisent/cli/wisent_cli/util/__init__.py +0 -0
wisent/cli/wisent_cli/util/aggregations.py +43 -0
wisent/cli/wisent_cli/util/parsing.py +126 -0
wisent/cli/wisent_cli/version.py +4 -0
wisent/core/__init__.py +27 -0
wisent/core/activations/__init__.py +0 -0
wisent/core/activations/activations_collector.py +338 -0
wisent/core/activations/core/__init__.py +0 -0
wisent/core/activations/core/atoms.py +216 -0
wisent/core/agent/__init__.py +18 -0
wisent/core/agent/budget.py +638 -0
wisent/core/agent/device_benchmarks.py +685 -0
wisent/core/agent/diagnose/__init__.py +55 -0
wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
wisent/core/agent/diagnose/create_classifier.py +1154 -0
wisent/core/agent/diagnose/response_diagnostics.py +268 -0
wisent/core/agent/diagnose/select_classifiers.py +506 -0
wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
wisent/core/agent/diagnose/tasks/__init__.py +33 -0
wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
wisent/core/agent/diagnose.py +242 -0
wisent/core/agent/steer.py +212 -0
wisent/core/agent/timeout.py +134 -0
wisent/core/autonomous_agent.py +1234 -0
wisent/core/bigcode_integration.py +583 -0
wisent/core/contrastive_pairs/__init__.py +15 -0
wisent/core/contrastive_pairs/core/__init__.py +0 -0
wisent/core/contrastive_pairs/core/atoms.py +45 -0
wisent/core/contrastive_pairs/core/buliders.py +59 -0
wisent/core/contrastive_pairs/core/pair.py +178 -0
wisent/core/contrastive_pairs/core/response.py +152 -0
wisent/core/contrastive_pairs/core/serialization.py +300 -0
wisent/core/contrastive_pairs/core/set.py +133 -0
wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
wisent/core/data_loaders/__init__.py +0 -0
wisent/core/data_loaders/core/__init__.py +0 -0
wisent/core/data_loaders/core/atoms.py +98 -0
wisent/core/data_loaders/loaders/__init__.py +0 -0
wisent/core/data_loaders/loaders/custom.py +120 -0
wisent/core/data_loaders/loaders/lm_loader.py +218 -0
wisent/core/detection_handling.py +257 -0
wisent/core/download_full_benchmarks.py +1386 -0
wisent/core/evaluators/__init__.py +0 -0
wisent/core/evaluators/oracles/__init__.py +0 -0
wisent/core/evaluators/oracles/interactive.py +73 -0
wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
wisent/core/evaluators/oracles/user_specified.py +67 -0
wisent/core/hyperparameter_optimizer.py +429 -0
wisent/core/lm_eval_harness_ground_truth.py +1396 -0
wisent/core/log_likelihoods_evaluator.py +321 -0
wisent/core/managed_cached_benchmarks.py +595 -0
wisent/core/mixed_benchmark_sampler.py +364 -0
wisent/core/model_config_manager.py +330 -0
wisent/core/model_persistence.py +317 -0
wisent/core/models/__init__.py +0 -0
wisent/core/models/core/__init__.py +0 -0
wisent/core/models/core/atoms.py +460 -0
wisent/core/models/wisent_model.py +727 -0
wisent/core/multi_steering.py +316 -0
wisent/core/optuna/__init__.py +57 -0
wisent/core/optuna/classifier/__init__.py +25 -0
wisent/core/optuna/classifier/activation_generator.py +349 -0
wisent/core/optuna/classifier/classifier_cache.py +509 -0
wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
wisent/core/optuna/steering/__init__.py +0 -0
wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
wisent/core/optuna/steering/data_utils.py +342 -0
wisent/core/optuna/steering/metrics.py +474 -0
wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
wisent/core/optuna/steering/steering_optimization.py +1111 -0
wisent/core/parser.py +1668 -0
wisent/core/prompts/__init__.py +0 -0
wisent/core/prompts/core/__init__.py +0 -0
wisent/core/prompts/core/atom.py +57 -0
wisent/core/prompts/core/prompt_formater.py +157 -0
wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
wisent/core/representation.py +5 -0
wisent/core/sample_size_optimizer.py +648 -0
wisent/core/sample_size_optimizer_v2.py +355 -0
wisent/core/save_results.py +277 -0
wisent/core/steering.py +652 -0
wisent/core/steering_method.py +26 -0
wisent/core/steering_methods/__init__.py +0 -0
wisent/core/steering_methods/core/__init__.py +0 -0
wisent/core/steering_methods/core/atoms.py +153 -0
wisent/core/steering_methods/methods/__init__.py +0 -0
wisent/core/steering_methods/methods/caa.py +44 -0
wisent/core/steering_optimizer.py +1297 -0
wisent/core/task_interface.py +132 -0
wisent/core/task_selector.py +189 -0
wisent/core/tasks/__init__.py +175 -0
wisent/core/tasks/aime_task.py +141 -0
wisent/core/tasks/file_task.py +211 -0
wisent/core/tasks/hle_task.py +180 -0
wisent/core/tasks/hmmt_task.py +119 -0
wisent/core/tasks/livecodebench_task.py +201 -0
wisent/core/tasks/livemathbench_task.py +158 -0
wisent/core/tasks/lm_eval_task.py +455 -0
wisent/core/tasks/math500_task.py +84 -0
wisent/core/tasks/polymath_task.py +146 -0
wisent/core/tasks/supergpqa_task.py +220 -0
wisent/core/time_estimator.py +149 -0
wisent/core/timing_calibration.py +174 -0
wisent/core/tracking/__init__.py +54 -0
wisent/core/tracking/latency.py +618 -0
wisent/core/tracking/memory.py +359 -0
wisent/core/trainers/__init__.py +0 -0
wisent/core/trainers/core/__init__.py +11 -0
wisent/core/trainers/core/atoms.py +45 -0
wisent/core/trainers/steering_trainer.py +271 -0
wisent/core/user_model_config.py +158 -0
wisent/opti/__init__.py +0 -0
wisent/opti/core/__init__.py +0 -0
wisent/opti/core/atoms.py +175 -0
wisent/opti/methods/__init__.py +0 -0
wisent/opti/methods/opti_classificator.py +172 -0
wisent/opti/methods/opti_steering.py +138 -0
wisent/synthetic/__init__.py +0 -0
wisent/synthetic/cleaners/__init__.py +0 -0
wisent/synthetic/cleaners/core/__init__.py +0 -0
wisent/synthetic/cleaners/core/atoms.py +58 -0
wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
wisent/synthetic/cleaners/methods/__init__.py +0 -0
wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
wisent/synthetic/db_instructions/__init__.py +0 -0
wisent/synthetic/db_instructions/core/__init__.py +0 -0
wisent/synthetic/db_instructions/core/atoms.py +25 -0
wisent/synthetic/db_instructions/mini_dp.py +37 -0
wisent/synthetic/generators/__init__.py +0 -0
wisent/synthetic/generators/core/__init__.py +0 -0
wisent/synthetic/generators/core/atoms.py +73 -0
wisent/synthetic/generators/diversities/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/__init__.py +0 -0
wisent/synthetic/generators/diversities/core/core.py +68 -0
wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
wisent/synthetic/generators/pairs_generator.py +179 -0
wisent-0.5.1.dist-info/METADATA +67 -0
wisent-0.5.1.dist-info/RECORD +218 -0
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
wisent/activations/__init__.py +0 -9
wisent/activations/client.py +0 -97
wisent/activations/extractor.py +0 -251
wisent/activations/models.py +0 -95
wisent/client.py +0 -45
wisent/control_vector/__init__.py +0 -9
wisent/control_vector/client.py +0 -85
wisent/control_vector/manager.py +0 -168
wisent/control_vector/models.py +0 -70
wisent/inference/__init__.py +0 -9
wisent/inference/client.py +0 -103
wisent/inference/inferencer.py +0 -250
wisent/inference/models.py +0 -66
wisent/utils/__init__.py +0 -3
wisent/utils/auth.py +0 -30
wisent/utils/http.py +0 -228
wisent/version.py +0 -3
wisent-0.1.1.dist-info/METADATA +0 -142
wisent-0.1.1.dist-info/RECORD +0 -23
{wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0

wisent/core/mixed_benchmark_sampler.py ADDED Viewed

@@ -0,0 +1,364 @@
+"""
+Mixed Benchmark Sampler for tag-based random sampling across multiple benchmarks.
+This module enables training and evaluation on random samples from multiple benchmarks
+that share common tags (e.g., 'coding', 'reasoning', 'math').
+"""
+import random
+import logging
+from typing import List, Dict, Any, Optional, Set, Tuple
+from dataclasses import dataclass
+from collections import defaultdict
+# Suppress BigCode debug output
+import builtins
+_original_print = getattr(builtins, '_original_print', builtins.print)
+def _quiet_print(*args, **kwargs):
+    """Filter out BigCode debug messages."""
+    message = ' '.join(str(arg) for arg in args)
+    if any(x in message for x in ['DEBUG', 'Available tasks:', 'ERROR extracting', 'bigcode_eval']):
+        return
+    _original_print(*args, **kwargs)
+# Store original print and patch
+builtins._original_print = builtins.print
+builtins.print = _quiet_print
+try:
+    from .lm_harness_integration.only_benchmarks import CORE_BENCHMARKS
+except ImportError:
+    # Try alternative import path
+    import sys
+    import os
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    sys.path.insert(0, os.path.join(current_dir, "lm-harness-integration"))
+    from only_benchmarks import CORE_BENCHMARKS
+from .contrastive_pairs import ContrastivePairSet
+from .managed_cached_benchmarks import ManagedCachedBenchmarks, get_managed_cache
+logger = logging.getLogger(__name__)
+@dataclass
+class BenchmarkSample:
+    """A single sample from a benchmark."""
+    benchmark_name: str
+    sample_data: Dict[str, Any]
+    tags: List[str]
+class MixedBenchmarkSampler:
+    """
+    Samples randomly from multiple benchmarks based on tags.
+    This creates more robust classifiers by training on diverse data
+    from multiple sources rather than a single benchmark.
+    """
+    def __init__(self, cache_dir: str = "./benchmark_cache"):
+        """
+        Initialize the mixed benchmark sampler.
+        Args:
+            cache_dir: Directory for cached benchmark data
+        """
+        self.cache_dir = cache_dir
+        self.managed_cache = get_managed_cache(cache_dir)
+        self._benchmark_registry = self._build_benchmark_registry()
+    def _build_benchmark_registry(self) -> Dict[str, List[str]]:
+        """Build a registry mapping tags to benchmark names."""
+        tag_to_benchmarks = defaultdict(list)
+        for benchmark_name, config in CORE_BENCHMARKS.items():
+            tags = config.get("tags", [])
+            for tag in tags:
+                tag_to_benchmarks[tag].append(benchmark_name)
+        return dict(tag_to_benchmarks)
+    def get_benchmarks_by_tag(self, tag: str) -> List[str]:
+        """Get all benchmarks that have a specific tag."""
+        return self._benchmark_registry.get(tag, [])
+    def get_benchmarks_by_tags(self, tags: List[str], mode: str = "any") -> List[str]:
+        """
+        Get benchmarks that match the given tags.
+        Args:
+            tags: List of tags to match
+            mode: "any" (benchmark has at least one tag) or "all" (benchmark has all tags)
+        Returns:
+            List of benchmark names matching the criteria
+        """
+        if mode == "any":
+            # Get benchmarks that have ANY of the specified tags
+            matching_benchmarks = set()
+            for tag in tags:
+                matching_benchmarks.update(self.get_benchmarks_by_tag(tag))
+            return list(matching_benchmarks)
+        elif mode == "all":
+            # Get benchmarks that have ALL of the specified tags
+            if not tags:
+                return []
+            # Start with benchmarks that have the first tag
+            matching_benchmarks = set(self.get_benchmarks_by_tag(tags[0]))
+            # Intersect with benchmarks for each additional tag
+            for tag in tags[1:]:
+                matching_benchmarks &= set(self.get_benchmarks_by_tag(tag))
+            return list(matching_benchmarks)
+        else:
+            raise ValueError(f"Invalid mode: {mode}. Use 'any' or 'all'")
+    def sample_mixed_dataset(
+        self,
+        tags: List[str],
+        total_samples: int,
+        split_ratio: float = 0.8,
+        random_seed: Optional[int] = None,
+        tag_mode: str = "any",
+        benchmark_weights: Optional[Dict[str, float]] = None
+    ) -> Tuple[List[BenchmarkSample], List[BenchmarkSample]]:
+        """
+        Sample a mixed dataset from benchmarks matching the given tags.
+        Args:
+            tags: Tags to filter benchmarks (e.g., ["coding", "python"])
+            total_samples: Total number of samples to collect
+            split_ratio: Train/test split ratio
+            random_seed: Random seed for reproducibility
+            tag_mode: "any" or "all" for tag matching
+            benchmark_weights: Optional weights for sampling probability per benchmark
+        Returns:
+            Tuple of (train_samples, test_samples)
+        """
+        if random_seed is not None:
+            random.seed(random_seed)
+        # Get matching benchmarks
+        matching_benchmarks = self.get_benchmarks_by_tags(tags, mode=tag_mode)
+        if not matching_benchmarks:
+            raise ValueError(f"No benchmarks found with tags {tags} (mode={tag_mode})")
+        logger.info(f"Found {len(matching_benchmarks)} benchmarks matching tags {tags}")
+        logger.info(f"Matching benchmarks: {matching_benchmarks[:10]}...")  # Show first 10
+        # Collect all available samples from matching benchmarks
+        all_samples = []
+        benchmark_sample_counts = {}
+        # Skip benchmarks that require code execution permission
+        code_execution_benchmarks = {"apps", "ds1000", "mercury"}
+        for benchmark_name in matching_benchmarks:
+            # Skip benchmarks that require code execution for safety
+            if benchmark_name in code_execution_benchmarks:
+                logger.info(f"Skipping {benchmark_name} (requires code execution permission)")
+                continue
+            try:
+                # Get samples from this benchmark
+                samples_per_benchmark = max(10, total_samples // len(matching_benchmarks))
+                cached_samples = self.managed_cache.get_task_samples(
+                    task_name=benchmark_name,
+                    limit=samples_per_benchmark,
+                    force_fresh=False
+                )
+                # Convert to BenchmarkSample objects
+                for sample in cached_samples:
+                    benchmark_sample = BenchmarkSample(
+                        benchmark_name=benchmark_name,
+                        sample_data=sample,
+                        tags=CORE_BENCHMARKS[benchmark_name].get("tags", [])
+                    )
+                    all_samples.append(benchmark_sample)
+                benchmark_sample_counts[benchmark_name] = len(cached_samples)
+            except Exception as e:
+                logger.warning(f"Failed to load samples from {benchmark_name}: {e}")
+                continue
+        if not all_samples:
+            raise ValueError(f"No samples could be loaded from any benchmark with tags {tags}")
+        logger.info(f"Collected {len(all_samples)} total samples from {len(benchmark_sample_counts)} benchmarks")
+        for benchmark, count in benchmark_sample_counts.items():
+            logger.debug(f"  {benchmark}: {count} samples")
+        # Apply benchmark weights if provided
+        if benchmark_weights:
+            weighted_samples = []
+            for sample in all_samples:
+                weight = benchmark_weights.get(sample.benchmark_name, 1.0)
+                # Duplicate samples based on weight (simple approach)
+                weighted_samples.extend([sample] * int(weight))
+            all_samples = weighted_samples
+        # Randomly sample and shuffle
+        if len(all_samples) > total_samples:
+            all_samples = random.sample(all_samples, total_samples)
+        else:
+            # If we have fewer samples than requested, use all and log warning
+            logger.warning(f"Only {len(all_samples)} samples available, requested {total_samples}")
+        random.shuffle(all_samples)
+        # Split into train/test
+        split_point = int(len(all_samples) * split_ratio)
+        train_samples = all_samples[:split_point]
+        test_samples = all_samples[split_point:]
+        # Log distribution
+        train_dist = defaultdict(int)
+        test_dist = defaultdict(int)
+        for sample in train_samples:
+            train_dist[sample.benchmark_name] += 1
+        for sample in test_samples:
+            test_dist[sample.benchmark_name] += 1
+        logger.info(f"Train set: {len(train_samples)} samples from {len(train_dist)} benchmarks")
+        logger.info(f"Test set: {len(test_samples)} samples from {len(test_dist)} benchmarks")
+        return train_samples, test_samples
+    def extract_contrastive_pairs_from_mixed_samples(
+        self,
+        samples: List[BenchmarkSample]
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract contrastive pairs from mixed benchmark samples.
+        Args:
+            samples: List of BenchmarkSample objects
+        Returns:
+            List of contrastive pairs with question, correct_answer, incorrect_answer
+        """
+        contrastive_pairs = []
+        for sample in samples:
+            try:
+                # Each sample already has normalized QA pair from managed cache
+                qa_pair = sample.sample_data.get("normalized", {})
+                if qa_pair and all(k in qa_pair for k in ["question", "correct_answer", "incorrect_answer"]):
+                    # Add benchmark source info
+                    qa_pair["source_benchmark"] = sample.benchmark_name
+                    qa_pair["tags"] = sample.tags
+                    contrastive_pairs.append(qa_pair)
+                else:
+                    logger.warning(f"Invalid QA pair from {sample.benchmark_name}")
+            except Exception as e:
+                logger.warning(f"Failed to extract pair from {sample.benchmark_name}: {e}")
+                continue
+        logger.info(f"Extracted {len(contrastive_pairs)} contrastive pairs from mixed samples")
+        return contrastive_pairs
+    def create_mixed_contrastive_pair_set(
+        self,
+        tags: List[str],
+        total_samples: int,
+        name: Optional[str] = None,
+        **kwargs
+    ) -> ContrastivePairSet:
+        """
+        Create a ContrastivePairSet from mixed benchmark samples.
+        Args:
+            tags: Tags to filter benchmarks
+            total_samples: Number of samples to include
+            name: Name for the pair set (auto-generated if not provided)
+            **kwargs: Additional arguments for sample_mixed_dataset
+        Returns:
+            ContrastivePairSet ready for training
+        """
+        # Sample mixed dataset
+        train_samples, test_samples = self.sample_mixed_dataset(
+            tags=tags,
+            total_samples=total_samples,
+            **kwargs
+        )
+        # Extract contrastive pairs
+        all_samples = train_samples + test_samples
+        contrastive_pairs = self.extract_contrastive_pairs_from_mixed_samples(all_samples)
+        # Create name if not provided
+        if name is None:
+            name = f"mixed_{'_'.join(tags)}_{total_samples}_samples"
+        # Create ContrastivePairSet
+        return ContrastivePairSet.from_contrastive_pairs(
+            name=name,
+            contrastive_pairs=contrastive_pairs,
+            task_type="mixed_benchmark"
+        )
+def sample_benchmarks_by_tag(
+    tag: str,
+    samples_per_benchmark: int = 10,
+    max_benchmarks: Optional[int] = None,
+    random_seed: Optional[int] = None
+) -> Dict[str, List[Dict[str, Any]]]:
+    """
+    Convenience function to sample from all benchmarks with a specific tag.
+    Args:
+        tag: Tag to filter benchmarks (e.g., "coding")
+        samples_per_benchmark: Number of samples from each benchmark
+        max_benchmarks: Maximum number of benchmarks to sample from
+        random_seed: Random seed for reproducibility
+    Returns:
+        Dictionary mapping benchmark names to their samples
+    """
+    sampler = MixedBenchmarkSampler()
+    # Get all benchmarks with the tag
+    benchmarks = sampler.get_benchmarks_by_tag(tag)
+    if max_benchmarks and len(benchmarks) > max_benchmarks:
+        if random_seed is not None:
+            random.seed(random_seed)
+        benchmarks = random.sample(benchmarks, max_benchmarks)
+    # Sample from each benchmark
+    results = {}
+    cache = get_managed_cache()
+    for benchmark_name in benchmarks:
+        try:
+            samples = cache.get_task_samples(
+                task_name=benchmark_name,
+                limit=samples_per_benchmark,
+                force_fresh=False
+            )
+            results[benchmark_name] = samples
+            logger.info(f"Sampled {len(samples)} from {benchmark_name}")
+        except Exception as e:
+            logger.warning(f"Failed to sample from {benchmark_name}: {e}")
+            continue
+    return results

wisent/core/model_config_manager.py ADDED Viewed

@@ -0,0 +1,330 @@
+"""
+Model Configuration Manager for storing and retrieving optimal parameters per model.
+"""
+import os
+import json
+import logging
+from typing import Dict, Any, Optional, List
+from datetime import datetime
+import hashlib
+import numpy as np
+class NumpyEncoder(json.JSONEncoder):
+    """Custom JSON encoder to handle numpy types."""
+    def default(self, obj):
+        if isinstance(obj, (np.integer, np.int64)):
+            return int(obj)
+        if isinstance(obj, (np.floating, np.float64)):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super().default(obj)
+logger = logging.getLogger(__name__)
+class ModelConfigManager:
+    """Manages model-specific configuration files for optimal parameters."""
+    def __init__(self, config_dir: Optional[str] = None):
+        """
+        Initialize the ModelConfigManager.
+        Args:
+            config_dir: Directory to store config files. If None, uses default location.
+        """
+        if config_dir is None:
+            # Use ~/.wisent-guard/model_configs/ as default
+            home_dir = os.path.expanduser("~")
+            self.config_dir = os.path.join(home_dir, ".wisent-guard", "model_configs")
+        else:
+            self.config_dir = config_dir
+        # Create directory if it doesn't exist
+        os.makedirs(self.config_dir, exist_ok=True)
+    def _sanitize_model_name(self, model_name: str) -> str:
+        """
+        Convert model name to a safe filename.
+        Args:
+            model_name: Original model name (e.g., "meta-llama/Llama-3.1-8B-Instruct")
+        Returns:
+            Sanitized filename (e.g., "meta-llama_Llama-3.1-8B-Instruct")
+        """
+        # Replace problematic characters
+        sanitized = model_name.replace("/", "_").replace("\\", "_").replace(":", "_")
+        # Remove any other problematic characters
+        sanitized = "".join(c for c in sanitized if c.isalnum() or c in "._-")
+        return sanitized
+    def _get_config_path(self, model_name: str) -> str:
+        """Get the full path to the config file for a model."""
+        sanitized_name = self._sanitize_model_name(model_name)
+        return os.path.join(self.config_dir, f"{sanitized_name}.json")
+    def save_model_config(
+        self,
+        model_name: str,
+        classification_layer: int,
+        steering_layer: Optional[int] = None,
+        token_aggregation: str = "average",
+        detection_threshold: float = 0.6,
+        optimization_method: str = "manual",
+        optimization_metrics: Optional[Dict[str, Any]] = None,
+        task_specific_overrides: Optional[Dict[str, Dict[str, Any]]] = None
+    ) -> str:
+        """
+        Save optimal parameters for a model.
+        Args:
+            model_name: Name/path of the model
+            classification_layer: Optimal layer for classification
+            steering_layer: Optimal layer for steering (defaults to classification_layer)
+            token_aggregation: Token aggregation method
+            detection_threshold: Detection threshold
+            optimization_method: How these parameters were determined
+            optimization_metrics: Metrics from optimization process
+            task_specific_overrides: Task-specific parameter overrides
+        Returns:
+            Path to the saved config file
+        """
+        if steering_layer is None:
+            steering_layer = classification_layer
+        config_data = {
+            "model_name": model_name,
+            "created_date": datetime.now().isoformat(),
+            "optimization_method": optimization_method,
+            "optimal_parameters": {
+                "classification_layer": classification_layer,
+                "steering_layer": steering_layer,
+                "token_aggregation": token_aggregation,
+                "detection_threshold": detection_threshold
+            },
+            "task_specific_overrides": task_specific_overrides or {},
+            "optimization_metrics": optimization_metrics or {},
+            "config_version": "1.0"
+        }
+        config_path = self._get_config_path(model_name)
+        try:
+            with open(config_path, 'w') as f:
+                json.dump(config_data, f, indent=2, cls=NumpyEncoder)
+            logger.info(f"✅ Model configuration saved: {config_path}")
+            logger.info(f"   • Classification layer: {classification_layer}")
+            logger.info(f"   • Steering layer: {steering_layer}")
+            logger.info(f"   • Token aggregation: {token_aggregation}")
+            logger.info(f"   • Detection threshold: {detection_threshold}")
+            return config_path
+        except Exception as e:
+            logger.error(f"❌ Failed to save model configuration: {e}")
+            raise
+    def load_model_config(self, model_name: str) -> Optional[Dict[str, Any]]:
+        """
+        Load optimal parameters for a model.
+        Args:
+            model_name: Name/path of the model
+        Returns:
+            Configuration dictionary if found, None otherwise
+        """
+        config_path = self._get_config_path(model_name)
+        if not os.path.exists(config_path):
+            return None
+        try:
+            with open(config_path, 'r') as f:
+                config_data = json.load(f)
+            logger.debug(f"📄 Loaded model configuration: {config_path}")
+            return config_data
+        except Exception as e:
+            logger.warning(f"⚠️ Failed to load model configuration: {e}")
+            return None
+    def has_model_config(self, model_name: str) -> bool:
+        """Check if a model has a saved configuration."""
+        config_path = self._get_config_path(model_name)
+        return os.path.exists(config_path)
+    def update_model_config(self, model_name: str, config_data: Dict[str, Any]) -> str:
+        """
+        Update an existing model configuration.
+        Args:
+            model_name: Name/path of the model
+            config_data: Updated configuration dictionary
+        Returns:
+            Path to the saved config file
+        """
+        config_path = self._get_config_path(model_name)
+        # Update timestamp
+        config_data["updated_date"] = datetime.now().isoformat()
+        try:
+            with open(config_path, 'w') as f:
+                json.dump(config_data, f, indent=2, cls=NumpyEncoder)
+            logger.info(f"✅ Model configuration updated: {config_path}")
+            return config_path
+        except Exception as e:
+            logger.error(f"❌ Failed to update model configuration: {e}")
+            raise
+    def get_optimal_parameters(
+        self,
+        model_name: str,
+        task_name: Optional[str] = None
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Get optimal parameters for a model, with optional task-specific overrides.
+        Args:
+            model_name: Name/path of the model
+            task_name: Specific task name for overrides
+        Returns:
+            Dictionary of optimal parameters or None if no config exists
+        """
+        config = self.load_model_config(model_name)
+        if not config:
+            return None
+        # Start with base optimal parameters
+        optimal_params = config.get("optimal_parameters", {}).copy()
+        # Apply task-specific overrides if available
+        if task_name and "task_specific_overrides" in config:
+            task_overrides = config["task_specific_overrides"].get(task_name, {})
+            optimal_params.update(task_overrides)
+        return optimal_params
+    def get_optimal_sample_size(
+        self,
+        model_name: str,
+        task_name: str,
+        layer: int
+    ) -> Optional[int]:
+        """
+        Get optimal sample size for a specific task and layer.
+        Args:
+            model_name: Name/path of the model
+            task_name: Task name
+            layer: Layer index
+        Returns:
+            Optimal sample size or None if not found
+        """
+        config = self.load_model_config(model_name)
+        if not config:
+            return None
+        # Check if optimal_sample_sizes exists
+        if "optimal_sample_sizes" not in config:
+            return None
+        # Navigate the nested structure: optimal_sample_sizes[task][layer]
+        task_sizes = config["optimal_sample_sizes"].get(task_name, {})
+        sample_size = task_sizes.get(str(layer), None)
+        return sample_size
+    def list_model_configs(self) -> List[Dict[str, Any]]:
+        """
+        List all available model configurations.
+        Returns:
+            List of configuration summaries
+        """
+        configs = []
+        if not os.path.exists(self.config_dir):
+            return configs
+        for filename in os.listdir(self.config_dir):
+            if filename.endswith('.json'):
+                try:
+                    config_path = os.path.join(self.config_dir, filename)
+                    with open(config_path, 'r') as f:
+                        config_data = json.load(f)
+                    summary = {
+                        "model_name": config_data.get("model_name", "unknown"),
+                        "created_date": config_data.get("created_date", "unknown"),
+                        "optimization_method": config_data.get("optimization_method", "unknown"),
+                        "classification_layer": config_data.get("optimal_parameters", {}).get("classification_layer"),
+                        "steering_layer": config_data.get("optimal_parameters", {}).get("steering_layer"),
+                        "config_file": filename
+                    }
+                    configs.append(summary)
+                except Exception as e:
+                    logger.warning(f"⚠️ Failed to read config file {filename}: {e}")
+        return configs
+    def remove_model_config(self, model_name: str) -> bool:
+        """
+        Remove a model configuration.
+        Args:
+            model_name: Name/path of the model
+        Returns:
+            True if removed successfully, False otherwise
+        """
+        config_path = self._get_config_path(model_name)
+        if not os.path.exists(config_path):
+            logger.warning(f"⚠️ No configuration found for model: {model_name}")
+            return False
+        try:
+            os.remove(config_path)
+            logger.info(f"✅ Removed model configuration: {config_path}")
+            return True
+        except Exception as e:
+            logger.error(f"❌ Failed to remove model configuration: {e}")
+            return False
+# Convenience functions for easy access
+_default_manager = None
+def get_default_manager() -> ModelConfigManager:
+    """Get the default ModelConfigManager instance."""
+    global _default_manager
+    if _default_manager is None:
+        _default_manager = ModelConfigManager()
+    return _default_manager
+def save_model_config(model_name: str, **kwargs) -> str:
+    """Save model configuration using default manager."""
+    return get_default_manager().save_model_config(model_name, **kwargs)
+def load_model_config(model_name: str) -> Optional[Dict[str, Any]]:
+    """Load model configuration using default manager."""
+    return get_default_manager().load_model_config(model_name)
+def get_optimal_parameters(model_name: str, task_name: Optional[str] = None) -> Optional[Dict[str, Any]]:
+    """Get optimal parameters using default manager."""
+    return get_default_manager().get_optimal_parameters(model_name, task_name)

wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

wisent 0.1.1py3-none-any.whl → 0.5.1py3-none-any.whl