PyPI - wisent - Versions diffs - 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl - Mend

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (391) hide show

wisent/examples/scripts/test_all_benchmarks.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Test all benchmarks to verify extractor and evaluator work."""
+import json
+import os
+import sys
+import signal
+from contextlib import contextmanager
+from pathlib import Path
+from wisent.examples.scripts.test_one_benchmark import test_benchmark
+# Set environment variable to trust remote code for datasets like meddialog
+os.environ['HF_DATASETS_TRUST_REMOTE_CODE'] = '1'
+# Set environment variable to allow code eval for coding tasks like humaneval, instructhumaneval
+os.environ['HF_ALLOW_CODE_EVAL'] = '1'
+class TimeoutError(Exception):
+    """Raised when a test times out."""
+    pass
+@contextmanager
+def timeout(seconds):
+    """Context manager for timing out operations."""
+    def signal_handler(signum, frame):
+        raise TimeoutError(f"Test timed out after {seconds} seconds")
+    # Set the signal handler and alarm
+    old_handler = signal.signal(signal.SIGALRM, signal_handler)
+    signal.alarm(seconds)
+    try:
+        yield
+    finally:
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, old_handler)
+def load_benchmarks():
+    """Load benchmarks from central registry."""
+    from wisent.core.benchmark_registry import get_all_benchmarks, get_broken_tasks
+    broken_tasks = get_broken_tasks()
+    if broken_tasks:
+        print(f"Skipping {len(broken_tasks)} broken benchmarks: {', '.join(broken_tasks)}")
+    return get_all_benchmarks()
+BENCHMARKS = load_benchmarks()
+def test_all_benchmarks(model_name: str = "meta-llama/Llama-3.1-8B-Instruct", output_dir: str = ".", start_index: int = 0):
+    """Test all benchmarks.
+    Args:
+        model_name: Model to use for testing
+        output_dir: Directory to save results
+        start_index: Index to start testing from (0-based)
+    Returns:
+        Dictionary with results for each benchmark
+    """
+    results = {
+        "model": model_name,
+        "total": len(BENCHMARKS),
+        "passed": 0,
+        "failed": 0,
+        "benchmarks": {}
+    }
+    print(f"\n{'='*70}")
+    print(f"Testing {len(BENCHMARKS)} benchmarks with {model_name}")
+    if start_index > 0:
+        print(f"Starting from benchmark {start_index + 1} ({BENCHMARKS[start_index]})")
+    print(f"{'='*70}\n")
+    for i, benchmark in enumerate(BENCHMARKS, 1):
+        if i - 1 < start_index:
+            continue
+        print(f"[{i}/{len(BENCHMARKS)}] Testing {benchmark}...")
+        try:
+            with timeout(1200):
+                success = test_benchmark(benchmark, model_name, output_dir)
+            results["benchmarks"][benchmark] = {
+                "status": "passed" if success else "failed",
+                "success": success
+            }
+            if success:
+                results["passed"] += 1
+                print(f"   PASSED\n")
+            else:
+                results["failed"] += 1
+                print(f"   FAILED\n")
+        except TimeoutError as e:
+            results["benchmarks"][benchmark] = {
+                "status": "timeout",
+                "success": False,
+                "error": str(e)
+            }
+            results["failed"] += 1
+            print(f"   TIMEOUT: {e}\n")
+        except Exception as e:
+            results["benchmarks"][benchmark] = {
+                "status": "error",
+                "success": False,
+                "error": str(e)
+            }
+            results["failed"] += 1
+            print(f"   ERROR: {e}\n")
+    print(f"\n{'='*70}")
+    print(f"SUMMARY")
+    print(f"{'='*70}")
+    print(f"Total: {results['total']}")
+    print(f"Passed: {results['passed']}")
+    print(f"Failed: {results['failed']}")
+    print(f"Success rate: {results['passed']/results['total']*100:.1f}%")
+    print(f"{'='*70}\n")
+    return results
+if __name__ == "__main__":
+    model = sys.argv[1] if len(sys.argv) > 1 else "meta-llama/Llama-3.1-8B-Instruct"
+    # Default to results directory in scripts folder
+    default_output = Path(__file__).parent / "results"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else str(default_output)
+    start_index = int(sys.argv[3]) if len(sys.argv) > 3 else 0
+    results = test_all_benchmarks(model, output_dir, start_index)
+    # Exit with appropriate code
+    sys.exit(0 if results["failed"] == 0 else 1)

wisent/examples/scripts/test_all_benchmarks_new.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Test all benchmarks to verify extractor and evaluator work."""
+import sys
+import signal
+from contextmanager import contextmanager
+from wisent.examples.scripts.test_one_benchmark import test_benchmark
+class TimeoutError(Exception):
+    """Raised when a test times out."""
+    pass
+@contextmanager
+def timeout(seconds):
+    """Context manager for timing out operations."""
+    def signal_handler(signum, frame):
+        raise TimeoutError(f"Test timed out after {seconds} seconds")
+    # Set the signal handler and alarm
+    old_handler = signal.signal(signal.SIGALRM, signal_handler)
+    signal.alarm(seconds)
+    try:
+        yield
+    finally:
+        signal.alarm(0)
+        signal.signal(signal.SIGALRM, old_handler)

wisent/examples/scripts/test_contrastive_pairs_all_supported.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""Test contrastive pairs generation for all supported benchmarks.
+Generates example pairs for each benchmark and shows how they look
+with different extraction strategies.
+"""
+import json
+import signal
+import sys
+from pathlib import Path
+class TimeoutError(Exception):
+    pass
+def timeout_handler(signum, frame):
+    raise TimeoutError("Timeout")
+class MockTokenizer:
+    """Mock tokenizer for previewing extraction strategies."""
+    def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False):
+        if len(messages) == 1:
+            return f"<|user|>\n{messages[0]['content']}\n<|assistant|>\n"
+        elif len(messages) == 2:
+            return f"<|user|>\n{messages[0]['content']}\n<|assistant|>\n{messages[1]['content']}<|end|>"
+        return str(messages)
+    def __call__(self, text, add_special_tokens=False):
+        return {"input_ids": text.split()}
+def format_pair_with_strategies(pair, tokenizer):
+    """Format a contrastive pair with all extraction strategies.
+    Returns dict with raw data and formatted versions for each strategy.
+    """
+    from wisent.core.activations.extraction_strategy import (
+        ExtractionStrategy,
+        build_extraction_texts,
+    )
+    result = {
+        "raw": {
+            "prompt": pair.prompt,
+            "positive": pair.positive_response.model_response,
+            "negative": pair.negative_response.model_response,
+        },
+        "strategies": {}
+    }
+    strategies = [
+        "chat_last",
+        "chat_mean",
+        "mc_balanced",
+        "completion_last",
+        "completion_mean",
+        "mc_completion",
+    ]
+    for strategy_name in strategies:
+        try:
+            strategy = ExtractionStrategy(strategy_name)
+            # Build texts for positive response
+            if strategy in (ExtractionStrategy.MC_BALANCED, ExtractionStrategy.MC_COMPLETION):
+                pos_full, pos_answer, pos_prompt = build_extraction_texts(
+                    strategy,
+                    pair.prompt,
+                    pair.positive_response.model_response,
+                    tokenizer,
+                    other_response=pair.negative_response.model_response,
+                    is_positive=True,
+                    auto_convert_strategy=False,
+                )
+                neg_full, neg_answer, neg_prompt = build_extraction_texts(
+                    strategy,
+                    pair.prompt,
+                    pair.negative_response.model_response,
+                    tokenizer,
+                    other_response=pair.positive_response.model_response,
+                    is_positive=False,
+                    auto_convert_strategy=False,
+                )
+            else:
+                pos_full, pos_answer, pos_prompt = build_extraction_texts(
+                    strategy,
+                    pair.prompt,
+                    pair.positive_response.model_response,
+                    tokenizer,
+                    auto_convert_strategy=False,
+                )
+                neg_full, neg_answer, neg_prompt = build_extraction_texts(
+                    strategy,
+                    pair.prompt,
+                    pair.negative_response.model_response,
+                    tokenizer,
+                    auto_convert_strategy=False,
+                )
+            result["strategies"][strategy_name] = {
+                "positive": {
+                    "full_text": pos_full,
+                    "answer_token": pos_answer,
+                    "prompt_only": pos_prompt,
+                },
+                "negative": {
+                    "full_text": neg_full,
+                    "answer_token": neg_answer,
+                    "prompt_only": neg_prompt,
+                }
+            }
+        except Exception as e:
+            result["strategies"][strategy_name] = {"error": str(e)}
+    return result
+def test_all_benchmarks(timeout_per_task: int = 30, limit: int = 2):
+    """Test contrastive pairs generation for all supported benchmarks.
+    Args:
+        timeout_per_task: Timeout in seconds per benchmark
+        limit: Number of pairs to generate per benchmark
+    Returns:
+        Dictionary with results including example pairs with all strategies
+    """
+    from wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_pairs_generation import build_contrastive_pairs
+    from wisent.core.benchmark_registry import get_all_benchmarks, get_broken_tasks
+    all_benchmarks = get_all_benchmarks()
+    broken = set(get_broken_tasks())
+    # Filter out broken benchmarks
+    benchmarks = [b for b in all_benchmarks if b not in broken]
+    print(f"Testing {len(benchmarks)} benchmarks (excluded {len(broken)} broken)")
+    print(f"Timeout per task: {timeout_per_task}s, limit: {limit} pairs")
+    print()
+    tokenizer = MockTokenizer()
+    results = {
+        "total": len(benchmarks),
+        "ok": 0,
+        "failed": 0,
+        "timeout": 0,
+        "benchmarks": {}
+    }
+    for i, benchmark in enumerate(benchmarks):
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(timeout_per_task)
+        try:
+            pairs = build_contrastive_pairs(benchmark, limit=limit)
+            signal.alarm(0)
+            if pairs and len(pairs) > 0:
+                results["ok"] += 1
+                # Format pairs with all strategies
+                formatted_pairs = []
+                for pair in pairs:
+                    formatted_pairs.append(format_pair_with_strategies(pair, tokenizer))
+                results["benchmarks"][benchmark] = {
+                    "status": "ok",
+                    "num_pairs": len(pairs),
+                    "pairs": formatted_pairs
+                }
+                print(f"[{i+1}/{len(benchmarks)}] OK: {benchmark} - {len(pairs)} pairs")
+            else:
+                results["failed"] += 1
+                results["benchmarks"][benchmark] = {"status": "no_pairs", "num_pairs": 0}
+                print(f"[{i+1}/{len(benchmarks)}] FAIL: {benchmark} - no pairs returned")
+        except TimeoutError:
+            signal.alarm(0)
+            results["timeout"] += 1
+            results["benchmarks"][benchmark] = {"status": "timeout"}
+            print(f"[{i+1}/{len(benchmarks)}] TIMEOUT: {benchmark}")
+        except Exception as e:
+            signal.alarm(0)
+            results["failed"] += 1
+            error_msg = str(e)[:200]
+            results["benchmarks"][benchmark] = {"status": "error", "error": error_msg}
+            print(f"[{i+1}/{len(benchmarks)}] ERROR: {benchmark} - {error_msg[:100]}")
+    # Summary
+    print()
+    print("=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"Total tested: {results['total']}")
+    print(f"OK: {results['ok']}")
+    print(f"Failed: {results['failed']}")
+    print(f"Timeout: {results['timeout']}")
+    print(f"Success rate: {results['ok']/results['total']*100:.1f}%")
+    print("=" * 60)
+    return results
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Test contrastive pairs for all supported benchmarks")
+    parser.add_argument("--timeout", "-t", type=int, default=30, help="Timeout per task in seconds (default: 30)")
+    parser.add_argument("--limit", "-l", type=int, default=2, help="Number of pairs per benchmark (default: 2)")
+    parser.add_argument("--output", "-o", type=str, required=True, help="Output JSON file for results")
+    args = parser.parse_args()
+    results = test_all_benchmarks(timeout_per_task=args.timeout, limit=args.limit)
+    with open(args.output, 'w') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"\nResults saved to: {args.output}")
+    # Exit with error code if any failures
+    sys.exit(0 if results["failed"] == 0 and results["timeout"] == 0 else 1)
+if __name__ == "__main__":
+    main()

wisent/examples/scripts/test_nonsense_baseline.py ADDED Viewed

@@ -0,0 +1,261 @@
+"""
+Test whether our activation extraction gives meaningful signal
+by comparing real contrastive pairs vs nonsense random pairs.
+If nonsense pairs give similar Cohen's d / separation as real pairs,
+then our signal is meaningless.
+"""
+import argparse
+import random
+import string
+import torch
+import numpy as np
+from typing import List, Tuple
+from sklearn.svm import LinearSVC
+from sklearn.preprocessing import StandardScaler
+from wisent.core.models.wisent_model import WisentModel
+from wisent.core.activations.extraction_strategy import ExtractionStrategy
+from wisent.core.activations.activations_collector import ActivationCollector
+from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
+WORD_LIST = [
+    "water", "sumo", "half", "purple", "elephant", "calculator", "yesterday",
+    "moon", "basket", "thinking", "telephone", "mountain", "running", "quickly",
+    "tomorrow", "happiness", "keyboard", "window", "dancing", "coffee", "planet",
+    "singing", "computer", "orange", "flying", "bicycle", "dream", "ocean",
+    "pencil", "laughing", "cloud", "table", "walking", "music", "river", "chair",
+    "jumping", "sun", "book", "swimming", "star", "door", "cooking", "tree",
+    "writing", "sky", "flower", "playing", "rain", "paper", "sleeping", "green",
+    "seven", "under", "before", "strange", "ancient", "modern", "simple"
+]
+def generate_nonsense_text(length: int = None) -> str:
+    """Generate word salad - real words, no meaning."""
+    if length is None:
+        length = random.randint(3, 10)
+    words = random.choices(WORD_LIST, k=length)
+    return ' '.join(words)
+def generate_nonsense_pairs(n: int = 50) -> List[ContrastivePair]:
+    """Generate pairs with random nonsense text."""
+    pairs = []
+    for i in range(n):
+        prompt = generate_nonsense_text(10)
+        positive = generate_nonsense_text(15)
+        negative = generate_nonsense_text(15)
+        pairs.append(ContrastivePair(
+            prompt=prompt,
+            positive_response=PositiveResponse(model_response=positive),
+            negative_response=NegativeResponse(model_response=negative),
+        ))
+    return pairs
+def generate_real_pairs(n: int = 50) -> List[ContrastivePair]:
+    """Generate real contrastive pairs with semantic meaning."""
+    templates = [
+        ("Is the Earth flat?", "No, the Earth is approximately spherical.", "Yes, the Earth is flat."),
+        ("What is 2+2?", "4", "5"),
+        ("Is water wet?", "Yes, water is wet.", "No, water is not wet."),
+        ("What color is the sky?", "Blue", "Green"),
+        ("Is the sun a star?", "Yes, the sun is a star.", "No, the sun is a planet."),
+        ("What is the capital of France?", "Paris", "London"),
+        ("Is Python a programming language?", "Yes, Python is a programming language.", "No, Python is a snake."),
+        ("What is 10 * 5?", "50", "100"),
+        ("Is ice cold?", "Yes, ice is cold.", "No, ice is hot."),
+        ("What year did WW2 end?", "1945", "1939"),
+    ]
+    pairs = []
+    for i in range(n):
+        template = templates[i % len(templates)]
+        # Add some variation
+        variation = f" (instance {i})"
+        pairs.append(ContrastivePair(
+            prompt=template[0] + variation,
+            positive_response=PositiveResponse(model_response=template[1]),
+            negative_response=NegativeResponse(model_response=template[2]),
+        ))
+    return pairs
+def compute_cohens_d(pos_acts: np.ndarray, neg_acts: np.ndarray) -> float:
+    """Compute Cohen's d effect size."""
+    pos_mean = np.mean(pos_acts, axis=0)
+    neg_mean = np.mean(neg_acts, axis=0)
+    pos_var = np.var(pos_acts, axis=0)
+    neg_var = np.var(neg_acts, axis=0)
+    n1, n2 = len(pos_acts), len(neg_acts)
+    pooled_std = np.sqrt(((n1 - 1) * pos_var + (n2 - 1) * neg_var) / (n1 + n2 - 2))
+    pooled_std = np.mean(pooled_std)  # average across dimensions
+    if pooled_std < 1e-10:
+        return 0.0
+    diff = np.linalg.norm(pos_mean - neg_mean)
+    return diff / pooled_std
+def compute_linear_separability(pos_acts: np.ndarray, neg_acts: np.ndarray) -> float:
+    """Compute linear separability score using SVM."""
+    X = np.vstack([pos_acts, neg_acts])
+    y = np.array([1] * len(pos_acts) + [0] * len(neg_acts))
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    svm = LinearSVC(max_iter=1000, dual=False)
+    svm.fit(X_scaled, y)
+    return svm.score(X_scaled, y)
+def collect_activations(
+    model: WisentModel,
+    pairs: List[ContrastivePair],
+    strategy: ExtractionStrategy,
+    layer: int,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Collect activations for positive and negative responses."""
+    collector = ActivationCollector(model)
+    pos_acts = []
+    neg_acts = []
+    for pair in pairs:
+        try:
+            # Collect both positive and negative using the collect method
+            result = collector.collect(pair, strategy=strategy)
+            # result is a ContrastivePair with activations
+            pos_layer_acts = result.positive_response.layers_activations
+            neg_layer_acts = result.negative_response.layers_activations
+            # Extract layer (keys are strings like '1', '2', etc, and 1-indexed)
+            layer_key = str(layer + 1)  # Convert to 1-indexed string
+            if pos_layer_acts is not None and neg_layer_acts is not None:
+                if layer_key in pos_layer_acts and layer_key in neg_layer_acts:
+                    pos_acts.append(pos_layer_acts[layer_key].cpu().numpy())
+                    neg_acts.append(neg_layer_acts[layer_key].cpu().numpy())
+        except Exception as e:
+            print(f"Error collecting pair: {e}")
+            continue
+    return np.array(pos_acts), np.array(neg_acts)
+def main():
+    parser = argparse.ArgumentParser(description="Test nonsense baseline vs real pairs")
+    parser.add_argument("--model", type=str, default="meta-llama/Llama-3.2-1B-Instruct")
+    parser.add_argument("--n-pairs", type=int, default=50)
+    parser.add_argument("--strategies", type=str, nargs="+",
+                        default=["chat_mean", "chat_max_norm", "chat_last"])
+    parser.add_argument("--layers", type=int, nargs="+", default=None,
+                        help="Layers to test. Default: [0, 25%, 50%, 75%, last]")
+    args = parser.parse_args()
+    print(f"Loading model: {args.model}")
+    model = WisentModel(args.model)
+    num_layers = model.num_layers
+    # Default layers if not specified
+    if args.layers is None:
+        args.layers = [
+            0,
+            num_layers // 4,
+            num_layers // 2,
+            3 * num_layers // 4,
+            num_layers - 1,
+        ]
+    print(f"Model has {num_layers} layers")
+    print(f"Testing layers: {args.layers}")
+    print(f"Testing strategies: {args.strategies}")
+    print(f"Pairs per condition: {args.n_pairs}")
+    print()
+    # Generate pairs
+    print("Generating pairs...")
+    real_pairs = generate_real_pairs(args.n_pairs)
+    nonsense_pairs = generate_nonsense_pairs(args.n_pairs)
+    results = []
+    for strategy_name in args.strategies:
+        strategy = ExtractionStrategy(strategy_name)
+        for layer in args.layers:
+            print(f"\n{'='*60}")
+            print(f"Strategy: {strategy_name}, Layer: {layer} ({100*layer/num_layers:.0f}%)")
+            print('='*60)
+            # Real pairs
+            print("  Collecting REAL pairs...")
+            real_pos, real_neg = collect_activations(model, real_pairs, strategy, layer)
+            if len(real_pos) < 10 or len(real_neg) < 10:
+                print("  WARNING: Too few activations collected for real pairs")
+                continue
+            real_cohens_d = compute_cohens_d(real_pos, real_neg)
+            real_linear = compute_linear_separability(real_pos, real_neg)
+            # Nonsense pairs
+            print("  Collecting NONSENSE pairs...")
+            nonsense_pos, nonsense_neg = collect_activations(model, nonsense_pairs, strategy, layer)
+            if len(nonsense_pos) < 10 or len(nonsense_neg) < 10:
+                print("  WARNING: Too few activations collected for nonsense pairs")
+                continue
+            nonsense_cohens_d = compute_cohens_d(nonsense_pos, nonsense_neg)
+            nonsense_linear = compute_linear_separability(nonsense_pos, nonsense_neg)
+            # Compare
+            print(f"\n  REAL pairs:     Cohen's d = {real_cohens_d:8.2f}, Linear = {real_linear:.3f}")
+            print(f"  NONSENSE pairs: Cohen's d = {nonsense_cohens_d:8.2f}, Linear = {nonsense_linear:.3f}")
+            print(f"  RATIO (real/nonsense): Cohen's d = {real_cohens_d/max(nonsense_cohens_d, 0.01):.2f}x")
+            if real_cohens_d > nonsense_cohens_d * 2:
+                verdict = "SIGNAL IS REAL"
+            elif real_cohens_d > nonsense_cohens_d * 1.2:
+                verdict = "WEAK SIGNAL"
+            else:
+                verdict = "NO SIGNAL (nonsense is similar!)"
+            print(f"  VERDICT: {verdict}")
+            results.append({
+                "strategy": strategy_name,
+                "layer": layer,
+                "layer_pct": 100 * layer / num_layers,
+                "real_cohens_d": real_cohens_d,
+                "real_linear": real_linear,
+                "nonsense_cohens_d": nonsense_cohens_d,
+                "nonsense_linear": nonsense_linear,
+                "ratio": real_cohens_d / max(nonsense_cohens_d, 0.01),
+                "verdict": verdict,
+            })
+    # Summary
+    print("\n" + "="*80)
+    print("SUMMARY")
+    print("="*80)
+    print(f"{'Strategy':<15} {'Layer':<10} {'Real d':<10} {'Nonsense d':<12} {'Ratio':<8} {'Verdict'}")
+    print("-"*80)
+    for r in results:
+        print(f"{r['strategy']:<15} {r['layer']:>3} ({r['layer_pct']:>3.0f}%)  "
+              f"{r['real_cohens_d']:>8.2f}   {r['nonsense_cohens_d']:>10.2f}   "
+              f"{r['ratio']:>6.2f}x  {r['verdict']}")
+if __name__ == "__main__":
+    main()

wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl