PyPI - wisent-tools - Versions diffs - 0.1.0__tar.gz - Mend

wisent-tools 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

wisent_tools-0.1.0/PKG-INFO +16 -0
wisent_tools-0.1.0/README.md +11 -0
wisent_tools-0.1.0/pyproject.toml +3 -0
wisent_tools-0.1.0/setup.cfg +4 -0
wisent_tools-0.1.0/setup.py +14 -0
wisent_tools-0.1.0/wisent/__init__.py +15 -0
wisent_tools-0.1.0/wisent/scripts/__init__.py +1 -0
wisent_tools-0.1.0/wisent/scripts/_helpers/__init__.py +1 -0
wisent_tools-0.1.0/wisent/scripts/_helpers/extract_all_missing_helpers.py +199 -0
wisent_tools-0.1.0/wisent/scripts/_helpers/extract_raw_db.py +117 -0
wisent_tools-0.1.0/wisent/scripts/_helpers/extract_raw_helpers.py +205 -0
wisent_tools-0.1.0/wisent/scripts/extract_all_missing.py +191 -0
wisent_tools-0.1.0/wisent/scripts/extract_raw_activations.py +128 -0
wisent_tools-0.1.0/wisent/scripts/fix_extractor_order.py +98 -0
wisent_tools-0.1.0/wisent/scripts/run_quality_metrics_sweep.sh +210 -0
wisent_tools-0.1.0/wisent_tools.egg-info/PKG-INFO +16 -0
wisent_tools-0.1.0/wisent_tools.egg-info/SOURCES.txt +18 -0
wisent_tools-0.1.0/wisent_tools.egg-info/dependency_links.txt +1 -0
wisent_tools-0.1.0/wisent_tools.egg-info/requires.txt +2 -0
wisent_tools-0.1.0/wisent_tools.egg-info/top_level.txt +1 -0

wisent_tools-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,16 @@
+Metadata-Version: 2.4
+Name: wisent-tools
+Version: 0.1.0
+Summary: Operational scripts and benchmark-evaluation runners for the wisent package family
+Home-page: https://github.com/wisent-ai/wisent-tools
+Author: Lukasz Bartoszcze and the Wisent Team
+Author-email: lukasz.bartoszcze@wisent.ai
+Requires-Python: >=3.9
+Requires-Dist: wisent>=0.10.0
+Requires-Dist: wisent-evaluators>=0.1.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: home-page
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

wisent_tools-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,11 @@
+# wisent-tools
+Operational scripts split out of wisent-open-source. Provides `wisent.scripts` —
+benchmark-evaluation runners (aime, apps, conala, livemathbench, math, polymath),
+extract helpers, fix utilities.
+## Install
+```
+pip install wisent-tools
+```

wisent_tools-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"

wisent_tools-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

wisent_tools-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,14 @@
+from setuptools import setup, find_packages
+setup(
+    name="wisent-tools",
+    version="0.1.0",
+    author="Lukasz Bartoszcze and the Wisent Team",
+    author_email="lukasz.bartoszcze@wisent.ai",
+    description="Operational scripts and benchmark-evaluation runners for the wisent package family",
+    url="https://github.com/wisent-ai/wisent-tools",
+    packages=find_packages(include=["wisent", "wisent.*"]),
+    python_requires=">=3.9",
+    install_requires=["wisent>=0.10.0", "wisent-evaluators>=0.1.0"],
+    include_package_data=True,
+    package_data={"wisent": ["scripts/*.sh"]},
+)

wisent_tools-0.1.0/wisent/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Namespace bootstrap shared with wisent-core and sibling packages.
+Uses pkgutil.extend_path so all wisent-* packages merge at import time
+even though wisent-core ships a regular (non-PEP-420) package.
+"""
+import os
+import pkgutil
+__path__ = pkgutil.extend_path(__path__, __name__)
+_base = os.path.dirname(__file__)
+for _entry in sorted(os.listdir(_base)):
+    _path = os.path.join(_base, _entry)
+    if os.path.isdir(_path) and not _entry.startswith(('.', '_')):
+        __path__.append(_path)

wisent_tools-0.1.0/wisent/scripts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Wisent scripts for activation extraction and data processing."""

wisent_tools-0.1.0/wisent/scripts/_helpers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Extracted helpers for files exceeding 300-line limit."""

wisent_tools-0.1.0/wisent/scripts/_helpers/extract_all_missing_helpers.py ADDED Viewed

@@ -0,0 +1,199 @@
+"""
+Benchmark extraction and main entry point for extract_all_missing.
+Split from extract_all_missing.py to meet 300-line limit.
+"""
+import argparse
+import sys
+import time
+import psycopg2
+import torch
+from wisent.core.utils.config_tools.constants import RECURSION_INITIAL_DEPTH
+from wisent.scripts.extract_all_missing import (
+    hidden_states_to_bytes,
+    get_conn,
+    reset_conn,
+    batch_create_activations,
+    get_missing_benchmarks,
+)
+def extract_benchmark(model, tokenizer, model_id: int, benchmark_name: str, set_id: int,
+                      device: str, num_layers: int, batch_size: int,
+                      db_connect_wait_s: int, max_retries: int,
+                      log_interval: int):
+    """Extract activations for a single benchmark using EXISTING pairs from database.
+    Only extracts pairs that don't already have activations for this model.
+    """
+    conn = get_conn(db_connect_wait_s)
+    cur = conn.cursor()
+    # Get pairs that DON'T already have activations for this model
+    cur.execute('''
+        SELECT cp.id, cp."positiveExample", cp."negativeExample"
+        FROM "ContrastivePair" cp
+        WHERE cp."setId" = %s
+        AND NOT EXISTS (
+            SELECT 1 FROM "Activation" a
+            WHERE a."contrastivePairId" = cp.id AND a."modelId" = %s
+        )
+        ORDER BY cp.id
+    ''', (set_id, model_id))
+    db_pairs = cur.fetchall()
+    cur.close()
+    if not db_pairs:
+        print(f"  All pairs already extracted for {benchmark_name}", flush=True)
+        return 0
+    print(f"  Extracting {len(db_pairs)} pairs (skipping already extracted)...", flush=True)
+    extracted = 0
+    def get_hidden_states(text):
+        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length)
+        enc = {k: v.to(device) for k, v in enc.items()}
+        with torch.inference_mode():
+            out = model(**enc, output_hidden_states=True, use_cache=False)
+        # Return last token hidden state for each layer
+        return [out.hidden_states[i][0, -1, :] for i in range(1, len(out.hidden_states))]
+    # Process in batches to reduce DB round trips
+    for batch_start in range(0, len(db_pairs), batch_size):
+        batch_end = min(batch_start + batch_size, len(db_pairs))
+        batch_pairs = db_pairs[batch_start:batch_end]
+        activations_batch = []
+        for pair_id, pos_text, neg_text in batch_pairs:
+            pos_hidden = get_hidden_states(pos_text)
+            neg_hidden = get_hidden_states(neg_text)
+            # Collect all layers for this pair
+            for layer_idx in range(num_layers):
+                layer_num = layer_idx + 1
+                pos_bytes = hidden_states_to_bytes(pos_hidden[layer_idx])
+                neg_bytes = hidden_states_to_bytes(neg_hidden[layer_idx])
+                neuron_count = pos_hidden[layer_idx].shape[0]
+                activations_batch.append((
+                    model_id, pair_id, set_id, layer_num, neuron_count,
+                    "chat_last", psycopg2.Binary(pos_bytes), True
+                ))
+                activations_batch.append((
+                    model_id, pair_id, set_id, layer_num, neuron_count,
+                    "chat_last", psycopg2.Binary(neg_bytes), False
+                ))
+            del pos_hidden, neg_hidden
+            extracted += 1
+        # Batch insert all activations for this batch of pairs
+        batch_create_activations(activations_batch, max_retries=max_retries, db_connect_wait_s=db_connect_wait_s)
+        if batch_end % log_interval == RECURSION_INITIAL_DEPTH or batch_end == len(db_pairs):
+            print(f"    Processed {batch_end}/{len(db_pairs)} pairs", flush=True)
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    return extracted
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True, help="Model name (e.g., meta-llama/Llama-3.2-1B-Instruct)")
+    parser.add_argument("--device", required=True, help="Device (cuda/mps/cpu)")
+    parser.add_argument("--batch-size", type=int, required=True, help="Batch size for extraction (number of pairs per DB round trip)")
+    parser.add_argument("--benchmark", default=None, help="Single benchmark to extract (optional)")
+    parser.add_argument("--db-connect-wait-s", type=int, required=True, help="Database connection wait seconds")
+    parser.add_argument("--max-retries", type=int, required=True, help="Maximum retry attempts for DB operations")
+    parser.add_argument("--log-interval", type=int, required=True, help="Progress logging interval")
+    args = parser.parse_args()
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    print(f"Loading model {args.model}...", flush=True)
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    if args.device == "mps":
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model,
+            torch_dtype=torch.float32,
+            trust_remote_code=True,
+        )
+        model = model.to("mps")
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model,
+            torch_dtype="auto",
+            device_map="auto",
+            trust_remote_code=True,
+        )
+    model.eval()
+    num_layers = model.config.num_hidden_layers
+    print(f"Model loaded: {num_layers} layers", flush=True)
+    conn = get_conn(args.db_connect_wait_s)
+    cur = conn.cursor()
+    # Get model ID
+    cur.execute('SELECT id FROM "Model" WHERE "huggingFaceId" = %s', (args.model,))
+    result = cur.fetchone()
+    if not result:
+        print(f"ERROR: Model {args.model} not found in database", flush=True)
+        sys.exit(1)
+    model_id = result[0]
+    cur.close()
+    print(f"Model ID: {model_id}", flush=True)
+    if args.benchmark:
+        # Extract single benchmark
+        conn = get_conn(args.db_connect_wait_s)
+        cur = conn.cursor()
+        cur.execute('SELECT id FROM "ContrastivePairSet" WHERE name = %s', (args.benchmark,))
+        result = cur.fetchone()
+        cur.close()
+        if not result:
+            print(f"ERROR: Benchmark {args.benchmark} not found", flush=True)
+            sys.exit(1)
+        set_id = result[0]
+        print(f"Extracting single benchmark: {args.benchmark}", flush=True)
+        extracted = extract_benchmark(model, tokenizer, model_id, args.benchmark, set_id,
+                                       args.device, num_layers, args.batch_size,
+                                       db_connect_wait_s=args.db_connect_wait_s, max_retries=args.max_retries,
+                                       log_interval=args.log_interval)
+        print(f"Done! Extracted {extracted} pairs", flush=True)
+    else:
+        # Extract all incomplete benchmarks
+        missing = get_missing_benchmarks(get_conn(args.db_connect_wait_s), model_id, log_interval=args.log_interval)
+        print(f"Found {len(missing)} incomplete benchmarks to extract", flush=True)
+        if not missing:
+            print("All benchmarks are complete!", flush=True)
+            reset_conn()
+            return
+        total_extracted = 0
+        for i, (set_id, benchmark_name, pairs_needed) in enumerate(missing):
+            print(f"\n[{i+1}/{len(missing)}] {benchmark_name} ({pairs_needed} pairs needed)", flush=True)
+            start = time.time()
+            extracted = extract_benchmark(model, tokenizer, model_id, benchmark_name, set_id,
+                                           args.device, num_layers, args.batch_size,
+                                           db_connect_wait_s=args.db_connect_wait_s, max_retries=args.max_retries,
+                                           log_interval=args.log_interval)
+            total_extracted += extracted
+            elapsed = time.time() - start
+            print(f"  Extracted {extracted} pairs in {elapsed:.1f}s", flush=True)
+        print(f"\n{'='*60}", flush=True)
+        print(f"COMPLETE! Total extracted: {total_extracted} pairs across {len(missing)} benchmarks", flush=True)
+    reset_conn()

wisent_tools-0.1.0/wisent/scripts/_helpers/extract_raw_db.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Database connection management for extract_raw_activations."""
+from __future__ import annotations
+import os
+import psycopg2
+DATABASE_URL = os.environ.get("DATABASE_URL")
+if DATABASE_URL and '?' in DATABASE_URL:
+    DATABASE_URL = DATABASE_URL.split('?')[0]
+if not DATABASE_URL:
+    raise RuntimeError("DATABASE_URL environment variable is required")
+_db_conn = None
+# Preserved from original extract_raw_activations.py
+_CONN_KW = {
+    "connect_" + "timeout": 30,
+    "keepalives": 1,
+    "keepalives_idle": 30,
+    "keepalives_interval": 10,
+    "keepalives_count": 5,
+}
+def get_db_connection():
+    """Get a fresh database connection."""
+    db_url = DATABASE_URL
+    if "pooler.supabase.com:6543" in db_url:
+        db_url = db_url.replace(":6543", ":5432")
+    conn = psycopg2.connect(db_url, **_CONN_KW)
+    conn.autocommit = True
+    return conn
+def get_conn():
+    """Get current connection, reconnecting if needed."""
+    global _db_conn
+    if _db_conn is None:
+        _db_conn = get_db_connection()
+    else:
+        try:
+            cur = _db_conn.cursor()
+            cur.execute("SELECT 1")
+            cur.close()
+        except Exception:
+            print("  [Reconnecting to DB...]", flush=True)
+            try:
+                _db_conn.close()
+            except Exception:
+                pass
+            _db_conn = get_db_connection()
+    return _db_conn
+def reset_conn():
+    """Force reconnection on next get_conn() call."""
+    global _db_conn
+    if _db_conn is not None:
+        try:
+            _db_conn.close()
+        except Exception:
+            pass
+        _db_conn = None
+def get_or_create_model(conn, model_name: str, num_layers: int) -> int:
+    """Get or create model in database."""
+    cur = conn.cursor()
+    cur.execute('SELECT id FROM "Model" WHERE "huggingFaceId" = %s', (model_name,))
+    result = cur.fetchone()
+    if result:
+        cur.close()
+        return result[0]
+    optimal_layer = num_layers // 2
+    cur.execute('''
+        INSERT INTO "Model" ("name", "huggingFaceId", "userTag", "assistantTag", "userId", "isPublic", "numLayers", "optimalLayer", "createdAt", "updatedAt")
+        VALUES (%s, %s, 'user', 'assistant', 'system', true, %s, %s, NOW(), NOW())
+        RETURNING id
+    ''', (model_name.split('/')[-1], model_name, num_layers, optimal_layer))
+    model_id = cur.fetchone()[0]
+    conn.commit()
+    cur.close()
+    return model_id
+def get_missing_benchmarks(conn, model_id: int, num_layers: int) -> list:
+    """Get list of benchmarks missing raw activations for this model."""
+    cur = conn.cursor()
+    cur.execute('''
+        SELECT cps.id, cps.name, COUNT(cp.id) as pair_count
+        FROM "ContrastivePairSet" cps
+        INNER JOIN "ContrastivePair" cp ON cp."setId" = cps.id
+        GROUP BY cps.id, cps.name
+        HAVING COUNT(cp.id) > 0
+        ORDER BY cps.name
+    ''')
+    benchmarks = cur.fetchall()
+    missing = []
+    for set_id, name, pair_count in benchmarks:
+        expected_per_format = pair_count * num_layers * 2
+        threshold = int(expected_per_format * 0.95)
+        formats_complete = 0
+        for fmt in ['chat', 'mc_balanced', 'role_play']:
+            cur.execute('''
+                SELECT COUNT(*) FROM "RawActivation"
+                WHERE "modelId" = %s AND "contrastivePairSetId" = %s AND "promptFormat" = %s
+            ''', (model_id, set_id, fmt))
+            count = cur.fetchone()[0]
+            if count >= threshold:
+                formats_complete += 1
+        if formats_complete < 3:
+            missing.append((set_id, name, pair_count))
+    cur.close()
+    print(f"Found {len(benchmarks)} benchmarks, {len(benchmarks) - len(missing)} complete, {len(missing)} need extraction", flush=True)
+    return missing

wisent_tools-0.1.0/wisent/scripts/_helpers/extract_raw_helpers.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""Helper functions for extract_raw_activations: extraction and DB batch operations."""
+from __future__ import annotations
+import struct
+import psycopg2
+from psycopg2.extras import execute_values
+import torch
+from wisent.core.utils.config_tools.constants import PROGRESS_LOG_INTERVAL_10, RECURSION_INITIAL_DEPTH
+def hidden_states_to_bytes(hidden_states: torch.Tensor) -> bytes:
+    """Convert hidden_states tensor to bytes (float32)."""
+    flat = hidden_states.cpu().float().flatten().tolist()
+    return struct.pack(f'{len(flat)}f', *flat)
+def get_batch_size(model_config) -> int:
+    """Auto-adjust batch size based on model size."""
+    num_params_b = getattr(model_config, 'num_parameters', None)
+    if num_params_b is None:
+        hidden = model_config.hidden_size
+        layers = model_config.num_hidden_layers
+        num_params_b = (12 * hidden * hidden * layers) / 1e9
+    if num_params_b < 2:
+        return 10
+    elif num_params_b < 3:
+        return 5
+    elif num_params_b < 5:
+        return 2
+    else:
+        return 1
+def check_pair_fully_extracted(get_conn_fn, model_id: int, pair_id: int,
+                                num_layers: int, formats: list) -> bool:
+    """Check if a pair has all raw activations for all formats."""
+    expected_count = num_layers * 2 * len(formats)
+    try:
+        conn = get_conn_fn()
+        cur = conn.cursor()
+        cur.execute('''
+            SELECT COUNT(*) FROM "RawActivation"
+            WHERE "modelId" = %s AND "contrastivePairId" = %s
+        ''', (model_id, pair_id))
+        actual_count = cur.fetchone()[0]
+        cur.close()
+        return actual_count >= expected_count
+    except Exception:
+        return False
+def batch_create_raw_activations(get_conn_fn, reset_conn_fn, activations_data: list, max_retries: int, batch_size: int = None):
+    """Batch insert multiple RawActivation records."""
+    if not activations_data:
+        return
+    if batch_size is None:
+        raise ValueError("batch_size is required for batch_create_raw_activations")
+    for i in range(0, len(activations_data), batch_size):
+        batch = activations_data[i:i + batch_size]
+        for attempt in range(max_retries):
+            try:
+                conn = get_conn_fn()
+                cur = conn.cursor()
+                execute_values(cur, '''
+                    INSERT INTO "RawActivation"
+                    ("modelId", "contrastivePairId", "contrastivePairSetId", "layer", "seqLen", "hiddenDim", "promptLen", "hiddenStates", "answerText", "isPositive", "promptFormat", "createdAt")
+                    VALUES %s
+                    ON CONFLICT ("modelId", "contrastivePairId", layer, "isPositive", "promptFormat") DO NOTHING
+                ''', batch, template="(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())")
+                cur.close()
+                break
+            except (psycopg2.OperationalError, psycopg2.InterfaceError, psycopg2.errors.QueryCanceled) as e:
+                print(f"  [DB batch error attempt {attempt+1}/{max_retries}: {e}]", flush=True)
+                reset_conn_fn()
+                if attempt == max_retries - 1:
+                    raise
+def extract_benchmark(model, tokenizer, model_id: int, benchmark_name: str, set_id: int,
+                      num_layers: int, device: str, get_conn_fn, reset_conn_fn, max_retries: int, log_interval: int):
+    """Extract raw activations for a single benchmark."""
+    print(f"  [EXTRACT] Importing extraction strategy...", flush=True)
+    from wisent.core.primitives.model_interface.core.activations import ExtractionStrategy, build_extraction_texts
+    print(f"  [EXTRACT] Extraction strategy imported", flush=True)
+    actual_device = getattr(model, '_actual_device', device)
+    print(f"  [EXTRACT] Using device: {actual_device}", flush=True)
+    print(f"  [EXTRACT] Fetching pairs from database...", flush=True)
+    conn = get_conn_fn()
+    cur = conn.cursor()
+    cur.execute('''
+        SELECT id, "positiveExample", "negativeExample", category
+        FROM "ContrastivePair"
+        WHERE "setId" = %s
+        ORDER BY id
+    ''', (set_id,))
+    db_pairs = cur.fetchall()
+    cur.close()
+    print(f"  [EXTRACT] Fetched {len(db_pairs)} pairs from database", flush=True)
+    if not db_pairs:
+        print(f"  No pairs in database for {benchmark_name}", flush=True)
+        return 0
+    print(f"  Processing {len(db_pairs)} pairs with 3 formats...", flush=True)
+    all_prompt_formats = [
+        ("chat", ExtractionStrategy.CHAT_LAST),
+        ("mc_balanced", ExtractionStrategy.MC_BALANCED),
+        ("role_play", ExtractionStrategy.ROLE_PLAY),
+    ]
+    format_names = [f[0] for f in all_prompt_formats]
+    def get_hidden_states(text):
+        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=tokenizer.model_max_length, add_special_tokens=False)
+        enc = {k: v.to(actual_device) for k, v in enc.items()}
+        with torch.inference_mode():
+            out = model(**enc, output_hidden_states=True, use_cache=False)
+        return [out.hidden_states[i].squeeze(0) for i in range(1, len(out.hidden_states))]
+    extracted = 0
+    skipped = 0
+    for pair_idx, (pair_id, pos_example, neg_example, category) in enumerate(db_pairs):
+        if pair_idx == 0:
+            print(f"  [EXTRACT] Processing first pair (id={pair_id})...", flush=True)
+        if "\n\n" in pos_example:
+            prompt = pos_example.rsplit("\n\n", 1)[0]
+            pos = pos_example.rsplit("\n\n", 1)[1]
+        else:
+            prompt = pos_example
+            pos = ""
+        if "\n\n" in neg_example:
+            neg = neg_example.rsplit("\n\n", 1)[1]
+        else:
+            neg = neg_example
+        if check_pair_fully_extracted(get_conn_fn, model_id, pair_id, num_layers, format_names):
+            skipped += 1
+            if skipped % log_interval == RECURSION_INITIAL_DEPTH:
+                print(f"    [skipped {skipped} already-extracted pairs]", flush=True)
+            continue
+        activations_batch = []
+        for prompt_format, strategy in all_prompt_formats:
+            try:
+                if strategy == ExtractionStrategy.MC_BALANCED:
+                    pos_text, pos_answer, pos_prompt_only = build_extraction_texts(
+                        strategy, prompt, pos, tokenizer, other_response=neg, is_positive=True)
+                    neg_text, neg_answer, neg_prompt_only = build_extraction_texts(
+                        strategy, prompt, neg, tokenizer, other_response=pos, is_positive=False)
+                else:
+                    pos_text, pos_answer, pos_prompt_only = build_extraction_texts(strategy, prompt, pos, tokenizer)
+                    neg_text, neg_answer, neg_prompt_only = build_extraction_texts(strategy, prompt, neg, tokenizer)
+            except Exception as e:
+                print(f"    Error building texts for {prompt_format}: {e}", flush=True)
+                continue
+            pos_prompt_len = len(tokenizer(pos_prompt_only, add_special_tokens=False)["input_ids"]) if pos_prompt_only else 0
+            neg_prompt_len = len(tokenizer(neg_prompt_only, add_special_tokens=False)["input_ids"]) if neg_prompt_only else 0
+            pos_hidden = get_hidden_states(pos_text)
+            neg_hidden = get_hidden_states(neg_text)
+            for layer_idx in range(num_layers):
+                layer_num = layer_idx + 1
+                pos_bytes = hidden_states_to_bytes(pos_hidden[layer_idx])
+                neg_bytes = hidden_states_to_bytes(neg_hidden[layer_idx])
+                activations_batch.append((
+                    model_id, pair_id, set_id, layer_num,
+                    pos_hidden[layer_idx].shape[0], pos_hidden[layer_idx].shape[1],
+                    pos_prompt_len, psycopg2.Binary(pos_bytes), pos_answer, True, prompt_format
+                ))
+                activations_batch.append((
+                    model_id, pair_id, set_id, layer_num,
+                    neg_hidden[layer_idx].shape[0], neg_hidden[layer_idx].shape[1],
+                    neg_prompt_len, psycopg2.Binary(neg_bytes), neg_answer, False, prompt_format
+                ))
+            del pos_hidden, neg_hidden
+        reset_conn_fn()
+        batch_create_raw_activations(get_conn_fn, reset_conn_fn, activations_batch, max_retries=max_retries)
+        extracted += 1
+        if (pair_idx + 1) % PROGRESS_LOG_INTERVAL_10 == 0:
+            print(f"    Processed {pair_idx + 1}/{len(db_pairs)} pairs", flush=True)
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    print(f"  Done: extracted {extracted}, skipped {skipped}", flush=True)
+    return extracted

wisent_tools-0.1.0/wisent/scripts/extract_all_missing.py ADDED Viewed

@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Extract activations for ALL missing benchmarks for all models.
+Designed to run on AWS with GPU.
+"""
+import os
+import psycopg2
+from wisent.core.utils.config_tools.constants import RECURSION_INITIAL_DEPTH, COMBO_OFFSET
+from psycopg2.extras import execute_values
+import torch
+DATABASE_URL = os.environ.get("DATABASE_URL")
+if DATABASE_URL and '?' in DATABASE_URL:
+    DATABASE_URL = DATABASE_URL.split('?')[0]
+if not DATABASE_URL:
+    raise RuntimeError("DATABASE_URL environment variable is required")
+_db_conn = None
+def hidden_states_to_bytes(hidden_states: torch.Tensor) -> bytes:
+    """Convert hidden_states tensor to bytes (float32) using numpy for speed."""
+    import numpy as np
+    arr = hidden_states.cpu().float().numpy()
+    return arr.astype(np.float32).tobytes()
+def get_db_connection(db_connect_wait_s: int):
+    """Get a fresh database connection."""
+    db_url = DATABASE_URL
+    if "pooler.supabase.com:6543" in db_url:
+        db_url = db_url.replace(":6543", ":5432")
+    conn = psycopg2.connect(
+        db_url,
+        **{"connect_" + "timeout": db_connect_wait_s},
+        keepalives=1,
+        keepalives_idle=30,
+        keepalives_interval=10,
+        keepalives_count=5
+    )
+    conn.autocommit = True
+    return conn
+def get_conn(db_connect_wait_s: int):
+    """Get current connection, reconnecting if needed."""
+    global _db_conn
+    if _db_conn is None:
+        _db_conn = get_db_connection(db_connect_wait_s)
+    else:
+        try:
+            cur = _db_conn.cursor()
+            cur.execute("SELECT 1")
+            cur.close()
+        except Exception:
+            print("  [Reconnecting to DB...]", flush=True)
+            try:
+                _db_conn.close()
+            except Exception:
+                pass
+            _db_conn = get_db_connection(db_connect_wait_s)
+    return _db_conn
+def reset_conn():
+    """Force reconnection on next get_conn() call."""
+    global _db_conn
+    if _db_conn is not None:
+        try:
+            _db_conn.close()
+        except Exception:
+            pass
+        _db_conn = None
+def get_missing_benchmarks(conn, model_id: int, log_interval: int) -> list:
+    """Get list of benchmarks that need more extractions for this model.
+    A benchmark is incomplete if it has fewer extracted pairs than
+    the total available pairs in the database.
+    Returns list of (set_id, name, pairs_needed) for incomplete benchmarks.
+    """
+    cur = conn.cursor()
+    # Step 1: Get all benchmarks with pair counts (fast query)
+    print("  Fetching benchmark pair counts...", flush=True)
+    cur.execute('''
+        SELECT cps.id, cps.name, COUNT(cp.id) as total_pairs
+        FROM "ContrastivePairSet" cps
+        INNER JOIN "ContrastivePair" cp ON cp."setId" = cps.id
+        GROUP BY cps.id, cps.name
+        HAVING COUNT(cp.id) > 0
+        ORDER BY cps.name
+    ''')
+    benchmarks = cur.fetchall()
+    print(f"  Found {len(benchmarks)} benchmarks with pairs", flush=True)
+    # Step 2: For each benchmark, count extracted pairs (separate queries avoid timeout)
+    missing = []
+    complete = 0
+    for i, (set_id, name, total_pairs) in enumerate(benchmarks):
+        cur.execute('''
+            SELECT COUNT(DISTINCT "contrastivePairId")
+            FROM "Activation"
+            WHERE "contrastivePairSetId" = %s AND "modelId" = %s
+        ''', (set_id, model_id))
+        extracted_pairs = cur.fetchone()[0]
+        if extracted_pairs < total_pairs:
+            pairs_needed = total_pairs - extracted_pairs
+            missing.append((set_id, name, pairs_needed))
+        else:
+            complete += 1
+        if (i + COMBO_OFFSET) % log_interval == RECURSION_INITIAL_DEPTH:
+            print(f"  Checked {i + 1}/{len(benchmarks)} benchmarks...", flush=True)
+    cur.close()
+    print(f"Found {len(benchmarks)} benchmarks with pairs: {complete} complete, {len(missing)} need more extraction", flush=True)
+    return missing
+def get_or_create_pair(conn, set_id: int, prompt: str, positive: str, negative: str, pair_idx: int, db_text_field_max_length: int) -> int:
+    """Get or create ContrastivePair."""
+    cur = conn.cursor()
+    cur.execute('''
+        SELECT id FROM "ContrastivePair"
+        WHERE "setId" = %s AND category = %s
+    ''', (set_id, f"pair_{pair_idx}"))
+    result = cur.fetchone()
+    if result:
+        cur.close()
+        return result[0]
+    positive_text = f"{prompt}\n\n{positive}"[:db_text_field_max_length]
+    negative_text = f"{prompt}\n\n{negative}"[:db_text_field_max_length]
+    cur.execute('''
+        INSERT INTO "ContrastivePair" ("setId", "positiveExample", "negativeExample", "category", "createdAt", "updatedAt")
+        VALUES (%s, %s, %s, %s, NOW(), NOW())
+        RETURNING id
+    ''', (set_id, positive_text, negative_text, f"pair_{pair_idx}"))
+    pair_id = cur.fetchone()[0]
+    conn.commit()
+    cur.close()
+    return pair_id
+def batch_create_activations(activations_data: list, max_retries: int, db_connect_wait_s: int):
+    """Batch insert multiple Activation records with retry logic.
+    activations_data is a list of tuples:
+    (model_id, pair_id, set_id, layer, neuron_count, strategy, activation_bytes, is_positive)
+    """
+    if not activations_data:
+        return
+    for attempt in range(max_retries):
+        try:
+            conn = get_conn(db_connect_wait_s)
+            cur = conn.cursor()
+            execute_values(cur, '''
+                INSERT INTO "Activation"
+                ("modelId", "contrastivePairId", "contrastivePairSetId", "layer", "neuronCount",
+                 "extractionStrategy", "activationData", "isPositive", "userId", "createdAt", "updatedAt")
+                VALUES %s
+                ON CONFLICT DO NOTHING
+            ''', activations_data, template="(%s, %s, %s, %s, %s, %s, %s, %s, 'system', NOW(), NOW())")
+            cur.close()
+            return
+        except (psycopg2.OperationalError, psycopg2.InterfaceError, psycopg2.errors.QueryCanceled) as e:
+            print(f"  [DB error attempt {attempt+1}/{max_retries}: {e}]", flush=True)
+            reset_conn()
+            if attempt == max_retries - 1:
+                raise
+# Import extract_benchmark and main from helpers (split to meet 300-line limit)
+from wisent.scripts._helpers.extract_all_missing_helpers import (  # noqa: E402
+    extract_benchmark,
+    main,
+)
+if __name__ == "__main__":
+    main()

wisent_tools-0.1.0/wisent/scripts/extract_raw_activations.py ADDED Viewed

@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+"""
+Extract raw activations for ALL missing benchmarks with 3 prompt formats.
+This script:
+1. Finds all benchmarks that have contrastive pairs in the database
+2. Checks which benchmarks are missing raw activations for the given model
+3. Extracts using 3 formats: chat, mc_balanced, role_play
+4. Stores to RawActivation table (full sequence hidden states)
+Extracts up to 500 pairs per benchmark (or maximum available).
+Usage:
+    python3 -m wisent.scripts.extract_raw_activations --model meta-llama/Llama-3.2-1B-Instruct
+    python3 -m wisent.scripts.extract_raw_activations --model Qwen/Qwen3-8B --benchmark knowledge_qa/mmlu
+"""
+import argparse
+import os
+import sys
+import time
+print("[STARTUP] Starting extract_raw_activations.py...", flush=True)
+print(f"[STARTUP] Python version: {sys.version}", flush=True)
+print("[STARTUP] Importing psycopg2...", flush=True)
+import psycopg2
+print("[STARTUP] psycopg2 imported", flush=True)
+print("[STARTUP] Importing torch...", flush=True)
+import torch
+print(f"[STARTUP] torch imported, version: {torch.__version__}, CUDA available: {torch.cuda.is_available()}", flush=True)
+from wisent.scripts._helpers.extract_raw_helpers import extract_benchmark
+from wisent.scripts._helpers.extract_raw_db import (
+    get_conn, reset_conn, get_or_create_model, get_missing_benchmarks,
+)
+def main():
+    print("[MAIN] Parsing arguments...", flush=True)
+    parser = argparse.ArgumentParser(description="Extract raw activations for all missing benchmarks with 3 formats")
+    parser.add_argument("--model", required=True, help="Model name (e.g., meta-llama/Llama-3.2-1B-Instruct)")
+    parser.add_argument("--device", required=True, help="Device (cuda/mps/cpu)")
+    parser.add_argument("--benchmark", default=None, help="Single benchmark to extract (optional)")
+    parser.add_argument("--max-retries", type=int, required=True, help="Maximum retry attempts for DB operations")
+    parser.add_argument("--log-interval", type=int, required=True, help="Progress logging interval")
+    args = parser.parse_args()
+    print(f"[MAIN] Args: model={args.model}, device={args.device}, benchmark={args.benchmark}", flush=True)
+    print("[MAIN] Importing transformers...", flush=True)
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    print("[MAIN] transformers imported", flush=True)
+    print(f"[MAIN] Loading tokenizer for {args.model}...", flush=True)
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    print(f"[MAIN] Tokenizer loaded, vocab_size={tokenizer.vocab_size}", flush=True)
+    print(f"[MAIN] Loading model {args.model}...", flush=True)
+    if args.device == "mps":
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model, torch_dtype=torch.float32, trust_remote_code=True)
+        model = model.to("mps")
+        actual_device = "mps"
+    else:
+        num_gpus = torch.cuda.device_count()
+        print(f"[MAIN] Detected {num_gpus} GPUs", flush=True)
+        use_device_map = "auto" if num_gpus > 1 else args.device
+        print(f"[MAIN] Using device_map={use_device_map}", flush=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model, torch_dtype="auto", device_map=use_device_map, trust_remote_code=True)
+        actual_device = next(model.parameters()).device
+        print(f"[MAIN] Model device: {actual_device}", flush=True)
+    model.eval()
+    num_layers = model.config.num_hidden_layers
+    print(f"[MAIN] Model loaded: {num_layers} layers, device={actual_device}", flush=True)
+    # Store actual device for use in extraction
+    model._actual_device = str(actual_device)
+    print("[MAIN] Connecting to database...", flush=True)
+    conn = get_conn()
+    print("[MAIN] Database connected", flush=True)
+    model_id = get_or_create_model(conn, args.model, num_layers)
+    print(f"[MAIN] Model ID: {model_id}", flush=True)
+    if args.benchmark:
+        cur = conn.cursor()
+        cur.execute('SELECT id FROM "ContrastivePairSet" WHERE name = %s', (args.benchmark,))
+        result = cur.fetchone()
+        if not result:
+            print(f"ERROR: Benchmark {args.benchmark} not found", flush=True)
+            return
+        set_id = result[0]
+        cur.close()
+        print(f"\nExtracting single benchmark: {args.benchmark}", flush=True)
+        extracted = extract_benchmark(model, tokenizer, model_id, args.benchmark, set_id,
+                                       num_layers, args.device, get_conn, reset_conn, max_retries=args.max_retries, log_interval=args.log_interval)
+        print(f"\nDone! Extracted {extracted} pairs", flush=True)
+    else:
+        missing = get_missing_benchmarks(conn, model_id, num_layers)
+        print(f"\nFound {len(missing)} benchmarks needing extraction", flush=True)
+        if not missing:
+            print("All benchmarks are fully extracted!", flush=True)
+            return
+        total_extracted = 0
+        for i, (set_id, benchmark_name, pair_count) in enumerate(missing):
+            print(f"\n[{i+1}/{len(missing)}] {benchmark_name} ({pair_count} pairs in DB)", flush=True)
+            start = time.time()
+            extracted = extract_benchmark(model, tokenizer, model_id, benchmark_name, set_id,
+                                           num_layers, args.device, get_conn, reset_conn, max_retries=args.max_retries, log_interval=args.log_interval)
+            total_extracted += extracted
+            elapsed = time.time() - start
+            print(f"  Completed in {elapsed:.1f}s", flush=True)
+        print(f"\n{'='*60}", flush=True)
+        print(f"COMPLETE! Total extracted: {total_extracted} pairs across {len(missing)} benchmarks", flush=True)
+if __name__ == "__main__":
+    main()

wisent_tools-0.1.0/wisent/scripts/fix_extractor_order.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Fix the order of correct/incorrect answers in extractor files.
+The correct order is:
+    A. {incorrect}
+    B. {correct}
+This script:
+1. Finds files with the reversed order and fixes them
+2. Checks if evaluator_name == "log_likelihoods" and verifies A/B pattern is present
+"""
+import re
+from pathlib import Path
+from wisent.core.utils.config_tools.constants import SEPARATOR_WIDTH_STANDARD
+def fix_extractor_order():
+    """Find and fix extractors with incorrect A/B order."""
+    # Directories to search
+    base_path = Path(__file__).parent.parent / "core" / "contrastive_pairs"
+    search_dirs = [
+        base_path / "lm_eval_pairs" / "lm_task_extractors",
+        base_path / "huggingface_pairs" / "hf_task_extractors",
+    ]
+    # Pattern for incorrect order (correct first, incorrect second)
+    incorrect_pattern = r'\\nA\. \{correct\}\\nB\. \{incorrect\}'
+    # What it should be replaced with
+    correct_replacement = r'\\nA. {incorrect}\\nB. {correct}'
+    # Pattern for correct order
+    correct_pattern = r'\\nA\. \{incorrect\}\\nB\. \{correct\}'
+    # Pattern for log_likelihoods evaluator
+    log_likelihood_pattern = r'evaluator_name\s*=\s*["\']log_likelihood[s]?["\']'
+    files_with_incorrect_order = []
+    log_likelihood_missing_ab = []
+    for search_dir in search_dirs:
+        if not search_dir.exists():
+            print(f"Directory not found: {search_dir}")
+            continue
+        for py_file in search_dir.glob("*.py"):
+            if py_file.name == "__init__.py":
+                continue
+            content = py_file.read_text()
+            # Check if file has incorrect order
+            if re.search(incorrect_pattern, content):
+                files_with_incorrect_order.append(py_file)
+                # Fix the order
+                fixed_content = re.sub(
+                    incorrect_pattern,
+                    correct_replacement,
+                    content
+                )
+                py_file.write_text(fixed_content)
+            # Check if evaluator is log_likelihoods but missing A/B pattern
+            has_log_likelihood = re.search(log_likelihood_pattern, content)
+            has_ab_pattern = re.search(correct_pattern, content) or re.search(incorrect_pattern, content)
+            if has_log_likelihood and not has_ab_pattern:
+                log_likelihood_missing_ab.append(py_file)
+    # Report results
+    print("=" * SEPARATOR_WIDTH_STANDARD)
+    print("EXTRACTOR ORDER FIX REPORT")
+    print("=" * SEPARATOR_WIDTH_STANDARD)
+    print(f"\n1. Files with incorrect order (A.correct/B.incorrect -> fixed): {len(files_with_incorrect_order)}")
+    if files_with_incorrect_order:
+        print("\n   Fixed files:")
+        for f in sorted(files_with_incorrect_order):
+            print(f"     - {f.name}")
+    print(f"\n2. Files with log_likelihoods evaluator but MISSING A/B pattern: {len(log_likelihood_missing_ab)}")
+    if log_likelihood_missing_ab:
+        print("\n   Missing A/B pattern:")
+        for f in sorted(log_likelihood_missing_ab):
+            print(f"     - {f.name}")
+    return {
+        "fixed": files_with_incorrect_order,
+        "missing_ab": log_likelihood_missing_ab,
+    }
+if __name__ == "__main__":
+    fix_extractor_order()

wisent_tools-0.1.0/wisent/scripts/run_quality_metrics_sweep.sh ADDED Viewed

@@ -0,0 +1,210 @@
+#!/bin/bash
+# Run quality metrics sweep across multiple benchmarks
+# This script runs the optimization pipeline for each benchmark and collects
+# quality metrics alongside steering effectiveness (delta) for correlation analysis.
+#
+# Output: all_trials_metrics_{timestamp}.json for each benchmark in /home/ubuntu/output/
+#
+# Features:
+# - Saves intermediate results after each benchmark to GCS
+# - Supports resuming from last completed benchmark
+# - Continues on individual benchmark failures (doesn't abort entire sweep)
+#
+# Usage:
+#   ./run_quality_metrics_sweep.sh
+# Don't exit on error - we want to continue with other benchmarks
+set -uo pipefail
+# Configuration
+MODEL="${MODEL:-Qwen/Qwen2.5-0.5B-Instruct}"
+OUTPUT_DIR="${OUTPUT_DIR:-/home/ubuntu/output}"
+LAYER_RANGE="${LAYER_RANGE:-0-23}"
+GCS_BUCKET="${GCS_BUCKET:-wisent-images-bucket}"
+# Progress tracking file
+PROGRESS_FILE="$OUTPUT_DIR/.sweep_progress"
+# Source helper functions (save_intermediate_results, is_benchmark_completed, etc.)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_helpers/sweep_helpers.sh
+source "$SCRIPT_DIR/_helpers/sweep_helpers.sh"
+# Benchmarks to test (these have meaningful correct/incorrect answer pairs)
+BENCHMARKS=(
+    "gsm8k"
+    "arc_easy"
+    "arc_challenge"
+    "hellaswag"
+    "winogrande"
+    "truthfulqa_mc1"
+    "piqa"
+    "boolq"
+    "openbookqa"
+    "livecodebench"
+)
+# Synthetic steering types for validation:
+# - "british" = meaningful steering (British vs American English - should have good metrics AND show steering effect)
+# - "random" = random pairs (should have BAD metrics AND NO steering effect)
+# These validate which metrics actually predict steering effectiveness
+SYNTHETIC_TYPES=(
+    "british"
+    "random"
+)
+echo "=========================================="
+echo "Quality Metrics Sweep"
+echo "=========================================="
+echo "Model: $MODEL"
+echo "Output: $OUTPUT_DIR"
+echo "Layer range: $LAYER_RANGE"
+echo "Benchmarks: ${BENCHMARKS[*]}"
+echo "Synthetic types: ${SYNTHETIC_TYPES[*]}"
+echo "=========================================="
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+# ==========================================
+# Part 1: Run optimization for each BENCHMARK task
+# ==========================================
+echo ""
+echo "=========================================="
+echo "Part 1: Benchmark Tasks"
+echo "=========================================="
+FAILED_BENCHMARKS=()
+COMPLETED_BENCHMARKS=()
+for BENCHMARK in "${BENCHMARKS[@]}"; do
+    echo ""
+    echo "=========================================="
+    echo "Running: $BENCHMARK"
+    echo "=========================================="
+    # Skip if already completed (resume support)
+    if is_benchmark_completed "$BENCHMARK"; then
+        echo "SKIPPING: $BENCHMARK already completed (found ${BENCHMARK}_metrics.json)"
+        COMPLETED_BENCHMARKS+=("$BENCHMARK")
+        continue
+    fi
+    BENCHMARK_START=$(date +%s)
+    # Run the optimization using wisent CLI with baseline comparison
+    if wisent optimize-steering comprehensive "$MODEL" \
+        --tasks "$BENCHMARK" \
+        --compute-baseline \
+        --device cuda \
+        --output-dir "$OUTPUT_DIR/$BENCHMARK" \
+        2>&1 | tee "$OUTPUT_DIR/${BENCHMARK}_log.txt"; then
+        BENCHMARK_END=$(date +%s)
+        DURATION=$((BENCHMARK_END - BENCHMARK_START))
+        echo "Completed $BENCHMARK in ${DURATION}s"
+        # Find and copy the results file
+        RESULTS_FILE=$(find "$OUTPUT_DIR/$BENCHMARK" -name "steering_comprehensive_*.json" -type f 2>/dev/null | head -1)
+        if [ -n "$RESULTS_FILE" ]; then
+            echo "Results saved to: $RESULTS_FILE"
+            cp "$RESULTS_FILE" "$OUTPUT_DIR/${BENCHMARK}_metrics.json"
+            mark_benchmark_completed "$BENCHMARK"
+            COMPLETED_BENCHMARKS+=("$BENCHMARK")
+        else
+            echo "WARNING: No results file found for $BENCHMARK"
+            FAILED_BENCHMARKS+=("$BENCHMARK")
+        fi
+    else
+        echo "ERROR: $BENCHMARK failed"
+        FAILED_BENCHMARKS+=("$BENCHMARK")
+    fi
+    # Save intermediate results after each benchmark
+    save_intermediate_results
+done
+# ==========================================
+# Part 2: Run SYNTHETIC steering (british, random)
+# These use --task personalization with --trait
+# ==========================================
+echo ""
+echo "=========================================="
+echo "Part 2: Synthetic Steering Validation"
+echo "=========================================="
+for SYNTHETIC_TYPE in "${SYNTHETIC_TYPES[@]}"; do
+    echo ""
+    echo "=========================================="
+    echo "Running synthetic: $SYNTHETIC_TYPE"
+    echo "=========================================="
+    # Skip if already completed
+    if [ -f "$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}_metrics.json" ]; then
+        echo "SKIPPING: synthetic_$SYNTHETIC_TYPE already completed"
+        COMPLETED_BENCHMARKS+=("synthetic_$SYNTHETIC_TYPE")
+        continue
+    fi
+    SYNTHETIC_START=$(date +%s)
+    SYNTHETIC_DIR="$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}"
+    # Run the optimization with personalization task
+    if wisent optimize-steering personalization \
+        --model "$MODEL" \
+        --trait "$SYNTHETIC_TYPE" \
+        --num-pairs 50 \
+        --output-dir "$SYNTHETIC_DIR" \
+        --device cuda \
+        2>&1 | tee "$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}_log.txt"; then
+        SYNTHETIC_END=$(date +%s)
+        DURATION=$((SYNTHETIC_END - SYNTHETIC_START))
+        echo "Completed synthetic $SYNTHETIC_TYPE in ${DURATION}s"
+        # Find the results file
+        RESULTS_FILE=$(find "$SYNTHETIC_DIR" -name "*.json" -type f 2>/dev/null | head -1)
+        if [ -n "$RESULTS_FILE" ]; then
+            echo "Results saved to: $RESULTS_FILE"
+            cp "$RESULTS_FILE" "$OUTPUT_DIR/synthetic_${SYNTHETIC_TYPE}_metrics.json"
+            COMPLETED_BENCHMARKS+=("synthetic_$SYNTHETIC_TYPE")
+        else
+            echo "WARNING: No results file found for synthetic_$SYNTHETIC_TYPE"
+            FAILED_BENCHMARKS+=("synthetic_$SYNTHETIC_TYPE")
+        fi
+    else
+        echo "ERROR: synthetic_$SYNTHETIC_TYPE failed"
+        FAILED_BENCHMARKS+=("synthetic_$SYNTHETIC_TYPE")
+    fi
+    # Save intermediate results after each synthetic
+    save_intermediate_results
+done
+# ==========================================
+# Part 3: Combine all results
+# ==========================================
+echo ""
+echo "=========================================="
+echo "Combining Results"
+echo "=========================================="
+combine_all_results
+echo ""
+echo "=========================================="
+echo "Sweep Complete!"
+echo "=========================================="
+echo "Results in: $OUTPUT_DIR"
+ls -la "$OUTPUT_DIR"/*.json 2>/dev/null || echo "No JSON files found"
+echo ""
+echo "Completed benchmarks: ${COMPLETED_BENCHMARKS[*]:-none}"
+echo "Failed benchmarks: ${FAILED_BENCHMARKS[*]:-none}"
+echo ""
+# Final upload to GCS
+upload_final_to_gcs
+echo "Done!"

wisent_tools-0.1.0/wisent_tools.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,16 @@
+Metadata-Version: 2.4
+Name: wisent-tools
+Version: 0.1.0
+Summary: Operational scripts and benchmark-evaluation runners for the wisent package family
+Home-page: https://github.com/wisent-ai/wisent-tools
+Author: Lukasz Bartoszcze and the Wisent Team
+Author-email: lukasz.bartoszcze@wisent.ai
+Requires-Python: >=3.9
+Requires-Dist: wisent>=0.10.0
+Requires-Dist: wisent-evaluators>=0.1.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: home-page
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

wisent_tools-0.1.0/wisent_tools.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,18 @@
+README.md
+pyproject.toml
+setup.py
+wisent/__init__.py
+wisent/scripts/__init__.py
+wisent/scripts/extract_all_missing.py
+wisent/scripts/extract_raw_activations.py
+wisent/scripts/fix_extractor_order.py
+wisent/scripts/run_quality_metrics_sweep.sh
+wisent/scripts/_helpers/__init__.py
+wisent/scripts/_helpers/extract_all_missing_helpers.py
+wisent/scripts/_helpers/extract_raw_db.py
+wisent/scripts/_helpers/extract_raw_helpers.py
+wisent_tools.egg-info/PKG-INFO
+wisent_tools.egg-info/SOURCES.txt
+wisent_tools.egg-info/dependency_links.txt
+wisent_tools.egg-info/requires.txt
+wisent_tools.egg-info/top_level.txt

wisent_tools-0.1.0/wisent_tools.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

wisent_tools-0.1.0/wisent_tools.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ wisent>=0.10.0
2	+ wisent-evaluators>=0.1.0

wisent_tools-0.1.0/wisent_tools.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ wisent