PyPI - structuremappingmemory - Versions diffs - 1.0.0__py3-none-any.whl - Mend

structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

sma/__init__.py +5 -0
sma/__main__.py +5 -0
sma/agent/__init__.py +5 -0
sma/agent/adapter_draft.py +217 -0
sma/agent/api.py +67 -0
sma/agent/comparison.py +591 -0
sma/agent/llm.py +280 -0
sma/agent/policies.py +21 -0
sma/agent/service.py +95 -0
sma/cli.py +65 -0
sma/encoders/__init__.py +38 -0
sma/encoders/agentobs.py +27 -0
sma/encoders/base.py +23 -0
sma/encoders/code_treesitter.py +64 -0
sma/encoders/coverage.py +80 -0
sma/encoders/draft_adapter.py +183 -0
sma/encoders/healthcare.py +207 -0
sma/encoders/logs_drain.py +142 -0
sma/encoders/prose_tier1.py +57 -0
sma/encoders/structured.py +57 -0
sma/encoders/traces.py +45 -0
sma/eval/__init__.py +2 -0
sma/eval/agentic/__init__.py +35 -0
sma/eval/agentic/arms/__init__.py +0 -0
sma/eval/agentic/arms/cyber.py +48 -0
sma/eval/agentic/arms/discovery.py +35 -0
sma/eval/agentic/arms/finance.py +38 -0
sma/eval/agentic/arms/legal.py +74 -0
sma/eval/agentic/arms/medicine.py +45 -0
sma/eval/agentic/harness.py +275 -0
sma/eval/agentic/memories.py +308 -0
sma/eval/agentic/metrics.py +82 -0
sma/eval/agentic_qa/__init__.py +27 -0
sma/eval/agentic_qa/agent.py +383 -0
sma/eval/agentic_qa/metrics.py +239 -0
sma/eval/agentic_qa/pools.py +197 -0
sma/eval/arn.py +65 -0
sma/eval/baselines/__init__.py +6 -0
sma/eval/baselines/bge_dense.py +54 -0
sma/eval/baselines/bm25.py +18 -0
sma/eval/baselines/dense.py +42 -0
sma/eval/baselines/hipporag.py +235 -0
sma/eval/baselines/hybrid_rrf.py +30 -0
sma/eval/baselines/longcontext_llm.py +124 -0
sma/eval/baselines/rerank.py +41 -0
sma/eval/baselines/splade.py +77 -0
sma/eval/baselines/wl_kernel.py +163 -0
sma/eval/bugsinpy.py +358 -0
sma/eval/bugsinpy_families.py +164 -0
sma/eval/crossdomain.py +89 -0
sma/eval/diabetes.py +61 -0
sma/eval/drift_env.py +26 -0
sma/eval/drift_metrics.py +24 -0
sma/eval/family_labels.py +167 -0
sma/eval/fraud_elliptic/__init__.py +29 -0
sma/eval/fraud_elliptic/encoder.py +279 -0
sma/eval/fraud_elliptic/eval.py +269 -0
sma/eval/fraud_elliptic/test_encoder.py +123 -0
sma/eval/ieee_cis.py +66 -0
sma/eval/loghub.py +16 -0
sma/eval/loghub_eval.py +480 -0
sma/eval/longmemeval.py +51 -0
sma/eval/memory_backends/__init__.py +2 -0
sma/eval/memory_backends/base.py +22 -0
sma/eval/memory_backends/context_only.py +14 -0
sma/eval/memory_backends/rag_notes.py +17 -0
sma/eval/memory_backends/shared_llm.py +30 -0
sma/eval/memory_backends/sma_memory.py +54 -0
sma/eval/memory_backends/zep_graphiti.py +33 -0
sma/eval/metrics.py +32 -0
sma/eval/ontology_bench.py +219 -0
sma/eval/report.py +573 -0
sma/eval/ssb_eval.py +216 -0
sma/eval/ssb_generator.py +116 -0
sma/eval/stats.py +108 -0
sma/eval/transfer_eval.py +844 -0
sma/index/__init__.py +15 -0
sma/index/ann.py +21 -0
sma/index/content_vectors.py +60 -0
sma/index/inverted.py +63 -0
sma/index/macfac.py +174 -0
sma/ir/__init__.py +22 -0
sma/ir/canon.py +106 -0
sma/ir/schema.py +165 -0
sma/ir/sexpr.py +86 -0
sma/ir/signatures.py +76 -0
sma/match/__init__.py +20 -0
sma/match/conflicts.py +46 -0
sma/match/engine.py +60 -0
sma/match/explain.py +59 -0
sma/match/infer.py +54 -0
sma/match/kernels.py +54 -0
sma/match/mdl.py +30 -0
sma/match/merge_cpsat.py +77 -0
sma/match/merge_greedy.py +15 -0
sma/match/mh.py +177 -0
sma/match/ses.py +84 -0
sma/match/types.py +115 -0
sma/match/verifier.py +27 -0
sma/ontology/__init__.py +45 -0
sma/ontology/attack.py +134 -0
sma/ontology/cpc.py +69 -0
sma/ontology/graph.py +58 -0
sma/ontology/loader.py +262 -0
sma/ontology/mitre_xml.py +67 -0
sma/ontology/mount.py +101 -0
sma/ontology/rdf_loader.py +75 -0
sma/ontology/registry.py +115 -0
sma/ontology/router.py +69 -0
sma/ontology/usgaap.py +73 -0
sma/sage/__init__.py +6 -0
sma/sage/assimilate.py +12 -0
sma/sage/pools.py +105 -0
sma/sage/probabilities.py +10 -0
sma/store/__init__.py +6 -0
sma/store/lmdb_store.py +78 -0
sma/store/registry.py +26 -0
sma/store/wal.py +26 -0
sma/ui/app.py +642 -0
structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0

sma/eval/transfer_eval.py ADDED Viewed

@@ -0,0 +1,844 @@
+"""Cross-system transfer evaluation (blueprint section 8.3, task T2-b).
+Indexes incidents from one log system and queries with incidents from a
+DIFFERENT system: HDFS->OpenStack and BGL->Thunderbird. Vocabularies differ
+across systems but failure motifs recur, so this is the unseen-concept test
+on real data. Compares the four retrieval methods from loghub_eval
+(SMA, BM25, Dense RAG, KG-PPR Proxy) plus HippoRAG (B5, deterministic
+adaptation) with weighted vote, label_hit_rate@k
+and latency metrics, but WITHOUT an 80/20 split: the index set comes
+entirely from system A and the query set entirely from system B.
+Run as: python3 -u -m sma.eval.transfer_eval [--scorer ses|mdl]
+"""
+from __future__ import annotations
+import argparse
+import csv
+import gzip
+import hashlib
+import pathlib
+import random
+import re
+import tarfile
+import time
+import zipfile
+from collections import defaultdict, Counter
+import numpy as np
+from sklearn.metrics import f1_score
+from sma.encoders import get_encoder
+from sma.index.macfac import MacFacIndex
+from sma.match.types import MatchConfig
+from sma.eval.loghub_eval import sample_hdfs_stratified, sample_bgl_stratified
+# Expected checksum of a complete Thunderbird.tar.gz (a previous copy was
+# corrupt; we refuse to evaluate against anything that does not match).
+THUNDERBIRD_MD5 = "0891b048df2919dc78c99c4428686b44"
+# Thunderbird is huge (~30GB uncompressed, ~211M lines). For tractability we
+# cap both streaming passes at the first 20 million lines; the split name
+# records this cap as "thunderbird_first20M".
+THUNDERBIRD_LINE_CAP = 20_000_000
+# Spirit (Sandia supercomputer, Oliner & Stearley 2007) held-out transfer
+# target. Source: USENIX CFDR hpc4/spirit2.gz (NOT in the LogHub Zenodo
+# records; see data/manifests/datasets.json source_note). Same alert-flag
+# format family as BGL/Thunderbird. md5 verified before every evaluation,
+# like Thunderbird.
+SPIRIT_MD5 = "ba6271c4f454bc21634b19c406d9769c"
+# Spirit is ~37GB uncompressed (~272M lines). Same tractability cap as
+# Thunderbird: both streaming passes stop at the first 20 million lines;
+# the split name records this cap as "spirit_first20M".
+SPIRIT_LINE_CAP = 20_000_000
+OPENSTACK_INSTANCE_RE = re.compile(r"instance: ([0-9a-f-]{36})")
+def get_stratified_subset(keys, target_n, sort_key, rng):
+    """Sample target_n keys stratified over 5 temporal bins (same scheme as
+    the nested helpers in loghub_eval)."""
+    sorted_keys = sorted(keys, key=sort_key)
+    if len(sorted_keys) <= target_n:
+        return sorted_keys
+    bins = np.array_split(sorted_keys, 5)
+    subset = []
+    per_bin = target_n // 5
+    for b in bins:
+        subset.extend(rng.sample(list(b), min(len(b), per_bin)))
+    while len(subset) < target_n and sorted_keys:
+        rem = list(set(sorted_keys) - set(subset))
+        if not rem:
+            break
+        subset.append(rng.choice(rem))
+    return subset
+def sample_openstack(
+    path: pathlib.Path, sample_size: int = 200, seed: int = 42
+) -> list[tuple[str, str, str]]:
+    """Sessionize and sample stratified OpenStack logs by VM instance id.
+    The LogHub OpenStack archive contains openstack_normal1.log,
+    openstack_normal2.log and openstack_abnormal.log. Sessions are grouped by
+    the VM instance id appearing as "[instance: <uuid>]"; a session is labeled
+    Anomaly iff it comes from the abnormal log (instance-id sets of the normal
+    and abnormal runs are disjoint).
+    """
+    members = [
+        ("openstack_normal1.log", "Normal"),
+        ("openstack_normal2.log", "Normal"),
+        ("openstack_abnormal.log", "Anomaly"),
+    ]
+    # Pass 1: gather session sizes, labels and first-seen order
+    session_counts = Counter()
+    labels = {}
+    first_seen = {}
+    line_no = 0
+    with tarfile.open(path, "r:gz") as tar:
+        for member_name, label in members:
+            with tar.extractfile(member_name) as fh:
+                for line_bytes in fh:
+                    line_no += 1
+                    line = line_bytes.decode("utf-8", errors="ignore")
+                    match = OPENSTACK_INSTANCE_RE.search(line)
+                    if not match:
+                        continue
+                    key = f"openstack_{match.group(1)}"
+                    session_counts[key] += 1
+                    if label == "Anomaly":
+                        labels[key] = "Anomaly"
+                    else:
+                        labels.setdefault(key, "Normal")
+                    # Files are time-ordered, so first-seen line index is a
+                    # monotone proxy for the first timestamp (used only for
+                    # the 5-bin temporal stratification below).
+                    if key not in first_seen:
+                        first_seen[key] = line_no
+    # Filter sessions with length >= 3 to avoid tiny cases (BGL convention)
+    filtered_keys = [k for k, count in session_counts.items() if count >= 3]
+    anom_keys = [k for k in filtered_keys if labels[k] == "Anomaly"]
+    norm_keys = [k for k in filtered_keys if labels[k] == "Normal"]
+    rng = random.Random(seed)
+    sampled_anom = get_stratified_subset(
+        anom_keys, sample_size // 2, lambda k: first_seen[k], rng
+    )
+    sampled_norm = get_stratified_subset(
+        norm_keys, sample_size // 2, lambda k: first_seen[k], rng
+    )
+    sampled_set = set(sampled_anom + sampled_norm)
+    # Pass 2: extract actual lines for the sampled set
+    sessions_lines = defaultdict(list)
+    with tarfile.open(path, "r:gz") as tar:
+        for member_name, _label in members:
+            with tar.extractfile(member_name) as fh:
+                for line_bytes in fh:
+                    line = line_bytes.decode("utf-8", errors="ignore")
+                    match = OPENSTACK_INSTANCE_RE.search(line)
+                    if not match:
+                        continue
+                    key = f"openstack_{match.group(1)}"
+                    if key in sampled_set:
+                        # Drop the leading source-filename column: which file
+                        # a line came from perfectly encodes the session label
+                        # (normal vs abnormal run), so keeping it would leak
+                        # labels into the text just like the BGL alert column.
+                        sessions_lines[key].append(line.partition(" ")[2] or line)
+    results = []
+    for k in sampled_anom + sampled_norm:
+        lines = sessions_lines.get(k, [])
+        if lines:
+            results.append((k, "".join(lines), labels[k]))
+    return results
+def check_thunderbird(path: pathlib.Path) -> str | None:
+    """Return None if Thunderbird.tar.gz is present and checksum-verified,
+    otherwise a human-readable reason to skip the BGL->Thunderbird pair."""
+    if not path.exists():
+        return f"{path} is missing (download may still be in progress)"
+    digest = hashlib.md5()
+    with path.open("rb") as fh:
+        for chunk in iter(lambda: fh.read(1 << 20), b""):
+            digest.update(chunk)
+    actual = digest.hexdigest()
+    if actual != THUNDERBIRD_MD5:
+        return (
+            f"{path} md5 mismatch: expected {THUNDERBIRD_MD5}, got {actual} "
+            "(file incomplete or corrupt; a previous copy was corrupt too)"
+        )
+    return None
+def sample_thunderbird(
+    path: pathlib.Path, sample_size: int = 200, seed: int = 42
+) -> list[tuple[str, str, str]]:
+    """Sessionize and sample stratified Thunderbird logs (BGL-like format).
+    Streams the tar.gz in two passes without extracting to disk or holding
+    lines in memory, like sample_bgl_stratified. Sessionizes per node into
+    60-second windows; the first whitespace-separated field is the
+    ground-truth label column ("-" = normal, anything else = alert category)
+    and is STRIPPED from extracted text to avoid label leakage. Both passes
+    are capped at the first THUNDERBIRD_LINE_CAP (20M) lines for tractability
+    (the split name "thunderbird_first20M" records the cap).
+    """
+    skip_reason = check_thunderbird(path)
+    if skip_reason:
+        print(f"Skipping Thunderbird sampling: {skip_reason}")
+        return []
+    def stream_lines(tb_path):
+        """Yield decoded lines of the first log member, capped at 20M."""
+        with tarfile.open(tb_path, "r|gz") as tar:
+            for member in tar:
+                fh = tar.extractfile(member)
+                if fh is None:
+                    continue
+                for line_no, line_bytes in enumerate(fh, start=1):
+                    if line_no > THUNDERBIRD_LINE_CAP:
+                        return
+                    yield line_bytes.decode("utf-8", errors="ignore")
+                return  # only the first (log) member matters
+    # Pass 1: gather metadata for sessionization and labels
+    session_counts = Counter()
+    labels = defaultdict(bool)
+    timestamps = {}
+    for line in stream_lines(path):
+        parts = line.split(maxsplit=5)
+        if len(parts) < 5:
+            continue
+        label = parts[0]
+        try:
+            timestamp = int(parts[1])
+        except ValueError:
+            continue
+        node_id = parts[3]
+        # Group Thunderbird into 60-second windows per node, like BGL
+        window = timestamp // 60
+        session_key = f"tbird_{node_id}_{window}"
+        session_counts[session_key] += 1
+        if label != "-":
+            labels[session_key] = True
+        if session_key not in timestamps:
+            timestamps[session_key] = timestamp
+    # Filter sessions with length >= 3 to avoid tiny cases
+    filtered_keys = [k for k, count in session_counts.items() if count >= 3]
+    anom_keys = [k for k in filtered_keys if labels[k]]
+    norm_keys = [k for k in filtered_keys if not labels[k]]
+    rng = random.Random(seed)
+    sampled_anom = get_stratified_subset(
+        anom_keys, sample_size // 2, lambda k: timestamps[k], rng
+    )
+    sampled_norm = get_stratified_subset(
+        norm_keys, sample_size // 2, lambda k: timestamps[k], rng
+    )
+    sampled_set = set(sampled_anom + sampled_norm)
+    # Pass 2: extract actual lines for the sampled set
+    sessions_lines = defaultdict(list)
+    for line in stream_lines(path):
+        parts = line.split(maxsplit=5)
+        if len(parts) < 5:
+            continue
+        try:
+            timestamp = int(parts[1])
+        except ValueError:
+            continue
+        node_id = parts[3]
+        window = timestamp // 60
+        session_key = f"tbird_{node_id}_{window}"
+        if session_key in sampled_set:
+            # Drop the leading alert-category column: it is the ground-truth
+            # label, not log content. Keeping it leaks labels to every
+            # retriever (Thunderbird '-' = normal, anything else = anomaly) -
+            # the same bug previously shipped and fixed in BGL.
+            sessions_lines[session_key].append(line.partition(" ")[2] or line)
+    results = []
+    for k in sampled_anom + sampled_norm:
+        lines = sessions_lines.get(k, [])
+        if lines:
+            results.append((k, "".join(lines), "Anomaly" if labels[k] else "Normal"))
+    return results
+def check_spirit(path: pathlib.Path) -> str | None:
+    """Return None if spirit2.gz is present and checksum-verified, otherwise
+    a human-readable reason to skip Spirit pairs (mirrors check_thunderbird)."""
+    if not path.exists():
+        return f"{path} is missing (download may still be in progress)"
+    digest = hashlib.md5()
+    with path.open("rb") as fh:
+        for chunk in iter(lambda: fh.read(1 << 20), b""):
+            digest.update(chunk)
+    actual = digest.hexdigest()
+    if actual != SPIRIT_MD5:
+        return (
+            f"{path} md5 mismatch: expected {SPIRIT_MD5}, got {actual} "
+            "(file incomplete or corrupt)"
+        )
+    return None
+def sample_spirit(
+    path: pathlib.Path, sample_size: int = 200, seed: int = 42
+) -> list[tuple[str, str, str]]:
+    """Sessionize and sample stratified Spirit logs (BGL/Thunderbird family).
+    Modeled exactly on sample_thunderbird: streams the plain gzip in two
+    passes without extracting to disk or holding lines in memory. Sessionizes
+    per node into 60-second windows with a >=3 line minimum; the first
+    whitespace-separated field is the ground-truth alert label column
+    ("-" = normal, anything else = alert category, e.g. R_HDA_NR) and is
+    STRIPPED from extracted text to avoid label leakage. Both passes are
+    capped at the first SPIRIT_LINE_CAP (20M) lines for tractability (the
+    split name "spirit_first20M" records the cap).
+    Spirit line format (verified against CFDR hpc4/spirit2.gz):
+        LABEL EPOCH DATE NODE Month Day HH:MM:SS src daemon[pid]: message
+    so parts[0]=label, parts[1]=epoch seconds, parts[3]=node id - identical
+    field positions to Thunderbird.
+    """
+    skip_reason = check_spirit(path)
+    if skip_reason:
+        print(f"Skipping Spirit sampling: {skip_reason}")
+        return []
+    def stream_lines(sp_path):
+        """Yield decoded lines of the gzipped log, capped at 20M."""
+        with gzip.open(sp_path, "rb") as fh:
+            for line_no, line_bytes in enumerate(fh, start=1):
+                if line_no > SPIRIT_LINE_CAP:
+                    return
+                yield line_bytes.decode("utf-8", errors="ignore")
+    # Pass 1: gather metadata for sessionization and labels
+    session_counts = Counter()
+    labels = defaultdict(bool)
+    timestamps = {}
+    for line in stream_lines(path):
+        parts = line.split(maxsplit=5)
+        if len(parts) < 5:
+            continue
+        label = parts[0]
+        try:
+            timestamp = int(parts[1])
+        except ValueError:
+            continue
+        node_id = parts[3]
+        # Group Spirit into 60-second windows per node, like BGL/Thunderbird
+        window = timestamp // 60
+        session_key = f"spirit_{node_id}_{window}"
+        session_counts[session_key] += 1
+        if label != "-":
+            labels[session_key] = True
+        if session_key not in timestamps:
+            timestamps[session_key] = timestamp
+    # Filter sessions with length >= 3 to avoid tiny cases
+    filtered_keys = [k for k, count in session_counts.items() if count >= 3]
+    anom_keys = [k for k in filtered_keys if labels[k]]
+    norm_keys = [k for k in filtered_keys if not labels[k]]
+    print(
+        f"Spirit (first {SPIRIT_LINE_CAP // 1_000_000}M lines): "
+        f"{len(session_counts)} sessions, {len(filtered_keys)} with >=3 lines "
+        f"({len(anom_keys)} anomalous / {len(norm_keys)} normal)"
+    )
+    rng = random.Random(seed)
+    sampled_anom = get_stratified_subset(
+        anom_keys, sample_size // 2, lambda k: timestamps[k], rng
+    )
+    sampled_norm = get_stratified_subset(
+        norm_keys, sample_size // 2, lambda k: timestamps[k], rng
+    )
+    sampled_set = set(sampled_anom + sampled_norm)
+    # Pass 2: extract actual lines for the sampled set
+    sessions_lines = defaultdict(list)
+    for line in stream_lines(path):
+        parts = line.split(maxsplit=5)
+        if len(parts) < 5:
+            continue
+        try:
+            timestamp = int(parts[1])
+        except ValueError:
+            continue
+        node_id = parts[3]
+        window = timestamp // 60
+        session_key = f"spirit_{node_id}_{window}"
+        if session_key in sampled_set:
+            # Drop the leading alert-category column: it is the ground-truth
+            # label, not log content. Keeping it would leak labels to every
+            # retriever (Spirit "-" = normal, anything else = anomaly), the
+            # same leak previously found and fixed in BGL and Thunderbird.
+            sessions_lines[session_key].append(line.partition(" ")[2] or line)
+    results = []
+    for k in sampled_anom + sampled_norm:
+        lines = sessions_lines.get(k, [])
+        if lines:
+            results.append((k, "".join(lines), "Anomaly" if labels[k] else "Normal"))
+    sampled_counts = Counter(label for _, _, label in results)
+    print(
+        f"Spirit sample: {len(results)} sessions "
+        f"({sampled_counts.get('Anomaly', 0)} Anomaly / "
+        f"{sampled_counts.get('Normal', 0)} Normal)"
+    )
+    return results
+def run_transfer(
+    index_data: list[tuple[str, str, str]],
+    query_data: list[tuple[str, str, str]],
+    pair_name: str,
+    scorer: str = "ses",
+    normalization: str = "max",
+    per_query_rows: list[dict] | None = None,
+) -> list[dict]:
+    """Execute five-way cross-system transfer comparison.
+    Adapted from loghub_eval.run_evaluation but WITHOUT the 80/20 split:
+    index_data is the full index set (system A), query_data the full query
+    set (system B).
+    If ``per_query_rows`` is a list, one dict per (query, method) is appended
+    to it -- query_id, true/pred label (the macro-F1 inputs) and per-query
+    hit@{1,5,10} -- so callers such as scripts/confirmatory_battery.py can run
+    paired per-query statistics. Returned summary rows are unchanged.
+    """
+    split_name = f"{pair_name}[{scorer}]"
+    print(
+        f"\n--- Running transfer evaluation {split_name} "
+        f"({len(index_data)} index / {len(query_data)} query cases) ---"
+    )
+    # Parse and encode cases
+    log_encoder = get_encoder("logs")
+    print("Encoding index cases...")
+    index_cases = []
+    index_docs = []  # List of (case_id, text)
+    index_labels = {}
+    for sid, text, label in index_data:
+        case = log_encoder.encode(text, session_id=sid).case
+        index_cases.append(case)
+        index_docs.append((case.case_id, text))
+        index_labels[case.case_id] = label
+    print("Encoding query cases...")
+    query_cases = []
+    query_docs = []
+    query_labels = {}
+    for sid, text, label in query_data:
+        case = log_encoder.encode(text, session_id=sid).case
+        query_cases.append(case)
+        query_docs.append((case.case_id, text))
+        query_labels[case.case_id] = label
+    # Build indexes ONCE before the query loop
+    # 1. Build SMA MAC/FAC index
+    print(f"Building SMA Index (scorer={scorer})...")
+    sma_index = MacFacIndex(config=MatchConfig(scorer=scorer, normalization=normalization))
+    sma_index.build(index_cases)
+    # 2. Build BM25 Index
+    print("Building BM25 Index...")
+    from rank_bm25 import BM25Okapi
+    tokenized_index = [text.lower().split() for _, text in index_docs]
+    bm25_index = BM25Okapi(tokenized_index)
+    # 3. Build Dense RAG Index (SentenceTransformers)
+    print("Building Dense RAG Index (SentenceTransformers)...")
+    from sentence_transformers import SentenceTransformer, util
+    dense_model = SentenceTransformer('all-MiniLM-L6-v2')
+    index_texts = [text for _, text in index_docs]
+    index_embeddings = dense_model.encode(index_texts, convert_to_tensor=True, show_progress_bar=False)
+    # 4. Build KG-PPR Proxy index
+    print("Building KG-PPR Proxy Index...")
+    index_entity_counters = {
+        ic.case_id: Counter(e.name for e in ic.entities())
+        for ic in index_cases
+    }
+    # 5. Build HippoRAG index (B5: phrase graph + Personalized PageRank)
+    print("Building HippoRAG Index...")
+    from sma.eval.baselines.hipporag import HippoRAGRetriever
+    hipporag_index = HippoRAGRetriever()
+    hipporag_index.build(index_docs)
+    # 6. Enterprise hybrid stack (B6): RRF(BM25 + dense) and a cross-encoder
+    # rerank over the fused top-20 pool - the production RAG posture.
+    print("Loading cross-encoder reranker (Hybrid+Rerank)...")
+    from sma.eval.baselines.hybrid_rrf import rrf_fuse
+    from sma.eval.baselines.rerank import CrossEncoderReranker
+    reranker = CrossEncoderReranker()
+    index_text_by_id = dict(index_docs)
+    # Per-query ranked retrieval for each method, as (case_id, score) pairs.
+    def retrieve_sma(q_case, q_text):
+        # shortlist=40, fac_budget=20 keeps CPU latency bounded
+        results = sma_index.retrieve(q_case, k=10, shortlist=40, fac_budget=20)
+        return [(r.case_id, r.ses_n) for r in results]
+    def retrieve_bm25(q_case, q_text):
+        scores = bm25_index.get_scores(q_text.lower().split())
+        ranked = sorted(zip(doc_ids, scores), key=lambda row: (-row[1], row[0]))
+        return ranked[:10]
+    def retrieve_dense(q_case, q_text):
+        query_embedding = dense_model.encode(q_text, convert_to_tensor=True, show_progress_bar=False)
+        scores = util.cos_sim(query_embedding, index_embeddings)[0].cpu().tolist()
+        ranked = sorted(zip(doc_ids, scores), key=lambda row: (-row[1], row[0]))
+        return ranked[:10]
+    def retrieve_kg(q_case, q_text):
+        q_counter = Counter(e.name for e in q_case.entities())
+        ranked = sorted(
+            (
+                (ic_id, float(sum(min(v, counts.get(k, 0)) for k, v in q_counter.items())))
+                for ic_id, counts in index_entity_counters.items()
+            ),
+            key=lambda row: (-row[1], row[0]),
+        )
+        return ranked[:10]
+    def retrieve_hipporag(q_case, q_text):
+        return hipporag_index.retrieve(q_text, k=10)
+    def _bm25_ranked(q_text, k):
+        scores = bm25_index.get_scores(q_text.lower().split())
+        return sorted(zip(doc_ids, scores), key=lambda row: (-row[1], row[0]))[:k]
+    def _dense_ranked(q_text, k):
+        query_embedding = dense_model.encode(q_text, convert_to_tensor=True, show_progress_bar=False)
+        scores = util.cos_sim(query_embedding, index_embeddings)[0].cpu().tolist()
+        return sorted(zip(doc_ids, scores), key=lambda row: (-row[1], row[0]))[:k]
+    def retrieve_hybrid_rrf(q_case, q_text):
+        return rrf_fuse([_bm25_ranked(q_text, 20), _dense_ranked(q_text, 20)], top_k=10)
+    def retrieve_hybrid_rerank(q_case, q_text):
+        pool = rrf_fuse([_bm25_ranked(q_text, 20), _dense_ranked(q_text, 20)], top_k=20)
+        candidates = [(cid, index_text_by_id[cid]) for cid, _ in pool]
+        return reranker.rerank(q_text, candidates, top_k=10)
+    def weighted_vote(ranked, top=5):
+        voting = {"Anomaly": 0.0, "Normal": 0.0}
+        for case_id, score in ranked[:top]:
+            voting[index_labels[case_id]] += score
+        return max(voting, key=voting.get) if sum(voting.values()) > 0 else "Normal"
+    retrievers = {
+        "SMA": retrieve_sma,
+        "BM25": retrieve_bm25,
+        "Dense RAG": retrieve_dense,
+        "KG-PPR Proxy": retrieve_kg,
+        "HippoRAG": retrieve_hipporag,
+        "Hybrid-RRF": retrieve_hybrid_rrf,
+        "Hybrid+Rerank": retrieve_hybrid_rerank,
+    }
+    methods = list(retrievers)
+    metrics_by_method = {m: {"recalls": [], "preds": [], "latencies": []} for m in methods}
+    doc_ids = [doc_id for doc_id, _ in index_docs]
+    print("Starting retrieval runs...")
+    total_queries = len(query_cases)
+    for idx, (q_case, (q_case_id, q_text)) in enumerate(zip(query_cases, query_docs), start=1):
+        for method, retriever in retrievers.items():
+            t0 = time.perf_counter()
+            ranked = retriever(q_case, q_text)
+            elapsed_ms = (time.perf_counter() - t0) * 1000
+            data = metrics_by_method[method]
+            data["recalls"].append([case_id for case_id, _ in ranked])
+            data["latencies"].append(elapsed_ms)
+            data["preds"].append(weighted_vote(ranked))
+        if idx % 20 == 0 or idx == total_queries:
+            print(f"Processed {idx}/{total_queries} retrieval runs...")
+    # Calculate final metrics
+    transfer_rows = []
+    true_labels = [query_labels[c.case_id] for c in query_cases]
+    for m in methods:
+        data = metrics_by_method[m]
+        preds = data["preds"]
+        recalls = data["recalls"]
+        latencies = data["latencies"]
+        # F1 Score
+        f1 = f1_score(true_labels, preds, average="macro")
+        # label_hit_rate @ 1, 5, 10
+        r1_list = []
+        r5_list = []
+        r10_list = []
+        for q_idx, q_case in enumerate(query_cases):
+            q_label = query_labels[q_case.case_id]
+            ret_ids = recalls[q_idx]
+            # Find all relevant index cases for this query
+            relevant_ids = {ic.case_id for ic in index_cases if index_labels[ic.case_id] == q_label}
+            # Hit rate at k = count of retrieved relevant / min(relevant_ids, k)
+            def compute_hit_rate_k(k):
+                hits = len(set(ret_ids[:k]).intersection(relevant_ids))
+                denom = min(len(relevant_ids), k)
+                return hits / denom if denom > 0 else 0.0
+            r1_list.append(compute_hit_rate_k(1))
+            r5_list.append(compute_hit_rate_k(5))
+            r10_list.append(compute_hit_rate_k(10))
+            if per_query_rows is not None:
+                per_query_rows.append({
+                    "split": split_name,
+                    "method": m,
+                    "query_id": q_case.case_id,
+                    "true_label": q_label,
+                    "pred_label": preds[q_idx],
+                    "hit@1": r1_list[-1],
+                    "hit@5": r5_list[-1],
+                    "hit@10": r10_list[-1],
+                })
+        r1 = sum(r1_list) / len(r1_list)
+        r5 = sum(r5_list) / len(r5_list)
+        r10 = sum(r10_list) / len(r10_list)
+        # Latency p50, p95
+        p50 = np.percentile(latencies, 50)
+        p95 = np.percentile(latencies, 95)
+        transfer_rows.append({
+            "dataset": "LogHub",
+            "split": split_name,
+            "method": m,
+            "macro_f1": f"{f1:.4f}",
+            "label_hit_rate@1": f"{r1:.4f}",
+            "label_hit_rate@5": f"{r5:.4f}",
+            "label_hit_rate@10": f"{r10:.4f}",
+            "p50_ms": f"{p50:.3f}",
+            "p95_ms": f"{p95:.3f}"
+        })
+        # Print results
+        print(f"Method: {m}")
+        print(f"  Macro-F1: {f1:.4f}")
+        print(f"  label_hit_rate@1: {r1:.4f}, label_hit_rate@5: {r5:.4f}, label_hit_rate@10: {r10:.4f}")
+        print(f"  p50 Latency: {p50:.3f} ms, p95 Latency: {p95:.3f} ms")
+        # Diagnostic alerts for collapsed or suspiciously perfect runs
+        unique_preds = set(preds)
+        is_suspicious = (f1 == 0.0 or f1 == 1.0 or len(unique_preds) <= 1)
+        if is_suspicious:
+            reason = ""
+            if f1 == 0.0:
+                reason = "F1 is 0.0: Retrieval collapse or dataset imbalance"
+            elif f1 == 1.0:
+                reason = "F1 is 1.0: Suspiciously perfect classification - potential data leakage or indexing overlap"
+            elif len(unique_preds) <= 1:
+                reason = f"Retrieval collapse: predicted only '{list(unique_preds)[0]}' sessions"
+            transfer_rows.append({
+                "dataset": "DIAGNOSTIC",
+                "split": split_name,
+                "method": f"{m}_alert",
+                "macro_f1": reason,
+                "label_hit_rate@1": "ALERT",
+                "label_hit_rate@5": "ALERT",
+                "label_hit_rate@10": "ALERT",
+                "p50_ms": "0.000",
+                "p95_ms": "0.000"
+            })
+            print(f"  [DIAGNOSTIC ALERT] {reason}")
+    return transfer_rows
+def append_transfer_rows(
+    rows: list[dict], out_path: str | pathlib.Path = "reports/transfer_metrics.csv"
+) -> None:
+    """Append metric rows to a transfer metrics CSV (triage schema).
+    Defaults to reports/transfer_metrics.csv (the original behavior)."""
+    if not rows:
+        return
+    out_path = pathlib.Path(out_path)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fieldnames = [
+        "dataset", "split", "method", "macro_f1",
+        "label_hit_rate@1", "label_hit_rate@5", "label_hit_rate@10",
+        "p50_ms", "p95_ms",
+    ]
+    write_header = not out_path.exists()
+    with out_path.open("a", encoding="utf-8", newline="") as fh:
+        writer = csv.DictWriter(fh, fieldnames=fieldnames)
+        if write_header:
+            writer.writeheader()
+        writer.writerows(rows)
+    print(f"Appended {len(rows)} rows to {out_path}")
+# Registry of samplable systems for --pairs. Each entry maps the system name
+# (as written in a "A->B" pair spec) to (archive filename, sampler, display
+# name used in the split string, optional integrity-check function).
+SYSTEMS = {
+    "HDFS": ("HDFS_v1.zip", sample_hdfs_stratified, "HDFS", None),
+    "BGL": ("BGL.zip", sample_bgl_stratified, "BGL", None),
+    "OpenStack": ("OpenStack.tar.gz", sample_openstack, "OpenStack", None),
+    "Thunderbird": (
+        "Thunderbird.tar.gz", sample_thunderbird, "thunderbird_first20M",
+        check_thunderbird,
+    ),
+    "Spirit": ("spirit2.gz", sample_spirit, "spirit_first20M", check_spirit),
+}
+def run_named_pairs(pairs_spec, scorer, seed, index_size, query_size, out_path,
+                    normalization="max", per_query_rows=None):
+    """Run a comma-separated list of "A->B" transfer pairs (e.g.
+    "BGL->Spirit,HDFS->Spirit") with an explicit seed, appending rows to
+    out_path. Additive entry point used by --pairs; the default (no --pairs)
+    code path in main() is unchanged. ``per_query_rows`` is threaded through
+    to run_transfer (see there); the summary rows are also returned."""
+    raw_dir = pathlib.Path("data/raw/loghub_raw")
+    all_rows = []
+    sample_cache = {}  # (system, size, seed) -> sampled sessions
+    def sample_system(name, size):
+        key = (name, size, seed)
+        if key in sample_cache:
+            return sample_cache[key]
+        filename, sampler, _display, check = SYSTEMS[name]
+        path = raw_dir / filename
+        if not path.exists():
+            print(f"Skipping {name}: {path} is missing. Run fetch_datasets.py first.")
+            data = []
+        else:
+            skip = check(path) if check else None
+            if skip:
+                print(f"Skipping {name}: {skip}")
+                data = []
+            else:
+                print(f"Sampling {name} sessions (size={size}, seed={seed})...")
+                data = sampler(path, sample_size=size, seed=seed)
+                counts = Counter(label for _, _, label in data)
+                print(
+                    f"{name} class counts: {counts.get('Anomaly', 0)} Anomaly / "
+                    f"{counts.get('Normal', 0)} Normal"
+                )
+        sample_cache[key] = data
+        return data
+    for pair in [p.strip() for p in pairs_spec.split(",") if p.strip()]:
+        if "->" not in pair:
+            print(f"Skipping malformed pair spec '{pair}' (expected 'A->B').")
+            continue
+        src, dst = (s.strip() for s in pair.split("->", 1))
+        if src not in SYSTEMS or dst not in SYSTEMS:
+            known = ", ".join(SYSTEMS)
+            print(f"Skipping pair '{pair}': unknown system (known: {known}).")
+            continue
+        index_data = sample_system(src, index_size)
+        query_data = sample_system(dst, query_size)
+        if not index_data or not query_data:
+            print(f"Skipping pair '{pair}': empty index or query sample.")
+            continue
+        pair_name = f"{SYSTEMS[src][2]}->{SYSTEMS[dst][2]}[seed{seed}]"
+        all_rows.extend(run_transfer(index_data, query_data, pair_name, scorer=scorer,
+                                     normalization=normalization, per_query_rows=per_query_rows))
+    append_transfer_rows(all_rows, out_path)
+    return all_rows
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Cross-system transfer evaluation (T2-b)")
+    parser.add_argument("--scorer", choices=["ses", "mdl", "surprisal"], default="ses")
+    parser.add_argument("--normalization", choices=["max", "min", "sqrt", "target"], default="max")
+    parser.add_argument("--index-size", type=int, default=800,
+                        help="stratified sessions to index from system A")
+    parser.add_argument("--query-size", type=int, default=200,
+                        help="stratified sessions to query from system B")
+    parser.add_argument("--pairs", default=None,
+                        help="comma-separated 'A->B' pairs to run instead of the "
+                             "default HDFS->OpenStack and BGL->Thunderbird pairs, "
+                             "e.g. 'BGL->Spirit,HDFS->Spirit'")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="sampling seed threaded into both samplers")
+    parser.add_argument("--out", default="reports/transfer_metrics.csv",
+                        help="CSV path to append metric rows to")
+    args = parser.parse_args()
+    random.seed(args.seed)
+    if args.pairs:
+        run_named_pairs(
+            args.pairs, args.scorer, args.seed,
+            normalization=args.normalization,
+            index_size=args.index_size, query_size=args.query_size, out_path=args.out,
+        )
+        return
+    raw_dir = pathlib.Path("data/raw/loghub_raw")
+    hdfs_zip = raw_dir / "HDFS_v1.zip"
+    bgl_zip = raw_dir / "BGL.zip"
+    openstack_tar = raw_dir / "OpenStack.tar.gz"
+    thunderbird_tar = raw_dir / "Thunderbird.tar.gz"
+    all_rows = []
+    # Pair 1: HDFS -> OpenStack
+    if not hdfs_zip.exists():
+        print(f"Skipping HDFS->OpenStack: {hdfs_zip} is missing. Run fetch_datasets.py first.")
+    elif not openstack_tar.exists():
+        print(f"Skipping HDFS->OpenStack: {openstack_tar} is missing. Run fetch_datasets.py first.")
+    else:
+        print("Sampling HDFS sessions (index set)...")
+        hdfs_index = sample_hdfs_stratified(hdfs_zip, sample_size=args.index_size, seed=args.seed)
+        print("Sampling OpenStack sessions (query set)...")
+        openstack_query = sample_openstack(openstack_tar, sample_size=args.query_size, seed=args.seed)
+        all_rows.extend(
+            run_transfer(hdfs_index, openstack_query, "HDFS->OpenStack", scorer=args.scorer)
+        )
+    # Pair 2: BGL -> Thunderbird (first 20M lines, see THUNDERBIRD_LINE_CAP)
+    tbird_skip = check_thunderbird(thunderbird_tar)
+    if not bgl_zip.exists():
+        print(f"Skipping BGL->Thunderbird: {bgl_zip} is missing. Run fetch_datasets.py first.")
+    elif tbird_skip:
+        print(f"Skipping BGL->Thunderbird: {tbird_skip}")
+    else:
+        print("Sampling BGL sessions (index set)...")
+        bgl_index = sample_bgl_stratified(bgl_zip, sample_size=args.index_size, seed=args.seed)
+        print("Sampling Thunderbird sessions (query set, first 20M lines)...")
+        tbird_query = sample_thunderbird(thunderbird_tar, sample_size=args.query_size, seed=args.seed)
+        if tbird_query:
+            all_rows.extend(
+                run_transfer(bgl_index, tbird_query, "BGL->thunderbird_first20M", scorer=args.scorer)
+            )
+        else:
+            print("Skipping BGL->Thunderbird: no Thunderbird sessions sampled.")
+    append_transfer_rows(all_rows, args.out)
+if __name__ == "__main__":
+    main()