mneme-core 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. mneme_core/__init__.py +4 -0
  2. mneme_core/__main__.py +8 -0
  3. mneme_core/approval.py +183 -0
  4. mneme_core/audit.py +82 -0
  5. mneme_core/bench/__init__.py +40 -0
  6. mneme_core/bench/hardware.py +110 -0
  7. mneme_core/bench/harness.py +258 -0
  8. mneme_core/bench/metrics.py +142 -0
  9. mneme_core/bench/synth.py +207 -0
  10. mneme_core/capability.py +131 -0
  11. mneme_core/cli.py +1437 -0
  12. mneme_core/compression/__init__.py +106 -0
  13. mneme_core/compression/config.py +100 -0
  14. mneme_core/compression/ledger.py +447 -0
  15. mneme_core/compression/llm.py +155 -0
  16. mneme_core/compression/pipeline.py +543 -0
  17. mneme_core/compression/prompts/compress-en.md +108 -0
  18. mneme_core/compression/staging.py +363 -0
  19. mneme_core/connectors.py +158 -0
  20. mneme_core/connectors_net.py +231 -0
  21. mneme_core/console.py +278 -0
  22. mneme_core/distill/__init__.py +74 -0
  23. mneme_core/distill/adaptive_topk.py +65 -0
  24. mneme_core/distill/audit.py +248 -0
  25. mneme_core/distill/compressed_format.py +90 -0
  26. mneme_core/distill/injection_dedup.py +101 -0
  27. mneme_core/distill/shell_compress.py +207 -0
  28. mneme_core/fts5/__init__.py +5 -0
  29. mneme_core/fts5/indexer.py +675 -0
  30. mneme_core/fts5/locale/__init__.py +6 -0
  31. mneme_core/fts5/locale/tr.py +120 -0
  32. mneme_core/injection.py +76 -0
  33. mneme_core/kg/__init__.py +42 -0
  34. mneme_core/kg/client.py +97 -0
  35. mneme_core/kg/episode_stage.py +190 -0
  36. mneme_core/kg/flush.py +40 -0
  37. mneme_core/kg/worker.py +326 -0
  38. mneme_core/modes.py +375 -0
  39. mneme_core/modes_cli.py +151 -0
  40. mneme_core/patterns.py +265 -0
  41. mneme_core/privacy.py +90 -0
  42. mneme_core/py.typed +1 -0
  43. mneme_core/retrieval/__init__.py +54 -0
  44. mneme_core/retrieval/dense.py +425 -0
  45. mneme_core/retrieval/planner.py +73 -0
  46. mneme_core/retrieval/rrf.py +412 -0
  47. mneme_core/retrieval/telemetry.py +83 -0
  48. mneme_core/security.py +181 -0
  49. mneme_core/security_bench.py +241 -0
  50. mneme_core/taint.py +115 -0
  51. mneme_core/telemetry/__init__.py +27 -0
  52. mneme_core/telemetry/writer.py +237 -0
  53. mneme_core/temporal/__init__.py +58 -0
  54. mneme_core/temporal/backend.py +173 -0
  55. mneme_core/temporal/claim.py +237 -0
  56. mneme_core/temporal/extract.py +279 -0
  57. mneme_core/temporal/graphiti_export.py +160 -0
  58. mneme_core/temporal/index.py +249 -0
  59. mneme_core/temporal/query.py +217 -0
  60. mneme_core/trajectory.py +287 -0
  61. mneme_core/vault/__init__.py +19 -0
  62. mneme_core/vault/atomic_write.py +73 -0
  63. mneme_core/vault/config.py +177 -0
  64. mneme_core/vault/file_lock.py +110 -0
  65. mneme_core/vault/frontmatter.py +344 -0
  66. mneme_core-2.0.0.dist-info/METADATA +75 -0
  67. mneme_core-2.0.0.dist-info/RECORD +69 -0
  68. mneme_core-2.0.0.dist-info/WHEEL +4 -0
  69. mneme_core-2.0.0.dist-info/entry_points.txt +5 -0
mneme_core/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ """mneme-core: vault-native memory engine for Claude Code."""
2
+
3
+ __version__ = "2.0.0"
4
+ __all__ = ["__version__"]
mneme_core/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Module execution entry point for ``python -m mneme_core``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .cli import main
6
+
7
+ if __name__ == "__main__":
8
+ main()
mneme_core/approval.py ADDED
@@ -0,0 +1,183 @@
1
+ """Human-approval gate for memory edits (conflict-resolution #4).
2
+
3
+ An agent *proposes* a memory edit; the proposal starts in ``PENDING`` status.
4
+ Edits in *durable* categories (IDENTITY, PREFERENCE, CLINICAL, LEGAL,
5
+ FINANCIAL) must receive explicit human approval before they may be applied.
6
+ Edits in the EPHEMERAL category (session notes, observations) may be applied
7
+ while still PENDING. A REJECTED proposal can never be applied.
8
+
9
+ All user-supplied content is passed through :func:`mneme_core.privacy.redact`
10
+ before being stored in the proposal. Proposal IDs are deterministic
11
+ (``uuid.uuid5``) so re-proposing identical inputs yields the same ID.
12
+
13
+ Pure, deterministic, no IO, no network, no clock.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import dataclasses
19
+ import uuid
20
+ from dataclasses import dataclass
21
+ from enum import Enum
22
+
23
+ from .privacy import redact
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Enumerations
27
+ # ---------------------------------------------------------------------------
28
+
29
+ _NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # NAMESPACE_URL
30
+
31
+
32
+ class ProposalStatus(str, Enum): # noqa: UP042
33
+ """Lifecycle status of a :class:`MemoryProposal`.
34
+
35
+ Inherits ``str`` so instances serialise directly as JSON strings and
36
+ compare equal to their string values without an extra ``.value`` call.
37
+ ``UP042`` suppresses the Ruff suggestion to use ``StrEnum`` (Python 3.11+)
38
+ to stay consistent with the ``str`` + ``Enum`` pattern used elsewhere.
39
+ """
40
+
41
+ PENDING = "PENDING"
42
+ APPROVED = "APPROVED"
43
+ REJECTED = "REJECTED"
44
+
45
+
46
+ class EditCategory(str, Enum): # noqa: UP042
47
+ """Semantic category of the memory edit being proposed.
48
+
49
+ * **EPHEMERAL** — low-stakes session/topic/observation; may be applied
50
+ without explicit approval.
51
+ * All other categories are *durable* and require human approval.
52
+ """
53
+
54
+ EPHEMERAL = "EPHEMERAL"
55
+ IDENTITY = "IDENTITY"
56
+ PREFERENCE = "PREFERENCE"
57
+ CLINICAL = "CLINICAL"
58
+ LEGAL = "LEGAL"
59
+ FINANCIAL = "FINANCIAL"
60
+
61
+
62
+ #: Categories that require explicit human approval before an edit may be applied.
63
+ DURABLE_CATEGORIES: frozenset[EditCategory] = frozenset(
64
+ {
65
+ EditCategory.IDENTITY,
66
+ EditCategory.PREFERENCE,
67
+ EditCategory.CLINICAL,
68
+ EditCategory.LEGAL,
69
+ EditCategory.FINANCIAL,
70
+ }
71
+ )
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Proposal dataclass
75
+ # ---------------------------------------------------------------------------
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class MemoryProposal:
80
+ """An immutable record of a proposed memory edit.
81
+
82
+ Attributes
83
+ ----------
84
+ proposal_id:
85
+ Deterministic ``uuid5`` derived from ``action``, ``target_path``, and
86
+ the *redacted* content. Identical inputs always produce the same ID.
87
+ action:
88
+ ``"create"`` | ``"update"`` | ``"delete"``.
89
+ target_path:
90
+ Vault-relative path of the note being created/modified/deleted.
91
+ content:
92
+ Proposed note content **after** redaction via
93
+ :func:`mneme_core.privacy.redact`.
94
+ category:
95
+ Semantic category governing approval requirements.
96
+ status:
97
+ Current lifecycle status (PENDING / APPROVED / REJECTED).
98
+ trust:
99
+ Proposer identity string; defaults to ``"agent"``.
100
+ """
101
+
102
+ proposal_id: str
103
+ action: str
104
+ target_path: str
105
+ content: str
106
+ category: EditCategory
107
+ status: ProposalStatus
108
+ trust: str
109
+
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Pure functions
113
+ # ---------------------------------------------------------------------------
114
+
115
+
116
+ def requires_human_approval(category: EditCategory) -> bool:
117
+ """Return ``True`` iff *category* is in :data:`DURABLE_CATEGORIES`."""
118
+ return category in DURABLE_CATEGORIES
119
+
120
+
121
+ def propose(
122
+ *,
123
+ action: str,
124
+ target_path: str,
125
+ content: str,
126
+ category: EditCategory,
127
+ trust: str = "agent",
128
+ ) -> MemoryProposal:
129
+ """Create a new :class:`MemoryProposal` in PENDING status.
130
+
131
+ Content is redacted via :func:`~mneme_core.privacy.redact` before being
132
+ stored. The ``proposal_id`` is a deterministic ``uuid5`` derived from
133
+ ``action + NUL + target_path + NUL + redacted_content`` so that identical
134
+ inputs always yield the same proposal ID.
135
+ """
136
+ redacted = redact(content)
137
+ # category and trust are part of proposal identity: an EPHEMERAL and a
138
+ # CLINICAL proposal for the same path+content must NOT share a proposal_id,
139
+ # else a downstream store keyed on it could alias the durable edit to an
140
+ # already-applied ephemeral one and bypass the human-approval gate.
141
+ seed = f"{action}\x00{target_path}\x00{category.value}\x00{trust}\x00{redacted}"
142
+ proposal_id = str(uuid.uuid5(_NAMESPACE, seed))
143
+ return MemoryProposal(
144
+ proposal_id=proposal_id,
145
+ action=action,
146
+ target_path=target_path,
147
+ content=redacted,
148
+ category=category,
149
+ status=ProposalStatus.PENDING,
150
+ trust=trust,
151
+ )
152
+
153
+
154
+ def approve(proposal: MemoryProposal) -> MemoryProposal:
155
+ """Return a copy of *proposal* with ``status`` set to APPROVED.
156
+
157
+ Idempotent: approving an already-approved proposal returns an equivalent
158
+ object unchanged.
159
+ """
160
+ return dataclasses.replace(proposal, status=ProposalStatus.APPROVED)
161
+
162
+
163
+ def reject(proposal: MemoryProposal) -> MemoryProposal:
164
+ """Return a copy of *proposal* with ``status`` set to REJECTED."""
165
+ return dataclasses.replace(proposal, status=ProposalStatus.REJECTED)
166
+
167
+
168
+ def can_apply(proposal: MemoryProposal) -> bool:
169
+ """Return ``True`` iff the proposal may be applied to the vault.
170
+
171
+ Rules (deterministic, pure):
172
+
173
+ * A REJECTED proposal can **never** be applied.
174
+ * An APPROVED proposal can always be applied.
175
+ * A PENDING proposal may be applied only when its category is EPHEMERAL
176
+ (i.e. ``requires_human_approval`` is ``False``).
177
+ """
178
+ if proposal.status == ProposalStatus.REJECTED:
179
+ return False
180
+ if proposal.status == ProposalStatus.APPROVED:
181
+ return True
182
+ # PENDING: allowed only for non-durable (ephemeral) categories.
183
+ return not requires_human_approval(proposal.category)
mneme_core/audit.py ADDED
@@ -0,0 +1,82 @@
1
+ """Read-only vault audit aggregator (the console surface, v1).
2
+
3
+ Produces a deterministic, read-only snapshot of the vault: note counts by
4
+ memory type plus a security-scan summary. This is the programmatic/text console
5
+ surface; a browser audit UI (timeline, graph explorer, token-ROI dashboard) is
6
+ deferred. The aggregator never writes and never raises on a single bad file.
7
+
8
+ No LLM, no network.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from .security import scan_vault, summarize
18
+ from .vault.frontmatter import parse
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class AuditReport:
23
+ """A read-only vault health + safety snapshot."""
24
+
25
+ note_count: int
26
+ type_counts: dict[str, int]
27
+ security: dict[str, Any]
28
+
29
+
30
+ def build_audit(vault_root: Path) -> AuditReport:
31
+ """Walk the vault markdown and build a read-only audit snapshot.
32
+
33
+ Counts notes by frontmatter ``type`` (notes without parseable frontmatter
34
+ are bucketed as ``"(none)"``) and folds in a security-scan summary. Symlinks
35
+ that escape the vault are skipped; unreadable or malformed files never crash
36
+ the audit.
37
+ """
38
+ root_resolved = vault_root.resolve()
39
+ type_counts: dict[str, int] = {}
40
+ note_count = 0
41
+ for md_path in sorted(vault_root.rglob("*.md")):
42
+ try:
43
+ resolved = md_path.resolve()
44
+ except OSError:
45
+ continue
46
+ if not resolved.is_relative_to(root_resolved):
47
+ continue
48
+ try:
49
+ text = md_path.read_text(encoding="utf-8", errors="replace")
50
+ except OSError:
51
+ continue
52
+ note_count += 1
53
+ type_name = "(none)"
54
+ try:
55
+ front, _ = parse(text)
56
+ if front is not None and front.type:
57
+ type_name = front.type
58
+ except Exception: # noqa: BLE001 - audit must never crash on a bad note
59
+ type_name = "(unparseable)"
60
+ type_counts[type_name] = type_counts.get(type_name, 0) + 1
61
+
62
+ security = summarize(scan_vault(vault_root))
63
+ return AuditReport(note_count=note_count, type_counts=type_counts, security=security)
64
+
65
+
66
+ def render_audit(report: AuditReport) -> str:
67
+ """Render an :class:`AuditReport` as a plain-text report."""
68
+ lines = ["# Vault audit", "", f"Notes: {report.note_count}", "", "## By type"]
69
+ if report.type_counts:
70
+ for type_name, count in sorted(report.type_counts.items()):
71
+ lines.append(f"- {type_name}: {count}")
72
+ else:
73
+ lines.append("- (no notes)")
74
+ lines.extend(["", "## Security"])
75
+ sec = report.security
76
+ lines.append(f"- files flagged: {sec.get('files_flagged', 0)}")
77
+ lines.append(f"- total findings: {sec.get('total_findings', 0)}")
78
+ by_kind = sec.get("by_kind", {})
79
+ if isinstance(by_kind, dict):
80
+ for kind, count in sorted(by_kind.items()):
81
+ lines.append(f" - {kind}: {count}")
82
+ return "\n".join(lines) + "\n"
@@ -0,0 +1,40 @@
1
+ """Benchmark helpers shared by the ``benchmarks/`` runner scripts.
2
+
3
+ Three focused submodules:
4
+
5
+ * :mod:`mneme_core.bench.metrics` - information-retrieval metric primitives
6
+ (nDCG@k, Recall@k, MRR) plus quantile helpers for latency distributions.
7
+ * :mod:`mneme_core.bench.synth` - deterministic synthetic corpus and query
8
+ generator. Tests and benchmarks both lean on this so they exercise the
9
+ same shape of data.
10
+ * :mod:`mneme_core.bench.hardware` - hardware/runtime probe that emits
11
+ ``hardware.json`` next to every benchmark result for reproducibility.
12
+
13
+ The submodules are intentionally small and side-effect-free. Bench scripts
14
+ own their own argparse, output, and CI guards.
15
+ """
16
+
17
+ from .hardware import HardwareSnapshot, capture_hardware
18
+ from .metrics import (
19
+ mean_reciprocal_rank,
20
+ ndcg_at_k,
21
+ percentiles,
22
+ recall_at_k,
23
+ )
24
+ from .synth import (
25
+ SyntheticCorpus,
26
+ SyntheticQuery,
27
+ build_synthetic_corpus,
28
+ )
29
+
30
+ __all__ = [
31
+ "HardwareSnapshot",
32
+ "SyntheticCorpus",
33
+ "SyntheticQuery",
34
+ "build_synthetic_corpus",
35
+ "capture_hardware",
36
+ "mean_reciprocal_rank",
37
+ "ndcg_at_k",
38
+ "percentiles",
39
+ "recall_at_k",
40
+ ]
@@ -0,0 +1,110 @@
1
+ """Hardware and runtime snapshot for reproducible benchmark output.
2
+
3
+ Every benchmark run drops a ``hardware.json`` next to its result so
4
+ readers can interpret the numbers in context. The probe is deliberately
5
+ shallow - just the fields a reader needs to decide whether two runs are
6
+ comparable.
7
+
8
+ Why not ``platform.uname()`` for everything: the fields below are the
9
+ ones that materially affect numbers in the v1.0 benchmark set. CPU
10
+ model and core count drive latency; OS family changes file-IO
11
+ behavior; Python and (when available) Node versions pin the runtime.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import platform
19
+ import shutil
20
+ import subprocess
21
+ import sys
22
+ from dataclasses import asdict, dataclass
23
+ from pathlib import Path
24
+
25
+
26
+ @dataclass
27
+ class HardwareSnapshot:
28
+ """Captured at the start of every benchmark run."""
29
+
30
+ os: str
31
+ os_release: str
32
+ cpu_model: str
33
+ cpu_count_logical: int
34
+ python_version: str
35
+ node_version: str | None
36
+ mneme_bench_seed: int
37
+
38
+
39
+ def _detect_cpu_model() -> str:
40
+ """Best-effort CPU model string across OSes.
41
+
42
+ Linux: ``/proc/cpuinfo`` first non-empty ``model name``.
43
+ macOS: ``sysctl -n machdep.cpu.brand_string`` when ``sysctl`` is on
44
+ PATH. Windows and fallback: ``platform.processor()``.
45
+ """
46
+ system = platform.system().lower()
47
+ if system == "linux":
48
+ try:
49
+ cpuinfo = Path("/proc/cpuinfo").read_text(encoding="utf-8")
50
+ for line in cpuinfo.splitlines():
51
+ if line.startswith("model name"):
52
+ return line.split(":", 1)[1].strip()
53
+ except OSError:
54
+ pass
55
+ if system == "darwin" and shutil.which("sysctl") is not None:
56
+ try:
57
+ out = subprocess.run( # noqa: S603,S607
58
+ ["sysctl", "-n", "machdep.cpu.brand_string"],
59
+ capture_output=True,
60
+ text=True,
61
+ check=False,
62
+ timeout=2,
63
+ )
64
+ if out.returncode == 0:
65
+ return out.stdout.strip()
66
+ except (OSError, subprocess.SubprocessError):
67
+ pass
68
+ return platform.processor() or "unknown"
69
+
70
+
71
+ def _detect_node_version() -> str | None:
72
+ """Return the ``node --version`` output, or ``None`` when absent."""
73
+ if shutil.which("node") is None:
74
+ return None
75
+ try:
76
+ out = subprocess.run( # noqa: S603,S607
77
+ ["node", "--version"],
78
+ capture_output=True,
79
+ text=True,
80
+ check=False,
81
+ timeout=2,
82
+ )
83
+ if out.returncode == 0:
84
+ return out.stdout.strip()
85
+ except (OSError, subprocess.SubprocessError):
86
+ return None
87
+ return None
88
+
89
+
90
+ def capture_hardware(seed: int = 42) -> HardwareSnapshot:
91
+ """Capture the snapshot. Always returns; never raises."""
92
+ cpu_count = os.cpu_count() or 0
93
+ return HardwareSnapshot(
94
+ os=platform.system() or "unknown",
95
+ os_release=platform.release() or "unknown",
96
+ cpu_model=_detect_cpu_model(),
97
+ cpu_count_logical=cpu_count,
98
+ python_version=sys.version.split()[0],
99
+ node_version=_detect_node_version(),
100
+ mneme_bench_seed=seed,
101
+ )
102
+
103
+
104
+ def write_hardware_json(snapshot: HardwareSnapshot, out_path: Path) -> None:
105
+ """Serialize the snapshot to ``out_path`` as pretty JSON."""
106
+ out_path.parent.mkdir(parents=True, exist_ok=True)
107
+ out_path.write_text(
108
+ json.dumps(asdict(snapshot), indent=2) + "\n",
109
+ encoding="utf-8",
110
+ )
@@ -0,0 +1,258 @@
1
+ """Evaluation runner, dataset adapters, and head-to-head comparator.
2
+
3
+ This module ships the runner infrastructure and format adapters only.
4
+ The datasets (LongMemEval, LoCoMo) and any competitor retrieve functions
5
+ (e.g. a claude-mem retrieve fn) are SUPPLIED BY THE OPERATOR at run time.
6
+ This module makes no superiority claim about any retrieval system.
7
+
8
+ Any public statement that one system "beats" or is "best" relative to
9
+ another requires an operator-run, published benchmark with full
10
+ experimental controls. The :func:`compare` function provides a purely
11
+ descriptive readout of the numbers produced in a single run.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from collections.abc import Callable, Iterable
17
+ from dataclasses import dataclass
18
+
19
+ from mneme_core.bench.metrics import (
20
+ mean_reciprocal_rank,
21
+ ndcg_at_k,
22
+ recall_at_k,
23
+ )
24
+
25
+ RetrieveIds = Callable[[str], list[str | int]]
26
+ """Query string -> ranked list of doc ids (best first)."""
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class EvalCase:
31
+ """One evaluation case: a query and the doc ids that should be retrieved."""
32
+
33
+ case_id: str
34
+ query: str
35
+ relevant_ids: tuple[str | int, ...]
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class CaseResult:
40
+ """Per-case metric scores for a single (query, retrieve) evaluation."""
41
+
42
+ case_id: str
43
+ recall_at_k: float
44
+ reciprocal_rank: float
45
+ ndcg_at_k: float
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class EvalReport:
50
+ """Aggregate evaluation report for one system over a case set."""
51
+
52
+ system_name: str
53
+ k: int
54
+ n_cases: int
55
+ mean_recall_at_k: float
56
+ mean_mrr: float
57
+ mean_ndcg_at_k: float
58
+ per_case: tuple[CaseResult, ...]
59
+
60
+
61
+ def run_eval(
62
+ cases: Iterable[EvalCase],
63
+ retrieve: RetrieveIds,
64
+ *,
65
+ system_name: str,
66
+ k: int = 10,
67
+ ) -> EvalReport:
68
+ """Run deterministic evaluation of *retrieve* over *cases*.
69
+
70
+ For each case the retrieve function is called once. Metrics are
71
+ computed by reusing :func:`~mneme_core.bench.metrics.recall_at_k`,
72
+ :func:`~mneme_core.bench.metrics.mean_reciprocal_rank`, and
73
+ :func:`~mneme_core.bench.metrics.ndcg_at_k`. An empty retrieve result
74
+ scores zero on all metrics without raising. An empty case list returns
75
+ an all-zero report without raising. The order of ``per_case`` matches
76
+ the input iteration order.
77
+ """
78
+ case_results: list[CaseResult] = []
79
+
80
+ for case in cases:
81
+ retrieved: list[str | int] = retrieve(case.query)
82
+ rel: tuple[str | int, ...] = case.relevant_ids
83
+
84
+ r_at_k = recall_at_k(retrieved, rel, k)
85
+ rr = mean_reciprocal_rank([(retrieved, rel)])
86
+ n_at_k = ndcg_at_k(retrieved, rel, k)
87
+
88
+ case_results.append(
89
+ CaseResult(
90
+ case_id=case.case_id,
91
+ recall_at_k=r_at_k,
92
+ reciprocal_rank=rr,
93
+ ndcg_at_k=n_at_k,
94
+ )
95
+ )
96
+
97
+ n = len(case_results)
98
+ if n == 0:
99
+ return EvalReport(
100
+ system_name=system_name,
101
+ k=k,
102
+ n_cases=0,
103
+ mean_recall_at_k=0.0,
104
+ mean_mrr=0.0,
105
+ mean_ndcg_at_k=0.0,
106
+ per_case=(),
107
+ )
108
+
109
+ mean_r = sum(c.recall_at_k for c in case_results) / n
110
+ mean_mrr = sum(c.reciprocal_rank for c in case_results) / n
111
+ mean_n = sum(c.ndcg_at_k for c in case_results) / n
112
+
113
+ return EvalReport(
114
+ system_name=system_name,
115
+ k=k,
116
+ n_cases=n,
117
+ mean_recall_at_k=mean_r,
118
+ mean_mrr=mean_mrr,
119
+ mean_ndcg_at_k=mean_n,
120
+ per_case=tuple(case_results),
121
+ )
122
+
123
+
124
+ def load_longmemeval(records: Iterable[dict[str, object]]) -> list[EvalCase]:
125
+ """Map LongMemEval-style records to :class:`EvalCase`.
126
+
127
+ Accepted key variants:
128
+
129
+ * ``case_id``: ``"question_id"`` | ``"id"`` | ``"case_id"``
130
+ (fallback ``"case-<i>"`` when none present).
131
+ * ``query``: ``"question"`` | ``"query"`` | ``"input"``.
132
+ * ``relevant_ids``: ``"answer_session_ids"`` | ``"relevant_ids"``
133
+ | ``"evidence_ids"`` | ``"gold_ids"`` (empty list when absent).
134
+
135
+ Records with no resolvable query are silently skipped. Never raises.
136
+ """
137
+ result: list[EvalCase] = []
138
+ for i, rec in enumerate(records):
139
+ query = _first_str(rec, ("question", "query", "input"))
140
+ if query is None:
141
+ continue
142
+ case_id = _first_str(rec, ("question_id", "id", "case_id")) or f"case-{i}"
143
+ rel_raw = _first_list(
144
+ rec, ("answer_session_ids", "relevant_ids", "evidence_ids", "gold_ids")
145
+ )
146
+ relevant_ids: tuple[str | int, ...] = tuple(v for v in rel_raw if isinstance(v, (str, int)))
147
+ result.append(EvalCase(case_id=case_id, query=query, relevant_ids=relevant_ids))
148
+ return result
149
+
150
+
151
+ def load_locomo(records: Iterable[dict[str, object]]) -> list[EvalCase]:
152
+ """Map LoCoMo-style records to :class:`EvalCase`.
153
+
154
+ Accepted key variants:
155
+
156
+ * ``case_id``: ``"sample_id"`` | ``"id"``.
157
+ * ``query``: ``"question"`` | ``"query"``.
158
+ * ``relevant_ids``: ``"evidence"`` | ``"relevant_ids"`` | ``"gold_ids"``
159
+ (empty list when absent).
160
+
161
+ Records with no resolvable query are silently skipped. Never raises.
162
+ """
163
+ result: list[EvalCase] = []
164
+ for i, rec in enumerate(records):
165
+ query = _first_str(rec, ("question", "query"))
166
+ if query is None:
167
+ continue
168
+ case_id = _first_str(rec, ("sample_id", "id")) or f"case-{i}"
169
+ rel_raw = _first_list(rec, ("evidence", "relevant_ids", "gold_ids"))
170
+ relevant_ids: tuple[str | int, ...] = tuple(v for v in rel_raw if isinstance(v, (str, int)))
171
+ result.append(EvalCase(case_id=case_id, query=query, relevant_ids=relevant_ids))
172
+ return result
173
+
174
+
175
+ def head_to_head(
176
+ cases: Iterable[EvalCase],
177
+ systems: dict[str, RetrieveIds],
178
+ *,
179
+ k: int = 10,
180
+ ) -> dict[str, EvalReport]:
181
+ """Run :func:`run_eval` for each system over the same materialised case set.
182
+
183
+ Cases are materialised once so every system is evaluated on an
184
+ identical sequence. Returns ``{system_name: EvalReport}``. Deterministic.
185
+ """
186
+ materialised: list[EvalCase] = list(cases)
187
+ return {
188
+ name: run_eval(materialised, retrieve_fn, system_name=name, k=k)
189
+ for name, retrieve_fn in systems.items()
190
+ }
191
+
192
+
193
+ def compare(reports: dict[str, EvalReport]) -> dict[str, object]:
194
+ """Tabulate mean metrics across systems and identify the leader per metric.
195
+
196
+ Returns a dict with three keys:
197
+
198
+ * ``"systems"``: sorted list of system names.
199
+ * ``"metrics"``: ``{"recall_at_k": {name: val}, "mrr": {...},
200
+ "ndcg_at_k": {...}}``.
201
+ * ``"leader_by_metric"``: ``{"recall_at_k": <name>, ...}`` — the
202
+ system with the highest value for each metric.
203
+
204
+ IMPORTANT: ``leader_by_metric`` is a PURELY DESCRIPTIVE readout of
205
+ this run's measured numbers. It is NOT a published superiority claim.
206
+ Any public "best / beats X" assertion requires an operator-run,
207
+ published benchmark with full experimental controls; this harness only
208
+ measures. Ties are broken by choosing the lexicographically smallest
209
+ name among the joint-maximum scorers, ensuring a deterministic result.
210
+ """
211
+ names = sorted(reports)
212
+ recall_vals = {n: reports[n].mean_recall_at_k for n in names}
213
+ mrr_vals = {n: reports[n].mean_mrr for n in names}
214
+ ndcg_vals = {n: reports[n].mean_ndcg_at_k for n in names}
215
+
216
+ def _leader(vals: dict[str, float]) -> str:
217
+ if not vals:
218
+ return ""
219
+ max_val = max(vals.values())
220
+ candidates = sorted(n for n, v in vals.items() if v == max_val)
221
+ return candidates[0]
222
+
223
+ return {
224
+ "systems": names,
225
+ "metrics": {
226
+ "recall_at_k": recall_vals,
227
+ "mrr": mrr_vals,
228
+ "ndcg_at_k": ndcg_vals,
229
+ },
230
+ "leader_by_metric": {
231
+ "recall_at_k": _leader(recall_vals),
232
+ "mrr": _leader(mrr_vals),
233
+ "ndcg_at_k": _leader(ndcg_vals),
234
+ },
235
+ }
236
+
237
+
238
+ # ---------------------------------------------------------------------------
239
+ # Internal helpers
240
+ # ---------------------------------------------------------------------------
241
+
242
+
243
+ def _first_str(rec: dict[str, object], keys: tuple[str, ...]) -> str | None:
244
+ """Return the first non-empty string value found under any of *keys*."""
245
+ for key in keys:
246
+ val = rec.get(key)
247
+ if isinstance(val, str) and val:
248
+ return val
249
+ return None
250
+
251
+
252
+ def _first_list(rec: dict[str, object], keys: tuple[str, ...]) -> list[object]:
253
+ """Return the first list value found under any of *keys*, else ``[]``."""
254
+ for key in keys:
255
+ val = rec.get(key)
256
+ if isinstance(val, list):
257
+ return val
258
+ return []