mneme-core 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mneme_core/__init__.py +4 -0
- mneme_core/__main__.py +8 -0
- mneme_core/approval.py +183 -0
- mneme_core/audit.py +82 -0
- mneme_core/bench/__init__.py +40 -0
- mneme_core/bench/hardware.py +110 -0
- mneme_core/bench/harness.py +258 -0
- mneme_core/bench/metrics.py +142 -0
- mneme_core/bench/synth.py +207 -0
- mneme_core/capability.py +131 -0
- mneme_core/cli.py +1437 -0
- mneme_core/compression/__init__.py +106 -0
- mneme_core/compression/config.py +100 -0
- mneme_core/compression/ledger.py +447 -0
- mneme_core/compression/llm.py +155 -0
- mneme_core/compression/pipeline.py +543 -0
- mneme_core/compression/prompts/compress-en.md +108 -0
- mneme_core/compression/staging.py +363 -0
- mneme_core/connectors.py +158 -0
- mneme_core/connectors_net.py +231 -0
- mneme_core/console.py +278 -0
- mneme_core/distill/__init__.py +74 -0
- mneme_core/distill/adaptive_topk.py +65 -0
- mneme_core/distill/audit.py +248 -0
- mneme_core/distill/compressed_format.py +90 -0
- mneme_core/distill/injection_dedup.py +101 -0
- mneme_core/distill/shell_compress.py +207 -0
- mneme_core/fts5/__init__.py +5 -0
- mneme_core/fts5/indexer.py +675 -0
- mneme_core/fts5/locale/__init__.py +6 -0
- mneme_core/fts5/locale/tr.py +120 -0
- mneme_core/injection.py +76 -0
- mneme_core/kg/__init__.py +42 -0
- mneme_core/kg/client.py +97 -0
- mneme_core/kg/episode_stage.py +190 -0
- mneme_core/kg/flush.py +40 -0
- mneme_core/kg/worker.py +326 -0
- mneme_core/modes.py +375 -0
- mneme_core/modes_cli.py +151 -0
- mneme_core/patterns.py +265 -0
- mneme_core/privacy.py +90 -0
- mneme_core/py.typed +1 -0
- mneme_core/retrieval/__init__.py +54 -0
- mneme_core/retrieval/dense.py +425 -0
- mneme_core/retrieval/planner.py +73 -0
- mneme_core/retrieval/rrf.py +412 -0
- mneme_core/retrieval/telemetry.py +83 -0
- mneme_core/security.py +181 -0
- mneme_core/security_bench.py +241 -0
- mneme_core/taint.py +115 -0
- mneme_core/telemetry/__init__.py +27 -0
- mneme_core/telemetry/writer.py +237 -0
- mneme_core/temporal/__init__.py +58 -0
- mneme_core/temporal/backend.py +173 -0
- mneme_core/temporal/claim.py +237 -0
- mneme_core/temporal/extract.py +279 -0
- mneme_core/temporal/graphiti_export.py +160 -0
- mneme_core/temporal/index.py +249 -0
- mneme_core/temporal/query.py +217 -0
- mneme_core/trajectory.py +287 -0
- mneme_core/vault/__init__.py +19 -0
- mneme_core/vault/atomic_write.py +73 -0
- mneme_core/vault/config.py +177 -0
- mneme_core/vault/file_lock.py +110 -0
- mneme_core/vault/frontmatter.py +344 -0
- mneme_core-2.0.0.dist-info/METADATA +75 -0
- mneme_core-2.0.0.dist-info/RECORD +69 -0
- mneme_core-2.0.0.dist-info/WHEEL +4 -0
- mneme_core-2.0.0.dist-info/entry_points.txt +5 -0
mneme_core/__init__.py
ADDED
mneme_core/__main__.py
ADDED
mneme_core/approval.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Human-approval gate for memory edits (conflict-resolution #4).
|
|
2
|
+
|
|
3
|
+
An agent *proposes* a memory edit; the proposal starts in ``PENDING`` status.
|
|
4
|
+
Edits in *durable* categories (IDENTITY, PREFERENCE, CLINICAL, LEGAL,
|
|
5
|
+
FINANCIAL) must receive explicit human approval before they may be applied.
|
|
6
|
+
Edits in the EPHEMERAL category (session notes, observations) may be applied
|
|
7
|
+
while still PENDING. A REJECTED proposal can never be applied.
|
|
8
|
+
|
|
9
|
+
All user-supplied content is passed through :func:`mneme_core.privacy.redact`
|
|
10
|
+
before being stored in the proposal. Proposal IDs are deterministic
|
|
11
|
+
(``uuid.uuid5``) so re-proposing identical inputs yields the same ID.
|
|
12
|
+
|
|
13
|
+
Pure, deterministic, no IO, no network, no clock.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import dataclasses
|
|
19
|
+
import uuid
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from enum import Enum
|
|
22
|
+
|
|
23
|
+
from .privacy import redact
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Enumerations
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
_NAMESPACE = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8") # NAMESPACE_URL
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ProposalStatus(str, Enum): # noqa: UP042
|
|
33
|
+
"""Lifecycle status of a :class:`MemoryProposal`.
|
|
34
|
+
|
|
35
|
+
Inherits ``str`` so instances serialise directly as JSON strings and
|
|
36
|
+
compare equal to their string values without an extra ``.value`` call.
|
|
37
|
+
``UP042`` suppresses the Ruff suggestion to use ``StrEnum`` (Python 3.11+)
|
|
38
|
+
to stay consistent with the ``str`` + ``Enum`` pattern used elsewhere.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
PENDING = "PENDING"
|
|
42
|
+
APPROVED = "APPROVED"
|
|
43
|
+
REJECTED = "REJECTED"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EditCategory(str, Enum): # noqa: UP042
|
|
47
|
+
"""Semantic category of the memory edit being proposed.
|
|
48
|
+
|
|
49
|
+
* **EPHEMERAL** — low-stakes session/topic/observation; may be applied
|
|
50
|
+
without explicit approval.
|
|
51
|
+
* All other categories are *durable* and require human approval.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
EPHEMERAL = "EPHEMERAL"
|
|
55
|
+
IDENTITY = "IDENTITY"
|
|
56
|
+
PREFERENCE = "PREFERENCE"
|
|
57
|
+
CLINICAL = "CLINICAL"
|
|
58
|
+
LEGAL = "LEGAL"
|
|
59
|
+
FINANCIAL = "FINANCIAL"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
#: Categories that require explicit human approval before an edit may be applied.
|
|
63
|
+
DURABLE_CATEGORIES: frozenset[EditCategory] = frozenset(
|
|
64
|
+
{
|
|
65
|
+
EditCategory.IDENTITY,
|
|
66
|
+
EditCategory.PREFERENCE,
|
|
67
|
+
EditCategory.CLINICAL,
|
|
68
|
+
EditCategory.LEGAL,
|
|
69
|
+
EditCategory.FINANCIAL,
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Proposal dataclass
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(frozen=True)
|
|
79
|
+
class MemoryProposal:
|
|
80
|
+
"""An immutable record of a proposed memory edit.
|
|
81
|
+
|
|
82
|
+
Attributes
|
|
83
|
+
----------
|
|
84
|
+
proposal_id:
|
|
85
|
+
Deterministic ``uuid5`` derived from ``action``, ``target_path``, and
|
|
86
|
+
the *redacted* content. Identical inputs always produce the same ID.
|
|
87
|
+
action:
|
|
88
|
+
``"create"`` | ``"update"`` | ``"delete"``.
|
|
89
|
+
target_path:
|
|
90
|
+
Vault-relative path of the note being created/modified/deleted.
|
|
91
|
+
content:
|
|
92
|
+
Proposed note content **after** redaction via
|
|
93
|
+
:func:`mneme_core.privacy.redact`.
|
|
94
|
+
category:
|
|
95
|
+
Semantic category governing approval requirements.
|
|
96
|
+
status:
|
|
97
|
+
Current lifecycle status (PENDING / APPROVED / REJECTED).
|
|
98
|
+
trust:
|
|
99
|
+
Proposer identity string; defaults to ``"agent"``.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
proposal_id: str
|
|
103
|
+
action: str
|
|
104
|
+
target_path: str
|
|
105
|
+
content: str
|
|
106
|
+
category: EditCategory
|
|
107
|
+
status: ProposalStatus
|
|
108
|
+
trust: str
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# Pure functions
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def requires_human_approval(category: EditCategory) -> bool:
|
|
117
|
+
"""Return ``True`` iff *category* is in :data:`DURABLE_CATEGORIES`."""
|
|
118
|
+
return category in DURABLE_CATEGORIES
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def propose(
|
|
122
|
+
*,
|
|
123
|
+
action: str,
|
|
124
|
+
target_path: str,
|
|
125
|
+
content: str,
|
|
126
|
+
category: EditCategory,
|
|
127
|
+
trust: str = "agent",
|
|
128
|
+
) -> MemoryProposal:
|
|
129
|
+
"""Create a new :class:`MemoryProposal` in PENDING status.
|
|
130
|
+
|
|
131
|
+
Content is redacted via :func:`~mneme_core.privacy.redact` before being
|
|
132
|
+
stored. The ``proposal_id`` is a deterministic ``uuid5`` derived from
|
|
133
|
+
``action + NUL + target_path + NUL + redacted_content`` so that identical
|
|
134
|
+
inputs always yield the same proposal ID.
|
|
135
|
+
"""
|
|
136
|
+
redacted = redact(content)
|
|
137
|
+
# category and trust are part of proposal identity: an EPHEMERAL and a
|
|
138
|
+
# CLINICAL proposal for the same path+content must NOT share a proposal_id,
|
|
139
|
+
# else a downstream store keyed on it could alias the durable edit to an
|
|
140
|
+
# already-applied ephemeral one and bypass the human-approval gate.
|
|
141
|
+
seed = f"{action}\x00{target_path}\x00{category.value}\x00{trust}\x00{redacted}"
|
|
142
|
+
proposal_id = str(uuid.uuid5(_NAMESPACE, seed))
|
|
143
|
+
return MemoryProposal(
|
|
144
|
+
proposal_id=proposal_id,
|
|
145
|
+
action=action,
|
|
146
|
+
target_path=target_path,
|
|
147
|
+
content=redacted,
|
|
148
|
+
category=category,
|
|
149
|
+
status=ProposalStatus.PENDING,
|
|
150
|
+
trust=trust,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def approve(proposal: MemoryProposal) -> MemoryProposal:
|
|
155
|
+
"""Return a copy of *proposal* with ``status`` set to APPROVED.
|
|
156
|
+
|
|
157
|
+
Idempotent: approving an already-approved proposal returns an equivalent
|
|
158
|
+
object unchanged.
|
|
159
|
+
"""
|
|
160
|
+
return dataclasses.replace(proposal, status=ProposalStatus.APPROVED)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def reject(proposal: MemoryProposal) -> MemoryProposal:
|
|
164
|
+
"""Return a copy of *proposal* with ``status`` set to REJECTED."""
|
|
165
|
+
return dataclasses.replace(proposal, status=ProposalStatus.REJECTED)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def can_apply(proposal: MemoryProposal) -> bool:
|
|
169
|
+
"""Return ``True`` iff the proposal may be applied to the vault.
|
|
170
|
+
|
|
171
|
+
Rules (deterministic, pure):
|
|
172
|
+
|
|
173
|
+
* A REJECTED proposal can **never** be applied.
|
|
174
|
+
* An APPROVED proposal can always be applied.
|
|
175
|
+
* A PENDING proposal may be applied only when its category is EPHEMERAL
|
|
176
|
+
(i.e. ``requires_human_approval`` is ``False``).
|
|
177
|
+
"""
|
|
178
|
+
if proposal.status == ProposalStatus.REJECTED:
|
|
179
|
+
return False
|
|
180
|
+
if proposal.status == ProposalStatus.APPROVED:
|
|
181
|
+
return True
|
|
182
|
+
# PENDING: allowed only for non-durable (ephemeral) categories.
|
|
183
|
+
return not requires_human_approval(proposal.category)
|
mneme_core/audit.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Read-only vault audit aggregator (the console surface, v1).
|
|
2
|
+
|
|
3
|
+
Produces a deterministic, read-only snapshot of the vault: note counts by
|
|
4
|
+
memory type plus a security-scan summary. This is the programmatic/text console
|
|
5
|
+
surface; a browser audit UI (timeline, graph explorer, token-ROI dashboard) is
|
|
6
|
+
deferred. The aggregator never writes and never raises on a single bad file.
|
|
7
|
+
|
|
8
|
+
No LLM, no network.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from .security import scan_vault, summarize
|
|
18
|
+
from .vault.frontmatter import parse
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class AuditReport:
|
|
23
|
+
"""A read-only vault health + safety snapshot."""
|
|
24
|
+
|
|
25
|
+
note_count: int
|
|
26
|
+
type_counts: dict[str, int]
|
|
27
|
+
security: dict[str, Any]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def build_audit(vault_root: Path) -> AuditReport:
|
|
31
|
+
"""Walk the vault markdown and build a read-only audit snapshot.
|
|
32
|
+
|
|
33
|
+
Counts notes by frontmatter ``type`` (notes without parseable frontmatter
|
|
34
|
+
are bucketed as ``"(none)"``) and folds in a security-scan summary. Symlinks
|
|
35
|
+
that escape the vault are skipped; unreadable or malformed files never crash
|
|
36
|
+
the audit.
|
|
37
|
+
"""
|
|
38
|
+
root_resolved = vault_root.resolve()
|
|
39
|
+
type_counts: dict[str, int] = {}
|
|
40
|
+
note_count = 0
|
|
41
|
+
for md_path in sorted(vault_root.rglob("*.md")):
|
|
42
|
+
try:
|
|
43
|
+
resolved = md_path.resolve()
|
|
44
|
+
except OSError:
|
|
45
|
+
continue
|
|
46
|
+
if not resolved.is_relative_to(root_resolved):
|
|
47
|
+
continue
|
|
48
|
+
try:
|
|
49
|
+
text = md_path.read_text(encoding="utf-8", errors="replace")
|
|
50
|
+
except OSError:
|
|
51
|
+
continue
|
|
52
|
+
note_count += 1
|
|
53
|
+
type_name = "(none)"
|
|
54
|
+
try:
|
|
55
|
+
front, _ = parse(text)
|
|
56
|
+
if front is not None and front.type:
|
|
57
|
+
type_name = front.type
|
|
58
|
+
except Exception: # noqa: BLE001 - audit must never crash on a bad note
|
|
59
|
+
type_name = "(unparseable)"
|
|
60
|
+
type_counts[type_name] = type_counts.get(type_name, 0) + 1
|
|
61
|
+
|
|
62
|
+
security = summarize(scan_vault(vault_root))
|
|
63
|
+
return AuditReport(note_count=note_count, type_counts=type_counts, security=security)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def render_audit(report: AuditReport) -> str:
|
|
67
|
+
"""Render an :class:`AuditReport` as a plain-text report."""
|
|
68
|
+
lines = ["# Vault audit", "", f"Notes: {report.note_count}", "", "## By type"]
|
|
69
|
+
if report.type_counts:
|
|
70
|
+
for type_name, count in sorted(report.type_counts.items()):
|
|
71
|
+
lines.append(f"- {type_name}: {count}")
|
|
72
|
+
else:
|
|
73
|
+
lines.append("- (no notes)")
|
|
74
|
+
lines.extend(["", "## Security"])
|
|
75
|
+
sec = report.security
|
|
76
|
+
lines.append(f"- files flagged: {sec.get('files_flagged', 0)}")
|
|
77
|
+
lines.append(f"- total findings: {sec.get('total_findings', 0)}")
|
|
78
|
+
by_kind = sec.get("by_kind", {})
|
|
79
|
+
if isinstance(by_kind, dict):
|
|
80
|
+
for kind, count in sorted(by_kind.items()):
|
|
81
|
+
lines.append(f" - {kind}: {count}")
|
|
82
|
+
return "\n".join(lines) + "\n"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Benchmark helpers shared by the ``benchmarks/`` runner scripts.
|
|
2
|
+
|
|
3
|
+
Three focused submodules:
|
|
4
|
+
|
|
5
|
+
* :mod:`mneme_core.bench.metrics` - information-retrieval metric primitives
|
|
6
|
+
(nDCG@k, Recall@k, MRR) plus quantile helpers for latency distributions.
|
|
7
|
+
* :mod:`mneme_core.bench.synth` - deterministic synthetic corpus and query
|
|
8
|
+
generator. Tests and benchmarks both lean on this so they exercise the
|
|
9
|
+
same shape of data.
|
|
10
|
+
* :mod:`mneme_core.bench.hardware` - hardware/runtime probe that emits
|
|
11
|
+
``hardware.json`` next to every benchmark result for reproducibility.
|
|
12
|
+
|
|
13
|
+
The submodules are intentionally small and side-effect-free. Bench scripts
|
|
14
|
+
own their own argparse, output, and CI guards.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .hardware import HardwareSnapshot, capture_hardware
|
|
18
|
+
from .metrics import (
|
|
19
|
+
mean_reciprocal_rank,
|
|
20
|
+
ndcg_at_k,
|
|
21
|
+
percentiles,
|
|
22
|
+
recall_at_k,
|
|
23
|
+
)
|
|
24
|
+
from .synth import (
|
|
25
|
+
SyntheticCorpus,
|
|
26
|
+
SyntheticQuery,
|
|
27
|
+
build_synthetic_corpus,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"HardwareSnapshot",
|
|
32
|
+
"SyntheticCorpus",
|
|
33
|
+
"SyntheticQuery",
|
|
34
|
+
"build_synthetic_corpus",
|
|
35
|
+
"capture_hardware",
|
|
36
|
+
"mean_reciprocal_rank",
|
|
37
|
+
"ndcg_at_k",
|
|
38
|
+
"percentiles",
|
|
39
|
+
"recall_at_k",
|
|
40
|
+
]
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Hardware and runtime snapshot for reproducible benchmark output.
|
|
2
|
+
|
|
3
|
+
Every benchmark run drops a ``hardware.json`` next to its result so
|
|
4
|
+
readers can interpret the numbers in context. The probe is deliberately
|
|
5
|
+
shallow - just the fields a reader needs to decide whether two runs are
|
|
6
|
+
comparable.
|
|
7
|
+
|
|
8
|
+
Why not ``platform.uname()`` for everything: the fields below are the
|
|
9
|
+
ones that materially affect numbers in the v1.0 benchmark set. CPU
|
|
10
|
+
model and core count drive latency; OS family changes file-IO
|
|
11
|
+
behavior; Python and (when available) Node versions pin the runtime.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import platform
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
from dataclasses import asdict, dataclass
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class HardwareSnapshot:
|
|
28
|
+
"""Captured at the start of every benchmark run."""
|
|
29
|
+
|
|
30
|
+
os: str
|
|
31
|
+
os_release: str
|
|
32
|
+
cpu_model: str
|
|
33
|
+
cpu_count_logical: int
|
|
34
|
+
python_version: str
|
|
35
|
+
node_version: str | None
|
|
36
|
+
mneme_bench_seed: int
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _detect_cpu_model() -> str:
|
|
40
|
+
"""Best-effort CPU model string across OSes.
|
|
41
|
+
|
|
42
|
+
Linux: ``/proc/cpuinfo`` first non-empty ``model name``.
|
|
43
|
+
macOS: ``sysctl -n machdep.cpu.brand_string`` when ``sysctl`` is on
|
|
44
|
+
PATH. Windows and fallback: ``platform.processor()``.
|
|
45
|
+
"""
|
|
46
|
+
system = platform.system().lower()
|
|
47
|
+
if system == "linux":
|
|
48
|
+
try:
|
|
49
|
+
cpuinfo = Path("/proc/cpuinfo").read_text(encoding="utf-8")
|
|
50
|
+
for line in cpuinfo.splitlines():
|
|
51
|
+
if line.startswith("model name"):
|
|
52
|
+
return line.split(":", 1)[1].strip()
|
|
53
|
+
except OSError:
|
|
54
|
+
pass
|
|
55
|
+
if system == "darwin" and shutil.which("sysctl") is not None:
|
|
56
|
+
try:
|
|
57
|
+
out = subprocess.run( # noqa: S603,S607
|
|
58
|
+
["sysctl", "-n", "machdep.cpu.brand_string"],
|
|
59
|
+
capture_output=True,
|
|
60
|
+
text=True,
|
|
61
|
+
check=False,
|
|
62
|
+
timeout=2,
|
|
63
|
+
)
|
|
64
|
+
if out.returncode == 0:
|
|
65
|
+
return out.stdout.strip()
|
|
66
|
+
except (OSError, subprocess.SubprocessError):
|
|
67
|
+
pass
|
|
68
|
+
return platform.processor() or "unknown"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _detect_node_version() -> str | None:
|
|
72
|
+
"""Return the ``node --version`` output, or ``None`` when absent."""
|
|
73
|
+
if shutil.which("node") is None:
|
|
74
|
+
return None
|
|
75
|
+
try:
|
|
76
|
+
out = subprocess.run( # noqa: S603,S607
|
|
77
|
+
["node", "--version"],
|
|
78
|
+
capture_output=True,
|
|
79
|
+
text=True,
|
|
80
|
+
check=False,
|
|
81
|
+
timeout=2,
|
|
82
|
+
)
|
|
83
|
+
if out.returncode == 0:
|
|
84
|
+
return out.stdout.strip()
|
|
85
|
+
except (OSError, subprocess.SubprocessError):
|
|
86
|
+
return None
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def capture_hardware(seed: int = 42) -> HardwareSnapshot:
|
|
91
|
+
"""Capture the snapshot. Always returns; never raises."""
|
|
92
|
+
cpu_count = os.cpu_count() or 0
|
|
93
|
+
return HardwareSnapshot(
|
|
94
|
+
os=platform.system() or "unknown",
|
|
95
|
+
os_release=platform.release() or "unknown",
|
|
96
|
+
cpu_model=_detect_cpu_model(),
|
|
97
|
+
cpu_count_logical=cpu_count,
|
|
98
|
+
python_version=sys.version.split()[0],
|
|
99
|
+
node_version=_detect_node_version(),
|
|
100
|
+
mneme_bench_seed=seed,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def write_hardware_json(snapshot: HardwareSnapshot, out_path: Path) -> None:
|
|
105
|
+
"""Serialize the snapshot to ``out_path`` as pretty JSON."""
|
|
106
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
out_path.write_text(
|
|
108
|
+
json.dumps(asdict(snapshot), indent=2) + "\n",
|
|
109
|
+
encoding="utf-8",
|
|
110
|
+
)
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Evaluation runner, dataset adapters, and head-to-head comparator.
|
|
2
|
+
|
|
3
|
+
This module ships the runner infrastructure and format adapters only.
|
|
4
|
+
The datasets (LongMemEval, LoCoMo) and any competitor retrieve functions
|
|
5
|
+
(e.g. a claude-mem retrieve fn) are SUPPLIED BY THE OPERATOR at run time.
|
|
6
|
+
This module makes no superiority claim about any retrieval system.
|
|
7
|
+
|
|
8
|
+
Any public statement that one system "beats" or is "best" relative to
|
|
9
|
+
another requires an operator-run, published benchmark with full
|
|
10
|
+
experimental controls. The :func:`compare` function provides a purely
|
|
11
|
+
descriptive readout of the numbers produced in a single run.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Callable, Iterable
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from mneme_core.bench.metrics import (
|
|
20
|
+
mean_reciprocal_rank,
|
|
21
|
+
ndcg_at_k,
|
|
22
|
+
recall_at_k,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
RetrieveIds = Callable[[str], list[str | int]]
|
|
26
|
+
"""Query string -> ranked list of doc ids (best first)."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class EvalCase:
|
|
31
|
+
"""One evaluation case: a query and the doc ids that should be retrieved."""
|
|
32
|
+
|
|
33
|
+
case_id: str
|
|
34
|
+
query: str
|
|
35
|
+
relevant_ids: tuple[str | int, ...]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class CaseResult:
|
|
40
|
+
"""Per-case metric scores for a single (query, retrieve) evaluation."""
|
|
41
|
+
|
|
42
|
+
case_id: str
|
|
43
|
+
recall_at_k: float
|
|
44
|
+
reciprocal_rank: float
|
|
45
|
+
ndcg_at_k: float
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(frozen=True)
|
|
49
|
+
class EvalReport:
|
|
50
|
+
"""Aggregate evaluation report for one system over a case set."""
|
|
51
|
+
|
|
52
|
+
system_name: str
|
|
53
|
+
k: int
|
|
54
|
+
n_cases: int
|
|
55
|
+
mean_recall_at_k: float
|
|
56
|
+
mean_mrr: float
|
|
57
|
+
mean_ndcg_at_k: float
|
|
58
|
+
per_case: tuple[CaseResult, ...]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run_eval(
|
|
62
|
+
cases: Iterable[EvalCase],
|
|
63
|
+
retrieve: RetrieveIds,
|
|
64
|
+
*,
|
|
65
|
+
system_name: str,
|
|
66
|
+
k: int = 10,
|
|
67
|
+
) -> EvalReport:
|
|
68
|
+
"""Run deterministic evaluation of *retrieve* over *cases*.
|
|
69
|
+
|
|
70
|
+
For each case the retrieve function is called once. Metrics are
|
|
71
|
+
computed by reusing :func:`~mneme_core.bench.metrics.recall_at_k`,
|
|
72
|
+
:func:`~mneme_core.bench.metrics.mean_reciprocal_rank`, and
|
|
73
|
+
:func:`~mneme_core.bench.metrics.ndcg_at_k`. An empty retrieve result
|
|
74
|
+
scores zero on all metrics without raising. An empty case list returns
|
|
75
|
+
an all-zero report without raising. The order of ``per_case`` matches
|
|
76
|
+
the input iteration order.
|
|
77
|
+
"""
|
|
78
|
+
case_results: list[CaseResult] = []
|
|
79
|
+
|
|
80
|
+
for case in cases:
|
|
81
|
+
retrieved: list[str | int] = retrieve(case.query)
|
|
82
|
+
rel: tuple[str | int, ...] = case.relevant_ids
|
|
83
|
+
|
|
84
|
+
r_at_k = recall_at_k(retrieved, rel, k)
|
|
85
|
+
rr = mean_reciprocal_rank([(retrieved, rel)])
|
|
86
|
+
n_at_k = ndcg_at_k(retrieved, rel, k)
|
|
87
|
+
|
|
88
|
+
case_results.append(
|
|
89
|
+
CaseResult(
|
|
90
|
+
case_id=case.case_id,
|
|
91
|
+
recall_at_k=r_at_k,
|
|
92
|
+
reciprocal_rank=rr,
|
|
93
|
+
ndcg_at_k=n_at_k,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
n = len(case_results)
|
|
98
|
+
if n == 0:
|
|
99
|
+
return EvalReport(
|
|
100
|
+
system_name=system_name,
|
|
101
|
+
k=k,
|
|
102
|
+
n_cases=0,
|
|
103
|
+
mean_recall_at_k=0.0,
|
|
104
|
+
mean_mrr=0.0,
|
|
105
|
+
mean_ndcg_at_k=0.0,
|
|
106
|
+
per_case=(),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
mean_r = sum(c.recall_at_k for c in case_results) / n
|
|
110
|
+
mean_mrr = sum(c.reciprocal_rank for c in case_results) / n
|
|
111
|
+
mean_n = sum(c.ndcg_at_k for c in case_results) / n
|
|
112
|
+
|
|
113
|
+
return EvalReport(
|
|
114
|
+
system_name=system_name,
|
|
115
|
+
k=k,
|
|
116
|
+
n_cases=n,
|
|
117
|
+
mean_recall_at_k=mean_r,
|
|
118
|
+
mean_mrr=mean_mrr,
|
|
119
|
+
mean_ndcg_at_k=mean_n,
|
|
120
|
+
per_case=tuple(case_results),
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def load_longmemeval(records: Iterable[dict[str, object]]) -> list[EvalCase]:
|
|
125
|
+
"""Map LongMemEval-style records to :class:`EvalCase`.
|
|
126
|
+
|
|
127
|
+
Accepted key variants:
|
|
128
|
+
|
|
129
|
+
* ``case_id``: ``"question_id"`` | ``"id"`` | ``"case_id"``
|
|
130
|
+
(fallback ``"case-<i>"`` when none present).
|
|
131
|
+
* ``query``: ``"question"`` | ``"query"`` | ``"input"``.
|
|
132
|
+
* ``relevant_ids``: ``"answer_session_ids"`` | ``"relevant_ids"``
|
|
133
|
+
| ``"evidence_ids"`` | ``"gold_ids"`` (empty list when absent).
|
|
134
|
+
|
|
135
|
+
Records with no resolvable query are silently skipped. Never raises.
|
|
136
|
+
"""
|
|
137
|
+
result: list[EvalCase] = []
|
|
138
|
+
for i, rec in enumerate(records):
|
|
139
|
+
query = _first_str(rec, ("question", "query", "input"))
|
|
140
|
+
if query is None:
|
|
141
|
+
continue
|
|
142
|
+
case_id = _first_str(rec, ("question_id", "id", "case_id")) or f"case-{i}"
|
|
143
|
+
rel_raw = _first_list(
|
|
144
|
+
rec, ("answer_session_ids", "relevant_ids", "evidence_ids", "gold_ids")
|
|
145
|
+
)
|
|
146
|
+
relevant_ids: tuple[str | int, ...] = tuple(v for v in rel_raw if isinstance(v, (str, int)))
|
|
147
|
+
result.append(EvalCase(case_id=case_id, query=query, relevant_ids=relevant_ids))
|
|
148
|
+
return result
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def load_locomo(records: Iterable[dict[str, object]]) -> list[EvalCase]:
|
|
152
|
+
"""Map LoCoMo-style records to :class:`EvalCase`.
|
|
153
|
+
|
|
154
|
+
Accepted key variants:
|
|
155
|
+
|
|
156
|
+
* ``case_id``: ``"sample_id"`` | ``"id"``.
|
|
157
|
+
* ``query``: ``"question"`` | ``"query"``.
|
|
158
|
+
* ``relevant_ids``: ``"evidence"`` | ``"relevant_ids"`` | ``"gold_ids"``
|
|
159
|
+
(empty list when absent).
|
|
160
|
+
|
|
161
|
+
Records with no resolvable query are silently skipped. Never raises.
|
|
162
|
+
"""
|
|
163
|
+
result: list[EvalCase] = []
|
|
164
|
+
for i, rec in enumerate(records):
|
|
165
|
+
query = _first_str(rec, ("question", "query"))
|
|
166
|
+
if query is None:
|
|
167
|
+
continue
|
|
168
|
+
case_id = _first_str(rec, ("sample_id", "id")) or f"case-{i}"
|
|
169
|
+
rel_raw = _first_list(rec, ("evidence", "relevant_ids", "gold_ids"))
|
|
170
|
+
relevant_ids: tuple[str | int, ...] = tuple(v for v in rel_raw if isinstance(v, (str, int)))
|
|
171
|
+
result.append(EvalCase(case_id=case_id, query=query, relevant_ids=relevant_ids))
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def head_to_head(
|
|
176
|
+
cases: Iterable[EvalCase],
|
|
177
|
+
systems: dict[str, RetrieveIds],
|
|
178
|
+
*,
|
|
179
|
+
k: int = 10,
|
|
180
|
+
) -> dict[str, EvalReport]:
|
|
181
|
+
"""Run :func:`run_eval` for each system over the same materialised case set.
|
|
182
|
+
|
|
183
|
+
Cases are materialised once so every system is evaluated on an
|
|
184
|
+
identical sequence. Returns ``{system_name: EvalReport}``. Deterministic.
|
|
185
|
+
"""
|
|
186
|
+
materialised: list[EvalCase] = list(cases)
|
|
187
|
+
return {
|
|
188
|
+
name: run_eval(materialised, retrieve_fn, system_name=name, k=k)
|
|
189
|
+
for name, retrieve_fn in systems.items()
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def compare(reports: dict[str, EvalReport]) -> dict[str, object]:
|
|
194
|
+
"""Tabulate mean metrics across systems and identify the leader per metric.
|
|
195
|
+
|
|
196
|
+
Returns a dict with three keys:
|
|
197
|
+
|
|
198
|
+
* ``"systems"``: sorted list of system names.
|
|
199
|
+
* ``"metrics"``: ``{"recall_at_k": {name: val}, "mrr": {...},
|
|
200
|
+
"ndcg_at_k": {...}}``.
|
|
201
|
+
* ``"leader_by_metric"``: ``{"recall_at_k": <name>, ...}`` — the
|
|
202
|
+
system with the highest value for each metric.
|
|
203
|
+
|
|
204
|
+
IMPORTANT: ``leader_by_metric`` is a PURELY DESCRIPTIVE readout of
|
|
205
|
+
this run's measured numbers. It is NOT a published superiority claim.
|
|
206
|
+
Any public "best / beats X" assertion requires an operator-run,
|
|
207
|
+
published benchmark with full experimental controls; this harness only
|
|
208
|
+
measures. Ties are broken by choosing the lexicographically smallest
|
|
209
|
+
name among the joint-maximum scorers, ensuring a deterministic result.
|
|
210
|
+
"""
|
|
211
|
+
names = sorted(reports)
|
|
212
|
+
recall_vals = {n: reports[n].mean_recall_at_k for n in names}
|
|
213
|
+
mrr_vals = {n: reports[n].mean_mrr for n in names}
|
|
214
|
+
ndcg_vals = {n: reports[n].mean_ndcg_at_k for n in names}
|
|
215
|
+
|
|
216
|
+
def _leader(vals: dict[str, float]) -> str:
|
|
217
|
+
if not vals:
|
|
218
|
+
return ""
|
|
219
|
+
max_val = max(vals.values())
|
|
220
|
+
candidates = sorted(n for n, v in vals.items() if v == max_val)
|
|
221
|
+
return candidates[0]
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
"systems": names,
|
|
225
|
+
"metrics": {
|
|
226
|
+
"recall_at_k": recall_vals,
|
|
227
|
+
"mrr": mrr_vals,
|
|
228
|
+
"ndcg_at_k": ndcg_vals,
|
|
229
|
+
},
|
|
230
|
+
"leader_by_metric": {
|
|
231
|
+
"recall_at_k": _leader(recall_vals),
|
|
232
|
+
"mrr": _leader(mrr_vals),
|
|
233
|
+
"ndcg_at_k": _leader(ndcg_vals),
|
|
234
|
+
},
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# ---------------------------------------------------------------------------
|
|
239
|
+
# Internal helpers
|
|
240
|
+
# ---------------------------------------------------------------------------
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _first_str(rec: dict[str, object], keys: tuple[str, ...]) -> str | None:
|
|
244
|
+
"""Return the first non-empty string value found under any of *keys*."""
|
|
245
|
+
for key in keys:
|
|
246
|
+
val = rec.get(key)
|
|
247
|
+
if isinstance(val, str) and val:
|
|
248
|
+
return val
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _first_list(rec: dict[str, object], keys: tuple[str, ...]) -> list[object]:
|
|
253
|
+
"""Return the first list value found under any of *keys*, else ``[]``."""
|
|
254
|
+
for key in keys:
|
|
255
|
+
val = rec.get(key)
|
|
256
|
+
if isinstance(val, list):
|
|
257
|
+
return val
|
|
258
|
+
return []
|