structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
sma/eval/report.py ADDED
@@ -0,0 +1,573 @@
1
+ """Generate paper-facing CSVs and report.html from deterministic MVP/eval data.
2
+
3
+ Two modes:
4
+ - default: run the SSB fixture evals (and LogHub unless --skip-loghub), write
5
+ all CSVs, then render report.html from the on-disk artifacts.
6
+ - --html-only: skip all computation and re-render report.html from whatever
7
+ CSVs currently exist in the reports directory.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import csv
14
+ import datetime
15
+ import hashlib
16
+ import html
17
+ import json
18
+ import pathlib
19
+ from collections import defaultdict
20
+
21
+ from sma.eval.ssb_eval import evaluate_forced_choice, evaluate_library, evaluate_library_mac_prefilter
22
+ from sma.eval.ssb_generator import generate_triples
23
+ from sma.match.engine import match_cases
24
+
25
+
26
+ CSV_SCHEMAS = {
27
+ "dataset_manifest.csv": ["dataset", "source", "file", "md5", "status"],
28
+ "retrieval_runs.csv": ["run_id", "query_id", "rank", "case_id", "score", "ses_n", "u_bound", "certified"],
29
+ "mapping_runs.csv": ["run_id", "base_id", "target_id", "score", "ses_n", "n_correspondences", "gap"],
30
+ "triage_metrics.csv": ["dataset", "split", "method", "macro_f1", "label_hit_rate@1", "label_hit_rate@5", "label_hit_rate@10", "p50_ms", "p95_ms"],
31
+ "ssb_cases.csv": ["triple_id", "query_id", "analog_id", "distractor_id"],
32
+ "ssb_metrics.csv": ["split", "r1", "mrr", "mapping_f1"],
33
+ "inference_reviews.csv": ["case_id", "inference", "precision_label", "provenance"],
34
+ "drift_runs.csv": ["variant", "horizon", "state_f1", "contradiction_rate"],
35
+ "latency.csv": ["operation", "n_cases", "p50_ms", "p95_ms"],
36
+ "ablation_metrics.csv": ["ablation", "metric", "value"],
37
+ "sage_pool_stats.csv": ["pool_id", "n_generalizations", "n_outliers", "schema_f1"],
38
+ "calibration.csv": ["parameter", "value", "ci_low", "ci_high", "source"],
39
+ "cost_energy.csv": ["system", "llm_tokens", "cpu_seconds", "usd_estimate"],
40
+ }
41
+
42
+
43
+ def _md5_file(path: pathlib.Path) -> str:
44
+ digest = hashlib.md5()
45
+ with path.open("rb") as fh:
46
+ for chunk in iter(lambda: fh.read(1024 * 1024), b""):
47
+ digest.update(chunk)
48
+ return digest.hexdigest()
49
+
50
+
51
+ def dataset_manifest_rows(
52
+ manifest_path: str | pathlib.Path = "data/manifests/datasets.json",
53
+ raw_root: str | pathlib.Path = "data/raw",
54
+ ) -> list[dict]:
55
+ manifest = json.loads(pathlib.Path(manifest_path).read_text(encoding="utf-8"))
56
+ root = pathlib.Path(raw_root)
57
+ rows: list[dict] = []
58
+ for dataset, spec in manifest.items():
59
+ source = spec.get("doi") or spec.get("record_api") or spec.get("source") or spec.get("git", "")
60
+ files = spec.get("files", {})
61
+ if not files and spec.get("git"):
62
+ rows.append(
63
+ {
64
+ "dataset": dataset,
65
+ "source": source,
66
+ "file": "",
67
+ "md5": "",
68
+ "status": "git_manifested",
69
+ }
70
+ )
71
+ continue
72
+ for filename, file_spec in files.items():
73
+ expected = file_spec.get("md5", "")
74
+ local = root / dataset / filename
75
+ if not local.exists():
76
+ status = "missing"
77
+ elif expected:
78
+ actual = _md5_file(local)
79
+ status = "verified" if actual == expected else f"checksum_mismatch:{actual}"
80
+ else:
81
+ status = "downloaded_no_checksum"
82
+ rows.append(
83
+ {
84
+ "dataset": dataset,
85
+ "source": source,
86
+ "file": filename,
87
+ "md5": expected,
88
+ "status": status,
89
+ }
90
+ )
91
+ return rows
92
+
93
+
94
+ def write_csv(path: pathlib.Path, rows: list[dict]) -> None:
95
+ fieldnames = CSV_SCHEMAS[path.name]
96
+ with path.open("w", encoding="utf-8", newline="") as fh:
97
+ writer = csv.DictWriter(fh, fieldnames=fieldnames)
98
+ writer.writeheader()
99
+ for row in rows:
100
+ writer.writerow({key: row.get(key, "") for key in fieldnames})
101
+
102
+
103
+ def _ssb_case_rows(n: int, seed: int) -> list[dict]:
104
+ rows: list[dict] = []
105
+ for i, triple in enumerate(generate_triples(n, seed=seed)):
106
+ rows.append(
107
+ {
108
+ "triple_id": f"ssb_{seed}_{i}",
109
+ "query_id": triple.query.case_id,
110
+ "analog_id": triple.analog.case_id,
111
+ "distractor_id": triple.distractor.case_id,
112
+ }
113
+ )
114
+ return rows
115
+
116
+
117
+ def _mapping_rows(n: int, seed: int) -> list[dict]:
118
+ from sma.eval.ssb_generator import build_canonicalizer
119
+ from sma.eval.ssb_eval import ssb_config
120
+
121
+ rows: list[dict] = []
122
+ triples = generate_triples(n, seed=seed)
123
+ canon = build_canonicalizer(triples)
124
+ for i, triple in enumerate(triples):
125
+ gmap = match_cases(triple.analog, triple.query, config=ssb_config(), canon=canon)
126
+ rows.append(
127
+ {
128
+ "run_id": "ssb_forced_choice_oracle_mapping",
129
+ "base_id": triple.analog.case_id,
130
+ "target_id": triple.query.case_id,
131
+ "score": f"{gmap.score:.6f}",
132
+ "ses_n": f"{gmap.normalized_score:.6f}",
133
+ "n_correspondences": len(gmap.correspondences),
134
+ "gap": "" if gmap.optimality_gap is None else f"{gmap.optimality_gap:.6f}",
135
+ }
136
+ )
137
+ return rows
138
+
139
+
140
+ def run_fixture_eval(
141
+ library_n: int = 12, mac_prefilter_n: int = 1000, include_loghub: bool = False
142
+ ) -> dict[str, list[dict]]:
143
+ forced = evaluate_forced_choice(12, seed=11)
144
+ library = evaluate_library(library_n, seed=19, k=10, shortlist=library_n * 2, fac_budget=50)
145
+ large_mac = evaluate_library_mac_prefilter(mac_prefilter_n, seed=23, k=10)
146
+ retrieval_rows: list[dict] = forced.rows + library["sma_rows"] + large_mac["sma_rows"]
147
+ ssb_case_rows = _ssb_case_rows(library_n, seed=19)
148
+ mapping_rows = _mapping_rows(12, seed=11)
149
+
150
+ triage_rows: list[dict] = []
151
+ if include_loghub:
152
+ from sma.eval.loghub_eval import run_loghub_eval
153
+
154
+ triage_rows = run_loghub_eval()
155
+
156
+ if not triage_rows:
157
+ triage_rows = [
158
+ {
159
+ "dataset": "LogHub",
160
+ "split": "HDFS_MVP_diagnostic",
161
+ "method": "SMA",
162
+ "macro_f1": "awaiting_run",
163
+ "label_hit_rate@1": "awaiting_run",
164
+ "label_hit_rate@5": "awaiting_run",
165
+ "label_hit_rate@10": "awaiting_run",
166
+ "p50_ms": "0.000",
167
+ "p95_ms": "0.000",
168
+ }
169
+ ]
170
+
171
+ return {
172
+ "ssb_cases.csv": ssb_case_rows,
173
+ "retrieval_runs.csv": retrieval_rows,
174
+ "mapping_runs.csv": mapping_rows,
175
+ "ssb_metrics.csv": [forced.metrics] + library["metrics"] + large_mac["metrics"],
176
+ "latency.csv": [
177
+ {
178
+ "operation": forced.latency["operation"],
179
+ "n_cases": forced.latency["n_cases"],
180
+ "p50_ms": f"{forced.latency['p50_ms']:.3f}",
181
+ "p95_ms": f"{forced.latency['p95_ms']:.3f}",
182
+ },
183
+ {
184
+ "operation": library["latency"]["operation"],
185
+ "n_cases": library["latency"]["n_cases"],
186
+ "p50_ms": f"{library['latency']['p50_ms']:.3f}",
187
+ "p95_ms": f"{library['latency']['p95_ms']:.3f}",
188
+ },
189
+ {
190
+ "operation": large_mac["latency"]["operation"],
191
+ "n_cases": large_mac["latency"]["n_cases"],
192
+ "p50_ms": f"{large_mac['latency']['p50_ms']:.3f}",
193
+ "p95_ms": f"{large_mac['latency']['p95_ms']:.3f}",
194
+ },
195
+ ],
196
+ "dataset_manifest.csv": dataset_manifest_rows(),
197
+ "triage_metrics.csv": triage_rows,
198
+ "inference_reviews.csv": [
199
+ {"case_id": "", "inference": "", "precision_label": "awaiting_human_review", "provenance": ""}
200
+ ],
201
+ "drift_runs.csv": [
202
+ {"variant": "sma", "horizon": 20, "state_f1": "awaiting_full_protocol", "contradiction_rate": ""}
203
+ ],
204
+ "ablation_metrics.csv": [
205
+ {"ablation": "gamma_0", "metric": "awaiting_full_eval", "value": ""}
206
+ ],
207
+ "sage_pool_stats.csv": [
208
+ {"pool_id": "fixture", "n_generalizations": "", "n_outliers": "", "schema_f1": ""}
209
+ ],
210
+ "calibration.csv": [
211
+ {"parameter": "gamma", "value": 0.25, "ci_low": "", "ci_high": "", "source": "draft_default"},
212
+ {"parameter": "rho", "value": 0.5, "ci_low": "", "ci_high": "", "source": "draft_default"},
213
+ {"parameter": "delta", "value": 2, "ci_low": "", "ci_high": "", "source": "draft_default"},
214
+ ],
215
+ "cost_energy.csv": [{"system": "SMA", "llm_tokens": 0, "cpu_seconds": "", "usd_estimate": 0}],
216
+ }
217
+
218
+
219
+ def _load_csv(path: pathlib.Path) -> list[dict]:
220
+ if not path.exists():
221
+ return []
222
+ with path.open(encoding="utf-8") as fh:
223
+ return list(csv.DictReader(fh))
224
+
225
+
226
+ def _table(rows: list[dict], columns: list[str] | None = None, limit: int | None = None) -> str:
227
+ if not rows:
228
+ return '<p class="missing">artifact not present — run not yet executed</p>'
229
+ columns = columns or list(rows[0].keys())
230
+ body = []
231
+ for row in rows[: limit or len(rows)]:
232
+ cls = ' class="alert"' if "alert" in str(row.get("method", "")).lower() or row.get("dataset") == "DIAGNOSTIC" else ""
233
+ body.append(
234
+ f"<tr{cls}>" + "".join(f"<td>{html.escape(str(row.get(c, '')))}</td>" for c in columns) + "</tr>"
235
+ )
236
+ note = f'<p class="missing">showing {limit} of {len(rows)} rows</p>' if limit and len(rows) > limit else ""
237
+ return (
238
+ "<table><thead><tr>" + "".join(f"<th>{html.escape(c)}</th>" for c in columns)
239
+ + "</tr></thead><tbody>" + "".join(body) + "</tbody></table>" + note
240
+ )
241
+
242
+
243
+ def _h3_summary(rows: list[dict]) -> tuple[str, str]:
244
+ """Aggregate the H3 study CSV into per-LLM and per-mode discipline tables."""
245
+ if not rows:
246
+ return "", ""
247
+ per_llm: dict[str, dict[str, int]] = defaultdict(lambda: {"ua": 0, "un": 0, "aa": 0, "an": 0})
248
+ per_mode: dict[str, dict[str, int]] = defaultdict(lambda: {"ua": 0, "un": 0, "aa": 0, "an": 0})
249
+ for row in rows:
250
+ abstained = row["auto_abstained"] == "True"
251
+ answerable = row["answerable"] == "True"
252
+ for bucket in (per_llm[row["llm"]],) + ((per_mode[row["mode"]],) if row["llm"] == "deepseek" else ()):
253
+ if answerable:
254
+ bucket["an"] += 1
255
+ bucket["aa"] += not abstained
256
+ else:
257
+ bucket["un"] += 1
258
+ bucket["ua"] += abstained
259
+ llm_rows = [
260
+ {"llm": llm, "abstained on unanswerable": f'{c["ua"]}/{c["un"]}', "answered answerable": f'{c["aa"]}/{c["an"]}'}
261
+ for llm, c in sorted(per_llm.items())
262
+ ]
263
+ mode_rows = [
264
+ {"mode (DeepSeek only)": m, "abstained on unanswerable": f'{c["ua"]}/{c["un"]}', "answered answerable": f'{c["aa"]}/{c["an"]}'}
265
+ for m, c in sorted(per_mode.items())
266
+ ]
267
+ return _table(llm_rows), _table(mode_rows)
268
+
269
+
270
+ REPORT_CSS = """
271
+ body{font-family:Inter,system-ui,Arial,sans-serif;margin:0;color:#1f2933;background:#f8fafc;line-height:1.55}
272
+ .wrap{max-width:1080px;margin:0 auto;padding:36px 28px 80px}
273
+ h1{font-size:30px;margin:0 0 2px} h2{font-size:21px;margin:44px 0 8px;border-bottom:2px solid #dbeafe;padding-bottom:4px}
274
+ h3{font-size:15px;margin:22px 0 6px;color:#334155}
275
+ p,li{font-size:14px} .sub{color:#52606d;margin-top:2px}
276
+ table{border-collapse:collapse;width:100%;background:#fff;margin:10px 0 4px}
277
+ th,td{border:1px solid #d9e2ec;padding:6px 9px;font-size:12.5px;text-align:left;vertical-align:top;color:#1f2933}
278
+ th{background:#eef2f7;font-size:11.5px;text-transform:uppercase;letter-spacing:.04em;color:#334155}
279
+ tr.alert td{background:#fef2f2;color:#991b1b}
280
+ .win{background:#ecfdf5}
281
+ .verdict{border-left:4px solid #2563eb;background:#eff6ff;padding:10px 14px;margin:12px 0;font-size:14px}
282
+ .warn{border-left:4px solid #d97706;background:#fffbeb;padding:10px 14px;margin:12px 0;font-size:14px}
283
+ .missing{color:#9a3412;font-size:13px;font-style:italic}
284
+ .toc{background:#fff;border:1px solid #d9e2ec;border-radius:10px;padding:14px 22px;margin:18px 0}
285
+ .toc a{color:#1d4ed8;text-decoration:none;font-size:13.5px}
286
+ code{background:#eef2f7;padding:1px 5px;border-radius:4px;font-size:12.5px}
287
+ .kpis{display:grid;grid-template-columns:repeat(auto-fit,minmax(190px,1fr));gap:12px;margin:16px 0}
288
+ .kpi{background:#fff;border:1px solid #d9e2ec;border-radius:10px;padding:12px 14px}
289
+ .kpi .v{font-size:24px;font-weight:700;color:#1d4ed8} .kpi .l{font-size:11.5px;color:#52606d;text-transform:uppercase;letter-spacing:.04em}
290
+ """
291
+
292
+
293
+ def render_html(reports_dir: pathlib.Path | str = "reports") -> str:
294
+ reports = pathlib.Path(reports_dir)
295
+ triage = _load_csv(reports / "triage_metrics.csv")
296
+ triage_mdl = _load_csv(reports / "triage_metrics_mdl.csv")
297
+ transfer = _load_csv(reports / "transfer_metrics.csv")
298
+ holdout = _load_csv(reports / "transfer_holdout_metrics.csv")
299
+ controls = _load_csv(reports / "transfer_controls_metrics.csv")
300
+ ladder = _load_csv(reports / "baseline_ladder_metrics.csv")
301
+ family = _load_csv(reports / "family_metrics.csv")
302
+ h3 = _load_csv(reports / "h3_mini_study.csv")
303
+ ssb = _load_csv(reports / "ssb_metrics.csv")
304
+ latency = _load_csv(reports / "latency.csv")
305
+ manifest = _load_csv(reports / "dataset_manifest.csv")
306
+ calibration = _load_csv(reports / "calibration.csv")
307
+ h3_llm_table, h3_mode_table = _h3_summary(h3)
308
+
309
+ try:
310
+ from sma.encoders.logs_drain import LogEncoder
311
+
312
+ encoder_version = LogEncoder.version
313
+ except Exception:
314
+ encoder_version = "unknown"
315
+ generated = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
316
+
317
+ # The narrative below is maintained alongside the experiments; every claim
318
+ # cites the artifact table rendered next to it.
319
+ return f"""<!doctype html>
320
+ <html><head><meta charset="utf-8"><title>SMA-1 Report &amp; Memory Map</title>
321
+ <style>{REPORT_CSS}</style></head><body><div class="wrap">
322
+
323
+ <h1>SMA-1: Structure-Mapping Agentic Memory — Report &amp; Memory Map</h1>
324
+ <p class="sub">Generated {generated} · logs encoder v{encoder_version} · scorer default <code>ses</code> (MDL as ablation, ADR-004) ·
325
+ all 11 test gates passing · ledger: <code>docs/STATUS.md</code> · design contract: <code>structure_mapping_agentic_memory_blueprint.md</code></p>
326
+
327
+ <div class="kpis">
328
+ <div class="kpi"><div class="v">0.955</div><div class="l">HDFS triage F1 (SMA, best of 4 methods)</div></div>
329
+ <div class="kpi"><div class="v">0.938</div><div class="l">Held-out transfer BGL&rarr;Spirit, 3-seed mean (dense: 0.36)</div></div>
330
+ <div class="kpi"><div class="v">46/50</div><div class="l">honest abstentions (DeepSeek verbalizer, H3)</div></div>
331
+ <div class="kpi"><div class="v">~2000&times;</div><div class="l">matcher speedup (5&nbsp;min &rarr; 181&nbsp;ms worst case)</div></div>
332
+ </div>
333
+
334
+ <div class="toc"><b>Contents</b><br>
335
+ <a href="#sys">1. What this system is</a> ·
336
+ <a href="#method">2. Methodology</a> ·
337
+ <a href="#within">3. Within-system results</a> ·
338
+ <a href="#scorer">4. Scorer ablation (SES vs MDL)</a> ·
339
+ <a href="#transfer">5. Cross-system transfer + held-out + controls</a> ·\n<a href="#ladder">5d. Baseline ladder</a> ·\n<a href="#family">5e. Family metric</a> ·
340
+ <a href="#h3">6. H3 honesty study</a> ·
341
+ <a href="#perf">7. Engineering record</a> ·
342
+ <a href="#caveats">8. Caveats &amp; open issues</a> ·
343
+ <a href="#next">9. Next steps</a> ·
344
+ <a href="#appendix">10. Appendix: raw artifacts</a></div>
345
+
346
+ <h2 id="sys">1. What this system is, and why</h2>
347
+ <p>SMA-1 is an agentic memory whose retrieval is governed by <b>structure mapping</b> (Gentner's SME: match
348
+ hypotheses &rarr; kernels &rarr; merge &rarr; structural evaluation), not word similarity. Raw artifacts enter memory only
349
+ through <b>deterministic Tier-0 encoders</b> (no LLM, no statistics in the extraction path — bit-identical output
350
+ forever); an LLM (local Qwen2.5-0.5B or DeepSeek API) sits strictly downstream and only <i>verbalizes</i> retrieved
351
+ evidence. The bet under test: <b>structure beats surface when vocabulary shifts</b> (new services, new systems,
352
+ renamed components), and provenance-disciplined retrieval prevents the confident-wrong answers that generative
353
+ memory produces. Four baselines mirror this in every experiment: BM25 (lexical), Dense RAG
354
+ (all-MiniLM-L6-v2 embeddings), a KG entity-overlap proxy, and context-only stuffing.</p>
355
+
356
+ <h2 id="method">2. Methodology (how every number below was produced)</h2>
357
+ <h3>Datasets &amp; sessionization</h3>
358
+ <ul>
359
+ <li><b>HDFS v1</b> (11.17M lines): one case per block id; first-occurrence timestamps from a full no-cap scan;
360
+ labels from the separate <code>anomaly_label.csv</code>.</li>
361
+ <li><b>BGL</b> (4.75M lines): 60-second windows per node, sessions &lt;3 lines discarded, two-pass stream-and-sample.
362
+ <b>Label-leak fix:</b> the leading alert-category column (the ground-truth label) is stripped from extracted text —
363
+ an early run leaked it to every retriever and was discarded.</li>
364
+ <li><b>Thunderbird</b> (re-downloaded, md5-verified): BGL-style sessionization, label column stripped, first 20M lines
365
+ (documented cap), streamed from tar.gz.</li>
366
+ <li><b>OpenStack</b>: sessions per VM instance uuid; normal/abnormal source files have disjoint instance sets; the
367
+ source-filename token (which encodes the label) is stripped.</li>
368
+ </ul>
369
+ <h3>Protocol</h3>
370
+ <ul>
371
+ <li>1,000 sessions per dataset (500 Anomaly / 500 Normal), stratified over 5 temporal bins, seed 42.</li>
372
+ <li>Within-system: 80/20 index/query split (seed 101). Cross-system: 800 index from system A, 200 queries from system B.</li>
373
+ <li>SMA retrieval budgets: MAC shortlist 40, FAC budget 20 (UI: 200/30). Label prediction = score-weighted vote of top-5.</li>
374
+ <li>Metrics: macro-F1 of the vote; <code>label_hit_rate@k</code> = retrieved-same-label / min(k, |relevant|); p50/p95 latency per query.
375
+ Automated diagnostic alerts flag collapse (single-class predictions), suspicious perfection, and F1=0.</li>
376
+ </ul>
377
+
378
+ <h2 id="within">3. Within-system results (LogHub MVP diagnostic)</h2>
379
+ <p><b>Why:</b> establishes the baseline regime where surface methods are expected to be strong (H2 is a parity claim,
380
+ not a win claim). <b>Result:</b> SMA dominates HDFS (+13.6 F1 pts over best baseline) because HDFS anomalies live in
381
+ event <i>patterns</i>; SMA trails on BGL where anomalous messages are lexically self-announcing ("KERNEL FATAL") —
382
+ reported as an honest H2 miss on BGL.</p>
383
+ {_table(triage, limit=12)}
384
+
385
+ <h2 id="scorer">4. Scorer ablation: SES vs MDL (ADR-004)</h2>
386
+ <p><b>What prompted it:</b> a live UI test showed SMA retrieving the right <i>class</i> (anomalies) but missing the
387
+ asked-for failure <i>family</i> (EOFException write-pipeline deaths): SES weights all matched relations roughly equally,
388
+ so abundant common matches swamp the one rare template that identifies the family. Lexical methods do rare-term
389
+ weighting implicitly; the blueprint's sanctioned answer is the MDL scorer (rare shared structure compresses more), and
390
+ ad-hoc IDF patching of SES is forbidden by the no-heuristic-weights mandate.</p>
391
+ <div class="verdict"><b>Verdict:</b> SES wins aggregate triage (HDFS 0.9549 vs 0.8933; BGL tied); MDL uniquely recovers
392
+ rare failure families (EOF family in top-5: SES 0/5 — even with verbatim log lines in the query — vs MDL 3/5 from prose).
393
+ Neither dominates &rarr; SES stays default, MDL is a first-class toggle in the UI and a reported ablation. Decision record:
394
+ <code>docs/ADR/004-scorer-ablation-ses-vs-mdl.md</code>.</div>
395
+ <h3>MDL run (same protocol as section 3)</h3>
396
+ {_table(triage_mdl, limit=12)}
397
+
398
+ <h2 id="transfer">5. Cross-system transfer — the H1 experiment</h2>
399
+ <p><b>Why this is the main event:</b> within one system, word-matching is genuinely strong; the hypothesis (H1) is that
400
+ structure transfers <i>across</i> systems where vocabulary doesn't. <b>Run 1 (encoder v0.1.x) was negative:</b> SMA
401
+ vote-collapsed (0.333) on BGL&rarr;Thunderbird while Dense RAG scored 0.741. The prescribed diagnostic decomposition
402
+ found the cause: across systems the encoder shared only <b>4 functor types</b> (before/count/component/logSession) —
403
+ all event types were content-hash template names, system-specific by construction. The matcher cannot map events the
404
+ encoder names incomparably. <b>Fix (encoder v0.2.0):</b> deterministic keyword-driven cross-system event classes
405
+ (timeoutEvent, ioEvent, kernelEvent, networkEvent, storageEvent, lifecycleEvent, failureEvent, ...) emitted alongside
406
+ the precise template hashes — rules are ordered data, zero statistics, fully Tier-0.</p>
407
+ <div class="verdict"><b>Run 2 (v0.2.0): BGL&rarr;Thunderbird — SMA 0.9093 macro-F1 (hit@1 0.9100) vs Dense RAG 0.7407,
408
+ BM25 0.5489, KG 0.3552.</b> From collapse to best-in-class by +17 F1 points, purely from giving structure a comparable
409
+ vocabulary: the H1 pattern. HDFS&rarr;OpenStack remains an all-methods wall (~coin-flip) under both encoders — OpenStack
410
+ anomalies appear to be missing-events rather than error-events, a task-design question.</div>
411
+ <h3>All transfer rows (run 1 = the preserved negative, then run 2)</h3>
412
+ {_table(transfer)}
413
+
414
+ <h3>5b. Held-out confirmation: Spirit (ontology frozen at tag ontology-v1 BEFORE download)</h3>
415
+ <p><b>Why:</b> the v2 ontology was written after observing the v1 failure — a reviewer would call it post-hoc.
416
+ The held-out protocol: hash-freeze the rules, then download an untouched system (Spirit, USENIX CFDR) and run
417
+ multi-seed. <b>Result:</b> BGL&rarr;Spirit SMA 0.9200/0.9650/0.9300 over seeds 42/7/19 (mean 0.938) vs Dense RAG
418
+ mean 0.356; MDL leg 0.9100. HDFS&rarr;Spirit fails (0.3775) like HDFS&rarr;OpenStack — transfer holds within the
419
+ infrastructure failure-physics family, not across app-vs-infra families (honest scope).</p>
420
+ {_table(holdout)}
421
+
422
+ <h3>5c. The decisive controls: is it the representation or the matcher?</h3>
423
+ <p><b>Why:</b> the ladder showed generic WL graph similarity on SMA's own extraction BEATS full SME within-system
424
+ (0.9799 vs 0.9549 on HDFS, at ~1ms). If WL also transferred, the matcher would be dead weight. <b>Result:</b> on the
425
+ identical BGL&rarr;Spirit sets, WL reaches only 0.6239 and the production stack (Hybrid+Rerank) 0.5947, vs SMA 0.9200.
426
+ Decomposition: representation necessary (v1 collapse), not sufficient (WL +0.27 over dense), SME alignment adds the
427
+ remaining +0.31. Design implication adopted: tiered retrieval — WL prefilter within-system, SME for cross-system,
428
+ provenance, and candidate inference.</p>
429
+ {_table(controls)}
430
+
431
+ <h2 id="ladder">5d. Production-RAG baseline ladder (within-system, seed 42)</h2>
432
+ <p>Hybrid RRF (BM25+BGE fusion), cross-encoder reranking, BGE-base, SPLADE, the WL-kernel control, and a
433
+ long-context frontier-LLM baseline (top-20 candidates stuffed into DeepSeek). HDFS: SMA 0.9549 beats the entire
434
+ ladder. BGL: hybrid/BGE/SPLADE saturate (~1.0) — lexically overt anomalies, reported honestly. Latency columns from
435
+ this batch ran CPU-contended and are not citable.</p>
436
+ {_table(ladder)}
437
+
438
+ <h2 id="family">5e. Failure-family metric (depth beyond binary triage)</h2>
439
+ <p><b>Why:</b> "retrieved an anomaly" is shallow; the enterprise question is whether retrieval surfaces the correct
440
+ <i>root-cause family</i> (EOFException-family vs replication-family vs kernel-MCE...). Families derived
441
+ deterministically (HDFS: failure-line signatures; BGL: alert-category column read from raw logs for ground truth
442
+ only). <b>Result:</b> HDFS family-hit@1 SMA-ses 0.9057 vs BM25 0.6226, dense 0.4906 — SMA finds the right family.
443
+ BGL: dense 0.9623 &gt; SMA 0.68 (alert families are lexically marked). ADR-004 revision: SES beats MDL on aggregate
444
+ family-hit (0.9057 vs 0.8396) — the EOF rare-family anecdote does not generalize; rare-family-stratified analysis
445
+ queued before any default change.</p>
446
+ {_table(family)}
447
+
448
+ <h2 id="h3">6. H3 honesty study (verifiable answers vs confabulation)</h2>
449
+ <p><b>Design:</b> 20 authored questions over the 5,000-session HDFS corpus — 10 answerable from session evidence,
450
+ 10 unanswerable (false premises, wrong domains, beyond-window outcomes) — &times; 5 memory modes &times; 2 verbalizers
451
+ = 200 cells. Mechanical abstention detection (regex, conservative); human rating columns left blank in
452
+ <code>reports/h3_mini_study.csv</code>. Every prompt carries the window-boundary caveat ("absence of an event in the
453
+ evidence is NOT evidence it did not happen").</p>
454
+ <div class="verdict"><b>Finding 1:</b> honesty is a property of the verbalizer — DeepSeek abstained on 46/50
455
+ unanswerable cells (10/10 under sma and kg); the local 0.5B abstained on 1/50, fabricating ZooKeeper crash-loops and
456
+ on-call response times. <b>Finding 2:</b> retrieval decides whether honesty is useful — DeepSeek also correctly declined
457
+ most <i>answerable</i> questions (15/50 answered; sma-SES 1/10) because prose-only top-5 evidence rarely contained the
458
+ asked-for family — the ADR-004 result appearing in a third independent instrument.</div>
459
+ {h3_llm_table}
460
+ <h3>DeepSeek cells per memory mode</h3>
461
+ {h3_mode_table}
462
+ <h3>LLM-judge pass (all 200 cells, audit trail in h3_judged.csv)</h3>
463
+ <p>Every answer judged against deterministically reconstructed evidence under a written rubric (correctness,
464
+ confabulation, unsupported-claim count, confidence flags). <b>DeepSeek: 99% judged-correct, 0 invented entities,
465
+ 0.02 mean unsupported claims/answer; local Qwen-0.5B: 2% correct, 18% confabulation, 0.32 unsupported claims,
466
+ 0/100 abstentions</b> — including invented statistics, fabricated log-line formats, and impossible dates. The
467
+ auto-abstention regex was validated against the judge (precision 0.96 / recall 0.87). Judge-found pipeline bug
468
+ fixed: a 400-char evidence cap was truncating exactly the anomaly lines questions ask about (now 900). The 40
469
+ SMA-mode rows carry reduced judge confidence (encoder changed post-study) — flagged for human spot-check.</p>
470
+
471
+ <h2 id="perf">7. Engineering record (what was fixed, why it matters)</h2>
472
+ <ul>
473
+ <li><b>Matcher hot path (~2000&times;):</b> <code>Kernel.bindings</code> was a property rebuilt (with full statement
474
+ re-serialization) on every access inside O(k&sup2;) merge loops &rarr; cached tables; MH seeding capped per canonical functor
475
+ group (U-ordered, identical statements first — the blueprint &sect;10.2 tripwire); root-MH-only kernels per &sect;2.2.
476
+ Worst-case 120-line session: &gt;5 min &rarr; 181 ms. Canonical battery unchanged (G2 green).</li>
477
+ <li><b>Certified retrieval fix:</b> the MAC/FAC early-stop compared a raw-score bound to the k-th raw score while ranking
478
+ by normalized ses_n &rarr; bound now converted to ses_n units (admissible since the normalizer &ge; query self-score);
479
+ the <code>certified</code> flag is honest under FAC budget truncation.</li>
480
+ <li><b>Encoder v0.1.1 &rarr; v0.2.0:</b> cause/enables now require antecedent-before-consequent (the Python contradicted its
481
+ own rules/logs.yaml); v0.2.0 added the cross-system event ontology (&sect;5 above).</li>
482
+ <li><b>Eval integrity:</b> BGL + Thunderbird label columns and the OpenStack source-filename token stripped (three
483
+ separate label-leak vectors closed); SSB generator vocabulary collisions fixed (per-triple namespaces);
484
+ content addressing is blake3-only (no silent hash fallback).</li>
485
+ <li><b>Orchestration:</b> chat-format local LLM calls with repeat-penalty (fixes looping); evidence prompts carry numbered
486
+ text only (no provenance hashes — small models parrot them) + the window caveat; DeepSeek backend via httpx with key in
487
+ git-ignored <code>.env</code>.</li>
488
+ </ul>
489
+ <h3>Latency snapshots</h3>
490
+ {_table(latency)}
491
+
492
+ <h2 id="caveats">8. Honest caveats &amp; open issues</h2>
493
+ <ul>
494
+ <li><b>Single seed everywhere.</b> No confidence intervals yet; the consolidated pre-freeze batch adds multi-seed +
495
+ paired bootstrap.</li>
496
+ <li><b>The transfer ontology is post-hoc.</b> Event-class rules were written after observing the run-1 failure (legitimate
497
+ pre-freeze diagnostics, not a preregistered result); needs held-out confirmation on new seeds/windows and a third pair.</li>
498
+ <li><b>BGL within-system is an honest H2 miss</b> (surface methods near-perfect on lexically overt anomalies).</li>
499
+ <li><b>HDFS&rarr;OpenStack wall:</b> no method beats coin-flip; investigate whether OpenStack anomaly semantics
500
+ (missing cleanup events) are visible to <i>any</i> retrieval formulation before re-attempting.</li>
501
+ <li><b>Short-session ses_n bias:</b> normalization favors small self-scores; revisit at calibration time.</li>
502
+ <li><b>SMA p95 latency</b> on large BGL windows (~2s) exceeds comfort; tripwire ladder not yet exhausted.</li>
503
+ <li><b>SSB far-vocabulary circularity:</b> the synthetic benchmark's <code>far_</code> renaming is known to the
504
+ canonicalizer; treat SSB as machinery verification only until real disjoint lexicons are implemented.</li>
505
+ <li><b>mdl_gain is an MVP</b> (within-target costs over matched functor types); corpus-level code-length costs are the
506
+ designed upgrade.</li>
507
+ <li><b>H3 auto-abstention is regex-based</b>; human rating pass pending (columns provided).</li>
508
+ </ul>
509
+
510
+ <h2 id="next">9. Next steps (agreed sequencing)</h2>
511
+ <ol>
512
+ <li><b>Consolidated pre-freeze eval batch:</b> hybrid RRF (+cross-encoder reranker) baselines, multi-seed transfer
513
+ confirmation including the MDL leg, third transfer pair, long-context B6 designed against the H3 findings.</li>
514
+ <li><b>Calibration then freeze</b> (&gamma;, &rho;, &delta;, &theta; on validation only), pre-registration tag, then single test-set runs.</li>
515
+ <li>LogHub-2k oracle template validation (G3 full gate); SME-v4 25-pair oracle battery (G2 full gate).</li>
516
+ <li>Drift protocol T5 (the agentic claim), BugsInPy (T3), ablation battery (&gamma;=0 first).</li>
517
+ </ol>
518
+
519
+ <h2 id="appendix">10. Appendix: remaining raw artifacts</h2>
520
+ <h3>Dataset manifest (checksums)</h3>
521
+ {_table(manifest)}
522
+ <h3>SSB fixture metrics</h3>
523
+ {_table(ssb)}
524
+ <h3>Calibration placeholders (pre-freeze)</h3>
525
+ {_table(calibration)}
526
+
527
+ </div></body></html>
528
+ """
529
+
530
+
531
+ def main(argv: list[str] | None = None) -> int:
532
+ parser = argparse.ArgumentParser()
533
+ parser.add_argument("--out", default="reports/report.html")
534
+ parser.add_argument(
535
+ "--ssb-library-n",
536
+ type=int,
537
+ default=12,
538
+ help="Number of SSB triples for the FAC-backed full-library run. Default 12 gives 24 library cases.",
539
+ )
540
+ parser.add_argument(
541
+ "--ssb-mac-prefilter-n",
542
+ type=int,
543
+ default=1000,
544
+ help="Number of SSB triples for the MAC-stage candidate-generation diagnostic.",
545
+ )
546
+ parser.add_argument(
547
+ "--skip-loghub",
548
+ action="store_true",
549
+ help="Skip the long-running HDFS/BGL LogHub evaluation (SSB fixtures only).",
550
+ )
551
+ parser.add_argument(
552
+ "--html-only",
553
+ action="store_true",
554
+ help="Re-render report.html from existing CSVs without running any evaluation.",
555
+ )
556
+ args = parser.parse_args(argv)
557
+ out = pathlib.Path(args.out)
558
+ out.parent.mkdir(parents=True, exist_ok=True)
559
+ if not args.html_only:
560
+ rows = run_fixture_eval(
561
+ library_n=args.ssb_library_n,
562
+ mac_prefilter_n=args.ssb_mac_prefilter_n,
563
+ include_loghub=not args.skip_loghub,
564
+ )
565
+ for name, csv_rows in rows.items():
566
+ write_csv(out.parent / name, csv_rows)
567
+ out.write_text(render_html(out.parent), encoding="utf-8")
568
+ print(out)
569
+ return 0
570
+
571
+
572
+ if __name__ == "__main__":
573
+ raise SystemExit(main())