structuremappingmemory 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. sma/__init__.py +5 -0
  2. sma/__main__.py +5 -0
  3. sma/agent/__init__.py +5 -0
  4. sma/agent/adapter_draft.py +217 -0
  5. sma/agent/api.py +67 -0
  6. sma/agent/comparison.py +591 -0
  7. sma/agent/llm.py +280 -0
  8. sma/agent/policies.py +21 -0
  9. sma/agent/service.py +95 -0
  10. sma/cli.py +65 -0
  11. sma/encoders/__init__.py +38 -0
  12. sma/encoders/agentobs.py +27 -0
  13. sma/encoders/base.py +23 -0
  14. sma/encoders/code_treesitter.py +64 -0
  15. sma/encoders/coverage.py +80 -0
  16. sma/encoders/draft_adapter.py +183 -0
  17. sma/encoders/healthcare.py +207 -0
  18. sma/encoders/logs_drain.py +142 -0
  19. sma/encoders/prose_tier1.py +57 -0
  20. sma/encoders/structured.py +57 -0
  21. sma/encoders/traces.py +45 -0
  22. sma/eval/__init__.py +2 -0
  23. sma/eval/agentic/__init__.py +35 -0
  24. sma/eval/agentic/arms/__init__.py +0 -0
  25. sma/eval/agentic/arms/cyber.py +48 -0
  26. sma/eval/agentic/arms/discovery.py +35 -0
  27. sma/eval/agentic/arms/finance.py +38 -0
  28. sma/eval/agentic/arms/legal.py +74 -0
  29. sma/eval/agentic/arms/medicine.py +45 -0
  30. sma/eval/agentic/harness.py +275 -0
  31. sma/eval/agentic/memories.py +308 -0
  32. sma/eval/agentic/metrics.py +82 -0
  33. sma/eval/agentic_qa/__init__.py +27 -0
  34. sma/eval/agentic_qa/agent.py +383 -0
  35. sma/eval/agentic_qa/metrics.py +239 -0
  36. sma/eval/agentic_qa/pools.py +197 -0
  37. sma/eval/arn.py +65 -0
  38. sma/eval/baselines/__init__.py +6 -0
  39. sma/eval/baselines/bge_dense.py +54 -0
  40. sma/eval/baselines/bm25.py +18 -0
  41. sma/eval/baselines/dense.py +42 -0
  42. sma/eval/baselines/hipporag.py +235 -0
  43. sma/eval/baselines/hybrid_rrf.py +30 -0
  44. sma/eval/baselines/longcontext_llm.py +124 -0
  45. sma/eval/baselines/rerank.py +41 -0
  46. sma/eval/baselines/splade.py +77 -0
  47. sma/eval/baselines/wl_kernel.py +163 -0
  48. sma/eval/bugsinpy.py +358 -0
  49. sma/eval/bugsinpy_families.py +164 -0
  50. sma/eval/crossdomain.py +89 -0
  51. sma/eval/diabetes.py +61 -0
  52. sma/eval/drift_env.py +26 -0
  53. sma/eval/drift_metrics.py +24 -0
  54. sma/eval/family_labels.py +167 -0
  55. sma/eval/fraud_elliptic/__init__.py +29 -0
  56. sma/eval/fraud_elliptic/encoder.py +279 -0
  57. sma/eval/fraud_elliptic/eval.py +269 -0
  58. sma/eval/fraud_elliptic/test_encoder.py +123 -0
  59. sma/eval/ieee_cis.py +66 -0
  60. sma/eval/loghub.py +16 -0
  61. sma/eval/loghub_eval.py +480 -0
  62. sma/eval/longmemeval.py +51 -0
  63. sma/eval/memory_backends/__init__.py +2 -0
  64. sma/eval/memory_backends/base.py +22 -0
  65. sma/eval/memory_backends/context_only.py +14 -0
  66. sma/eval/memory_backends/rag_notes.py +17 -0
  67. sma/eval/memory_backends/shared_llm.py +30 -0
  68. sma/eval/memory_backends/sma_memory.py +54 -0
  69. sma/eval/memory_backends/zep_graphiti.py +33 -0
  70. sma/eval/metrics.py +32 -0
  71. sma/eval/ontology_bench.py +219 -0
  72. sma/eval/report.py +573 -0
  73. sma/eval/ssb_eval.py +216 -0
  74. sma/eval/ssb_generator.py +116 -0
  75. sma/eval/stats.py +108 -0
  76. sma/eval/transfer_eval.py +844 -0
  77. sma/index/__init__.py +15 -0
  78. sma/index/ann.py +21 -0
  79. sma/index/content_vectors.py +60 -0
  80. sma/index/inverted.py +63 -0
  81. sma/index/macfac.py +174 -0
  82. sma/ir/__init__.py +22 -0
  83. sma/ir/canon.py +106 -0
  84. sma/ir/schema.py +165 -0
  85. sma/ir/sexpr.py +86 -0
  86. sma/ir/signatures.py +76 -0
  87. sma/match/__init__.py +20 -0
  88. sma/match/conflicts.py +46 -0
  89. sma/match/engine.py +60 -0
  90. sma/match/explain.py +59 -0
  91. sma/match/infer.py +54 -0
  92. sma/match/kernels.py +54 -0
  93. sma/match/mdl.py +30 -0
  94. sma/match/merge_cpsat.py +77 -0
  95. sma/match/merge_greedy.py +15 -0
  96. sma/match/mh.py +177 -0
  97. sma/match/ses.py +84 -0
  98. sma/match/types.py +115 -0
  99. sma/match/verifier.py +27 -0
  100. sma/ontology/__init__.py +45 -0
  101. sma/ontology/attack.py +134 -0
  102. sma/ontology/cpc.py +69 -0
  103. sma/ontology/graph.py +58 -0
  104. sma/ontology/loader.py +262 -0
  105. sma/ontology/mitre_xml.py +67 -0
  106. sma/ontology/mount.py +101 -0
  107. sma/ontology/rdf_loader.py +75 -0
  108. sma/ontology/registry.py +115 -0
  109. sma/ontology/router.py +69 -0
  110. sma/ontology/usgaap.py +73 -0
  111. sma/sage/__init__.py +6 -0
  112. sma/sage/assimilate.py +12 -0
  113. sma/sage/pools.py +105 -0
  114. sma/sage/probabilities.py +10 -0
  115. sma/store/__init__.py +6 -0
  116. sma/store/lmdb_store.py +78 -0
  117. sma/store/registry.py +26 -0
  118. sma/store/wal.py +26 -0
  119. sma/ui/app.py +642 -0
  120. structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
  121. structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
  122. structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
  123. structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
  124. structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
  125. structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,197 @@
1
+ """Registered question pools for the Phase 5 LLM-QA "trustworthy specialist" phase.
2
+
3
+ This builds the three pre-registered question pools of
4
+ ``configs/preregistration_v2_llmqa.md`` from the flagship medicine arm
5
+ (HPO + ``phenotype.hpoa``), holding the index fixed so attribution stays clean:
6
+
7
+ * **answerable** — cases whose true disease IS indexed (the agent should answer + cite);
8
+ * **out-of-knowledge / novel** — cases whose true disease is HELD OUT of the index
9
+ (the agent should ABSTAIN *and* flag NOVEL). The held-out cases are, by
10
+ construction, both unanswerable and novel, so ``ook`` and ``novel`` are the
11
+ same list of :class:`QAItem`.
12
+
13
+ Each clinical case is a *hard* partial/imprecise observation generated exactly
14
+ like ``sma/eval/ontology_bench.run_arm``: sample a few of the disease's
15
+ phenotypes, climb 0-2 is-a levels (imprecision), and add a few noise terms, then
16
+ render the surviving terms as a natural-language presentation. Determinism: every
17
+ ``set`` is sorted to a list before use and the single RNG is explicitly seeded,
18
+ so identical ``(seed, n_index, ...)`` yields identical pools.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import random
24
+ from dataclasses import dataclass, field
25
+
26
+ from sma.ontology import MountedOntology
27
+
28
+
29
+ @dataclass
30
+ class QAItem:
31
+ """One pre-registered LLM-QA case.
32
+
33
+ ``case_text`` is the NL presentation shown to the agent; ``case_terms`` are
34
+ the (possibly climbed/noised) ontology term ids backing it; ``gold_id`` /
35
+ ``gold_name`` are the true disease. ``answerable`` is True iff the gold
36
+ disease is indexed; ``novel`` is True iff the gold disease was held out.
37
+ """
38
+
39
+ case_text: str
40
+ case_terms: frozenset[str]
41
+ gold_id: str
42
+ gold_name: str
43
+ answerable: bool
44
+ novel: bool
45
+
46
+
47
+ def _parse_hpoa(hpoa_path: str) -> dict[str, tuple[str, set[str]]]:
48
+ """Parse ``phenotype.hpoa`` into ``disease_id -> (name, {hpo_term_id})``.
49
+
50
+ Mirrors the flagship record construction (medicine arm /
51
+ ``scripts/bench_ontology_suite.load_hpo_records``): skip header/comment
52
+ lines, tab-split, keep only phenotypic-abnormality rows (column 10 == ``"P"``),
53
+ and read column 0 as the disease id, column 1 as the disease name, column 3
54
+ as the HPO term.
55
+ """
56
+ rec: dict[str, tuple[str, set[str]]] = {}
57
+ with open(hpoa_path, "r", encoding="utf-8") as handle:
58
+ for line in handle:
59
+ if line.startswith(("#", "database_id")):
60
+ continue
61
+ p = line.rstrip("\n").split("\t")
62
+ if len(p) < 11 or p[10] != "P":
63
+ continue
64
+ disease_id, disease_name, hpo_term = p[0], p[1], p[3]
65
+ name, terms = rec.setdefault(disease_id, (disease_name, set()))
66
+ terms.add(hpo_term)
67
+ return rec
68
+
69
+
70
+ def build_pools(
71
+ mounted: MountedOntology,
72
+ hpoa_path: str,
73
+ *,
74
+ seed: int = 7,
75
+ n_index: int = 1500,
76
+ n_answerable: int = 120,
77
+ n_held: int = 120,
78
+ n_calib: int = 60,
79
+ min_ph: int = 7,
80
+ max_ph: int = 30,
81
+ ) -> dict:
82
+ """Build the registered LLM-QA pools over the medicine (HPO) arm.
83
+
84
+ ``mounted`` is the mounted HPO ontology; ``hpoa_path`` points at
85
+ ``phenotype.hpoa``. Returns a dict with keys:
86
+
87
+ * ``"index_items"`` — ``list[IndexItem]`` for the INDEXED diseases (the
88
+ shared knowledge every memory is built over);
89
+ * ``"answerable"`` — ``n_answerable`` :class:`QAItem`\\ s drawn from INDEXED
90
+ diseases (``answerable=True, novel=False``);
91
+ * ``"ook"`` / ``"novel"`` — ``n_held`` :class:`QAItem`\\ s drawn from
92
+ HELD-OUT diseases (``answerable=False, novel=True``); the same list is
93
+ returned under both keys because held-out cases are both unanswerable and
94
+ novel.
95
+ * ``"calib_answerable"`` / ``"calib_ook"`` — up to ``n_calib`` :class:`QAItem`\\
96
+ s each, drawn from the SPARE indexed / held-out diseases (disjoint from the
97
+ test pools above). The driver scores these retrieval-only (no LLM spend) to
98
+ calibrate the cite-or-abstain threshold without ever touching the test
99
+ split. Empty when no spare diseases remain.
100
+
101
+ Eligible diseases carry ``min_ph..max_ph`` phenotypes that are present in
102
+ ``mounted.graph.terms``; their ids are sorted then shuffled under ``seed``.
103
+ The first ``n_index`` are INDEXED; the remainder are HELD-OUT. The test pools
104
+ are drawn first (so their cases are unchanged by ``n_calib``), then the
105
+ calibration pools from the remaining ids.
106
+ """
107
+ # Local import keeps this module importable even while the sibling
108
+ # agentic_qa.metrics is mid-construction (the package __init__ imports it).
109
+ from sma.eval.agentic import IndexItem
110
+
111
+ graph = mounted.graph
112
+
113
+ def term_text(t: str) -> str:
114
+ nm = graph.terms[t].name if t in graph.terms else ""
115
+ return nm or t
116
+
117
+ parents = {tid: tuple(term.parents) for tid, term in graph.terms.items()}
118
+ parsed = _parse_hpoa(hpoa_path)
119
+
120
+ # Eligibility: known phenotypes only, count in [min_ph, max_ph]. SORTED ids.
121
+ known: dict[str, tuple[str, list[str]]] = {}
122
+ for did in sorted(parsed):
123
+ name, terms = parsed[did]
124
+ present = sorted(t for t in terms if t in graph.terms)
125
+ if min_ph <= len(present) <= max_ph:
126
+ known[did] = (name, present)
127
+
128
+ eligible = sorted(known)
129
+ rng = random.Random(seed)
130
+ rng.shuffle(eligible)
131
+
132
+ indexed_ids = eligible[:n_index]
133
+ held_ids = eligible[n_index:]
134
+
135
+ # Shared index: IndexItem(key=id, term_ids, text=space-joined term NAMES, meta).
136
+ index_items = [
137
+ IndexItem(
138
+ key=did,
139
+ term_ids=frozenset(known[did][1]),
140
+ text=" ".join(term_text(t) for t in known[did][1]),
141
+ meta={"name": known[did][0]},
142
+ )
143
+ for did in indexed_ids
144
+ ]
145
+
146
+ # Noise pool: every phenotype present across the INDEXED diseases (SORTED), so
147
+ # injected distractors are in-vocabulary, matching the ontology_bench generator.
148
+ noise_pool = sorted({t for did in indexed_ids for t in known[did][1]})
149
+
150
+ def make_case(terms: list[str]) -> tuple[frozenset[str], str]:
151
+ """Hard query: sample <=5 phenotypes, climb 0-2 is-a levels, +3 noise."""
152
+ keep = rng.sample(terms, min(5, len(terms)))
153
+ q: list[str] = []
154
+ for t in keep:
155
+ cur = t
156
+ for _ in range(rng.choice([0, 0, 1, 1, 2])):
157
+ ps = parents.get(cur)
158
+ if ps:
159
+ cur = rng.choice(sorted(ps))
160
+ q.append(cur)
161
+ if noise_pool:
162
+ q += rng.sample(noise_pool, min(3, len(noise_pool)))
163
+ text = "Patient presents with: " + ", ".join(term_text(t) for t in q)
164
+ return frozenset(q), text
165
+
166
+ def qitems(ids: list[str], n: int, *, answerable: bool) -> list[QAItem]:
167
+ out: list[QAItem] = []
168
+ for did in ids[:n]:
169
+ name, terms = known[did]
170
+ case_terms, case_text = make_case(terms)
171
+ out.append(
172
+ QAItem(
173
+ case_text=case_text,
174
+ case_terms=case_terms,
175
+ gold_id=did,
176
+ gold_name=name,
177
+ answerable=answerable,
178
+ novel=not answerable,
179
+ )
180
+ )
181
+ return out
182
+
183
+ answerable = qitems(indexed_ids, n_answerable, answerable=True)
184
+ novel = qitems(held_ids, n_held, answerable=False)
185
+ # Calibration pools: SPARE ids beyond the test draws (disjoint), scored
186
+ # retrieval-only to fit the abstention threshold without test-set leakage.
187
+ calib_answerable = qitems(indexed_ids[n_answerable:], n_calib, answerable=True)
188
+ calib_ook = qitems(held_ids[n_held:], n_calib, answerable=False)
189
+
190
+ return {
191
+ "index_items": index_items,
192
+ "answerable": answerable,
193
+ "ook": novel,
194
+ "novel": novel,
195
+ "calib_answerable": calib_answerable,
196
+ "calib_ook": calib_ook,
197
+ }
sma/eval/arn.py ADDED
@@ -0,0 +1,65 @@
1
+ """ARN dataset helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import pathlib
7
+
8
+
9
+ DEFAULT_ARN_PATH = pathlib.Path(
10
+ "data/raw/arn/Analogical Reasoning on Narratives (ARN) dataset.xlsx - Sheet1.csv"
11
+ )
12
+
13
+ ARN_REQUIRED_COLUMNS = {
14
+ "id",
15
+ "proverb",
16
+ "query_narrative",
17
+ "first_choice",
18
+ "second_choice",
19
+ "distractor_similarity",
20
+ "analogy_level",
21
+ "correct_answer",
22
+ }
23
+
24
+
25
+ def validate_columns(columns: list[str]) -> bool:
26
+ return ARN_REQUIRED_COLUMNS.issubset(set(columns))
27
+
28
+
29
+ def load_arn_rows(path: str | pathlib.Path = DEFAULT_ARN_PATH, limit: int | None = None) -> list[dict[str, str]]:
30
+ source = pathlib.Path(path)
31
+ with source.open(encoding="utf-8-sig", newline="") as fh:
32
+ reader = csv.DictReader(fh)
33
+ if not reader.fieldnames or not validate_columns(reader.fieldnames):
34
+ raise ValueError(f"ARN CSV has unexpected columns: {reader.fieldnames}")
35
+ rows = []
36
+ for row in reader:
37
+ rows.append(dict(row))
38
+ if limit is not None and len(rows) >= limit:
39
+ break
40
+ return rows
41
+
42
+
43
+ def arn_choice_corpus(path: str | pathlib.Path = DEFAULT_ARN_PATH, limit: int = 12) -> tuple[str, str]:
44
+ """Return a raw text corpus of answer choices plus one suggested query."""
45
+
46
+ rows = load_arn_rows(path, limit=limit)
47
+ blocks: list[str] = []
48
+ suggested_query = ""
49
+ for row in rows:
50
+ if not suggested_query:
51
+ suggested_query = row["query_narrative"]
52
+ correct = row["correct_answer"].strip()
53
+ for choice_number, column in (("1", "first_choice"), ("2", "second_choice")):
54
+ label = "correct" if choice_number == correct else "distractor"
55
+ blocks.append(
56
+ "\n".join(
57
+ [
58
+ f"ARN id={row['id']} choice={choice_number} label={label}",
59
+ f"proverb: {row['proverb']}",
60
+ f"analogy_level: {row['analogy_level']}",
61
+ row[column],
62
+ ]
63
+ )
64
+ )
65
+ return "\n\n".join(blocks), suggested_query
@@ -0,0 +1,6 @@
1
+ """Baseline retrieval implementations for the LogHub evaluation ladder.
2
+
3
+ Modules here are deliberately self-contained: each baseline owns its model
4
+ loading and scoring so `scripts/baseline_ladder.py` can compose them under the
5
+ exact protocol of `sma.eval.loghub_eval` without touching that file.
6
+ """
@@ -0,0 +1,54 @@
1
+ """Dense retrieval with BAAI/bge-base-en-v1.5 (blueprint B2's specified embedder).
2
+
3
+ CPU-only. Index embeddings are batch-encoded once; queries are encoded per
4
+ call so per-query latency includes the real encode cost (same convention as
5
+ the MiniLM dense baseline in sma.eval.loghub_eval).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+
12
+ MODEL_NAME = "BAAI/bge-base-en-v1.5"
13
+ # Per the BGE model card, short queries in retrieval tasks should carry this
14
+ # instruction prefix; documents are encoded without it.
15
+ QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
16
+
17
+
18
+ class BGEDenseRetriever:
19
+ def __init__(self, model_name: str = MODEL_NAME, batch_size: int = 16):
20
+ from sentence_transformers import SentenceTransformer
21
+
22
+ self.model = SentenceTransformer(model_name, device="cpu")
23
+ self.batch_size = batch_size
24
+ self.doc_ids: list[str] = []
25
+ self.doc_matrix: np.ndarray | None = None
26
+
27
+ def build(self, documents: list[tuple[str, str]]) -> None:
28
+ self.doc_ids = [doc_id for doc_id, _ in documents]
29
+ texts = [text for _, text in documents]
30
+ self.doc_matrix = self.model.encode(
31
+ texts,
32
+ batch_size=self.batch_size,
33
+ convert_to_numpy=True,
34
+ normalize_embeddings=True,
35
+ show_progress_bar=False,
36
+ )
37
+
38
+ def encode_query(self, query_text: str) -> np.ndarray:
39
+ return self.model.encode(
40
+ [QUERY_PREFIX + query_text],
41
+ convert_to_numpy=True,
42
+ normalize_embeddings=True,
43
+ show_progress_bar=False,
44
+ )[0]
45
+
46
+ def retrieve(self, query_text: str, k: int = 10) -> list[tuple[str, float]]:
47
+ if self.doc_matrix is None:
48
+ return []
49
+ q = self.encode_query(query_text)
50
+ scores = self.doc_matrix @ q # cosine: both sides L2-normalized
51
+ ranked = sorted(
52
+ zip(self.doc_ids, map(float, scores)), key=lambda row: (-row[1], row[0])
53
+ )
54
+ return ranked[:k]
@@ -0,0 +1,18 @@
1
+ """Simple lexical baseline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import Counter
6
+
7
+
8
+ def lexical_score(query: str, document: str) -> float:
9
+ q = Counter(query.lower().split())
10
+ d = Counter(document.lower().split())
11
+ return sum(min(v, d.get(k, 0)) for k, v in q.items())
12
+
13
+
14
+ def rank_bm25_like(query: str, documents: list[tuple[str, str]], k: int = 10) -> list[tuple[str, float]]:
15
+ rows = [(doc_id, lexical_score(query, text)) for doc_id, text in documents]
16
+ rows.sort(key=lambda row: (-row[1], row[0]))
17
+ return rows[:k]
18
+
@@ -0,0 +1,42 @@
1
+ """Dense-RAG style baseline with local deterministic TF-IDF fallback.
2
+
3
+ This is not the final sentence-transformer baseline from the paper plan. It is
4
+ the CPU-safe baseline used in MVP reports when no embedding model is installed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+
13
+ def rank_tfidf_dense(query: str, documents: list[tuple[str, str]], k: int = 10) -> list[tuple[str, float]]:
14
+ if not documents:
15
+ return []
16
+ ids = [doc_id for doc_id, _ in documents]
17
+ texts = [text for _, text in documents]
18
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True)
19
+ matrix = vectorizer.fit_transform(texts + [query])
20
+ sims = cosine_similarity(matrix[-1], matrix[:-1]).ravel()
21
+ rows = list(zip(ids, map(float, sims)))
22
+ rows.sort(key=lambda row: (-row[1], row[0]))
23
+ return rows[:k]
24
+
25
+
26
+ def rank_tfidf_dense_batch(queries: list[str], documents: list[tuple[str, str]], k: int = 10) -> list[list[tuple[str, float]]]:
27
+ if not documents:
28
+ return [[] for _ in queries]
29
+ ids = [doc_id for doc_id, _ in documents]
30
+ texts = [text for _, text in documents]
31
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True)
32
+ doc_matrix = vectorizer.fit_transform(texts)
33
+ query_matrix = vectorizer.transform(queries)
34
+ sims = query_matrix @ doc_matrix.T
35
+ ranked: list[list[tuple[str, float]]] = []
36
+ for row_idx in range(sims.shape[0]):
37
+ row = sims.getrow(row_idx)
38
+ scores = row.toarray().ravel()
39
+ pairs = list(zip(ids, map(float, scores)))
40
+ pairs.sort(key=lambda item: (-item[1], item[0]))
41
+ ranked.append(pairs[:k])
42
+ return ranked
@@ -0,0 +1,235 @@
1
+ """HippoRAG-2-style KG retrieval comparator (blueprint B5, deterministic adaptation).
2
+
3
+ Adapted from HippoRAG / HippoRAG 2 (Gutierrez et al., NeurIPS'24 / ICML'25):
4
+ - a phrase graph over OpenIE triples with passage (document) nodes,
5
+ - synonym edges linking near-identical phrases,
6
+ - Personalized PageRank (damping 0.5, as published) with personalization
7
+ mass on query phrase nodes, weighted by node specificity 1/df
8
+ (HippoRAG's inverse passage-frequency seed weighting),
9
+ - documents scored by the PPR mass landing on their document nodes
10
+ (chosen over summing contained-entity mass: it is the published
11
+ HippoRAG 2 passage-node scoring and needs no extra idf heuristic).
12
+
13
+ Substitutions for a fair, deterministic, LLM-free comparison:
14
+ - LLM OpenIE is replaced by rule-based triple extraction: per log/code
15
+ line, regex-extracted entity-like tokens (block ids, IPs, hostnames,
16
+ paths, hex ids, dotted/CamelCase identifiers, content words) and a
17
+ fixed verb lexicon for relations; lines with no relation token fall
18
+ back to entity co-occurrence edges.
19
+ - The embedding-based synonym model is replaced by case/punctuation-
20
+ normalized string equality plus token-Jaccard >= 0.8 on split
21
+ identifiers.
22
+ No randomness anywhere: iteration is over sorted structures and PageRank
23
+ is the deterministic scipy power iteration.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import re
29
+ from collections import Counter, defaultdict
30
+
31
+ import networkx as nx
32
+
33
+ DAMPING = 0.5 # HippoRAG's published PPR damping factor
34
+ JACCARD_SYNONYM = 0.8
35
+ MAX_ENTITIES_PER_LINE = 16
36
+
37
+ # Pattern priority matters: earlier patterns mask their spans so later,
38
+ # more generic ones do not re-extract fragments.
39
+ ENTITY_PATTERNS: list[re.Pattern[str]] = [
40
+ re.compile(r"blk_-?\d+"), # HDFS block ids
41
+ re.compile(r"\b\d{1,3}(?:\.\d{1,3}){3}(?::\d+)?\b"), # IPv4(:port)
42
+ re.compile(r"\b0x[0-9a-fA-F]+\b|\b[0-9a-f]{8,}\b"), # hex ids
43
+ re.compile(r"(?<![\w/])/(?:[\w.\-]+/)+[\w.\-]+"), # file paths
44
+ re.compile(r"\b[A-Za-z_][\w$\-]*(?:\.[A-Za-z_][\w$\-]*)+\b"), # dotted: classes, hostnames
45
+ re.compile(r"\b[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+\b"), # CamelCase (exceptions, services)
46
+ re.compile(r"\b[A-Za-z][\w]*(?:[-_][\w]+)+\b"), # snake/hyphen identifiers
47
+ ]
48
+ WORD_PATTERN = re.compile(r"\b[A-Za-z]{2,}\b")
49
+
50
+ STOPWORDS = frozenset(
51
+ "the a an to of in on at for is was were be been being and or not with by "
52
+ "from this that it its as are has have had while when then than but if "
53
+ "else into over under after before during no none null true false via per "
54
+ "info debug trace warn".split()
55
+ )
56
+ RELATION_LEXICON = frozenset(
57
+ "received receiving sent send sending terminating terminated starting "
58
+ "started start stop stopping stopped failed failing fails fail connect "
59
+ "connected connecting disconnect disconnected deleting deleted delete "
60
+ "created creating create opened opening open closed closing close read "
61
+ "reading write writing wrote allocated allocating exceeded aborted "
62
+ "aborting retrying retried refused raised threw throw throws thrown "
63
+ "caught calling called returned returning killed killing launched "
64
+ "launching completed completing finished exited timed waiting blocked "
65
+ "serving served added adding removed removing updated updating "
66
+ "registered succeeded".split()
67
+ )
68
+
69
+
70
+ def _normalize(phrase: str) -> str:
71
+ return re.sub(r"[^0-9a-z]+", "", phrase.lower())
72
+
73
+
74
+ def _phrase_tokens(phrase: str) -> frozenset[str]:
75
+ """Split an identifier on case boundaries and punctuation for Jaccard."""
76
+ spaced = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", phrase)
77
+ return frozenset(t.lower() for t in re.split(r"[^0-9A-Za-z]+", spaced) if t)
78
+
79
+
80
+ def _line_triples(line: str) -> tuple[list[tuple[str, str, str]], list[tuple[str, str]], list[str]]:
81
+ """Extract (triples, co-occurrence pairs, all entities) from one line."""
82
+ found: list[tuple[int, str, str]] = []
83
+ masked = line
84
+ for pattern in ENTITY_PATTERNS:
85
+ for m in pattern.finditer(masked):
86
+ found.append((m.start(), "entity", m.group(0)))
87
+ masked = pattern.sub(lambda m: " " * len(m.group(0)), masked)
88
+ for m in WORD_PATTERN.finditer(masked):
89
+ word = m.group(0)
90
+ lower = word.lower()
91
+ if lower in STOPWORDS:
92
+ continue
93
+ if lower in RELATION_LEXICON or (len(lower) >= 5 and lower.endswith(("ing", "ed"))):
94
+ found.append((m.start(), "relation", lower))
95
+ else:
96
+ found.append((m.start(), "entity", word))
97
+ found.sort()
98
+ ents = [(pos, s) for pos, kind, s in found if kind == "entity"][:MAX_ENTITIES_PER_LINE]
99
+ rels = [(pos, s) for pos, kind, s in found if kind == "relation"]
100
+ entities = [s for _, s in ents]
101
+ triples: list[tuple[str, str, str]] = []
102
+ pairs: list[tuple[str, str]] = []
103
+ if rels:
104
+ for (p1, e1), (p2, e2) in zip(ents, ents[1:]):
105
+ between = [r for pr, r in rels if p1 < pr < p2]
106
+ rel = between[0] if between else rels[0][1]
107
+ triples.append((e1, rel, e2))
108
+ else:
109
+ for i, (_, e1) in enumerate(ents):
110
+ for _, e2 in ents[i + 1:]:
111
+ if e1 != e2:
112
+ pairs.append((e1, e2))
113
+ return triples, pairs, entities
114
+
115
+
116
+ def extract_phrases(text: str) -> list[str]:
117
+ """Entity-like phrases for a whole text, in order of first appearance."""
118
+ out: list[str] = []
119
+ seen: set[str] = set()
120
+ for line in text.splitlines() or [text]:
121
+ _, _, entities = _line_triples(line)
122
+ for e in entities:
123
+ if e not in seen:
124
+ seen.add(e)
125
+ out.append(e)
126
+ return out
127
+
128
+
129
+ class HippoRAGRetriever:
130
+ """Phrase graph + Personalized PageRank retriever (see module docstring)."""
131
+
132
+ def __init__(self, damping: float = DAMPING):
133
+ self.damping = damping
134
+ self.graph = nx.Graph()
135
+ self.doc_ids: list[str] = []
136
+ self._norm_index: dict[str, list[str]] = {}
137
+
138
+ @staticmethod
139
+ def _doc_node(doc_id: str) -> str:
140
+ return f"d::{doc_id}"
141
+
142
+ @staticmethod
143
+ def _ent_node(phrase: str) -> str:
144
+ return f"e::{phrase}"
145
+
146
+ def _bump(self, u: str, v: str, w: float) -> None:
147
+ if self.graph.has_edge(u, v):
148
+ self.graph[u][v]["weight"] += w
149
+ else:
150
+ self.graph.add_edge(u, v, weight=w)
151
+
152
+ def build(self, documents: list[tuple[str, str]]) -> None:
153
+ self.graph = nx.Graph()
154
+ self.doc_ids = [doc_id for doc_id, _ in documents]
155
+ for doc_id, text in documents:
156
+ dnode = self._doc_node(doc_id)
157
+ self.graph.add_node(dnode, kind="doc")
158
+ doc_counts: Counter[str] = Counter()
159
+ for line in text.splitlines() or [text]:
160
+ triples, pairs, entities = _line_triples(line)
161
+ doc_counts.update(entities)
162
+ for s, _rel, o in triples:
163
+ if s != o:
164
+ self._bump(self._ent_node(s), self._ent_node(o), 1.0)
165
+ for e1, e2 in pairs:
166
+ self._bump(self._ent_node(e1), self._ent_node(e2), 1.0)
167
+ for phrase in sorted(doc_counts):
168
+ enode = self._ent_node(phrase)
169
+ self.graph.add_node(enode, kind="entity")
170
+ self._bump(enode, dnode, float(doc_counts[phrase]))
171
+ self._add_synonym_edges()
172
+ self._norm_index = defaultdict(list)
173
+ for node in sorted(self.graph.nodes):
174
+ if node.startswith("e::"):
175
+ self._norm_index[_normalize(node[3:])].append(node)
176
+ self._norm_index = dict(self._norm_index)
177
+
178
+ def _add_synonym_edges(self) -> None:
179
+ phrases = sorted(n[3:] for n in self.graph.nodes if n.startswith("e::"))
180
+ by_norm: dict[str, list[str]] = defaultdict(list)
181
+ for p in phrases:
182
+ by_norm[_normalize(p)].append(p)
183
+ for _norm, group in sorted(by_norm.items()):
184
+ for i, p1 in enumerate(group):
185
+ for p2 in group[i + 1:]:
186
+ self._bump(self._ent_node(p1), self._ent_node(p2), 1.0)
187
+ # Token-Jaccard synonyms among multi-token identifiers sharing a token.
188
+ token_sets = {p: _phrase_tokens(p) for p in phrases}
189
+ by_token: dict[str, list[str]] = defaultdict(list)
190
+ for p in phrases:
191
+ if len(token_sets[p]) >= 2:
192
+ for t in sorted(token_sets[p]):
193
+ by_token[t].append(p)
194
+ compared: set[tuple[str, str]] = set()
195
+ for _token, group in sorted(by_token.items()):
196
+ for i, p1 in enumerate(group):
197
+ for p2 in group[i + 1:]:
198
+ key = (p1, p2)
199
+ if key in compared or _normalize(p1) == _normalize(p2):
200
+ continue
201
+ compared.add(key)
202
+ t1, t2 = token_sets[p1], token_sets[p2]
203
+ jac = len(t1 & t2) / len(t1 | t2)
204
+ if jac >= JACCARD_SYNONYM:
205
+ self._bump(self._ent_node(p1), self._ent_node(p2), 1.0)
206
+
207
+ def _specificity(self, node: str) -> float:
208
+ df = sum(1 for nb in self.graph[node] if nb.startswith("d::"))
209
+ return 1.0 / max(1, df)
210
+
211
+ def retrieve(self, query_text: str, k: int = 10) -> list[tuple[str, float]]:
212
+ if not self.doc_ids:
213
+ return []
214
+ seeds: dict[str, float] = {}
215
+ for phrase in extract_phrases(query_text):
216
+ for node in self._norm_index.get(_normalize(phrase), []):
217
+ seeds[node] = max(seeds.get(node, 0.0), self._specificity(node))
218
+ personalization = dict(sorted(seeds.items())) if seeds else None
219
+ scores = nx.pagerank(
220
+ self.graph,
221
+ alpha=self.damping,
222
+ personalization=personalization,
223
+ weight="weight",
224
+ )
225
+ ranked = sorted(
226
+ ((doc_id, float(scores.get(self._doc_node(doc_id), 0.0))) for doc_id in self.doc_ids),
227
+ key=lambda row: (-row[1], row[0]),
228
+ )
229
+ return ranked[:k]
230
+
231
+
232
+ def rank_hipporag(query_text: str, documents: list[tuple[str, str]], k: int = 10) -> list[tuple[str, float]]:
233
+ retriever = HippoRAGRetriever()
234
+ retriever.build(documents)
235
+ return retriever.retrieve(query_text, k=k)
@@ -0,0 +1,30 @@
1
+ """Reciprocal-rank fusion (blueprint B3: the strong practical RAG).
2
+
3
+ Fuses any number of ranked lists with the standard RRF constant k=60
4
+ (Cormack, Clarke & Buettcher 2009). Scores are sum(1 / (k + rank)) with
5
+ 1-based ranks; ties break on doc id for determinism.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ RRF_K = 60
11
+
12
+
13
+ def rrf_fuse(
14
+ rankings: list[list[tuple[str, float]]],
15
+ k: int = RRF_K,
16
+ top_k: int | None = None,
17
+ ) -> list[tuple[str, float]]:
18
+ """Fuse ranked (doc_id, score) lists by reciprocal rank.
19
+
20
+ Input scores are ignored; only rank order matters (that is the point of
21
+ RRF -- it is scale-free across heterogeneous retrievers).
22
+ """
23
+ fused: dict[str, float] = {}
24
+ for ranking in rankings:
25
+ for rank, (doc_id, _score) in enumerate(ranking, start=1):
26
+ fused[doc_id] = fused.get(doc_id, 0.0) + 1.0 / (k + rank)
27
+ ranked = sorted(fused.items(), key=lambda row: (-row[1], row[0]))
28
+ if top_k is not None:
29
+ ranked = ranked[:top_k]
30
+ return ranked