structuremappingmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sma/__init__.py +5 -0
- sma/__main__.py +5 -0
- sma/agent/__init__.py +5 -0
- sma/agent/adapter_draft.py +217 -0
- sma/agent/api.py +67 -0
- sma/agent/comparison.py +591 -0
- sma/agent/llm.py +280 -0
- sma/agent/policies.py +21 -0
- sma/agent/service.py +95 -0
- sma/cli.py +65 -0
- sma/encoders/__init__.py +38 -0
- sma/encoders/agentobs.py +27 -0
- sma/encoders/base.py +23 -0
- sma/encoders/code_treesitter.py +64 -0
- sma/encoders/coverage.py +80 -0
- sma/encoders/draft_adapter.py +183 -0
- sma/encoders/healthcare.py +207 -0
- sma/encoders/logs_drain.py +142 -0
- sma/encoders/prose_tier1.py +57 -0
- sma/encoders/structured.py +57 -0
- sma/encoders/traces.py +45 -0
- sma/eval/__init__.py +2 -0
- sma/eval/agentic/__init__.py +35 -0
- sma/eval/agentic/arms/__init__.py +0 -0
- sma/eval/agentic/arms/cyber.py +48 -0
- sma/eval/agentic/arms/discovery.py +35 -0
- sma/eval/agentic/arms/finance.py +38 -0
- sma/eval/agentic/arms/legal.py +74 -0
- sma/eval/agentic/arms/medicine.py +45 -0
- sma/eval/agentic/harness.py +275 -0
- sma/eval/agentic/memories.py +308 -0
- sma/eval/agentic/metrics.py +82 -0
- sma/eval/agentic_qa/__init__.py +27 -0
- sma/eval/agentic_qa/agent.py +383 -0
- sma/eval/agentic_qa/metrics.py +239 -0
- sma/eval/agentic_qa/pools.py +197 -0
- sma/eval/arn.py +65 -0
- sma/eval/baselines/__init__.py +6 -0
- sma/eval/baselines/bge_dense.py +54 -0
- sma/eval/baselines/bm25.py +18 -0
- sma/eval/baselines/dense.py +42 -0
- sma/eval/baselines/hipporag.py +235 -0
- sma/eval/baselines/hybrid_rrf.py +30 -0
- sma/eval/baselines/longcontext_llm.py +124 -0
- sma/eval/baselines/rerank.py +41 -0
- sma/eval/baselines/splade.py +77 -0
- sma/eval/baselines/wl_kernel.py +163 -0
- sma/eval/bugsinpy.py +358 -0
- sma/eval/bugsinpy_families.py +164 -0
- sma/eval/crossdomain.py +89 -0
- sma/eval/diabetes.py +61 -0
- sma/eval/drift_env.py +26 -0
- sma/eval/drift_metrics.py +24 -0
- sma/eval/family_labels.py +167 -0
- sma/eval/fraud_elliptic/__init__.py +29 -0
- sma/eval/fraud_elliptic/encoder.py +279 -0
- sma/eval/fraud_elliptic/eval.py +269 -0
- sma/eval/fraud_elliptic/test_encoder.py +123 -0
- sma/eval/ieee_cis.py +66 -0
- sma/eval/loghub.py +16 -0
- sma/eval/loghub_eval.py +480 -0
- sma/eval/longmemeval.py +51 -0
- sma/eval/memory_backends/__init__.py +2 -0
- sma/eval/memory_backends/base.py +22 -0
- sma/eval/memory_backends/context_only.py +14 -0
- sma/eval/memory_backends/rag_notes.py +17 -0
- sma/eval/memory_backends/shared_llm.py +30 -0
- sma/eval/memory_backends/sma_memory.py +54 -0
- sma/eval/memory_backends/zep_graphiti.py +33 -0
- sma/eval/metrics.py +32 -0
- sma/eval/ontology_bench.py +219 -0
- sma/eval/report.py +573 -0
- sma/eval/ssb_eval.py +216 -0
- sma/eval/ssb_generator.py +116 -0
- sma/eval/stats.py +108 -0
- sma/eval/transfer_eval.py +844 -0
- sma/index/__init__.py +15 -0
- sma/index/ann.py +21 -0
- sma/index/content_vectors.py +60 -0
- sma/index/inverted.py +63 -0
- sma/index/macfac.py +174 -0
- sma/ir/__init__.py +22 -0
- sma/ir/canon.py +106 -0
- sma/ir/schema.py +165 -0
- sma/ir/sexpr.py +86 -0
- sma/ir/signatures.py +76 -0
- sma/match/__init__.py +20 -0
- sma/match/conflicts.py +46 -0
- sma/match/engine.py +60 -0
- sma/match/explain.py +59 -0
- sma/match/infer.py +54 -0
- sma/match/kernels.py +54 -0
- sma/match/mdl.py +30 -0
- sma/match/merge_cpsat.py +77 -0
- sma/match/merge_greedy.py +15 -0
- sma/match/mh.py +177 -0
- sma/match/ses.py +84 -0
- sma/match/types.py +115 -0
- sma/match/verifier.py +27 -0
- sma/ontology/__init__.py +45 -0
- sma/ontology/attack.py +134 -0
- sma/ontology/cpc.py +69 -0
- sma/ontology/graph.py +58 -0
- sma/ontology/loader.py +262 -0
- sma/ontology/mitre_xml.py +67 -0
- sma/ontology/mount.py +101 -0
- sma/ontology/rdf_loader.py +75 -0
- sma/ontology/registry.py +115 -0
- sma/ontology/router.py +69 -0
- sma/ontology/usgaap.py +73 -0
- sma/sage/__init__.py +6 -0
- sma/sage/assimilate.py +12 -0
- sma/sage/pools.py +105 -0
- sma/sage/probabilities.py +10 -0
- sma/store/__init__.py +6 -0
- sma/store/lmdb_store.py +78 -0
- sma/store/registry.py +26 -0
- sma/store/wal.py +26 -0
- sma/ui/app.py +642 -0
- structuremappingmemory-1.0.0.dist-info/METADATA +190 -0
- structuremappingmemory-1.0.0.dist-info/RECORD +125 -0
- structuremappingmemory-1.0.0.dist-info/WHEEL +5 -0
- structuremappingmemory-1.0.0.dist-info/entry_points.txt +2 -0
- structuremappingmemory-1.0.0.dist-info/licenses/LICENSE +204 -0
- structuremappingmemory-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Registered question pools for the Phase 5 LLM-QA "trustworthy specialist" phase.
|
|
2
|
+
|
|
3
|
+
This builds the three pre-registered question pools of
|
|
4
|
+
``configs/preregistration_v2_llmqa.md`` from the flagship medicine arm
|
|
5
|
+
(HPO + ``phenotype.hpoa``), holding the index fixed so attribution stays clean:
|
|
6
|
+
|
|
7
|
+
* **answerable** — cases whose true disease IS indexed (the agent should answer + cite);
|
|
8
|
+
* **out-of-knowledge / novel** — cases whose true disease is HELD OUT of the index
|
|
9
|
+
(the agent should ABSTAIN *and* flag NOVEL). The held-out cases are, by
|
|
10
|
+
construction, both unanswerable and novel, so ``ook`` and ``novel`` are the
|
|
11
|
+
same list of :class:`QAItem`.
|
|
12
|
+
|
|
13
|
+
Each clinical case is a *hard* partial/imprecise observation generated exactly
|
|
14
|
+
like ``sma/eval/ontology_bench.run_arm``: sample a few of the disease's
|
|
15
|
+
phenotypes, climb 0-2 is-a levels (imprecision), and add a few noise terms, then
|
|
16
|
+
render the surviving terms as a natural-language presentation. Determinism: every
|
|
17
|
+
``set`` is sorted to a list before use and the single RNG is explicitly seeded,
|
|
18
|
+
so identical ``(seed, n_index, ...)`` yields identical pools.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import random
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
|
|
26
|
+
from sma.ontology import MountedOntology
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class QAItem:
|
|
31
|
+
"""One pre-registered LLM-QA case.
|
|
32
|
+
|
|
33
|
+
``case_text`` is the NL presentation shown to the agent; ``case_terms`` are
|
|
34
|
+
the (possibly climbed/noised) ontology term ids backing it; ``gold_id`` /
|
|
35
|
+
``gold_name`` are the true disease. ``answerable`` is True iff the gold
|
|
36
|
+
disease is indexed; ``novel`` is True iff the gold disease was held out.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
case_text: str
|
|
40
|
+
case_terms: frozenset[str]
|
|
41
|
+
gold_id: str
|
|
42
|
+
gold_name: str
|
|
43
|
+
answerable: bool
|
|
44
|
+
novel: bool
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _parse_hpoa(hpoa_path: str) -> dict[str, tuple[str, set[str]]]:
|
|
48
|
+
"""Parse ``phenotype.hpoa`` into ``disease_id -> (name, {hpo_term_id})``.
|
|
49
|
+
|
|
50
|
+
Mirrors the flagship record construction (medicine arm /
|
|
51
|
+
``scripts/bench_ontology_suite.load_hpo_records``): skip header/comment
|
|
52
|
+
lines, tab-split, keep only phenotypic-abnormality rows (column 10 == ``"P"``),
|
|
53
|
+
and read column 0 as the disease id, column 1 as the disease name, column 3
|
|
54
|
+
as the HPO term.
|
|
55
|
+
"""
|
|
56
|
+
rec: dict[str, tuple[str, set[str]]] = {}
|
|
57
|
+
with open(hpoa_path, "r", encoding="utf-8") as handle:
|
|
58
|
+
for line in handle:
|
|
59
|
+
if line.startswith(("#", "database_id")):
|
|
60
|
+
continue
|
|
61
|
+
p = line.rstrip("\n").split("\t")
|
|
62
|
+
if len(p) < 11 or p[10] != "P":
|
|
63
|
+
continue
|
|
64
|
+
disease_id, disease_name, hpo_term = p[0], p[1], p[3]
|
|
65
|
+
name, terms = rec.setdefault(disease_id, (disease_name, set()))
|
|
66
|
+
terms.add(hpo_term)
|
|
67
|
+
return rec
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def build_pools(
|
|
71
|
+
mounted: MountedOntology,
|
|
72
|
+
hpoa_path: str,
|
|
73
|
+
*,
|
|
74
|
+
seed: int = 7,
|
|
75
|
+
n_index: int = 1500,
|
|
76
|
+
n_answerable: int = 120,
|
|
77
|
+
n_held: int = 120,
|
|
78
|
+
n_calib: int = 60,
|
|
79
|
+
min_ph: int = 7,
|
|
80
|
+
max_ph: int = 30,
|
|
81
|
+
) -> dict:
|
|
82
|
+
"""Build the registered LLM-QA pools over the medicine (HPO) arm.
|
|
83
|
+
|
|
84
|
+
``mounted`` is the mounted HPO ontology; ``hpoa_path`` points at
|
|
85
|
+
``phenotype.hpoa``. Returns a dict with keys:
|
|
86
|
+
|
|
87
|
+
* ``"index_items"`` — ``list[IndexItem]`` for the INDEXED diseases (the
|
|
88
|
+
shared knowledge every memory is built over);
|
|
89
|
+
* ``"answerable"`` — ``n_answerable`` :class:`QAItem`\\ s drawn from INDEXED
|
|
90
|
+
diseases (``answerable=True, novel=False``);
|
|
91
|
+
* ``"ook"`` / ``"novel"`` — ``n_held`` :class:`QAItem`\\ s drawn from
|
|
92
|
+
HELD-OUT diseases (``answerable=False, novel=True``); the same list is
|
|
93
|
+
returned under both keys because held-out cases are both unanswerable and
|
|
94
|
+
novel.
|
|
95
|
+
* ``"calib_answerable"`` / ``"calib_ook"`` — up to ``n_calib`` :class:`QAItem`\\
|
|
96
|
+
s each, drawn from the SPARE indexed / held-out diseases (disjoint from the
|
|
97
|
+
test pools above). The driver scores these retrieval-only (no LLM spend) to
|
|
98
|
+
calibrate the cite-or-abstain threshold without ever touching the test
|
|
99
|
+
split. Empty when no spare diseases remain.
|
|
100
|
+
|
|
101
|
+
Eligible diseases carry ``min_ph..max_ph`` phenotypes that are present in
|
|
102
|
+
``mounted.graph.terms``; their ids are sorted then shuffled under ``seed``.
|
|
103
|
+
The first ``n_index`` are INDEXED; the remainder are HELD-OUT. The test pools
|
|
104
|
+
are drawn first (so their cases are unchanged by ``n_calib``), then the
|
|
105
|
+
calibration pools from the remaining ids.
|
|
106
|
+
"""
|
|
107
|
+
# Local import keeps this module importable even while the sibling
|
|
108
|
+
# agentic_qa.metrics is mid-construction (the package __init__ imports it).
|
|
109
|
+
from sma.eval.agentic import IndexItem
|
|
110
|
+
|
|
111
|
+
graph = mounted.graph
|
|
112
|
+
|
|
113
|
+
def term_text(t: str) -> str:
|
|
114
|
+
nm = graph.terms[t].name if t in graph.terms else ""
|
|
115
|
+
return nm or t
|
|
116
|
+
|
|
117
|
+
parents = {tid: tuple(term.parents) for tid, term in graph.terms.items()}
|
|
118
|
+
parsed = _parse_hpoa(hpoa_path)
|
|
119
|
+
|
|
120
|
+
# Eligibility: known phenotypes only, count in [min_ph, max_ph]. SORTED ids.
|
|
121
|
+
known: dict[str, tuple[str, list[str]]] = {}
|
|
122
|
+
for did in sorted(parsed):
|
|
123
|
+
name, terms = parsed[did]
|
|
124
|
+
present = sorted(t for t in terms if t in graph.terms)
|
|
125
|
+
if min_ph <= len(present) <= max_ph:
|
|
126
|
+
known[did] = (name, present)
|
|
127
|
+
|
|
128
|
+
eligible = sorted(known)
|
|
129
|
+
rng = random.Random(seed)
|
|
130
|
+
rng.shuffle(eligible)
|
|
131
|
+
|
|
132
|
+
indexed_ids = eligible[:n_index]
|
|
133
|
+
held_ids = eligible[n_index:]
|
|
134
|
+
|
|
135
|
+
# Shared index: IndexItem(key=id, term_ids, text=space-joined term NAMES, meta).
|
|
136
|
+
index_items = [
|
|
137
|
+
IndexItem(
|
|
138
|
+
key=did,
|
|
139
|
+
term_ids=frozenset(known[did][1]),
|
|
140
|
+
text=" ".join(term_text(t) for t in known[did][1]),
|
|
141
|
+
meta={"name": known[did][0]},
|
|
142
|
+
)
|
|
143
|
+
for did in indexed_ids
|
|
144
|
+
]
|
|
145
|
+
|
|
146
|
+
# Noise pool: every phenotype present across the INDEXED diseases (SORTED), so
|
|
147
|
+
# injected distractors are in-vocabulary, matching the ontology_bench generator.
|
|
148
|
+
noise_pool = sorted({t for did in indexed_ids for t in known[did][1]})
|
|
149
|
+
|
|
150
|
+
def make_case(terms: list[str]) -> tuple[frozenset[str], str]:
|
|
151
|
+
"""Hard query: sample <=5 phenotypes, climb 0-2 is-a levels, +3 noise."""
|
|
152
|
+
keep = rng.sample(terms, min(5, len(terms)))
|
|
153
|
+
q: list[str] = []
|
|
154
|
+
for t in keep:
|
|
155
|
+
cur = t
|
|
156
|
+
for _ in range(rng.choice([0, 0, 1, 1, 2])):
|
|
157
|
+
ps = parents.get(cur)
|
|
158
|
+
if ps:
|
|
159
|
+
cur = rng.choice(sorted(ps))
|
|
160
|
+
q.append(cur)
|
|
161
|
+
if noise_pool:
|
|
162
|
+
q += rng.sample(noise_pool, min(3, len(noise_pool)))
|
|
163
|
+
text = "Patient presents with: " + ", ".join(term_text(t) for t in q)
|
|
164
|
+
return frozenset(q), text
|
|
165
|
+
|
|
166
|
+
def qitems(ids: list[str], n: int, *, answerable: bool) -> list[QAItem]:
|
|
167
|
+
out: list[QAItem] = []
|
|
168
|
+
for did in ids[:n]:
|
|
169
|
+
name, terms = known[did]
|
|
170
|
+
case_terms, case_text = make_case(terms)
|
|
171
|
+
out.append(
|
|
172
|
+
QAItem(
|
|
173
|
+
case_text=case_text,
|
|
174
|
+
case_terms=case_terms,
|
|
175
|
+
gold_id=did,
|
|
176
|
+
gold_name=name,
|
|
177
|
+
answerable=answerable,
|
|
178
|
+
novel=not answerable,
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
return out
|
|
182
|
+
|
|
183
|
+
answerable = qitems(indexed_ids, n_answerable, answerable=True)
|
|
184
|
+
novel = qitems(held_ids, n_held, answerable=False)
|
|
185
|
+
# Calibration pools: SPARE ids beyond the test draws (disjoint), scored
|
|
186
|
+
# retrieval-only to fit the abstention threshold without test-set leakage.
|
|
187
|
+
calib_answerable = qitems(indexed_ids[n_answerable:], n_calib, answerable=True)
|
|
188
|
+
calib_ook = qitems(held_ids[n_held:], n_calib, answerable=False)
|
|
189
|
+
|
|
190
|
+
return {
|
|
191
|
+
"index_items": index_items,
|
|
192
|
+
"answerable": answerable,
|
|
193
|
+
"ook": novel,
|
|
194
|
+
"novel": novel,
|
|
195
|
+
"calib_answerable": calib_answerable,
|
|
196
|
+
"calib_ook": calib_ook,
|
|
197
|
+
}
|
sma/eval/arn.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""ARN dataset helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import pathlib
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
DEFAULT_ARN_PATH = pathlib.Path(
|
|
10
|
+
"data/raw/arn/Analogical Reasoning on Narratives (ARN) dataset.xlsx - Sheet1.csv"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
ARN_REQUIRED_COLUMNS = {
|
|
14
|
+
"id",
|
|
15
|
+
"proverb",
|
|
16
|
+
"query_narrative",
|
|
17
|
+
"first_choice",
|
|
18
|
+
"second_choice",
|
|
19
|
+
"distractor_similarity",
|
|
20
|
+
"analogy_level",
|
|
21
|
+
"correct_answer",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def validate_columns(columns: list[str]) -> bool:
|
|
26
|
+
return ARN_REQUIRED_COLUMNS.issubset(set(columns))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_arn_rows(path: str | pathlib.Path = DEFAULT_ARN_PATH, limit: int | None = None) -> list[dict[str, str]]:
|
|
30
|
+
source = pathlib.Path(path)
|
|
31
|
+
with source.open(encoding="utf-8-sig", newline="") as fh:
|
|
32
|
+
reader = csv.DictReader(fh)
|
|
33
|
+
if not reader.fieldnames or not validate_columns(reader.fieldnames):
|
|
34
|
+
raise ValueError(f"ARN CSV has unexpected columns: {reader.fieldnames}")
|
|
35
|
+
rows = []
|
|
36
|
+
for row in reader:
|
|
37
|
+
rows.append(dict(row))
|
|
38
|
+
if limit is not None and len(rows) >= limit:
|
|
39
|
+
break
|
|
40
|
+
return rows
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def arn_choice_corpus(path: str | pathlib.Path = DEFAULT_ARN_PATH, limit: int = 12) -> tuple[str, str]:
|
|
44
|
+
"""Return a raw text corpus of answer choices plus one suggested query."""
|
|
45
|
+
|
|
46
|
+
rows = load_arn_rows(path, limit=limit)
|
|
47
|
+
blocks: list[str] = []
|
|
48
|
+
suggested_query = ""
|
|
49
|
+
for row in rows:
|
|
50
|
+
if not suggested_query:
|
|
51
|
+
suggested_query = row["query_narrative"]
|
|
52
|
+
correct = row["correct_answer"].strip()
|
|
53
|
+
for choice_number, column in (("1", "first_choice"), ("2", "second_choice")):
|
|
54
|
+
label = "correct" if choice_number == correct else "distractor"
|
|
55
|
+
blocks.append(
|
|
56
|
+
"\n".join(
|
|
57
|
+
[
|
|
58
|
+
f"ARN id={row['id']} choice={choice_number} label={label}",
|
|
59
|
+
f"proverb: {row['proverb']}",
|
|
60
|
+
f"analogy_level: {row['analogy_level']}",
|
|
61
|
+
row[column],
|
|
62
|
+
]
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
return "\n\n".join(blocks), suggested_query
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""Baseline retrieval implementations for the LogHub evaluation ladder.
|
|
2
|
+
|
|
3
|
+
Modules here are deliberately self-contained: each baseline owns its model
|
|
4
|
+
loading and scoring so `scripts/baseline_ladder.py` can compose them under the
|
|
5
|
+
exact protocol of `sma.eval.loghub_eval` without touching that file.
|
|
6
|
+
"""
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Dense retrieval with BAAI/bge-base-en-v1.5 (blueprint B2's specified embedder).
|
|
2
|
+
|
|
3
|
+
CPU-only. Index embeddings are batch-encoded once; queries are encoded per
|
|
4
|
+
call so per-query latency includes the real encode cost (same convention as
|
|
5
|
+
the MiniLM dense baseline in sma.eval.loghub_eval).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
MODEL_NAME = "BAAI/bge-base-en-v1.5"
|
|
13
|
+
# Per the BGE model card, short queries in retrieval tasks should carry this
|
|
14
|
+
# instruction prefix; documents are encoded without it.
|
|
15
|
+
QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BGEDenseRetriever:
|
|
19
|
+
def __init__(self, model_name: str = MODEL_NAME, batch_size: int = 16):
|
|
20
|
+
from sentence_transformers import SentenceTransformer
|
|
21
|
+
|
|
22
|
+
self.model = SentenceTransformer(model_name, device="cpu")
|
|
23
|
+
self.batch_size = batch_size
|
|
24
|
+
self.doc_ids: list[str] = []
|
|
25
|
+
self.doc_matrix: np.ndarray | None = None
|
|
26
|
+
|
|
27
|
+
def build(self, documents: list[tuple[str, str]]) -> None:
|
|
28
|
+
self.doc_ids = [doc_id for doc_id, _ in documents]
|
|
29
|
+
texts = [text for _, text in documents]
|
|
30
|
+
self.doc_matrix = self.model.encode(
|
|
31
|
+
texts,
|
|
32
|
+
batch_size=self.batch_size,
|
|
33
|
+
convert_to_numpy=True,
|
|
34
|
+
normalize_embeddings=True,
|
|
35
|
+
show_progress_bar=False,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def encode_query(self, query_text: str) -> np.ndarray:
|
|
39
|
+
return self.model.encode(
|
|
40
|
+
[QUERY_PREFIX + query_text],
|
|
41
|
+
convert_to_numpy=True,
|
|
42
|
+
normalize_embeddings=True,
|
|
43
|
+
show_progress_bar=False,
|
|
44
|
+
)[0]
|
|
45
|
+
|
|
46
|
+
def retrieve(self, query_text: str, k: int = 10) -> list[tuple[str, float]]:
|
|
47
|
+
if self.doc_matrix is None:
|
|
48
|
+
return []
|
|
49
|
+
q = self.encode_query(query_text)
|
|
50
|
+
scores = self.doc_matrix @ q # cosine: both sides L2-normalized
|
|
51
|
+
ranked = sorted(
|
|
52
|
+
zip(self.doc_ids, map(float, scores)), key=lambda row: (-row[1], row[0])
|
|
53
|
+
)
|
|
54
|
+
return ranked[:k]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Simple lexical baseline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import Counter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def lexical_score(query: str, document: str) -> float:
|
|
9
|
+
q = Counter(query.lower().split())
|
|
10
|
+
d = Counter(document.lower().split())
|
|
11
|
+
return sum(min(v, d.get(k, 0)) for k, v in q.items())
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def rank_bm25_like(query: str, documents: list[tuple[str, str]], k: int = 10) -> list[tuple[str, float]]:
|
|
15
|
+
rows = [(doc_id, lexical_score(query, text)) for doc_id, text in documents]
|
|
16
|
+
rows.sort(key=lambda row: (-row[1], row[0]))
|
|
17
|
+
return rows[:k]
|
|
18
|
+
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Dense-RAG style baseline with local deterministic TF-IDF fallback.
|
|
2
|
+
|
|
3
|
+
This is not the final sentence-transformer baseline from the paper plan. It is
|
|
4
|
+
the CPU-safe baseline used in MVP reports when no embedding model is installed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
10
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def rank_tfidf_dense(query: str, documents: list[tuple[str, str]], k: int = 10) -> list[tuple[str, float]]:
|
|
14
|
+
if not documents:
|
|
15
|
+
return []
|
|
16
|
+
ids = [doc_id for doc_id, _ in documents]
|
|
17
|
+
texts = [text for _, text in documents]
|
|
18
|
+
vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True)
|
|
19
|
+
matrix = vectorizer.fit_transform(texts + [query])
|
|
20
|
+
sims = cosine_similarity(matrix[-1], matrix[:-1]).ravel()
|
|
21
|
+
rows = list(zip(ids, map(float, sims)))
|
|
22
|
+
rows.sort(key=lambda row: (-row[1], row[0]))
|
|
23
|
+
return rows[:k]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def rank_tfidf_dense_batch(queries: list[str], documents: list[tuple[str, str]], k: int = 10) -> list[list[tuple[str, float]]]:
|
|
27
|
+
if not documents:
|
|
28
|
+
return [[] for _ in queries]
|
|
29
|
+
ids = [doc_id for doc_id, _ in documents]
|
|
30
|
+
texts = [text for _, text in documents]
|
|
31
|
+
vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=True)
|
|
32
|
+
doc_matrix = vectorizer.fit_transform(texts)
|
|
33
|
+
query_matrix = vectorizer.transform(queries)
|
|
34
|
+
sims = query_matrix @ doc_matrix.T
|
|
35
|
+
ranked: list[list[tuple[str, float]]] = []
|
|
36
|
+
for row_idx in range(sims.shape[0]):
|
|
37
|
+
row = sims.getrow(row_idx)
|
|
38
|
+
scores = row.toarray().ravel()
|
|
39
|
+
pairs = list(zip(ids, map(float, scores)))
|
|
40
|
+
pairs.sort(key=lambda item: (-item[1], item[0]))
|
|
41
|
+
ranked.append(pairs[:k])
|
|
42
|
+
return ranked
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""HippoRAG-2-style KG retrieval comparator (blueprint B5, deterministic adaptation).
|
|
2
|
+
|
|
3
|
+
Adapted from HippoRAG / HippoRAG 2 (Gutierrez et al., NeurIPS'24 / ICML'25):
|
|
4
|
+
- a phrase graph over OpenIE triples with passage (document) nodes,
|
|
5
|
+
- synonym edges linking near-identical phrases,
|
|
6
|
+
- Personalized PageRank (damping 0.5, as published) with personalization
|
|
7
|
+
mass on query phrase nodes, weighted by node specificity 1/df
|
|
8
|
+
(HippoRAG's inverse passage-frequency seed weighting),
|
|
9
|
+
- documents scored by the PPR mass landing on their document nodes
|
|
10
|
+
(chosen over summing contained-entity mass: it is the published
|
|
11
|
+
HippoRAG 2 passage-node scoring and needs no extra idf heuristic).
|
|
12
|
+
|
|
13
|
+
Substitutions for a fair, deterministic, LLM-free comparison:
|
|
14
|
+
- LLM OpenIE is replaced by rule-based triple extraction: per log/code
|
|
15
|
+
line, regex-extracted entity-like tokens (block ids, IPs, hostnames,
|
|
16
|
+
paths, hex ids, dotted/CamelCase identifiers, content words) and a
|
|
17
|
+
fixed verb lexicon for relations; lines with no relation token fall
|
|
18
|
+
back to entity co-occurrence edges.
|
|
19
|
+
- The embedding-based synonym model is replaced by case/punctuation-
|
|
20
|
+
normalized string equality plus token-Jaccard >= 0.8 on split
|
|
21
|
+
identifiers.
|
|
22
|
+
No randomness anywhere: iteration is over sorted structures and PageRank
|
|
23
|
+
is the deterministic scipy power iteration.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import re
|
|
29
|
+
from collections import Counter, defaultdict
|
|
30
|
+
|
|
31
|
+
import networkx as nx
|
|
32
|
+
|
|
33
|
+
DAMPING = 0.5 # HippoRAG's published PPR damping factor
|
|
34
|
+
JACCARD_SYNONYM = 0.8
|
|
35
|
+
MAX_ENTITIES_PER_LINE = 16
|
|
36
|
+
|
|
37
|
+
# Pattern priority matters: earlier patterns mask their spans so later,
|
|
38
|
+
# more generic ones do not re-extract fragments.
|
|
39
|
+
ENTITY_PATTERNS: list[re.Pattern[str]] = [
|
|
40
|
+
re.compile(r"blk_-?\d+"), # HDFS block ids
|
|
41
|
+
re.compile(r"\b\d{1,3}(?:\.\d{1,3}){3}(?::\d+)?\b"), # IPv4(:port)
|
|
42
|
+
re.compile(r"\b0x[0-9a-fA-F]+\b|\b[0-9a-f]{8,}\b"), # hex ids
|
|
43
|
+
re.compile(r"(?<![\w/])/(?:[\w.\-]+/)+[\w.\-]+"), # file paths
|
|
44
|
+
re.compile(r"\b[A-Za-z_][\w$\-]*(?:\.[A-Za-z_][\w$\-]*)+\b"), # dotted: classes, hostnames
|
|
45
|
+
re.compile(r"\b[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]+)+\b"), # CamelCase (exceptions, services)
|
|
46
|
+
re.compile(r"\b[A-Za-z][\w]*(?:[-_][\w]+)+\b"), # snake/hyphen identifiers
|
|
47
|
+
]
|
|
48
|
+
WORD_PATTERN = re.compile(r"\b[A-Za-z]{2,}\b")
|
|
49
|
+
|
|
50
|
+
STOPWORDS = frozenset(
|
|
51
|
+
"the a an to of in on at for is was were be been being and or not with by "
|
|
52
|
+
"from this that it its as are has have had while when then than but if "
|
|
53
|
+
"else into over under after before during no none null true false via per "
|
|
54
|
+
"info debug trace warn".split()
|
|
55
|
+
)
|
|
56
|
+
RELATION_LEXICON = frozenset(
|
|
57
|
+
"received receiving sent send sending terminating terminated starting "
|
|
58
|
+
"started start stop stopping stopped failed failing fails fail connect "
|
|
59
|
+
"connected connecting disconnect disconnected deleting deleted delete "
|
|
60
|
+
"created creating create opened opening open closed closing close read "
|
|
61
|
+
"reading write writing wrote allocated allocating exceeded aborted "
|
|
62
|
+
"aborting retrying retried refused raised threw throw throws thrown "
|
|
63
|
+
"caught calling called returned returning killed killing launched "
|
|
64
|
+
"launching completed completing finished exited timed waiting blocked "
|
|
65
|
+
"serving served added adding removed removing updated updating "
|
|
66
|
+
"registered succeeded".split()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _normalize(phrase: str) -> str:
|
|
71
|
+
return re.sub(r"[^0-9a-z]+", "", phrase.lower())
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _phrase_tokens(phrase: str) -> frozenset[str]:
|
|
75
|
+
"""Split an identifier on case boundaries and punctuation for Jaccard."""
|
|
76
|
+
spaced = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", phrase)
|
|
77
|
+
return frozenset(t.lower() for t in re.split(r"[^0-9A-Za-z]+", spaced) if t)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _line_triples(line: str) -> tuple[list[tuple[str, str, str]], list[tuple[str, str]], list[str]]:
|
|
81
|
+
"""Extract (triples, co-occurrence pairs, all entities) from one line."""
|
|
82
|
+
found: list[tuple[int, str, str]] = []
|
|
83
|
+
masked = line
|
|
84
|
+
for pattern in ENTITY_PATTERNS:
|
|
85
|
+
for m in pattern.finditer(masked):
|
|
86
|
+
found.append((m.start(), "entity", m.group(0)))
|
|
87
|
+
masked = pattern.sub(lambda m: " " * len(m.group(0)), masked)
|
|
88
|
+
for m in WORD_PATTERN.finditer(masked):
|
|
89
|
+
word = m.group(0)
|
|
90
|
+
lower = word.lower()
|
|
91
|
+
if lower in STOPWORDS:
|
|
92
|
+
continue
|
|
93
|
+
if lower in RELATION_LEXICON or (len(lower) >= 5 and lower.endswith(("ing", "ed"))):
|
|
94
|
+
found.append((m.start(), "relation", lower))
|
|
95
|
+
else:
|
|
96
|
+
found.append((m.start(), "entity", word))
|
|
97
|
+
found.sort()
|
|
98
|
+
ents = [(pos, s) for pos, kind, s in found if kind == "entity"][:MAX_ENTITIES_PER_LINE]
|
|
99
|
+
rels = [(pos, s) for pos, kind, s in found if kind == "relation"]
|
|
100
|
+
entities = [s for _, s in ents]
|
|
101
|
+
triples: list[tuple[str, str, str]] = []
|
|
102
|
+
pairs: list[tuple[str, str]] = []
|
|
103
|
+
if rels:
|
|
104
|
+
for (p1, e1), (p2, e2) in zip(ents, ents[1:]):
|
|
105
|
+
between = [r for pr, r in rels if p1 < pr < p2]
|
|
106
|
+
rel = between[0] if between else rels[0][1]
|
|
107
|
+
triples.append((e1, rel, e2))
|
|
108
|
+
else:
|
|
109
|
+
for i, (_, e1) in enumerate(ents):
|
|
110
|
+
for _, e2 in ents[i + 1:]:
|
|
111
|
+
if e1 != e2:
|
|
112
|
+
pairs.append((e1, e2))
|
|
113
|
+
return triples, pairs, entities
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def extract_phrases(text: str) -> list[str]:
|
|
117
|
+
"""Entity-like phrases for a whole text, in order of first appearance."""
|
|
118
|
+
out: list[str] = []
|
|
119
|
+
seen: set[str] = set()
|
|
120
|
+
for line in text.splitlines() or [text]:
|
|
121
|
+
_, _, entities = _line_triples(line)
|
|
122
|
+
for e in entities:
|
|
123
|
+
if e not in seen:
|
|
124
|
+
seen.add(e)
|
|
125
|
+
out.append(e)
|
|
126
|
+
return out
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class HippoRAGRetriever:
|
|
130
|
+
"""Phrase graph + Personalized PageRank retriever (see module docstring)."""
|
|
131
|
+
|
|
132
|
+
def __init__(self, damping: float = DAMPING):
|
|
133
|
+
self.damping = damping
|
|
134
|
+
self.graph = nx.Graph()
|
|
135
|
+
self.doc_ids: list[str] = []
|
|
136
|
+
self._norm_index: dict[str, list[str]] = {}
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def _doc_node(doc_id: str) -> str:
|
|
140
|
+
return f"d::{doc_id}"
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def _ent_node(phrase: str) -> str:
|
|
144
|
+
return f"e::{phrase}"
|
|
145
|
+
|
|
146
|
+
def _bump(self, u: str, v: str, w: float) -> None:
|
|
147
|
+
if self.graph.has_edge(u, v):
|
|
148
|
+
self.graph[u][v]["weight"] += w
|
|
149
|
+
else:
|
|
150
|
+
self.graph.add_edge(u, v, weight=w)
|
|
151
|
+
|
|
152
|
+
def build(self, documents: list[tuple[str, str]]) -> None:
|
|
153
|
+
self.graph = nx.Graph()
|
|
154
|
+
self.doc_ids = [doc_id for doc_id, _ in documents]
|
|
155
|
+
for doc_id, text in documents:
|
|
156
|
+
dnode = self._doc_node(doc_id)
|
|
157
|
+
self.graph.add_node(dnode, kind="doc")
|
|
158
|
+
doc_counts: Counter[str] = Counter()
|
|
159
|
+
for line in text.splitlines() or [text]:
|
|
160
|
+
triples, pairs, entities = _line_triples(line)
|
|
161
|
+
doc_counts.update(entities)
|
|
162
|
+
for s, _rel, o in triples:
|
|
163
|
+
if s != o:
|
|
164
|
+
self._bump(self._ent_node(s), self._ent_node(o), 1.0)
|
|
165
|
+
for e1, e2 in pairs:
|
|
166
|
+
self._bump(self._ent_node(e1), self._ent_node(e2), 1.0)
|
|
167
|
+
for phrase in sorted(doc_counts):
|
|
168
|
+
enode = self._ent_node(phrase)
|
|
169
|
+
self.graph.add_node(enode, kind="entity")
|
|
170
|
+
self._bump(enode, dnode, float(doc_counts[phrase]))
|
|
171
|
+
self._add_synonym_edges()
|
|
172
|
+
self._norm_index = defaultdict(list)
|
|
173
|
+
for node in sorted(self.graph.nodes):
|
|
174
|
+
if node.startswith("e::"):
|
|
175
|
+
self._norm_index[_normalize(node[3:])].append(node)
|
|
176
|
+
self._norm_index = dict(self._norm_index)
|
|
177
|
+
|
|
178
|
+
def _add_synonym_edges(self) -> None:
|
|
179
|
+
phrases = sorted(n[3:] for n in self.graph.nodes if n.startswith("e::"))
|
|
180
|
+
by_norm: dict[str, list[str]] = defaultdict(list)
|
|
181
|
+
for p in phrases:
|
|
182
|
+
by_norm[_normalize(p)].append(p)
|
|
183
|
+
for _norm, group in sorted(by_norm.items()):
|
|
184
|
+
for i, p1 in enumerate(group):
|
|
185
|
+
for p2 in group[i + 1:]:
|
|
186
|
+
self._bump(self._ent_node(p1), self._ent_node(p2), 1.0)
|
|
187
|
+
# Token-Jaccard synonyms among multi-token identifiers sharing a token.
|
|
188
|
+
token_sets = {p: _phrase_tokens(p) for p in phrases}
|
|
189
|
+
by_token: dict[str, list[str]] = defaultdict(list)
|
|
190
|
+
for p in phrases:
|
|
191
|
+
if len(token_sets[p]) >= 2:
|
|
192
|
+
for t in sorted(token_sets[p]):
|
|
193
|
+
by_token[t].append(p)
|
|
194
|
+
compared: set[tuple[str, str]] = set()
|
|
195
|
+
for _token, group in sorted(by_token.items()):
|
|
196
|
+
for i, p1 in enumerate(group):
|
|
197
|
+
for p2 in group[i + 1:]:
|
|
198
|
+
key = (p1, p2)
|
|
199
|
+
if key in compared or _normalize(p1) == _normalize(p2):
|
|
200
|
+
continue
|
|
201
|
+
compared.add(key)
|
|
202
|
+
t1, t2 = token_sets[p1], token_sets[p2]
|
|
203
|
+
jac = len(t1 & t2) / len(t1 | t2)
|
|
204
|
+
if jac >= JACCARD_SYNONYM:
|
|
205
|
+
self._bump(self._ent_node(p1), self._ent_node(p2), 1.0)
|
|
206
|
+
|
|
207
|
+
def _specificity(self, node: str) -> float:
|
|
208
|
+
df = sum(1 for nb in self.graph[node] if nb.startswith("d::"))
|
|
209
|
+
return 1.0 / max(1, df)
|
|
210
|
+
|
|
211
|
+
def retrieve(self, query_text: str, k: int = 10) -> list[tuple[str, float]]:
|
|
212
|
+
if not self.doc_ids:
|
|
213
|
+
return []
|
|
214
|
+
seeds: dict[str, float] = {}
|
|
215
|
+
for phrase in extract_phrases(query_text):
|
|
216
|
+
for node in self._norm_index.get(_normalize(phrase), []):
|
|
217
|
+
seeds[node] = max(seeds.get(node, 0.0), self._specificity(node))
|
|
218
|
+
personalization = dict(sorted(seeds.items())) if seeds else None
|
|
219
|
+
scores = nx.pagerank(
|
|
220
|
+
self.graph,
|
|
221
|
+
alpha=self.damping,
|
|
222
|
+
personalization=personalization,
|
|
223
|
+
weight="weight",
|
|
224
|
+
)
|
|
225
|
+
ranked = sorted(
|
|
226
|
+
((doc_id, float(scores.get(self._doc_node(doc_id), 0.0))) for doc_id in self.doc_ids),
|
|
227
|
+
key=lambda row: (-row[1], row[0]),
|
|
228
|
+
)
|
|
229
|
+
return ranked[:k]
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def rank_hipporag(query_text: str, documents: list[tuple[str, str]], k: int = 10) -> list[tuple[str, float]]:
|
|
233
|
+
retriever = HippoRAGRetriever()
|
|
234
|
+
retriever.build(documents)
|
|
235
|
+
return retriever.retrieve(query_text, k=k)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Reciprocal-rank fusion (blueprint B3: the strong practical RAG).
|
|
2
|
+
|
|
3
|
+
Fuses any number of ranked lists with the standard RRF constant k=60
|
|
4
|
+
(Cormack, Clarke & Buettcher 2009). Scores are sum(1 / (k + rank)) with
|
|
5
|
+
1-based ranks; ties break on doc id for determinism.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
RRF_K = 60
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def rrf_fuse(
|
|
14
|
+
rankings: list[list[tuple[str, float]]],
|
|
15
|
+
k: int = RRF_K,
|
|
16
|
+
top_k: int | None = None,
|
|
17
|
+
) -> list[tuple[str, float]]:
|
|
18
|
+
"""Fuse ranked (doc_id, score) lists by reciprocal rank.
|
|
19
|
+
|
|
20
|
+
Input scores are ignored; only rank order matters (that is the point of
|
|
21
|
+
RRF -- it is scale-free across heterogeneous retrievers).
|
|
22
|
+
"""
|
|
23
|
+
fused: dict[str, float] = {}
|
|
24
|
+
for ranking in rankings:
|
|
25
|
+
for rank, (doc_id, _score) in enumerate(ranking, start=1):
|
|
26
|
+
fused[doc_id] = fused.get(doc_id, 0.0) + 1.0 / (k + rank)
|
|
27
|
+
ranked = sorted(fused.items(), key=lambda row: (-row[1], row[0]))
|
|
28
|
+
if top_k is not None:
|
|
29
|
+
ranked = ranked[:top_k]
|
|
30
|
+
return ranked
|