mentar 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mentar/__init__.py +6 -0
- mentar/cli/__init__.py +1 -0
- mentar/cli/__main__.py +62 -0
- mentar/db/__init__.py +4 -0
- mentar/db/store.py +416 -0
- mentar/dialogue/__init__.py +4 -0
- mentar/engine/__init__.py +4 -0
- mentar/engine/bkt.py +99 -0
- mentar/engine/fringe.py +104 -0
- mentar/engine/probe_classify.py +79 -0
- mentar/eval/__init__.py +4 -0
- mentar/eval/verify_numeric.py +619 -0
- mentar/grounding/__init__.py +65 -0
- mentar/grounding/cache.py +127 -0
- mentar/grounding/reader.py +271 -0
- mentar/grounding/resolve.py +125 -0
- mentar/grounding/source_map.py +120 -0
- mentar/grounding/sources.py +267 -0
- mentar/grounding/wrapper.py +50 -0
- mentar/inference/__init__.py +7 -0
- mentar/safety/__init__.py +4 -0
- mentar/safety/escalation.py +316 -0
- mentar/tools/__init__.py +4 -0
- mentar/tools/validate_template.py +322 -0
- mentar-0.1.0.dev0.dist-info/METADATA +178 -0
- mentar-0.1.0.dev0.dist-info/RECORD +29 -0
- mentar-0.1.0.dev0.dist-info/WHEEL +5 -0
- mentar-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- mentar-0.1.0.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Passage cache: memoize resolved passages by anchor URL.
|
|
2
|
+
|
|
3
|
+
Memoisation strategy:
|
|
4
|
+
- In-memory dict (primary): ~zero cost per turn after warm; lives for the
|
|
5
|
+
process lifetime. Because ZIM files are static per build, the cache is
|
|
6
|
+
deterministic and never stale within a session.
|
|
7
|
+
- Optional on-disk cache (``cache.dir``): pickled dict keyed by anchor URL.
|
|
8
|
+
Enabled via ``cfg.grounding.cache.enabled = true`` + ``cache.dir`` path.
|
|
9
|
+
On-disk cache is best-effort: any I/O failure is logged and ignored (never
|
|
10
|
+
crashes a turn — degradation contract).
|
|
11
|
+
|
|
12
|
+
Spec: docs/design/W7_grounding_reader.md (Cost row in module contract).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import pickle
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Optional
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# ── In-memory store ───────────────────────────────────────────────────────────
|
|
27
|
+
_MEM_CACHE: dict[str, str] = {}
|
|
28
|
+
|
|
29
|
+
_DISK_CACHE_VERSION = 1
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _cache_key(anchor: str) -> str:
|
|
33
|
+
"""Stable, filesystem-safe cache key derived from the anchor URL."""
|
|
34
|
+
return hashlib.sha256(anchor.encode()).hexdigest()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _disk_path(cache_dir: str | Path, anchor: str) -> Path:
|
|
38
|
+
key = _cache_key(anchor)
|
|
39
|
+
return Path(cache_dir) / f"{key}.pkl"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ── Public API ────────────────────────────────────────────────────────────────
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get(anchor: str, cfg: dict) -> Optional[str]:
|
|
46
|
+
"""Return a cached passage for ``anchor``, or ``None`` if not cached.
|
|
47
|
+
|
|
48
|
+
Checks in-memory first, then on-disk (if enabled).
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
anchor: The anchor URL (cache key).
|
|
52
|
+
cfg: The ``grounding:`` config block.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Cached passage string, or ``None``.
|
|
56
|
+
"""
|
|
57
|
+
# 1. In-memory
|
|
58
|
+
if anchor in _MEM_CACHE:
|
|
59
|
+
logger.debug("cache.get: HIT (memory) anchor=%r", anchor)
|
|
60
|
+
return _MEM_CACHE[anchor]
|
|
61
|
+
|
|
62
|
+
# 2. On-disk
|
|
63
|
+
cache_cfg = cfg.get("cache", {})
|
|
64
|
+
if not cache_cfg.get("enabled", False):
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
cache_dir = _resolve_cache_dir(cache_cfg)
|
|
68
|
+
if not cache_dir:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
disk_file = _disk_path(cache_dir, anchor)
|
|
72
|
+
if disk_file.exists():
|
|
73
|
+
try:
|
|
74
|
+
with disk_file.open("rb") as f:
|
|
75
|
+
data = pickle.load(f)
|
|
76
|
+
if isinstance(data, dict) and data.get("v") == _DISK_CACHE_VERSION:
|
|
77
|
+
passage = data["passage"]
|
|
78
|
+
_MEM_CACHE[anchor] = passage # warm in-memory cache
|
|
79
|
+
logger.debug("cache.get: HIT (disk) anchor=%r", anchor)
|
|
80
|
+
return passage
|
|
81
|
+
except Exception:
|
|
82
|
+
logger.warning("cache.get: failed to read disk cache %s", disk_file, exc_info=True)
|
|
83
|
+
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def put(anchor: str, passage: str, cfg: dict) -> None:
|
|
88
|
+
"""Store ``passage`` in the cache for ``anchor``.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
anchor: The anchor URL (cache key).
|
|
92
|
+
passage: The resolved passage string.
|
|
93
|
+
cfg: The ``grounding:`` config block.
|
|
94
|
+
"""
|
|
95
|
+
_MEM_CACHE[anchor] = passage
|
|
96
|
+
|
|
97
|
+
cache_cfg = cfg.get("cache", {})
|
|
98
|
+
if not cache_cfg.get("enabled", False):
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
cache_dir = _resolve_cache_dir(cache_cfg)
|
|
102
|
+
if not cache_dir:
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
Path(cache_dir).mkdir(parents=True, exist_ok=True)
|
|
107
|
+
disk_file = _disk_path(cache_dir, anchor)
|
|
108
|
+
with disk_file.open("wb") as f:
|
|
109
|
+
pickle.dump({"v": _DISK_CACHE_VERSION, "passage": passage}, f)
|
|
110
|
+
logger.debug("cache.put: wrote disk cache %s", disk_file)
|
|
111
|
+
except Exception:
|
|
112
|
+
logger.warning("cache.put: failed to write disk cache", exc_info=True)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def clear_memory() -> None:
|
|
116
|
+
"""Clear the in-memory cache (useful in tests)."""
|
|
117
|
+
_MEM_CACHE.clear()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _resolve_cache_dir(cache_cfg: dict) -> Optional[str]:
|
|
121
|
+
"""Expand env-var substitution in cache_dir and return the resolved string."""
|
|
122
|
+
raw_dir: str = cache_cfg.get("dir", "")
|
|
123
|
+
if not raw_dir:
|
|
124
|
+
return None
|
|
125
|
+
# Expand ${VAR:-default} style references
|
|
126
|
+
expanded = os.path.expandvars(raw_dir)
|
|
127
|
+
return expanded if expanded else None
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""Thin owned libzim reader: open a ZIM archive and extract article text.
|
|
2
|
+
|
|
3
|
+
Responsibilities:
|
|
4
|
+
- open(zim_path) → ZimReader context
|
|
5
|
+
- get_by_url(anchor_url) → raw HTML bytes or None
|
|
6
|
+
- get_section(html_bytes, passage_hint) → plain text, hint-guided
|
|
7
|
+
|
|
8
|
+
Design notes:
|
|
9
|
+
- Uses libzim (runtime dep, pinned in pyproject.toml). No MCP server, no JSON-RPC.
|
|
10
|
+
- Search / lookup logic adapted from OpenZIM MCP (cameronrye/openzim-mcp, MIT) as
|
|
11
|
+
reference; re-implemented minimally for our anchor-resolution-only pilot path.
|
|
12
|
+
- Hermit-AI (AGPL) = ideas only; no code copied.
|
|
13
|
+
- This module is ~100-200 lines over libzim. It never interprets passage content —
|
|
14
|
+
it returns bytes/text verbatim; the prompt layer neutralises injections (SAFETY §1.5).
|
|
15
|
+
|
|
16
|
+
Spec: docs/design/W7_grounding_reader.md; SPEC §15 (layer-1 RAG).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import html
|
|
22
|
+
import logging
|
|
23
|
+
import re
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Optional
|
|
26
|
+
from urllib.parse import unquote, urlparse
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# ── HTML tag stripper ─────────────────────────────────────────────────────────
|
|
31
|
+
# Minimal regex-based stripper: we only need plain-text paragraphs, not a full
|
|
32
|
+
# DOM; a proper parser is overkill for this narrow path.
|
|
33
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
34
|
+
_MULTI_BLANK_RE = re.compile(r"\n{3,}")
|
|
35
|
+
_HEADING_RE = re.compile(
|
|
36
|
+
r"<h[1-6][^>]*>(.*?)</h[1-6]>", re.IGNORECASE | re.DOTALL
|
|
37
|
+
)
|
|
38
|
+
_PARA_RE = re.compile(r"<p[^>]*>(.*?)</p>", re.IGNORECASE | re.DOTALL)
|
|
39
|
+
_SECTION_HEADING_RE = re.compile(
|
|
40
|
+
r"<(?:h[1-6])[^>]*>(.*?)</(?:h[1-6])>", re.IGNORECASE | re.DOTALL
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _strip_html(raw: str) -> str:
|
|
48
|
+
"""Remove HTML tags and unescape entities; return plain text."""
|
|
49
|
+
text = _TAG_RE.sub(" ", raw)
|
|
50
|
+
text = html.unescape(text)
|
|
51
|
+
# Collapse whitespace runs to single spaces, but keep paragraph breaks.
|
|
52
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
53
|
+
text = re.sub(r" *\n *", "\n", text)
|
|
54
|
+
text = _MULTI_BLANK_RE.sub("\n\n", text)
|
|
55
|
+
return text.strip()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _anchor_to_zim_path(anchor_url: str) -> str:
|
|
59
|
+
"""Convert a wiki anchor URL to the ZIM A-namespace path.
|
|
60
|
+
|
|
61
|
+
Wiki URLs like ``https://en.vikidia.org/wiki/Fraction`` map to ZIM path
|
|
62
|
+
``A/Fraction``. We strip the ``/wiki/`` prefix and keep the article slug.
|
|
63
|
+
Path components are URL-decoded (spaces → underscores in ZIM convention).
|
|
64
|
+
|
|
65
|
+
Returns e.g. ``A/Fraction`` or ``A/Unit_fraction``.
|
|
66
|
+
"""
|
|
67
|
+
parsed = urlparse(anchor_url)
|
|
68
|
+
path = unquote(parsed.path) # "/wiki/Unit_fraction"
|
|
69
|
+
# Remove leading /wiki/ or /w/ style prefix
|
|
70
|
+
for prefix in ("/wiki/", "/w/"):
|
|
71
|
+
if path.startswith(prefix):
|
|
72
|
+
path = path[len(prefix):]
|
|
73
|
+
break
|
|
74
|
+
else:
|
|
75
|
+
# No recognised prefix — strip leading slash
|
|
76
|
+
path = path.lstrip("/")
|
|
77
|
+
# ZIM stores article pages under A/ namespace (OpenZIM convention)
|
|
78
|
+
return f"A/{path}" if path else ""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _extract_lead_section(html_content: str) -> str:
|
|
82
|
+
"""Extract the lead (opening) paragraphs before the first heading.
|
|
83
|
+
|
|
84
|
+
Most wiki articles begin with an untitled lead section followed by headings.
|
|
85
|
+
We collect all <p> tags that appear before the first <h2>/<h3>/… heading.
|
|
86
|
+
"""
|
|
87
|
+
# Find position of first section heading
|
|
88
|
+
first_heading = _SECTION_HEADING_RE.search(html_content)
|
|
89
|
+
lead_html = html_content[: first_heading.start()] if first_heading else html_content
|
|
90
|
+
|
|
91
|
+
# Collect paragraph text
|
|
92
|
+
paras = [_strip_html(m.group(1)) for m in _PARA_RE.finditer(lead_html)]
|
|
93
|
+
text = "\n\n".join(p for p in paras if p.strip())
|
|
94
|
+
if not text:
|
|
95
|
+
# Fallback: strip all tags from lead HTML
|
|
96
|
+
text = _strip_html(lead_html)
|
|
97
|
+
return text
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _extract_section_by_hint(html_content: str, passage_hint: str) -> str:
|
|
101
|
+
"""Extract the section whose heading best matches ``passage_hint``.
|
|
102
|
+
|
|
103
|
+
Strategy (deterministic, no model call):
|
|
104
|
+
1. Split on <h2>/<h3> headings.
|
|
105
|
+
2. Score each section by word overlap with the hint.
|
|
106
|
+
3. Return the best-matching section's paragraph text.
|
|
107
|
+
4. Fall back to lead section if nothing matches well.
|
|
108
|
+
"""
|
|
109
|
+
# Split into (heading_text, section_html) pairs
|
|
110
|
+
parts = re.split(
|
|
111
|
+
r"(<h[1-6][^>]*>.*?</h[1-6]>)", html_content, flags=re.IGNORECASE | re.DOTALL
|
|
112
|
+
)
|
|
113
|
+
# parts alternates: [pre_first_heading, heading1, body1, heading2, body2, ...]
|
|
114
|
+
sections: list[tuple[str, str]] = []
|
|
115
|
+
# Lead section (before first heading)
|
|
116
|
+
if parts[0].strip():
|
|
117
|
+
sections.append(("", parts[0]))
|
|
118
|
+
i = 1
|
|
119
|
+
while i + 1 < len(parts):
|
|
120
|
+
heading_html = parts[i]
|
|
121
|
+
body_html = parts[i + 1] if i + 1 < len(parts) else ""
|
|
122
|
+
heading_text = _strip_html(heading_html).lower()
|
|
123
|
+
sections.append((heading_text, body_html))
|
|
124
|
+
i += 2
|
|
125
|
+
|
|
126
|
+
if not sections:
|
|
127
|
+
return _strip_html(html_content)
|
|
128
|
+
|
|
129
|
+
# Score by hint word overlap
|
|
130
|
+
hint_words = set(re.split(r"\W+", passage_hint.lower())) - {"", "the", "a", "an", "of", "and"}
|
|
131
|
+
|
|
132
|
+
best_score = -1
|
|
133
|
+
best_body = sections[0][1] # default to lead section
|
|
134
|
+
|
|
135
|
+
for heading_text, body_html in sections:
|
|
136
|
+
if not heading_text:
|
|
137
|
+
# Lead section — check body text for hints
|
|
138
|
+
body_text_lower = _strip_html(body_html).lower()
|
|
139
|
+
else:
|
|
140
|
+
body_text_lower = heading_text
|
|
141
|
+
|
|
142
|
+
score = sum(1 for w in hint_words if w in body_text_lower)
|
|
143
|
+
if score > best_score:
|
|
144
|
+
best_score = score
|
|
145
|
+
best_body = body_html
|
|
146
|
+
|
|
147
|
+
# Extract paragraphs from the best section
|
|
148
|
+
paras = [_strip_html(m.group(1)) for m in _PARA_RE.finditer(best_body)]
|
|
149
|
+
text = "\n\n".join(p for p in paras if p.strip())
|
|
150
|
+
if not text:
|
|
151
|
+
text = _strip_html(best_body)
|
|
152
|
+
return text
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ── Public ZimReader class ────────────────────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class ZimReader:
|
|
159
|
+
"""Thin owned wrapper around ``libzim.reader.Archive``.
|
|
160
|
+
|
|
161
|
+
Usage::
|
|
162
|
+
|
|
163
|
+
reader = ZimReader(zim_path) # raises FileNotFoundError if path absent
|
|
164
|
+
html_bytes = reader.get_by_url("https://en.vikidia.org/wiki/Fraction")
|
|
165
|
+
text = reader.get_section(html_bytes, "Opening section — fraction as part")
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
def __init__(self, zim_path: str | Path) -> None:
|
|
169
|
+
"""Open the ZIM archive.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
zim_path: Path to the ``.zim`` file.
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
FileNotFoundError: If ``zim_path`` does not exist.
|
|
176
|
+
RuntimeError: If libzim cannot open the archive.
|
|
177
|
+
"""
|
|
178
|
+
from libzim.reader import Archive # deferred: libzim not in test-time import
|
|
179
|
+
|
|
180
|
+
path = Path(zim_path)
|
|
181
|
+
if not path.exists():
|
|
182
|
+
raise FileNotFoundError(f"ZIM file not found: {path}")
|
|
183
|
+
self._archive = Archive(path)
|
|
184
|
+
self._zim_path = path
|
|
185
|
+
logger.debug("ZimReader: opened %s (%d entries)", path.name, self._archive.all_entry_count)
|
|
186
|
+
|
|
187
|
+
# ── Lookup ────────────────────────────────────────────────────────────────
|
|
188
|
+
|
|
189
|
+
def get_by_url(self, anchor_url: str) -> Optional[bytes]:
|
|
190
|
+
"""Resolve a wiki anchor URL to raw HTML bytes from the ZIM archive.
|
|
191
|
+
|
|
192
|
+
Tries:
|
|
193
|
+
1. ``A/<slug>`` path (ZIM A-namespace convention).
|
|
194
|
+
2. Title lookup as fallback (handles alternative capitalisations).
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
anchor_url: Full wiki URL, e.g. ``https://en.vikidia.org/wiki/Fraction``.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Raw HTML bytes of the article, or ``None`` if not found.
|
|
201
|
+
"""
|
|
202
|
+
zim_path = _anchor_to_zim_path(anchor_url)
|
|
203
|
+
if not zim_path:
|
|
204
|
+
logger.warning("get_by_url: could not derive ZIM path from anchor %r", anchor_url)
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
slug = zim_path[2:] # strip "A/"
|
|
208
|
+
|
|
209
|
+
# 1. Direct path lookup. Try the A/ namespace (older ZIM convention) AND the
|
|
210
|
+
# bare slug (modern libzim 3.x ZIMs store articles at the root, no A/ prefix).
|
|
211
|
+
entry = self._lookup_path(zim_path) or self._lookup_path(slug)
|
|
212
|
+
|
|
213
|
+
# 2. Title-based fallback using the slug as title (handles capitalisation variants)
|
|
214
|
+
if entry is None:
|
|
215
|
+
title = slug.replace("_", " ")
|
|
216
|
+
entry = self._lookup_title(title)
|
|
217
|
+
|
|
218
|
+
if entry is None:
|
|
219
|
+
logger.warning(
|
|
220
|
+
"get_by_url: anchor %r not found in %s (tried path=%r, %r, title=%r)",
|
|
221
|
+
anchor_url, self._zim_path.name, zim_path, slug, slug.replace("_", " "),
|
|
222
|
+
)
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
# Follow redirects
|
|
226
|
+
while entry.is_redirect:
|
|
227
|
+
entry = entry.get_redirect_entry()
|
|
228
|
+
|
|
229
|
+
item = entry.get_item()
|
|
230
|
+
return bytes(item.content)
|
|
231
|
+
|
|
232
|
+
def get_section(self, html_bytes: bytes, passage_hint: str = "") -> str:
|
|
233
|
+
"""Extract a plain-text passage from raw HTML bytes.
|
|
234
|
+
|
|
235
|
+
The passage is guided by ``passage_hint`` (a human description, e.g.
|
|
236
|
+
"Opening section — fraction as part of something").
|
|
237
|
+
|
|
238
|
+
This method returns the content **verbatim** after stripping HTML tags.
|
|
239
|
+
It never interprets, executes, or filters passage content — that is the
|
|
240
|
+
prompt layer's responsibility (SAFETY §1.5 / W2.3).
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
html_bytes: Raw HTML bytes from :meth:`get_by_url`.
|
|
244
|
+
passage_hint: Human hint describing which section to prefer.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Plain-text passage (may be empty if HTML contained no text).
|
|
248
|
+
"""
|
|
249
|
+
html_content = html_bytes.decode("utf-8", errors="replace")
|
|
250
|
+
if passage_hint.strip():
|
|
251
|
+
return _extract_section_by_hint(html_content, passage_hint)
|
|
252
|
+
return _extract_lead_section(html_content)
|
|
253
|
+
|
|
254
|
+
# ── Internal helpers ──────────────────────────────────────────────────────
|
|
255
|
+
|
|
256
|
+
def _lookup_path(self, zim_path: str):
|
|
257
|
+
"""Return an Entry by ZIM path, or None on KeyError."""
|
|
258
|
+
try:
|
|
259
|
+
return self._archive.get_entry_by_path(zim_path)
|
|
260
|
+
except KeyError:
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
def _lookup_title(self, title: str):
|
|
264
|
+
"""Return an Entry by title, or None on KeyError."""
|
|
265
|
+
try:
|
|
266
|
+
return self._archive.get_entry_by_title(title)
|
|
267
|
+
except KeyError:
|
|
268
|
+
return None
|
|
269
|
+
|
|
270
|
+
def __repr__(self) -> str:
|
|
271
|
+
return f"ZimReader({self._zim_path!r})"
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Pilot resolution path: node grounding block → passage string.
|
|
2
|
+
|
|
3
|
+
Pilot scope (anchor-resolution only):
|
|
4
|
+
node_grounding dict (source, anchor, passage_hint)
|
|
5
|
+
→ scope guard (source_map.resolve_zim)
|
|
6
|
+
→ cache lookup
|
|
7
|
+
→ ZimReader.get_by_url(anchor)
|
|
8
|
+
→ ZimReader.get_section(html_bytes, passage_hint)
|
|
9
|
+
→ cache put
|
|
10
|
+
→ plain-text passage (or "" on any failure)
|
|
11
|
+
|
|
12
|
+
No LLM title-prediction, no BM25, no embeddings — those are deferred to W7.5.
|
|
13
|
+
|
|
14
|
+
Degradation contract (SAFETY §1.5 / SPEC §15):
|
|
15
|
+
Every failure mode returns "" and logs a warning. This function NEVER raises
|
|
16
|
+
(the outer __init__.resolve_grounding has a belt-and-braces try/except too).
|
|
17
|
+
|
|
18
|
+
Spec: docs/design/W7_grounding_reader.md; SPEC §15 (layer-1 RAG).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import logging
|
|
24
|
+
|
|
25
|
+
from mentar.grounding import cache as grounding_cache
|
|
26
|
+
from mentar.grounding.reader import ZimReader
|
|
27
|
+
from mentar.grounding.source_map import resolve_zim
|
|
28
|
+
from mentar.grounding.sources import materialize_zim
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
# Module-level ZimReader instances keyed by resolved ZIM path string.
|
|
33
|
+
# Avoids re-opening the same archive per turn (opening an archive is cheap but
|
|
34
|
+
# not free; re-using the instance is the right posture for a hot path).
|
|
35
|
+
_READER_POOL: dict[str, ZimReader] = {}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _get_reader(zim_path) -> ZimReader | None:
|
|
39
|
+
"""Return a cached ZimReader for ``zim_path``, opening it on first use.
|
|
40
|
+
|
|
41
|
+
Returns ``None`` (with a warning) if the ZIM file is absent or unreadable.
|
|
42
|
+
"""
|
|
43
|
+
key = str(zim_path)
|
|
44
|
+
if key in _READER_POOL:
|
|
45
|
+
return _READER_POOL[key]
|
|
46
|
+
try:
|
|
47
|
+
reader = ZimReader(zim_path)
|
|
48
|
+
_READER_POOL[key] = reader
|
|
49
|
+
return reader
|
|
50
|
+
except FileNotFoundError:
|
|
51
|
+
logger.warning("resolve: ZIM not found at %s — returning empty passage", zim_path)
|
|
52
|
+
return None
|
|
53
|
+
except Exception:
|
|
54
|
+
logger.warning("resolve: failed to open ZIM %s", zim_path, exc_info=True)
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def clear_reader_pool() -> None:
|
|
59
|
+
"""Clear the ZimReader pool (useful in tests to force re-open)."""
|
|
60
|
+
_READER_POOL.clear()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def resolve_grounding_inner(node_grounding: dict, cfg: dict) -> str:
|
|
64
|
+
"""Core resolution: node grounding block → plain-text passage or "".
|
|
65
|
+
|
|
66
|
+
Called by ``mentar.grounding.resolve_grounding``; may raise on truly
|
|
67
|
+
unexpected errors (the public API wraps this in a try/except).
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
node_grounding: Dict with ``source``, ``anchor``, ``passage_hint``.
|
|
71
|
+
cfg: The ``grounding:`` section of the runtime config.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Plain-text passage string, or "" on any recoverable failure.
|
|
75
|
+
"""
|
|
76
|
+
source: str = node_grounding.get("source", "")
|
|
77
|
+
anchor: str = node_grounding.get("anchor", "")
|
|
78
|
+
passage_hint: str = node_grounding.get("passage_hint", "")
|
|
79
|
+
|
|
80
|
+
if not source or not anchor:
|
|
81
|
+
logger.warning(
|
|
82
|
+
"resolve: missing source or anchor in node_grounding=%r — returning empty",
|
|
83
|
+
node_grounding,
|
|
84
|
+
)
|
|
85
|
+
return ""
|
|
86
|
+
|
|
87
|
+
# ── 1. Scope guard + ZIM location resolution ──────────────────────────────
|
|
88
|
+
zim_location = resolve_zim(source, anchor, cfg)
|
|
89
|
+
if zim_location is None:
|
|
90
|
+
# Logged inside resolve_zim (scope error or unconfigured source)
|
|
91
|
+
return ""
|
|
92
|
+
|
|
93
|
+
# ── 2. Cache lookup ───────────────────────────────────────────────────────
|
|
94
|
+
# Before materialization, so a cache hit never triggers an SMB copy.
|
|
95
|
+
cached = grounding_cache.get(anchor, cfg)
|
|
96
|
+
if cached is not None:
|
|
97
|
+
logger.debug("resolve: cache hit for anchor=%r", anchor)
|
|
98
|
+
return cached
|
|
99
|
+
|
|
100
|
+
# ── 3. Materialize the ZIM to a local path (copies from SMB if needed) ─────
|
|
101
|
+
zim_path = materialize_zim(zim_location, cfg)
|
|
102
|
+
if zim_path is None:
|
|
103
|
+
# Logged inside materialize_zim (missing file / SMB failure / no smbprotocol)
|
|
104
|
+
return ""
|
|
105
|
+
|
|
106
|
+
# ── 4. Open ZIM reader ────────────────────────────────────────────────────
|
|
107
|
+
reader = _get_reader(zim_path)
|
|
108
|
+
if reader is None:
|
|
109
|
+
return ""
|
|
110
|
+
|
|
111
|
+
# ── 5. Fetch article HTML ─────────────────────────────────────────────────
|
|
112
|
+
html_bytes = reader.get_by_url(anchor)
|
|
113
|
+
if html_bytes is None:
|
|
114
|
+
# Logged inside get_by_url
|
|
115
|
+
return ""
|
|
116
|
+
|
|
117
|
+
# ── 6. Extract passage ────────────────────────────────────────────────────
|
|
118
|
+
passage = reader.get_section(html_bytes, passage_hint)
|
|
119
|
+
if not passage or not passage.strip():
|
|
120
|
+
logger.warning("resolve: empty passage for anchor=%r passage_hint=%r", anchor, passage_hint)
|
|
121
|
+
return ""
|
|
122
|
+
|
|
123
|
+
# ── 7. Cache and return ───────────────────────────────────────────────────
|
|
124
|
+
grounding_cache.put(anchor, passage, cfg)
|
|
125
|
+
return passage
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Source-enum → ZIM-file mapping and anchor-host scope guard.
|
|
2
|
+
|
|
3
|
+
Responsibilities:
|
|
4
|
+
- Map a ``source`` enum value (vikidia | wikipedia_simple | wikibooks | …)
|
|
5
|
+
to the configured ZIM file path.
|
|
6
|
+
- Enforce the scope guard: a node's ``source`` must match the anchor's
|
|
7
|
+
hostname AND the configured ZIM for that source. A ``vikidia`` node must
|
|
8
|
+
never resolve out of the vikidia ZIM.
|
|
9
|
+
|
|
10
|
+
Spec: docs/design/W7_grounding_reader.md (Scope guard row in module contract).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from urllib.parse import urlparse
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# ── Canonical host suffixes per source ───────────────────────────────────────
|
|
22
|
+
# We check that the anchor hostname *ends with* the canonical suffix for the
|
|
23
|
+
# declared source. This blocks cross-source roaming without coupling us to a
|
|
24
|
+
# single subdomain variant.
|
|
25
|
+
_SOURCE_HOST_SUFFIXES: dict[str, tuple[str, ...]] = {
|
|
26
|
+
"vikidia": ("vikidia.org",),
|
|
27
|
+
"wikipedia_simple": ("simple.wikipedia.org",),
|
|
28
|
+
"wikibooks": ("wikibooks.org",),
|
|
29
|
+
# parent_upload and builtin have no network anchor — guard is relaxed for them.
|
|
30
|
+
"parent_upload": (),
|
|
31
|
+
"builtin": (),
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ScopeError(ValueError):
|
|
36
|
+
"""Raised when a node's anchor host does not match its declared source."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_zim_path(source: str, cfg: dict) -> str | None:
|
|
40
|
+
"""Return the configured ZIM *location* for ``source``, or ``None`` if unconfigured.
|
|
41
|
+
|
|
42
|
+
The ``sources`` entry may be a **structured spec** (``{project, lang,
|
|
43
|
+
selection?, flavour?, pin?}`` — the newest matching file in ``zim_dir`` is
|
|
44
|
+
chosen automatically, latest ``YYYY-MM`` wins) or a plain **filename string**
|
|
45
|
+
(used as-is). The resulting location may be a local path, a mounted-NAS path,
|
|
46
|
+
or an SMB URL/UNC depending on ``zim_dir`` —
|
|
47
|
+
:func:`mentar.grounding.sources.materialize_zim` turns it into a local path.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
source: Source enum string from the curriculum node (e.g. ``"vikidia"``).
|
|
51
|
+
cfg: The ``grounding:`` config block (``zim_dir``, ``sources`` sub-dict).
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
The joined location string if the source resolves to a file, else ``None``.
|
|
55
|
+
"""
|
|
56
|
+
from mentar.grounding.sources import join_location, resolve_filename
|
|
57
|
+
|
|
58
|
+
zim_dir = cfg.get("zim_dir", "") or "."
|
|
59
|
+
spec = (cfg.get("sources") or {}).get(source)
|
|
60
|
+
if not spec:
|
|
61
|
+
logger.debug("get_zim_path: source %r not in config.grounding.sources", source)
|
|
62
|
+
return None
|
|
63
|
+
filename = resolve_filename(spec, zim_dir, cfg)
|
|
64
|
+
if not filename:
|
|
65
|
+
logger.warning("get_zim_path: no ZIM file resolved for source %r in %r", source, zim_dir)
|
|
66
|
+
return None
|
|
67
|
+
return join_location(zim_dir, filename)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def check_scope(source: str, anchor: str) -> None:
|
|
71
|
+
"""Verify that ``anchor``'s hostname matches ``source``'s expected host(s).
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
source: Declared source enum (e.g. ``"vikidia"``).
|
|
75
|
+
anchor: The anchor URL from the curriculum node.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ScopeError: If the anchor host does not match the expected source hosts.
|
|
79
|
+
"""
|
|
80
|
+
suffixes = _SOURCE_HOST_SUFFIXES.get(source)
|
|
81
|
+
if suffixes is None:
|
|
82
|
+
# Unknown source — reject for safety
|
|
83
|
+
raise ScopeError(
|
|
84
|
+
f"Unknown source {source!r}; expected one of {sorted(_SOURCE_HOST_SUFFIXES)}"
|
|
85
|
+
)
|
|
86
|
+
if not suffixes:
|
|
87
|
+
# Sources without a network anchor (parent_upload, builtin) — no URL check needed
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
parsed = urlparse(anchor)
|
|
91
|
+
host = parsed.netloc.lower()
|
|
92
|
+
if not any(host == s or host.endswith("." + s) for s in suffixes):
|
|
93
|
+
raise ScopeError(
|
|
94
|
+
f"Scope violation: source={source!r} but anchor host={host!r} "
|
|
95
|
+
f"(expected host matching {suffixes})"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def resolve_zim(source: str, anchor: str, cfg: dict) -> str | None:
|
|
100
|
+
"""Validate scope and return the ZIM *location* for ``source``.
|
|
101
|
+
|
|
102
|
+
Combines :func:`check_scope` and :func:`get_zim_path`. Returns ``None``
|
|
103
|
+
(with a logged warning) instead of raising on scope errors so callers can
|
|
104
|
+
apply the degradation contract; the ScopeError is logged but swallowed here.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
source: Source enum string.
|
|
108
|
+
anchor: Full wiki URL from the curriculum node.
|
|
109
|
+
cfg: Grounding config block.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
The ZIM location string (local / mounted / SMB), or ``None`` on scope
|
|
113
|
+
error / missing config.
|
|
114
|
+
"""
|
|
115
|
+
try:
|
|
116
|
+
check_scope(source, anchor)
|
|
117
|
+
except ScopeError as exc:
|
|
118
|
+
logger.warning("resolve_zim: %s — returning None (degradation path)", exc)
|
|
119
|
+
return None
|
|
120
|
+
return get_zim_path(source, cfg)
|