mentar 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,127 @@
1
+ """Passage cache: memoize resolved passages by anchor URL.
2
+
3
+ Memoisation strategy:
4
+ - In-memory dict (primary): ~zero cost per turn after warm; lives for the
5
+ process lifetime. Because ZIM files are static per build, the cache is
6
+ deterministic and never stale within a session.
7
+ - Optional on-disk cache (``cache.dir``): pickled dict keyed by anchor URL.
8
+ Enabled via ``cfg.grounding.cache.enabled = true`` + ``cache.dir`` path.
9
+ On-disk cache is best-effort: any I/O failure is logged and ignored (never
10
+ crashes a turn — degradation contract).
11
+
12
+ Spec: docs/design/W7_grounding_reader.md (Cost row in module contract).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ import logging
19
+ import os
20
+ import pickle
21
+ from pathlib import Path
22
+ from typing import Optional
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # ── In-memory store ───────────────────────────────────────────────────────────
27
+ _MEM_CACHE: dict[str, str] = {}
28
+
29
+ _DISK_CACHE_VERSION = 1
30
+
31
+
32
+ def _cache_key(anchor: str) -> str:
33
+ """Stable, filesystem-safe cache key derived from the anchor URL."""
34
+ return hashlib.sha256(anchor.encode()).hexdigest()
35
+
36
+
37
+ def _disk_path(cache_dir: str | Path, anchor: str) -> Path:
38
+ key = _cache_key(anchor)
39
+ return Path(cache_dir) / f"{key}.pkl"
40
+
41
+
42
+ # ── Public API ────────────────────────────────────────────────────────────────
43
+
44
+
45
+ def get(anchor: str, cfg: dict) -> Optional[str]:
46
+ """Return a cached passage for ``anchor``, or ``None`` if not cached.
47
+
48
+ Checks in-memory first, then on-disk (if enabled).
49
+
50
+ Args:
51
+ anchor: The anchor URL (cache key).
52
+ cfg: The ``grounding:`` config block.
53
+
54
+ Returns:
55
+ Cached passage string, or ``None``.
56
+ """
57
+ # 1. In-memory
58
+ if anchor in _MEM_CACHE:
59
+ logger.debug("cache.get: HIT (memory) anchor=%r", anchor)
60
+ return _MEM_CACHE[anchor]
61
+
62
+ # 2. On-disk
63
+ cache_cfg = cfg.get("cache", {})
64
+ if not cache_cfg.get("enabled", False):
65
+ return None
66
+
67
+ cache_dir = _resolve_cache_dir(cache_cfg)
68
+ if not cache_dir:
69
+ return None
70
+
71
+ disk_file = _disk_path(cache_dir, anchor)
72
+ if disk_file.exists():
73
+ try:
74
+ with disk_file.open("rb") as f:
75
+ data = pickle.load(f)
76
+ if isinstance(data, dict) and data.get("v") == _DISK_CACHE_VERSION:
77
+ passage = data["passage"]
78
+ _MEM_CACHE[anchor] = passage # warm in-memory cache
79
+ logger.debug("cache.get: HIT (disk) anchor=%r", anchor)
80
+ return passage
81
+ except Exception:
82
+ logger.warning("cache.get: failed to read disk cache %s", disk_file, exc_info=True)
83
+
84
+ return None
85
+
86
+
87
+ def put(anchor: str, passage: str, cfg: dict) -> None:
88
+ """Store ``passage`` in the cache for ``anchor``.
89
+
90
+ Args:
91
+ anchor: The anchor URL (cache key).
92
+ passage: The resolved passage string.
93
+ cfg: The ``grounding:`` config block.
94
+ """
95
+ _MEM_CACHE[anchor] = passage
96
+
97
+ cache_cfg = cfg.get("cache", {})
98
+ if not cache_cfg.get("enabled", False):
99
+ return
100
+
101
+ cache_dir = _resolve_cache_dir(cache_cfg)
102
+ if not cache_dir:
103
+ return
104
+
105
+ try:
106
+ Path(cache_dir).mkdir(parents=True, exist_ok=True)
107
+ disk_file = _disk_path(cache_dir, anchor)
108
+ with disk_file.open("wb") as f:
109
+ pickle.dump({"v": _DISK_CACHE_VERSION, "passage": passage}, f)
110
+ logger.debug("cache.put: wrote disk cache %s", disk_file)
111
+ except Exception:
112
+ logger.warning("cache.put: failed to write disk cache", exc_info=True)
113
+
114
+
115
+ def clear_memory() -> None:
116
+ """Clear the in-memory cache (useful in tests)."""
117
+ _MEM_CACHE.clear()
118
+
119
+
120
+ def _resolve_cache_dir(cache_cfg: dict) -> Optional[str]:
121
+ """Expand env-var substitution in cache_dir and return the resolved string."""
122
+ raw_dir: str = cache_cfg.get("dir", "")
123
+ if not raw_dir:
124
+ return None
125
+ # Expand ${VAR:-default} style references
126
+ expanded = os.path.expandvars(raw_dir)
127
+ return expanded if expanded else None
@@ -0,0 +1,271 @@
1
+ """Thin owned libzim reader: open a ZIM archive and extract article text.
2
+
3
+ Responsibilities:
4
+ - open(zim_path) → ZimReader context
5
+ - get_by_url(anchor_url) → raw HTML bytes or None
6
+ - get_section(html_bytes, passage_hint) → plain text, hint-guided
7
+
8
+ Design notes:
9
+ - Uses libzim (runtime dep, pinned in pyproject.toml). No MCP server, no JSON-RPC.
10
+ - Search / lookup logic adapted from OpenZIM MCP (cameronrye/openzim-mcp, MIT) as
11
+ reference; re-implemented minimally for our anchor-resolution-only pilot path.
12
+ - Hermit-AI (AGPL) = ideas only; no code copied.
13
+ - This module is ~100-200 lines over libzim. It never interprets passage content —
14
+ it returns bytes/text verbatim; the prompt layer neutralises injections (SAFETY §1.5).
15
+
16
+ Spec: docs/design/W7_grounding_reader.md; SPEC §15 (layer-1 RAG).
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import html
22
+ import logging
23
+ import re
24
+ from pathlib import Path
25
+ from typing import Optional
26
+ from urllib.parse import unquote, urlparse
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # ── HTML tag stripper ─────────────────────────────────────────────────────────
31
+ # Minimal regex-based stripper: we only need plain-text paragraphs, not a full
32
+ # DOM; a proper parser is overkill for this narrow path.
33
+ _TAG_RE = re.compile(r"<[^>]+>")
34
+ _MULTI_BLANK_RE = re.compile(r"\n{3,}")
35
+ _HEADING_RE = re.compile(
36
+ r"<h[1-6][^>]*>(.*?)</h[1-6]>", re.IGNORECASE | re.DOTALL
37
+ )
38
+ _PARA_RE = re.compile(r"<p[^>]*>(.*?)</p>", re.IGNORECASE | re.DOTALL)
39
+ _SECTION_HEADING_RE = re.compile(
40
+ r"<(?:h[1-6])[^>]*>(.*?)</(?:h[1-6])>", re.IGNORECASE | re.DOTALL
41
+ )
42
+
43
+
44
+ # ── Helpers ───────────────────────────────────────────────────────────────────
45
+
46
+
47
+ def _strip_html(raw: str) -> str:
48
+ """Remove HTML tags and unescape entities; return plain text."""
49
+ text = _TAG_RE.sub(" ", raw)
50
+ text = html.unescape(text)
51
+ # Collapse whitespace runs to single spaces, but keep paragraph breaks.
52
+ text = re.sub(r"[ \t]+", " ", text)
53
+ text = re.sub(r" *\n *", "\n", text)
54
+ text = _MULTI_BLANK_RE.sub("\n\n", text)
55
+ return text.strip()
56
+
57
+
58
+ def _anchor_to_zim_path(anchor_url: str) -> str:
59
+ """Convert a wiki anchor URL to the ZIM A-namespace path.
60
+
61
+ Wiki URLs like ``https://en.vikidia.org/wiki/Fraction`` map to ZIM path
62
+ ``A/Fraction``. We strip the ``/wiki/`` prefix and keep the article slug.
63
+ Path components are URL-decoded (spaces → underscores in ZIM convention).
64
+
65
+ Returns e.g. ``A/Fraction`` or ``A/Unit_fraction``.
66
+ """
67
+ parsed = urlparse(anchor_url)
68
+ path = unquote(parsed.path) # "/wiki/Unit_fraction"
69
+ # Remove leading /wiki/ or /w/ style prefix
70
+ for prefix in ("/wiki/", "/w/"):
71
+ if path.startswith(prefix):
72
+ path = path[len(prefix):]
73
+ break
74
+ else:
75
+ # No recognised prefix — strip leading slash
76
+ path = path.lstrip("/")
77
+ # ZIM stores article pages under A/ namespace (OpenZIM convention)
78
+ return f"A/{path}" if path else ""
79
+
80
+
81
+ def _extract_lead_section(html_content: str) -> str:
82
+ """Extract the lead (opening) paragraphs before the first heading.
83
+
84
+ Most wiki articles begin with an untitled lead section followed by headings.
85
+ We collect all <p> tags that appear before the first <h2>/<h3>/… heading.
86
+ """
87
+ # Find position of first section heading
88
+ first_heading = _SECTION_HEADING_RE.search(html_content)
89
+ lead_html = html_content[: first_heading.start()] if first_heading else html_content
90
+
91
+ # Collect paragraph text
92
+ paras = [_strip_html(m.group(1)) for m in _PARA_RE.finditer(lead_html)]
93
+ text = "\n\n".join(p for p in paras if p.strip())
94
+ if not text:
95
+ # Fallback: strip all tags from lead HTML
96
+ text = _strip_html(lead_html)
97
+ return text
98
+
99
+
100
+ def _extract_section_by_hint(html_content: str, passage_hint: str) -> str:
101
+ """Extract the section whose heading best matches ``passage_hint``.
102
+
103
+ Strategy (deterministic, no model call):
104
+ 1. Split on <h2>/<h3> headings.
105
+ 2. Score each section by word overlap with the hint.
106
+ 3. Return the best-matching section's paragraph text.
107
+ 4. Fall back to lead section if nothing matches well.
108
+ """
109
+ # Split into (heading_text, section_html) pairs
110
+ parts = re.split(
111
+ r"(<h[1-6][^>]*>.*?</h[1-6]>)", html_content, flags=re.IGNORECASE | re.DOTALL
112
+ )
113
+ # parts alternates: [pre_first_heading, heading1, body1, heading2, body2, ...]
114
+ sections: list[tuple[str, str]] = []
115
+ # Lead section (before first heading)
116
+ if parts[0].strip():
117
+ sections.append(("", parts[0]))
118
+ i = 1
119
+ while i + 1 < len(parts):
120
+ heading_html = parts[i]
121
+ body_html = parts[i + 1] if i + 1 < len(parts) else ""
122
+ heading_text = _strip_html(heading_html).lower()
123
+ sections.append((heading_text, body_html))
124
+ i += 2
125
+
126
+ if not sections:
127
+ return _strip_html(html_content)
128
+
129
+ # Score by hint word overlap
130
+ hint_words = set(re.split(r"\W+", passage_hint.lower())) - {"", "the", "a", "an", "of", "and"}
131
+
132
+ best_score = -1
133
+ best_body = sections[0][1] # default to lead section
134
+
135
+ for heading_text, body_html in sections:
136
+ if not heading_text:
137
+ # Lead section — check body text for hints
138
+ body_text_lower = _strip_html(body_html).lower()
139
+ else:
140
+ body_text_lower = heading_text
141
+
142
+ score = sum(1 for w in hint_words if w in body_text_lower)
143
+ if score > best_score:
144
+ best_score = score
145
+ best_body = body_html
146
+
147
+ # Extract paragraphs from the best section
148
+ paras = [_strip_html(m.group(1)) for m in _PARA_RE.finditer(best_body)]
149
+ text = "\n\n".join(p for p in paras if p.strip())
150
+ if not text:
151
+ text = _strip_html(best_body)
152
+ return text
153
+
154
+
155
+ # ── Public ZimReader class ────────────────────────────────────────────────────
156
+
157
+
158
+ class ZimReader:
159
+ """Thin owned wrapper around ``libzim.reader.Archive``.
160
+
161
+ Usage::
162
+
163
+ reader = ZimReader(zim_path) # raises FileNotFoundError if path absent
164
+ html_bytes = reader.get_by_url("https://en.vikidia.org/wiki/Fraction")
165
+ text = reader.get_section(html_bytes, "Opening section — fraction as part")
166
+ """
167
+
168
+ def __init__(self, zim_path: str | Path) -> None:
169
+ """Open the ZIM archive.
170
+
171
+ Args:
172
+ zim_path: Path to the ``.zim`` file.
173
+
174
+ Raises:
175
+ FileNotFoundError: If ``zim_path`` does not exist.
176
+ RuntimeError: If libzim cannot open the archive.
177
+ """
178
+ from libzim.reader import Archive # deferred: libzim not in test-time import
179
+
180
+ path = Path(zim_path)
181
+ if not path.exists():
182
+ raise FileNotFoundError(f"ZIM file not found: {path}")
183
+ self._archive = Archive(path)
184
+ self._zim_path = path
185
+ logger.debug("ZimReader: opened %s (%d entries)", path.name, self._archive.all_entry_count)
186
+
187
+ # ── Lookup ────────────────────────────────────────────────────────────────
188
+
189
+ def get_by_url(self, anchor_url: str) -> Optional[bytes]:
190
+ """Resolve a wiki anchor URL to raw HTML bytes from the ZIM archive.
191
+
192
+ Tries:
193
+ 1. ``A/<slug>`` path (ZIM A-namespace convention).
194
+ 2. Title lookup as fallback (handles alternative capitalisations).
195
+
196
+ Args:
197
+ anchor_url: Full wiki URL, e.g. ``https://en.vikidia.org/wiki/Fraction``.
198
+
199
+ Returns:
200
+ Raw HTML bytes of the article, or ``None`` if not found.
201
+ """
202
+ zim_path = _anchor_to_zim_path(anchor_url)
203
+ if not zim_path:
204
+ logger.warning("get_by_url: could not derive ZIM path from anchor %r", anchor_url)
205
+ return None
206
+
207
+ slug = zim_path[2:] # strip "A/"
208
+
209
+ # 1. Direct path lookup. Try the A/ namespace (older ZIM convention) AND the
210
+ # bare slug (modern libzim 3.x ZIMs store articles at the root, no A/ prefix).
211
+ entry = self._lookup_path(zim_path) or self._lookup_path(slug)
212
+
213
+ # 2. Title-based fallback using the slug as title (handles capitalisation variants)
214
+ if entry is None:
215
+ title = slug.replace("_", " ")
216
+ entry = self._lookup_title(title)
217
+
218
+ if entry is None:
219
+ logger.warning(
220
+ "get_by_url: anchor %r not found in %s (tried path=%r, %r, title=%r)",
221
+ anchor_url, self._zim_path.name, zim_path, slug, slug.replace("_", " "),
222
+ )
223
+ return None
224
+
225
+ # Follow redirects
226
+ while entry.is_redirect:
227
+ entry = entry.get_redirect_entry()
228
+
229
+ item = entry.get_item()
230
+ return bytes(item.content)
231
+
232
+ def get_section(self, html_bytes: bytes, passage_hint: str = "") -> str:
233
+ """Extract a plain-text passage from raw HTML bytes.
234
+
235
+ The passage is guided by ``passage_hint`` (a human description, e.g.
236
+ "Opening section — fraction as part of something").
237
+
238
+ This method returns the content **verbatim** after stripping HTML tags.
239
+ It never interprets, executes, or filters passage content — that is the
240
+ prompt layer's responsibility (SAFETY §1.5 / W2.3).
241
+
242
+ Args:
243
+ html_bytes: Raw HTML bytes from :meth:`get_by_url`.
244
+ passage_hint: Human hint describing which section to prefer.
245
+
246
+ Returns:
247
+ Plain-text passage (may be empty if HTML contained no text).
248
+ """
249
+ html_content = html_bytes.decode("utf-8", errors="replace")
250
+ if passage_hint.strip():
251
+ return _extract_section_by_hint(html_content, passage_hint)
252
+ return _extract_lead_section(html_content)
253
+
254
+ # ── Internal helpers ──────────────────────────────────────────────────────
255
+
256
+ def _lookup_path(self, zim_path: str):
257
+ """Return an Entry by ZIM path, or None on KeyError."""
258
+ try:
259
+ return self._archive.get_entry_by_path(zim_path)
260
+ except KeyError:
261
+ return None
262
+
263
+ def _lookup_title(self, title: str):
264
+ """Return an Entry by title, or None on KeyError."""
265
+ try:
266
+ return self._archive.get_entry_by_title(title)
267
+ except KeyError:
268
+ return None
269
+
270
+ def __repr__(self) -> str:
271
+ return f"ZimReader({self._zim_path!r})"
@@ -0,0 +1,125 @@
1
+ """Pilot resolution path: node grounding block → passage string.
2
+
3
+ Pilot scope (anchor-resolution only):
4
+ node_grounding dict (source, anchor, passage_hint)
5
+ → scope guard (source_map.resolve_zim)
6
+ → cache lookup
7
+ → ZimReader.get_by_url(anchor)
8
+ → ZimReader.get_section(html_bytes, passage_hint)
9
+ → cache put
10
+ → plain-text passage (or "" on any failure)
11
+
12
+ No LLM title-prediction, no BM25, no embeddings — those are deferred to W7.5.
13
+
14
+ Degradation contract (SAFETY §1.5 / SPEC §15):
15
+ Every failure mode returns "" and logs a warning. This function NEVER raises
16
+ (the outer __init__.resolve_grounding has a belt-and-braces try/except too).
17
+
18
+ Spec: docs/design/W7_grounding_reader.md; SPEC §15 (layer-1 RAG).
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import logging
24
+
25
+ from mentar.grounding import cache as grounding_cache
26
+ from mentar.grounding.reader import ZimReader
27
+ from mentar.grounding.source_map import resolve_zim
28
+ from mentar.grounding.sources import materialize_zim
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Module-level ZimReader instances keyed by resolved ZIM path string.
33
+ # Avoids re-opening the same archive per turn (opening an archive is cheap but
34
+ # not free; re-using the instance is the right posture for a hot path).
35
+ _READER_POOL: dict[str, ZimReader] = {}
36
+
37
+
38
+ def _get_reader(zim_path) -> ZimReader | None:
39
+ """Return a cached ZimReader for ``zim_path``, opening it on first use.
40
+
41
+ Returns ``None`` (with a warning) if the ZIM file is absent or unreadable.
42
+ """
43
+ key = str(zim_path)
44
+ if key in _READER_POOL:
45
+ return _READER_POOL[key]
46
+ try:
47
+ reader = ZimReader(zim_path)
48
+ _READER_POOL[key] = reader
49
+ return reader
50
+ except FileNotFoundError:
51
+ logger.warning("resolve: ZIM not found at %s — returning empty passage", zim_path)
52
+ return None
53
+ except Exception:
54
+ logger.warning("resolve: failed to open ZIM %s", zim_path, exc_info=True)
55
+ return None
56
+
57
+
58
+ def clear_reader_pool() -> None:
59
+ """Clear the ZimReader pool (useful in tests to force re-open)."""
60
+ _READER_POOL.clear()
61
+
62
+
63
+ def resolve_grounding_inner(node_grounding: dict, cfg: dict) -> str:
64
+ """Core resolution: node grounding block → plain-text passage or "".
65
+
66
+ Called by ``mentar.grounding.resolve_grounding``; may raise on truly
67
+ unexpected errors (the public API wraps this in a try/except).
68
+
69
+ Args:
70
+ node_grounding: Dict with ``source``, ``anchor``, ``passage_hint``.
71
+ cfg: The ``grounding:`` section of the runtime config.
72
+
73
+ Returns:
74
+ Plain-text passage string, or "" on any recoverable failure.
75
+ """
76
+ source: str = node_grounding.get("source", "")
77
+ anchor: str = node_grounding.get("anchor", "")
78
+ passage_hint: str = node_grounding.get("passage_hint", "")
79
+
80
+ if not source or not anchor:
81
+ logger.warning(
82
+ "resolve: missing source or anchor in node_grounding=%r — returning empty",
83
+ node_grounding,
84
+ )
85
+ return ""
86
+
87
+ # ── 1. Scope guard + ZIM location resolution ──────────────────────────────
88
+ zim_location = resolve_zim(source, anchor, cfg)
89
+ if zim_location is None:
90
+ # Logged inside resolve_zim (scope error or unconfigured source)
91
+ return ""
92
+
93
+ # ── 2. Cache lookup ───────────────────────────────────────────────────────
94
+ # Before materialization, so a cache hit never triggers an SMB copy.
95
+ cached = grounding_cache.get(anchor, cfg)
96
+ if cached is not None:
97
+ logger.debug("resolve: cache hit for anchor=%r", anchor)
98
+ return cached
99
+
100
+ # ── 3. Materialize the ZIM to a local path (copies from SMB if needed) ─────
101
+ zim_path = materialize_zim(zim_location, cfg)
102
+ if zim_path is None:
103
+ # Logged inside materialize_zim (missing file / SMB failure / no smbprotocol)
104
+ return ""
105
+
106
+ # ── 4. Open ZIM reader ────────────────────────────────────────────────────
107
+ reader = _get_reader(zim_path)
108
+ if reader is None:
109
+ return ""
110
+
111
+ # ── 5. Fetch article HTML ─────────────────────────────────────────────────
112
+ html_bytes = reader.get_by_url(anchor)
113
+ if html_bytes is None:
114
+ # Logged inside get_by_url
115
+ return ""
116
+
117
+ # ── 6. Extract passage ────────────────────────────────────────────────────
118
+ passage = reader.get_section(html_bytes, passage_hint)
119
+ if not passage or not passage.strip():
120
+ logger.warning("resolve: empty passage for anchor=%r passage_hint=%r", anchor, passage_hint)
121
+ return ""
122
+
123
+ # ── 7. Cache and return ───────────────────────────────────────────────────
124
+ grounding_cache.put(anchor, passage, cfg)
125
+ return passage
@@ -0,0 +1,120 @@
1
+ """Source-enum → ZIM-file mapping and anchor-host scope guard.
2
+
3
+ Responsibilities:
4
+ - Map a ``source`` enum value (vikidia | wikipedia_simple | wikibooks | …)
5
+ to the configured ZIM file path.
6
+ - Enforce the scope guard: a node's ``source`` must match the anchor's
7
+ hostname AND the configured ZIM for that source. A ``vikidia`` node must
8
+ never resolve out of the vikidia ZIM.
9
+
10
+ Spec: docs/design/W7_grounding_reader.md (Scope guard row in module contract).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from pathlib import Path
17
+ from urllib.parse import urlparse
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # ── Canonical host suffixes per source ───────────────────────────────────────
22
+ # We check that the anchor hostname *ends with* the canonical suffix for the
23
+ # declared source. This blocks cross-source roaming without coupling us to a
24
+ # single subdomain variant.
25
+ _SOURCE_HOST_SUFFIXES: dict[str, tuple[str, ...]] = {
26
+ "vikidia": ("vikidia.org",),
27
+ "wikipedia_simple": ("simple.wikipedia.org",),
28
+ "wikibooks": ("wikibooks.org",),
29
+ # parent_upload and builtin have no network anchor — guard is relaxed for them.
30
+ "parent_upload": (),
31
+ "builtin": (),
32
+ }
33
+
34
+
35
+ class ScopeError(ValueError):
36
+ """Raised when a node's anchor host does not match its declared source."""
37
+
38
+
39
+ def get_zim_path(source: str, cfg: dict) -> str | None:
40
+ """Return the configured ZIM *location* for ``source``, or ``None`` if unconfigured.
41
+
42
+ The ``sources`` entry may be a **structured spec** (``{project, lang,
43
+ selection?, flavour?, pin?}`` — the newest matching file in ``zim_dir`` is
44
+ chosen automatically, latest ``YYYY-MM`` wins) or a plain **filename string**
45
+ (used as-is). The resulting location may be a local path, a mounted-NAS path,
46
+ or an SMB URL/UNC depending on ``zim_dir`` —
47
+ :func:`mentar.grounding.sources.materialize_zim` turns it into a local path.
48
+
49
+ Args:
50
+ source: Source enum string from the curriculum node (e.g. ``"vikidia"``).
51
+ cfg: The ``grounding:`` config block (``zim_dir``, ``sources`` sub-dict).
52
+
53
+ Returns:
54
+ The joined location string if the source resolves to a file, else ``None``.
55
+ """
56
+ from mentar.grounding.sources import join_location, resolve_filename
57
+
58
+ zim_dir = cfg.get("zim_dir", "") or "."
59
+ spec = (cfg.get("sources") or {}).get(source)
60
+ if not spec:
61
+ logger.debug("get_zim_path: source %r not in config.grounding.sources", source)
62
+ return None
63
+ filename = resolve_filename(spec, zim_dir, cfg)
64
+ if not filename:
65
+ logger.warning("get_zim_path: no ZIM file resolved for source %r in %r", source, zim_dir)
66
+ return None
67
+ return join_location(zim_dir, filename)
68
+
69
+
70
+ def check_scope(source: str, anchor: str) -> None:
71
+ """Verify that ``anchor``'s hostname matches ``source``'s expected host(s).
72
+
73
+ Args:
74
+ source: Declared source enum (e.g. ``"vikidia"``).
75
+ anchor: The anchor URL from the curriculum node.
76
+
77
+ Raises:
78
+ ScopeError: If the anchor host does not match the expected source hosts.
79
+ """
80
+ suffixes = _SOURCE_HOST_SUFFIXES.get(source)
81
+ if suffixes is None:
82
+ # Unknown source — reject for safety
83
+ raise ScopeError(
84
+ f"Unknown source {source!r}; expected one of {sorted(_SOURCE_HOST_SUFFIXES)}"
85
+ )
86
+ if not suffixes:
87
+ # Sources without a network anchor (parent_upload, builtin) — no URL check needed
88
+ return
89
+
90
+ parsed = urlparse(anchor)
91
+ host = parsed.netloc.lower()
92
+ if not any(host == s or host.endswith("." + s) for s in suffixes):
93
+ raise ScopeError(
94
+ f"Scope violation: source={source!r} but anchor host={host!r} "
95
+ f"(expected host matching {suffixes})"
96
+ )
97
+
98
+
99
+ def resolve_zim(source: str, anchor: str, cfg: dict) -> str | None:
100
+ """Validate scope and return the ZIM *location* for ``source``.
101
+
102
+ Combines :func:`check_scope` and :func:`get_zim_path`. Returns ``None``
103
+ (with a logged warning) instead of raising on scope errors so callers can
104
+ apply the degradation contract; the ScopeError is logged but swallowed here.
105
+
106
+ Args:
107
+ source: Source enum string.
108
+ anchor: Full wiki URL from the curriculum node.
109
+ cfg: Grounding config block.
110
+
111
+ Returns:
112
+ The ZIM location string (local / mounted / SMB), or ``None`` on scope
113
+ error / missing config.
114
+ """
115
+ try:
116
+ check_scope(source, anchor)
117
+ except ScopeError as exc:
118
+ logger.warning("resolve_zim: %s — returning None (degradation path)", exc)
119
+ return None
120
+ return get_zim_path(source, cfg)