mentar 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mentar/__init__.py +6 -0
- mentar/cli/__init__.py +1 -0
- mentar/cli/__main__.py +62 -0
- mentar/db/__init__.py +4 -0
- mentar/db/store.py +416 -0
- mentar/dialogue/__init__.py +4 -0
- mentar/engine/__init__.py +4 -0
- mentar/engine/bkt.py +99 -0
- mentar/engine/fringe.py +104 -0
- mentar/engine/probe_classify.py +79 -0
- mentar/eval/__init__.py +4 -0
- mentar/eval/verify_numeric.py +619 -0
- mentar/grounding/__init__.py +65 -0
- mentar/grounding/cache.py +127 -0
- mentar/grounding/reader.py +271 -0
- mentar/grounding/resolve.py +125 -0
- mentar/grounding/source_map.py +120 -0
- mentar/grounding/sources.py +267 -0
- mentar/grounding/wrapper.py +50 -0
- mentar/inference/__init__.py +7 -0
- mentar/safety/__init__.py +4 -0
- mentar/safety/escalation.py +316 -0
- mentar/tools/__init__.py +4 -0
- mentar/tools/validate_template.py +322 -0
- mentar-0.1.0.dev0.dist-info/METADATA +178 -0
- mentar-0.1.0.dev0.dist-info/RECORD +29 -0
- mentar-0.1.0.dev0.dist-info/WHEEL +5 -0
- mentar-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- mentar-0.1.0.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
"""
|
|
2
|
+
verify_numeric.py — Deterministic fraction/integer verifier for Mentar.
|
|
3
|
+
|
|
4
|
+
SAFETY-CRITICAL: per SPEC §15 Layer 2, every numeric/worked step the LLM generates
|
|
5
|
+
must be computationally verified BEFORE display. A wrong-but-confident verification
|
|
6
|
+
is a safety failure. Err on safe-reject over false-pass.
|
|
7
|
+
|
|
8
|
+
Supports T1.3 (eval-time scoring) and T3.5 (runtime serve-time gate).
|
|
9
|
+
Stdlib only — fractions.Fraction + re. No third-party deps.
|
|
10
|
+
|
|
11
|
+
Design decisions documented inline:
|
|
12
|
+
- Decimals (e.g. "0.5"): SAFE_REJECT. Not in pilot scope (SPEC §23, fractions.md
|
|
13
|
+
"Out of scope: decimal/fraction conversion"). Accepting decimals silently could
|
|
14
|
+
produce a false-pass if the LLM gives "0.5" when the expected form is "1/2".
|
|
15
|
+
- Unicode vulgar fractions (½ ¼ ¾ etc.): mapped to their a/b equivalents before
|
|
16
|
+
parsing — cheap via a small lookup table and avoids SAFE_REJECT on copy-paste input.
|
|
17
|
+
- Mixed numbers ("1 1/2"): parsed as whole + fraction; ambiguous forms with more than
|
|
18
|
+
one space-separated token that look like independent fractions → SAFE_REJECT.
|
|
19
|
+
- Negative denominators: Fraction normalises these natively (e.g. 3/-6 → -1/2).
|
|
20
|
+
- Zero denominator: SAFE_REJECT (never crash, never accept).
|
|
21
|
+
- Multiple plausible candidates of equal precedence at the same position → SAFE_REJECT.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import re
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from enum import Enum
|
|
29
|
+
from fractions import Fraction
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Public types
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
class CheckResult(Enum):
|
|
37
|
+
PASS = "pass"
|
|
38
|
+
FAIL = "fail"
|
|
39
|
+
EXTRACT_FAIL = "extract_fail" # could not locate a candidate answer
|
|
40
|
+
SAFE_REJECT = "safe_reject" # input malformed / ambiguous — refuse to verify
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class CheckOutcome:
|
|
45
|
+
result: CheckResult
|
|
46
|
+
extracted: str | None # what the verifier pulled out as the candidate
|
|
47
|
+
canonical: str | None # normalised form (e.g. "1/2") if extracted
|
|
48
|
+
detail: str # human-readable explanation
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# Unicode vulgar-fraction table (optional bonus — cheap lookup)
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
_UNICODE_FRACTIONS: dict[str, str] = {
|
|
56
|
+
"½": "1/2",
|
|
57
|
+
"⅓": "1/3",
|
|
58
|
+
"⅔": "2/3",
|
|
59
|
+
"¼": "1/4",
|
|
60
|
+
"¾": "3/4",
|
|
61
|
+
"⅕": "1/5",
|
|
62
|
+
"⅖": "2/5",
|
|
63
|
+
"⅗": "3/5",
|
|
64
|
+
"⅘": "4/5",
|
|
65
|
+
"⅙": "1/6",
|
|
66
|
+
"⅚": "5/6",
|
|
67
|
+
"⅛": "1/8",
|
|
68
|
+
"⅜": "3/8",
|
|
69
|
+
"⅝": "5/8",
|
|
70
|
+
"⅞": "7/8",
|
|
71
|
+
"⅐": "1/7",
|
|
72
|
+
"⅑": "1/9",
|
|
73
|
+
"⅒": "1/10",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
_UNICODE_FRAC_RE = re.compile("|".join(re.escape(c) for c in _UNICODE_FRACTIONS))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _expand_unicode_fractions(text: str) -> str:
|
|
80
|
+
"""Replace Unicode vulgar-fraction characters with their a/b form."""
|
|
81
|
+
return _UNICODE_FRAC_RE.sub(lambda m: _UNICODE_FRACTIONS[m.group()], text)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
# Regex patterns
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
# Fraction pattern: optional leading sign, optional whole number, then a/b
|
|
89
|
+
# We deliberately do NOT allow spaces inside the numerator/denominator tokens.
|
|
90
|
+
# Matches: "1/2", "-3/5", "2/4", "10/3"
|
|
91
|
+
_FRAC_PAT = r"-?\d+\s*/\s*-?\d+"
|
|
92
|
+
|
|
93
|
+
# Mixed-number pattern: whole SP fraction (e.g. "1 1/2", "-2 3/4")
|
|
94
|
+
# Requires exactly one space between whole and fraction.
|
|
95
|
+
_MIXED_PAT = r"-?\d+\s+\d+\s*/\s*\d+"
|
|
96
|
+
|
|
97
|
+
# Pure integer (no slash)
|
|
98
|
+
_INT_PAT = r"-?\d+"
|
|
99
|
+
|
|
100
|
+
# <answer> tag extraction
|
|
101
|
+
_ANSWER_TAG_RE = re.compile(r"<answer>\s*(.*?)\s*</answer>", re.IGNORECASE | re.DOTALL)
|
|
102
|
+
|
|
103
|
+
# Multiple-choice letter (A-D) or digit (1-4), possibly in parentheses or quoted
|
|
104
|
+
_MC_LETTER_RE = re.compile(r"\b([A-Da-d])\b")
|
|
105
|
+
_MC_DIGIT_RE = re.compile(r"\b([1-4])\b")
|
|
106
|
+
|
|
107
|
+
# Full patterns compiled
|
|
108
|
+
_MIXED_RE = re.compile(_MIXED_PAT)
|
|
109
|
+
_FRAC_RE = re.compile(_FRAC_PAT)
|
|
110
|
+
_INT_RE = re.compile(_INT_PAT)
|
|
111
|
+
|
|
112
|
+
# Decimal detection — reject these explicitly
|
|
113
|
+
_DECIMAL_RE = re.compile(r"\b\d+\.\d+\b")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
# normalise_fraction
|
|
118
|
+
# ---------------------------------------------------------------------------
|
|
119
|
+
|
|
120
|
+
def normalise_fraction(s: str) -> Fraction | None:
|
|
121
|
+
"""
|
|
122
|
+
Parse a fraction string and return a normalised Fraction, or None on failure.
|
|
123
|
+
|
|
124
|
+
Accepted forms:
|
|
125
|
+
- "1/2", "2/4", "-3/5" → proper/improper fraction
|
|
126
|
+
- "3" → integer (whole number)
|
|
127
|
+
- "1 1/2", "2 3/4" → mixed number (whole + fraction)
|
|
128
|
+
- Unicode vulgar fractions via _expand_unicode_fractions pre-pass
|
|
129
|
+
|
|
130
|
+
SAFE_REJECT (returns None) for:
|
|
131
|
+
- Zero denominator ("1/0", "5/0")
|
|
132
|
+
- Non-integer components ("a/b", "1.5/2")
|
|
133
|
+
- Empty string
|
|
134
|
+
- Anything that doesn't match the above forms
|
|
135
|
+
"""
|
|
136
|
+
if not s or not s.strip():
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
s = _expand_unicode_fractions(s.strip())
|
|
140
|
+
|
|
141
|
+
# Decimal in the token → reject (not in pilot scope)
|
|
142
|
+
if _DECIMAL_RE.search(s):
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
# Try mixed number first ("1 1/2")
|
|
146
|
+
mixed_m = re.fullmatch(r"\s*(-?\d+)\s+(\d+)\s*/\s*(\d+)\s*", s)
|
|
147
|
+
if mixed_m:
|
|
148
|
+
whole = int(mixed_m.group(1))
|
|
149
|
+
num = int(mixed_m.group(2))
|
|
150
|
+
den = int(mixed_m.group(3))
|
|
151
|
+
if den == 0:
|
|
152
|
+
return None # SAFE_REJECT
|
|
153
|
+
# mixed number sign: whole carries it; fraction part is always non-negative
|
|
154
|
+
try:
|
|
155
|
+
if whole < 0:
|
|
156
|
+
return Fraction(whole * den - num, den)
|
|
157
|
+
else:
|
|
158
|
+
return Fraction(whole * den + num, den)
|
|
159
|
+
except (ValueError, ZeroDivisionError):
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
# Try plain fraction ("a/b")
|
|
163
|
+
frac_m = re.fullmatch(r"\s*(-?\d+)\s*/\s*(-?\d+)\s*", s)
|
|
164
|
+
if frac_m:
|
|
165
|
+
num = int(frac_m.group(1))
|
|
166
|
+
den = int(frac_m.group(2))
|
|
167
|
+
if den == 0:
|
|
168
|
+
return None # SAFE_REJECT — zero denominator
|
|
169
|
+
try:
|
|
170
|
+
return Fraction(num, den)
|
|
171
|
+
except (ValueError, ZeroDivisionError):
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
# Try plain integer
|
|
175
|
+
int_m = re.fullmatch(r"\s*(-?\d+)\s*", s)
|
|
176
|
+
if int_m:
|
|
177
|
+
return Fraction(int(int_m.group(1)))
|
|
178
|
+
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# ---------------------------------------------------------------------------
|
|
183
|
+
# extract_answer
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
def extract_answer(text: str, answer_type: str) -> str | None:
|
|
187
|
+
"""
|
|
188
|
+
Pull the candidate answer string from free-form LLM output.
|
|
189
|
+
|
|
190
|
+
Returns None if no candidate can be unambiguously extracted.
|
|
191
|
+
|
|
192
|
+
Strategy by answer_type:
|
|
193
|
+
- "fraction" / "int":
|
|
194
|
+
1. Last <answer>…</answer> tag content if present.
|
|
195
|
+
2. Else last mixed-number pattern (e.g. "1 1/2").
|
|
196
|
+
3. Else last fraction pattern (e.g. "2/4").
|
|
197
|
+
4. Else last integer.
|
|
198
|
+
Ambiguity rule: if two candidates of EQUAL precedence appear at the same
|
|
199
|
+
'last position' (e.g. "1/2 or 3/4"), return None (SAFE_REJECT upstream).
|
|
200
|
+
- "mc4":
|
|
201
|
+
Last single letter A-D or digit 1-4 (case-insensitive), possibly in parens.
|
|
202
|
+
- Other: None.
|
|
203
|
+
|
|
204
|
+
Trailing punctuation (.!?,;:) and whitespace stripped before return.
|
|
205
|
+
"""
|
|
206
|
+
if not text or not text.strip():
|
|
207
|
+
return None
|
|
208
|
+
|
|
209
|
+
# Expand unicode fractions first
|
|
210
|
+
text_expanded = _expand_unicode_fractions(text)
|
|
211
|
+
|
|
212
|
+
if answer_type == "mc4":
|
|
213
|
+
return _extract_mc(text_expanded)
|
|
214
|
+
|
|
215
|
+
if answer_type in ("fraction", "int"):
|
|
216
|
+
return _extract_numeric(text_expanded, answer_type)
|
|
217
|
+
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _strip_punct(s: str) -> str:
|
|
222
|
+
"""Strip trailing punctuation and whitespace."""
|
|
223
|
+
return s.rstrip(".!?,;: \t\n")
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _extract_mc(text: str) -> str | None:
|
|
227
|
+
"""Extract last MC choice (A-D or 1-4) from text."""
|
|
228
|
+
# Find all letter matches and digit matches
|
|
229
|
+
letter_matches = list(_MC_LETTER_RE.finditer(text))
|
|
230
|
+
digit_matches = list(_MC_DIGIT_RE.finditer(text))
|
|
231
|
+
|
|
232
|
+
# Pick whichever type has its last match further right
|
|
233
|
+
last_letter = letter_matches[-1] if letter_matches else None
|
|
234
|
+
last_digit = digit_matches[-1] if digit_matches else None
|
|
235
|
+
|
|
236
|
+
if last_letter and last_digit:
|
|
237
|
+
if last_letter.start() > last_digit.start():
|
|
238
|
+
return last_letter.group(1).upper()
|
|
239
|
+
elif last_digit.start() > last_letter.start():
|
|
240
|
+
return last_digit.group(1)
|
|
241
|
+
else:
|
|
242
|
+
# Same position — ambiguous; return letter (letters take priority for MC)
|
|
243
|
+
return last_letter.group(1).upper()
|
|
244
|
+
elif last_letter:
|
|
245
|
+
return last_letter.group(1).upper()
|
|
246
|
+
elif last_digit:
|
|
247
|
+
return last_digit.group(1)
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _extract_numeric(text: str, answer_type: str) -> str | None:
|
|
252
|
+
"""
|
|
253
|
+
Extract a numeric candidate from text.
|
|
254
|
+
|
|
255
|
+
Priority:
|
|
256
|
+
1. <answer> tag
|
|
257
|
+
2. Last mixed number
|
|
258
|
+
3. Last fraction
|
|
259
|
+
4. Last integer (for answer_type="int" or as fallback for "fraction")
|
|
260
|
+
|
|
261
|
+
Ambiguity: if the last position contains two distinct fraction candidates
|
|
262
|
+
within 5 characters of each other (e.g. "1/2 or 3/4"), return None.
|
|
263
|
+
"""
|
|
264
|
+
# 1. Try <answer> tag — use raw text (not expanded) to check for tag presence
|
|
265
|
+
tag_match = _ANSWER_TAG_RE.search(text)
|
|
266
|
+
if tag_match:
|
|
267
|
+
content = _strip_punct(tag_match.group(1).strip())
|
|
268
|
+
return content if content else None
|
|
269
|
+
|
|
270
|
+
# 2. Check for decimal — if present, the extraction will surface it; we reject
|
|
271
|
+
# decimals in normalise_fraction, so we don't block extraction here.
|
|
272
|
+
|
|
273
|
+
# 3. Find all mixed-number matches
|
|
274
|
+
mixed_matches = list(_MIXED_RE.finditer(text))
|
|
275
|
+
|
|
276
|
+
# 4. Find all fraction matches (exclude those that are part of a mixed number)
|
|
277
|
+
frac_matches = list(_FRAC_RE.finditer(text))
|
|
278
|
+
# Filter out fractions that are the trailing part of a mixed-number match
|
|
279
|
+
mixed_spans = {(m.start(), m.end()) for m in mixed_matches}
|
|
280
|
+
frac_matches_standalone = [
|
|
281
|
+
m for m in frac_matches
|
|
282
|
+
if not any(ms <= m.start() and m.end() <= me for ms, me in mixed_spans)
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
# 5. Find all integer matches (exclude those inside fractions or mixed numbers)
|
|
286
|
+
int_matches = list(_INT_RE.finditer(text))
|
|
287
|
+
# Exclude integers that are substrings of fraction/mixed patterns
|
|
288
|
+
all_numeric_spans = {(m.start(), m.end()) for m in mixed_matches} | \
|
|
289
|
+
{(m.start(), m.end()) for m in frac_matches}
|
|
290
|
+
int_matches_standalone = [
|
|
291
|
+
m for m in int_matches
|
|
292
|
+
if not any(ms <= m.start() and m.end() <= me for ms, me in all_numeric_spans)
|
|
293
|
+
]
|
|
294
|
+
|
|
295
|
+
# Select by precedence (mixed > fraction > int), using 'last' occurrence
|
|
296
|
+
if mixed_matches:
|
|
297
|
+
last_mixed = mixed_matches[-1]
|
|
298
|
+
# Check for ambiguity: is there another fraction-level candidate within
|
|
299
|
+
# 10 chars after the mixed match that isn't part of it?
|
|
300
|
+
if frac_matches_standalone:
|
|
301
|
+
last_frac = frac_matches_standalone[-1]
|
|
302
|
+
# If both are within 10 chars of each other at the end, ambiguous
|
|
303
|
+
if abs(last_frac.start() - last_mixed.start()) < 10:
|
|
304
|
+
# Could be "1 1/2 or 3/4" — check if there's a connecting word
|
|
305
|
+
between = text[min(last_mixed.end(), last_frac.end()):
|
|
306
|
+
max(last_mixed.start(), last_frac.start())]
|
|
307
|
+
if re.search(r'\bor\b', between, re.IGNORECASE):
|
|
308
|
+
return None # Ambiguous
|
|
309
|
+
return _strip_punct(last_mixed.group())
|
|
310
|
+
|
|
311
|
+
if frac_matches_standalone:
|
|
312
|
+
last_frac = frac_matches_standalone[-1]
|
|
313
|
+
# Ambiguity check: two fractions close together at the end
|
|
314
|
+
if len(frac_matches_standalone) >= 2:
|
|
315
|
+
second_last = frac_matches_standalone[-2]
|
|
316
|
+
gap = text[second_last.end():last_frac.start()]
|
|
317
|
+
if re.search(r'\bor\b', gap, re.IGNORECASE):
|
|
318
|
+
return None # "1/2 or 3/4" → ambiguous
|
|
319
|
+
# Also check if they are very close without connective word
|
|
320
|
+
if last_frac.start() - second_last.end() <= 5:
|
|
321
|
+
return None # Two fractions with no separator → ambiguous
|
|
322
|
+
return _strip_punct(last_frac.group())
|
|
323
|
+
|
|
324
|
+
if int_matches_standalone:
|
|
325
|
+
last_int = int_matches_standalone[-1]
|
|
326
|
+
return _strip_punct(last_int.group())
|
|
327
|
+
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
# ---------------------------------------------------------------------------
|
|
332
|
+
# Canonical string representation
|
|
333
|
+
# ---------------------------------------------------------------------------
|
|
334
|
+
|
|
335
|
+
def _canonical_str(f: Fraction) -> str:
|
|
336
|
+
"""Return the simplest string form of a normalised Fraction."""
|
|
337
|
+
if f.denominator == 1:
|
|
338
|
+
return str(f.numerator)
|
|
339
|
+
return f"{f.numerator}/{f.denominator}"
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# ---------------------------------------------------------------------------
|
|
343
|
+
# check — main entry point
|
|
344
|
+
# ---------------------------------------------------------------------------
|
|
345
|
+
|
|
346
|
+
def check(
|
|
347
|
+
answer_type: str,
|
|
348
|
+
checker: str,
|
|
349
|
+
llm_output: str,
|
|
350
|
+
ground_truth: str,
|
|
351
|
+
) -> CheckOutcome:
|
|
352
|
+
"""
|
|
353
|
+
Verify an LLM-generated answer against ground truth.
|
|
354
|
+
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
answer_type : str
|
|
358
|
+
One of "int", "fraction", "mc4", "free_text" (matches fractions.md verifier.answer_type).
|
|
359
|
+
checker : str
|
|
360
|
+
One of "int_exact", "fraction_equiv", "mc_choice", "none".
|
|
361
|
+
llm_output : str
|
|
362
|
+
The raw LLM-generated text containing (or purportedly containing) the answer.
|
|
363
|
+
ground_truth : str
|
|
364
|
+
The correct answer as a plain string (e.g. "3", "1/2", "A").
|
|
365
|
+
|
|
366
|
+
Returns
|
|
367
|
+
-------
|
|
368
|
+
CheckOutcome
|
|
369
|
+
result is PASS / FAIL / EXTRACT_FAIL / SAFE_REJECT.
|
|
370
|
+
Never raises — all errors surface as SAFE_REJECT.
|
|
371
|
+
"""
|
|
372
|
+
# Guard: empty input
|
|
373
|
+
if not llm_output or not llm_output.strip():
|
|
374
|
+
return CheckOutcome(
|
|
375
|
+
result=CheckResult.EXTRACT_FAIL,
|
|
376
|
+
extracted=None,
|
|
377
|
+
canonical=None,
|
|
378
|
+
detail="Empty llm_output — nothing to verify.",
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Dispatch
|
|
382
|
+
try:
|
|
383
|
+
if checker == "none":
|
|
384
|
+
return _check_none()
|
|
385
|
+
elif checker == "int_exact":
|
|
386
|
+
return _check_int_exact(llm_output, ground_truth)
|
|
387
|
+
elif checker == "fraction_equiv":
|
|
388
|
+
return _check_fraction_equiv(llm_output, ground_truth)
|
|
389
|
+
elif checker == "mc_choice":
|
|
390
|
+
return _check_mc_choice(llm_output, ground_truth)
|
|
391
|
+
else:
|
|
392
|
+
return CheckOutcome(
|
|
393
|
+
result=CheckResult.SAFE_REJECT,
|
|
394
|
+
extracted=None,
|
|
395
|
+
canonical=None,
|
|
396
|
+
detail=f"Unknown checker '{checker}' — safe-reject to avoid false-pass.",
|
|
397
|
+
)
|
|
398
|
+
except Exception as exc: # noqa: BLE001
|
|
399
|
+
# Belt-and-suspenders: any unhandled exception → SAFE_REJECT, not crash
|
|
400
|
+
return CheckOutcome(
|
|
401
|
+
result=CheckResult.SAFE_REJECT,
|
|
402
|
+
extracted=None,
|
|
403
|
+
canonical=None,
|
|
404
|
+
detail=f"Unexpected error during verification: {exc!r} — safe-reject.",
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
# ---------------------------------------------------------------------------
|
|
409
|
+
# Individual checker implementations
|
|
410
|
+
# ---------------------------------------------------------------------------
|
|
411
|
+
|
|
412
|
+
def _check_none() -> CheckOutcome:
|
|
413
|
+
"""Checker 'none' — always PASS (non-checkable free_text answers)."""
|
|
414
|
+
return CheckOutcome(
|
|
415
|
+
result=CheckResult.PASS,
|
|
416
|
+
extracted=None,
|
|
417
|
+
canonical=None,
|
|
418
|
+
detail="Checker 'none': concept is non-checkable; auto-pass.",
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _check_int_exact(llm_output: str, ground_truth: str) -> CheckOutcome:
|
|
423
|
+
"""
|
|
424
|
+
Extract the last integer from llm_output and compare to int(ground_truth).
|
|
425
|
+
Malformed ground_truth → SAFE_REJECT.
|
|
426
|
+
"""
|
|
427
|
+
# Validate ground_truth
|
|
428
|
+
try:
|
|
429
|
+
gt_val = int(ground_truth.strip())
|
|
430
|
+
except (ValueError, AttributeError):
|
|
431
|
+
return CheckOutcome(
|
|
432
|
+
result=CheckResult.SAFE_REJECT,
|
|
433
|
+
extracted=None,
|
|
434
|
+
canonical=None,
|
|
435
|
+
detail=f"ground_truth '{ground_truth}' is not a valid integer — safe-reject.",
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Pre-extraction decimal guard: an LLM answer of "0.5" must not silently
|
|
439
|
+
# fall through to the integer extraction (which would grab "5" or "0").
|
|
440
|
+
# Pilot-domain integer answers are whole-number division results.
|
|
441
|
+
if _DECIMAL_RE.search(llm_output):
|
|
442
|
+
return CheckOutcome(
|
|
443
|
+
result=CheckResult.SAFE_REJECT,
|
|
444
|
+
extracted=None,
|
|
445
|
+
canonical=None,
|
|
446
|
+
detail="llm_output contains a decimal — pilot expects integer answers; safe-reject.",
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Extract candidate
|
|
450
|
+
candidate = extract_answer(llm_output, "int")
|
|
451
|
+
if candidate is None:
|
|
452
|
+
return CheckOutcome(
|
|
453
|
+
result=CheckResult.EXTRACT_FAIL,
|
|
454
|
+
extracted=None,
|
|
455
|
+
canonical=None,
|
|
456
|
+
detail="Could not extract an integer candidate from llm_output.",
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# Parse candidate as integer (it might look like a fraction — that's a fail not a reject)
|
|
460
|
+
try:
|
|
461
|
+
cand_val = int(candidate.strip())
|
|
462
|
+
except ValueError:
|
|
463
|
+
# Candidate extracted but not parseable as int (e.g. "3/4") — that's FAIL not SAFE_REJECT
|
|
464
|
+
return CheckOutcome(
|
|
465
|
+
result=CheckResult.FAIL,
|
|
466
|
+
extracted=candidate,
|
|
467
|
+
canonical=None,
|
|
468
|
+
detail=f"Extracted '{candidate}' but could not parse as integer (expected {gt_val}).",
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
canonical = str(cand_val)
|
|
472
|
+
result = CheckResult.PASS if cand_val == gt_val else CheckResult.FAIL
|
|
473
|
+
detail = (
|
|
474
|
+
f"Extracted {cand_val!r}, expected {gt_val!r}: {'match' if result == CheckResult.PASS else 'mismatch'}."
|
|
475
|
+
)
|
|
476
|
+
return CheckOutcome(result=result, extracted=candidate, canonical=canonical, detail=detail)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _check_fraction_equiv(llm_output: str, ground_truth: str) -> CheckOutcome:
|
|
480
|
+
"""
|
|
481
|
+
Extract a fraction/integer from llm_output, normalise both to Fraction,
|
|
482
|
+
and compare for equivalence.
|
|
483
|
+
|
|
484
|
+
Decimals in llm_output or ground_truth → SAFE_REJECT.
|
|
485
|
+
Zero denominator → SAFE_REJECT.
|
|
486
|
+
Unparseable → SAFE_REJECT (ground_truth) or EXTRACT_FAIL (candidate).
|
|
487
|
+
"""
|
|
488
|
+
# Check for decimal in ground_truth → SAFE_REJECT (config error)
|
|
489
|
+
if _DECIMAL_RE.search(ground_truth):
|
|
490
|
+
return CheckOutcome(
|
|
491
|
+
result=CheckResult.SAFE_REJECT,
|
|
492
|
+
extracted=None,
|
|
493
|
+
canonical=None,
|
|
494
|
+
detail=f"ground_truth '{ground_truth}' contains a decimal — not in pilot scope; safe-reject.",
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
# Check for decimal in llm_output BEFORE extraction. Without this, "0.5" falls
|
|
498
|
+
# through to the trailing-integer fallback in _extract_numeric and produces a
|
|
499
|
+
# confident-wrong FAIL ("5" extracted, compared to "1/2"). Decimals are out of
|
|
500
|
+
# pilot scope (SPEC §23) — safe-reject any decimal-shaped LLM output.
|
|
501
|
+
if _DECIMAL_RE.search(llm_output):
|
|
502
|
+
return CheckOutcome(
|
|
503
|
+
result=CheckResult.SAFE_REJECT,
|
|
504
|
+
extracted=None,
|
|
505
|
+
canonical=None,
|
|
506
|
+
detail="llm_output contains a decimal — not in pilot scope; safe-reject.",
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# Parse ground_truth
|
|
510
|
+
gt_frac = normalise_fraction(ground_truth.strip())
|
|
511
|
+
if gt_frac is None:
|
|
512
|
+
return CheckOutcome(
|
|
513
|
+
result=CheckResult.SAFE_REJECT,
|
|
514
|
+
extracted=None,
|
|
515
|
+
canonical=None,
|
|
516
|
+
detail=f"ground_truth '{ground_truth}' could not be normalised to a fraction — safe-reject.",
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Extract candidate from LLM output
|
|
520
|
+
candidate = extract_answer(llm_output, "fraction")
|
|
521
|
+
|
|
522
|
+
if candidate is None:
|
|
523
|
+
# Check if extraction returned None due to ambiguity (two fractions with 'or')
|
|
524
|
+
# vs. genuinely no fraction found — both are EXTRACT_FAIL at this level
|
|
525
|
+
# (the ambiguity check inside extract_answer returns None for both)
|
|
526
|
+
# We need to distinguish: if there ARE fraction-like tokens but we couldn't
|
|
527
|
+
# choose, that's SAFE_REJECT; if there are none, that's EXTRACT_FAIL.
|
|
528
|
+
# Heuristic: if there's a fraction pattern anywhere in the text, it's ambiguous.
|
|
529
|
+
expanded = _expand_unicode_fractions(llm_output)
|
|
530
|
+
has_frac = bool(_FRAC_RE.search(expanded)) or bool(_MIXED_RE.search(expanded))
|
|
531
|
+
if has_frac:
|
|
532
|
+
return CheckOutcome(
|
|
533
|
+
result=CheckResult.SAFE_REJECT,
|
|
534
|
+
extracted=None,
|
|
535
|
+
canonical=None,
|
|
536
|
+
detail="Multiple fraction candidates found but could not unambiguously select one — safe-reject.",
|
|
537
|
+
)
|
|
538
|
+
return CheckOutcome(
|
|
539
|
+
result=CheckResult.EXTRACT_FAIL,
|
|
540
|
+
extracted=None,
|
|
541
|
+
canonical=None,
|
|
542
|
+
detail="No fraction or integer candidate found in llm_output.",
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# Detect decimal in extracted candidate
|
|
546
|
+
if _DECIMAL_RE.search(candidate):
|
|
547
|
+
return CheckOutcome(
|
|
548
|
+
result=CheckResult.SAFE_REJECT,
|
|
549
|
+
extracted=candidate,
|
|
550
|
+
canonical=None,
|
|
551
|
+
detail=f"Extracted candidate '{candidate}' contains a decimal — not in pilot scope; safe-reject.",
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# Normalise candidate
|
|
555
|
+
cand_frac = normalise_fraction(candidate)
|
|
556
|
+
if cand_frac is None:
|
|
557
|
+
# Includes zero-denominator case
|
|
558
|
+
if re.search(r"/\s*0\b", candidate):
|
|
559
|
+
return CheckOutcome(
|
|
560
|
+
result=CheckResult.SAFE_REJECT,
|
|
561
|
+
extracted=candidate,
|
|
562
|
+
canonical=None,
|
|
563
|
+
detail=f"Extracted '{candidate}' has zero denominator — safe-reject.",
|
|
564
|
+
)
|
|
565
|
+
return CheckOutcome(
|
|
566
|
+
result=CheckResult.SAFE_REJECT,
|
|
567
|
+
extracted=candidate,
|
|
568
|
+
canonical=None,
|
|
569
|
+
detail=f"Extracted '{candidate}' could not be normalised — safe-reject.",
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
canonical = _canonical_str(cand_frac)
|
|
573
|
+
gt_canonical = _canonical_str(gt_frac)
|
|
574
|
+
result = CheckResult.PASS if cand_frac == gt_frac else CheckResult.FAIL
|
|
575
|
+
detail = (
|
|
576
|
+
f"Extracted '{candidate}' → {canonical}; "
|
|
577
|
+
f"expected '{ground_truth}' → {gt_canonical}: "
|
|
578
|
+
f"{'equivalent' if result == CheckResult.PASS else 'not equivalent'}."
|
|
579
|
+
)
|
|
580
|
+
return CheckOutcome(result=result, extracted=candidate, canonical=canonical, detail=detail)
|
|
581
|
+
|
|
582
|
+
|
|
583
|
+
def _check_mc_choice(llm_output: str, ground_truth: str) -> CheckOutcome:
|
|
584
|
+
"""
|
|
585
|
+
Extract the last MC choice (A-D or 1-4) from llm_output and compare
|
|
586
|
+
case-insensitively to ground_truth.
|
|
587
|
+
|
|
588
|
+
Malformed ground_truth → SAFE_REJECT.
|
|
589
|
+
"""
|
|
590
|
+
# Validate ground_truth: must be A-D or 1-4
|
|
591
|
+
gt = ground_truth.strip()
|
|
592
|
+
if not re.fullmatch(r"[A-Da-d1-4]", gt):
|
|
593
|
+
return CheckOutcome(
|
|
594
|
+
result=CheckResult.SAFE_REJECT,
|
|
595
|
+
extracted=None,
|
|
596
|
+
canonical=None,
|
|
597
|
+
detail=f"ground_truth '{gt}' is not a valid MC choice (A-D or 1-4) — safe-reject.",
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
candidate = extract_answer(llm_output, "mc4")
|
|
601
|
+
if candidate is None:
|
|
602
|
+
return CheckOutcome(
|
|
603
|
+
result=CheckResult.EXTRACT_FAIL,
|
|
604
|
+
extracted=None,
|
|
605
|
+
canonical=None,
|
|
606
|
+
detail="Could not extract an MC choice from llm_output.",
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Normalise: letters → uppercase, digits stay as-is
|
|
610
|
+
cand_norm = candidate.upper() if candidate.isalpha() else candidate
|
|
611
|
+
gt_norm = gt.upper() if gt.isalpha() else gt
|
|
612
|
+
|
|
613
|
+
result = CheckResult.PASS if cand_norm == gt_norm else CheckResult.FAIL
|
|
614
|
+
detail = (
|
|
615
|
+
f"Extracted '{candidate}' (normalised '{cand_norm}'), "
|
|
616
|
+
f"expected '{gt}' (normalised '{gt_norm}'): "
|
|
617
|
+
f"{'match' if result == CheckResult.PASS else 'mismatch'}."
|
|
618
|
+
)
|
|
619
|
+
return CheckOutcome(result=result, extracted=candidate, canonical=cand_norm, detail=detail)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Grounding / ZIM-reader module: resolve a curriculum node's grounding block to a passage.
|
|
2
|
+
|
|
3
|
+
Primary path (pilot scope, anchor-resolution only):
|
|
4
|
+
resolve_grounding(node_grounding, cfg) -> str
|
|
5
|
+
node_grounding: dict with keys source, anchor, passage_hint (from curriculum YAML)
|
|
6
|
+
cfg: dict from config/inference.yaml grounding: block
|
|
7
|
+
returns: inner passage text for {{grounding_passage}}, or "" on any failure
|
|
8
|
+
|
|
9
|
+
Degradation contract (SAFETY §1.5 / SPEC §15):
|
|
10
|
+
ZIM missing | anchor not found | empty passage → returns "", logs a warning, NEVER raises.
|
|
11
|
+
A grounding failure must never crash a tutoring turn.
|
|
12
|
+
|
|
13
|
+
Scope: anchor-resolution only (pilot). Title-prediction / BM25 / embeddings deferred to W7.5.
|
|
14
|
+
Deps: libzim (runtime, pinned). OpenZIM MCP (MIT) = reference only. No MCP server, no JSON-RPC.
|
|
15
|
+
Spec: docs/design/W7_grounding_reader.md; SPEC §15 (layer-1 RAG); SAFETY §1.5 (grounding-as-data).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
from mentar.grounding.resolve import resolve_grounding_inner
|
|
23
|
+
from mentar.grounding.wrapper import wrap_passage
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
__all__ = ["resolve_grounding"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def resolve_grounding(node_grounding: dict, cfg: dict) -> str:
|
|
31
|
+
"""Resolve a curriculum node's grounding block to a plain passage string.
|
|
32
|
+
|
|
33
|
+
This is the single entry-point the dialogue controller calls. It honours the
|
|
34
|
+
degradation contract: any failure returns "" — never an exception — so a missing
|
|
35
|
+
or broken ZIM never crashes a tutoring turn.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
node_grounding: The ``grounding:`` sub-dict from a curriculum concept node,
|
|
39
|
+
with keys ``source``, ``anchor``, ``passage_hint``.
|
|
40
|
+
cfg: The ``grounding:`` section of ``config/inference.yaml``
|
|
41
|
+
(loaded by the caller; env-vars already expanded).
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Inner passage text ready for ``{{grounding_passage}}`` — empty string on
|
|
45
|
+
any failure (degradation contract).
|
|
46
|
+
"""
|
|
47
|
+
# Type guard: a node may lack a grounding block entirely (None / malformed).
|
|
48
|
+
# Handle it before the try so the except handler can safely read .get() below.
|
|
49
|
+
if not isinstance(node_grounding, dict):
|
|
50
|
+
logger.warning(
|
|
51
|
+
"resolve_grounding: node_grounding is not a dict (%s) — returning empty passage",
|
|
52
|
+
type(node_grounding).__name__,
|
|
53
|
+
)
|
|
54
|
+
return ""
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
raw = resolve_grounding_inner(node_grounding, cfg)
|
|
58
|
+
return wrap_passage(raw, cfg)
|
|
59
|
+
except Exception:
|
|
60
|
+
logger.warning(
|
|
61
|
+
"resolve_grounding: unexpected error for anchor=%r — returning empty passage",
|
|
62
|
+
node_grounding.get("anchor", "<unknown>"),
|
|
63
|
+
exc_info=True,
|
|
64
|
+
)
|
|
65
|
+
return ""
|