zu-checks 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+
9
+ # uv / venv
10
+ .venv/
11
+ uv.lock.bak
12
+
13
+ # Test / type caches
14
+ .pytest_cache/
15
+ .mypy_cache/
16
+ .ruff_cache/
17
+ .coverage
18
+ htmlcov/
19
+
20
+ # Zu runtime artifacts
21
+ *.db
22
+ zu.db
23
+ zu.yaml.local
24
+ zu_review.jsonl
25
+ *.review.jsonl
26
+ # Per-agent cost telemetry ledger — machine-local run history, not source.
27
+ cost.jsonl
28
+ # A recorded replay path is learned per-run and machine-local — regenerated on
29
+ # every successful run, not source. The agent ships; its track does not.
30
+ track.json
31
+ # …except the flagship example ships its track on purpose, as a demo of the
32
+ # record/replay convergence (committed; re-runs show as ordinary modifications).
33
+ !examples/agents/vet-appointment/track.json
34
+
35
+ # Editor / OS
36
+ .idea/
37
+ .vscode/
38
+ .DS_Store
39
+
40
+ # Claude Code local session state
41
+ .claude/
42
+
43
+ # Secrets
44
+ .env
45
+ .env.*
46
+ !.env.example
47
+
48
+ # Microsoft Office temp/lock files
49
+ ~$*
50
+
51
+ # Internal design / strategy docs — kept local, never in the public repo
52
+ *.docx
53
+ *.pdf
54
+ # BUILD.md is the internal build-sequence / deferred-gaps ledger — kept local.
55
+ # (ARCHITECTURE.md is public: an onboarding agent needs the structural map.)
56
+ docs/BUILD.md
57
+
58
+ # Local secret — API key for live validation, never commit
59
+ zu_demo_key.md
60
+ *_key.md
61
+
62
+ # Local PyPI publish token — never commit
63
+ /pypi
64
+
65
+ # Local Discord credentials (bot token / app secrets) — never commit
66
+ /discord
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.4
2
+ Name: zu-checks
3
+ Version: 0.2.0
4
+ Summary: Zu built-in checks: detectors (empty, error, js-shell, embedded-widget, bot-wall) + validators (schema, grounding)
5
+ Project-URL: Homepage, https://github.com/k3-mt/zu
6
+ Project-URL: Repository, https://github.com/k3-mt/zu
7
+ License-Expression: Apache-2.0
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
15
+ Classifier: Typing :: Typed
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: jsonschema>=4
18
+ Requires-Dist: zu-core==0.2.0
19
+ Description-Content-Type: text/markdown
20
+
21
+ # zu-checks
22
+
23
+ The built-in **checks** that ship with the Zu base runtime — two stdlib plugin
24
+ kinds in one package:
25
+
26
+ - **detectors** (`zu_checks.detectors`) — `empty`, `error`, `js-shell`,
27
+ `bot-wall`. Inspect an observation and return a `Verdict`; the severity drives
28
+ the loop (`ESCALATE` climbs the tier ladder, `TERMINAL` ends the run).
29
+ - **validators** (`zu_checks.validators`) — `schema` (does the result fit the
30
+ requested shape?) and `grounding` (does every extracted value actually appear
31
+ in retrieved content? — the anti-hallucination check).
32
+
33
+ They're packaged together because both are pure-stdlib (schema adds only
34
+ `jsonschema`) and always present in the base — unlike the adapter packages
35
+ (`zu-providers`, `zu-tools`, `zu-backends`), whose separation carries distinct
36
+ heavy optional dependencies. All register via the standard `zu.detectors` /
37
+ `zu.validators` entry-point groups, exactly as a third-party check would.
@@ -0,0 +1,17 @@
1
+ # zu-checks
2
+
3
+ The built-in **checks** that ship with the Zu base runtime — two stdlib plugin
4
+ kinds in one package:
5
+
6
+ - **detectors** (`zu_checks.detectors`) — `empty`, `error`, `js-shell`,
7
+ `bot-wall`. Inspect an observation and return a `Verdict`; the severity drives
8
+ the loop (`ESCALATE` climbs the tier ladder, `TERMINAL` ends the run).
9
+ - **validators** (`zu_checks.validators`) — `schema` (does the result fit the
10
+ requested shape?) and `grounding` (does every extracted value actually appear
11
+ in retrieved content? — the anti-hallucination check).
12
+
13
+ They're packaged together because both are pure-stdlib (schema adds only
14
+ `jsonschema`) and always present in the base — unlike the adapter packages
15
+ (`zu-providers`, `zu-tools`, `zu-backends`), whose separation carries distinct
16
+ heavy optional dependencies. All register via the standard `zu.detectors` /
17
+ `zu.validators` entry-point groups, exactly as a third-party check would.
@@ -0,0 +1,44 @@
1
+ [project]
2
+ name = "zu-checks"
3
+ version = "0.2.0"
4
+ description = "Zu built-in checks: detectors (empty, error, js-shell, embedded-widget, bot-wall) + validators (schema, grounding)"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = "Apache-2.0"
8
+ classifiers = [
9
+ "Development Status :: 4 - Beta",
10
+ "Intended Audience :: Developers",
11
+ "License :: OSI Approved :: Apache Software License",
12
+ "Programming Language :: Python :: 3",
13
+ "Programming Language :: Python :: 3.11",
14
+ "Programming Language :: Python :: 3.12",
15
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
16
+ "Typing :: Typed",
17
+ ]
18
+ # The built-in checks are pure-stdlib (+ jsonschema for the schema validator) and
19
+ # always ship with the base runtime — they have no heavy/optional deps, which is
20
+ # why detectors and validators live together here rather than as two packages.
21
+ dependencies = ["zu-core==0.2.0", "jsonschema>=4"]
22
+
23
+ [project.entry-points."zu.detectors"]
24
+ empty = "zu_checks.detectors.empty:EmptyDetector"
25
+ error = "zu_checks.detectors.error:ErrorDetector"
26
+ js-shell = "zu_checks.detectors.js_shell:JsShellDetector"
27
+ embedded-widget = "zu_checks.detectors.embedded_widget:EmbeddedWidgetDetector"
28
+ bot-wall = "zu_checks.detectors.bot_wall:BotWallDetector"
29
+ action-surface-blind = "zu_checks.detectors.action_surface_blind:ActionSurfaceBlindDetector"
30
+
31
+ [project.entry-points."zu.validators"]
32
+ schema = "zu_checks.validators.schema:SchemaValidator"
33
+ grounding = "zu_checks.validators.grounding:GroundingValidator"
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/k3-mt/zu"
37
+ Repository = "https://github.com/k3-mt/zu"
38
+
39
+ [build-system]
40
+ requires = ["hatchling"]
41
+ build-backend = "hatchling.build"
42
+
43
+ [tool.hatch.build.targets.wheel]
44
+ packages = ["src/zu_checks"]
@@ -0,0 +1,13 @@
1
+ """Zu built-in checks — the two stdlib plugin kinds that ship with the base.
2
+
3
+ * ``zu_checks.detectors`` — observation-time detectors whose Verdict severities
4
+ drive the loop (ESCALATE climbs the tier ladder; TERMINAL ends the run).
5
+ * ``zu_checks.validators`` — on-final result checks (schema shape + grounding,
6
+ the anti-hallucination provenance check).
7
+
8
+ They live in one package because both are pure-stdlib (the schema validator adds
9
+ only ``jsonschema``) and always present in the base runtime — unlike the adapter
10
+ packages (providers/tools/backends) whose separation carries distinct heavy
11
+ optional dependencies. They register through the same ``zu.detectors`` /
12
+ ``zu.validators`` entry-point groups any third-party check would.
13
+ """
@@ -0,0 +1,37 @@
1
+ """Zu built-in detectors.
2
+
3
+ A detector inspects an observation and may return a Verdict. Verdict
4
+ severities (WARN, RETRY, ESCALATE, TERMINAL) map onto the loop's control flow:
5
+ ESCALATE is the deterministic signal that climbs the tier ladder. Detectors
6
+ are where escalation is decided — never improvised by the model.
7
+ """
8
+
9
+
10
+ # What counts as page content in an observation, in preference order. The loop
11
+ # stores a fetched/rendered page under one of these keys (mirrors zu_core.loop's
12
+ # own ``_CONTENT_KEYS``); a detector must consult all of them or it goes blind to
13
+ # a tool that returns ``{"text": ...}`` / ``{"content": ...}`` instead of html.
14
+ # One source of truth, reused by ``empty`` too.
15
+ _CONTENT_KEYS = ("html", "text", "content")
16
+
17
+
18
+ def _html_of(ctx) -> str:
19
+ """Best-effort extraction of the page content from a RunContext observation.
20
+
21
+ Concatenates *every* present content key (html, text, content) rather than
22
+ returning only the first, so a marker detector is never blind to a tool that
23
+ splits content across keys — the same all-keys view the ``empty`` detector
24
+ uses, so the detectors agree on what "the content" is."""
25
+ obs = getattr(ctx, "observation", None)
26
+ if isinstance(obs, dict):
27
+ parts = [v for k in _CONTENT_KEYS if isinstance(v := obs.get(k), str) and v]
28
+ if parts:
29
+ return "\n".join(parts)
30
+ return ""
31
+
32
+
33
+ def _contains_any(html: str, markers) -> bool:
34
+ """True if any marker (case-insensitive) appears in ``html`` — the shared
35
+ substring scan behind the marker-list detectors (bot-wall, js-shell)."""
36
+ lowered = html.lower()
37
+ return any(marker in lowered for marker in markers)
@@ -0,0 +1,33 @@
1
+ """action-surface-blind — escalate to vision when the action surface is blind.
2
+
3
+ The Action Surface (Engineering Design §11) is a fast, cheap default for the
4
+ common case; its competence boundary is the trigger for the next tier — pixels
5
+ and a vision model. When the accessibility tree is too thin to trust, the tool
6
+ sets ``surface_blind`` on its observation rather than silently returning an
7
+ incomplete surface. This detector turns that signal into the deterministic
8
+ ESCALATE that climbs the ladder to tier-4 vision (§11.4) — escalation decided by
9
+ a detector, never improvised by the model.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from zu_core.ports import RunContext, Scope, Severity, Verdict
15
+
16
+
17
+ class ActionSurfaceBlindDetector:
18
+ name = "action-surface-blind"
19
+ scope = Scope.PER_OBSERVATION
20
+
21
+ def inspect(self, ctx: RunContext) -> Verdict | None:
22
+ obs = getattr(ctx, "observation", None)
23
+ if not isinstance(obs, dict):
24
+ return None
25
+ if obs.get("surface_blind") is True:
26
+ surface = obs.get("action_surface")
27
+ reason = surface.get("blind_reason") if isinstance(surface, dict) else None
28
+ return Verdict(
29
+ severity=Severity.ESCALATE,
30
+ detector=self.name,
31
+ detail=reason or "action surface too thin to trust; escalate to vision",
32
+ )
33
+ return None
@@ -0,0 +1,54 @@
1
+ """bot-wall — fires on an anti-bot interstitial (Cloudflare, captcha, etc.)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from zu_core.ports import RunContext, Scope, Severity, Verdict
6
+
7
+ from . import _contains_any, _html_of
8
+
9
+ # Strong markers: phrasing characteristic of an anti-bot interstitial, specific
10
+ # enough that their presence is treated as the signal on its own. This is a
11
+ # deterministic heuristic, not a proof: a page that *discusses* CAPTCHAs (a news
12
+ # story, this very comment) can contain "captcha" and would escalate — the cost
13
+ # is a wasted tier-2 render, not a wrong answer, and escalating a borderline page
14
+ # is the safer failure. ``cf-browser-verification`` is unambiguous; the natural-
15
+ # language phrases are the ones with residual false-positive surface.
16
+ _STRONG_MARKERS = (
17
+ "captcha",
18
+ "are you a robot",
19
+ "verify you are human",
20
+ "cf-browser-verification",
21
+ )
22
+
23
+ # Weak markers: real Cloudflare wall phrasing, but common-enough English that a
24
+ # substring match alone false-positives (an article titled "Just a Moment in
25
+ # History", a banner reading "Attention required"). They fire ONLY when a
26
+ # Cloudflare fingerprint is also present, so a normal page is never escalated.
27
+ _WEAK_MARKERS = (
28
+ "attention required",
29
+ "just a moment",
30
+ )
31
+ _CLOUDFLARE_FINGERPRINTS = (
32
+ "cloudflare",
33
+ "cf-ray",
34
+ "cf-browser-verification",
35
+ "__cf",
36
+ "/cdn-cgi/",
37
+ )
38
+
39
+
40
+ class BotWallDetector:
41
+ name = "bot-wall"
42
+ scope = Scope.PER_OBSERVATION
43
+
44
+ def inspect(self, ctx: RunContext) -> Verdict | None:
45
+ html = _html_of(ctx)
46
+ strong = _contains_any(html, _STRONG_MARKERS)
47
+ weak = _contains_any(html, _WEAK_MARKERS) and _contains_any(html, _CLOUDFLARE_FINGERPRINTS)
48
+ if strong or weak:
49
+ return Verdict(
50
+ severity=Severity.ESCALATE,
51
+ detector=self.name,
52
+ detail="anti-bot wall detected",
53
+ )
54
+ return None
@@ -0,0 +1,97 @@
1
+ """embedded-widget — fires when the page's real content is inside a JS widget.
2
+
3
+ The complement to ``js-shell``. ``js-shell`` catches an *empty* SPA shell (a
4
+ ``<div id="root">`` with no visible text). But a page can be full of human-visible
5
+ chrome — nav, footer, copy — while the data the task actually needs (appointment
6
+ slots, a price table, a seat map) is rendered by an **embedded third-party widget
7
+ or iframe** that loads via JavaScript. A tier-1 ``http_fetch`` sees the chrome and
8
+ the empty mount point, never the data, so it would loop forever or give up. This
9
+ detector is the deterministic signal to *offer* the browser (tier 2) in that case.
10
+
11
+ It is conservative about what counts as a content widget, to avoid escalating on
12
+ ubiquitous analytics/ad scripts:
13
+
14
+ * an ``<iframe>`` with an external ``http(s)`` ``src`` — an embedded application
15
+ whose content is not in this DOM; or
16
+ * a **widget mount point** — an element whose *attributes* (id/class/data-*/domain)
17
+ name a content widget (``widget``, ``embed``, ``scheduler``, or a known booking
18
+ vendor) — together with an external ``<script>`` that fills it.
19
+
20
+ ESCALATE only *unlocks* the browser; the model renders only if it still lacks the
21
+ data, so being a touch generous here is cheap and fail-safe.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import re
27
+
28
+ from zu_core.ports import RunContext, Scope, Severity, Verdict
29
+
30
+ from . import _html_of
31
+
32
+ # Tokens that, when they appear in an element's ATTRIBUTES (not visible text),
33
+ # mark a JS content-widget mount. Generic structural words plus a few common
34
+ # booking/scheduling vendors — kept to attribute context so a nav link like
35
+ # href="/book-an-appointment" or body copy never trips it.
36
+ _WIDGET_TOKENS = (
37
+ "widget", "embed", "scheduler", "data-widget",
38
+ "vetstoria", "oabp", "calendly", "acuityscheduling", "simplybook", "petsapp",
39
+ )
40
+
41
+ # An <iframe ...> carrying an external http(s) src — an embedded app.
42
+ _IFRAME_SRC = re.compile(r"<iframe\b[^>]*\bsrc\s*=\s*[\"']https?://", re.IGNORECASE)
43
+ # Any element's attribute span, to scan for a widget token in attribute context.
44
+ _TAG_ATTRS = re.compile(r"<[a-zA-Z][a-zA-Z0-9]*\b([^>]*)>")
45
+ # An external <script src="http(s)://..."> — the loader that fills a mount point.
46
+ _EXTERNAL_SCRIPT = re.compile(r"<script\b[^>]*\bsrc\s*=\s*[\"']https?://", re.IGNORECASE)
47
+
48
+
49
+ def _has_widget_mount(html: str) -> bool:
50
+ """True if some element's attributes name a content widget."""
51
+ for m in _TAG_ATTRS.finditer(html):
52
+ attrs = m.group(1).lower()
53
+ if any(tok in attrs for tok in _WIDGET_TOKENS):
54
+ return True
55
+ return False
56
+
57
+
58
+ def _already_escalated(ctx: RunContext) -> bool:
59
+ """True if the run has already escalated (or browser-rendered) this run.
60
+
61
+ This detector is an escalation *trigger*: its job is to unlock the browser
62
+ tier once. After that it must go quiet — every later widget page (another
63
+ http_fetch, or the rendered DOM, which still carries the markers) would
64
+ otherwise re-fire, and at the top tier a re-escalation is 'exhausted' and ENDS
65
+ the run before the model can use the browser it just unlocked. So: fire once,
66
+ then defer to the model working at the higher tier."""
67
+ for ev in getattr(ctx, "events", []) or []:
68
+ et = getattr(ev, "type", "")
69
+ if et == "harness.task.escalated":
70
+ return True
71
+ if et == "data.source.fetched" and getattr(ev, "source", "") == "render_dom":
72
+ return True
73
+ return False
74
+
75
+
76
+ class EmbeddedWidgetDetector:
77
+ name = "embedded-widget"
78
+ scope = Scope.PER_OBSERVATION
79
+
80
+ def inspect(self, ctx: RunContext) -> Verdict | None:
81
+ html = _html_of(ctx)
82
+ if not html:
83
+ return None
84
+ if _already_escalated(ctx):
85
+ return None # already unlocked the browser; fire once, then stay quiet
86
+ embedded_app = bool(_IFRAME_SRC.search(html))
87
+ # A named mount point only counts when an external script is present to
88
+ # fill it — a bare class="...widget..." on a static page isn't deferred.
89
+ widget_loaded = _has_widget_mount(html) and bool(_EXTERNAL_SCRIPT.search(html))
90
+ if embedded_app or widget_loaded:
91
+ return Verdict(
92
+ severity=Severity.ESCALATE,
93
+ detector=self.name,
94
+ detail="page defers content to an embedded widget/iframe; "
95
+ "escalate to a browser to render it",
96
+ )
97
+ return None
@@ -0,0 +1,32 @@
1
+ """empty — fires when a *fetched page* carried no usable content.
2
+
3
+ Scoped to page-content observations on purpose: it judges a fetch (a tool that
4
+ returned ``html``/``text``/``content``) and escalates when that content is empty
5
+ — the signal to climb to a browser. It must NOT fire on observations that are not
6
+ page fetches — e.g. ``html_parse`` returning ``{"matches": [...]}`` (a successful
7
+ extraction) or an error observation — or it would spuriously escalate after real
8
+ work. So: a content key present but blank -> escalate; no content key -> not our
9
+ concern (return None).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from zu_core.ports import RunContext, Scope, Severity, Verdict
15
+
16
+ from . import _CONTENT_KEYS # one source of truth for "what counts as page content"
17
+
18
+
19
+ class EmptyDetector:
20
+ name = "empty"
21
+ scope = Scope.PER_OBSERVATION
22
+
23
+ def inspect(self, ctx: RunContext) -> Verdict | None:
24
+ obs = getattr(ctx, "observation", None)
25
+ if not isinstance(obs, dict):
26
+ return None
27
+ present = [k for k in _CONTENT_KEYS if k in obs]
28
+ if not present:
29
+ return None # not a page-content observation — "empty" doesn't apply
30
+ if all(not str(obs.get(k) or "").strip() for k in present):
31
+ return Verdict(severity=Severity.ESCALATE, detector=self.name, detail="empty observation")
32
+ return None
@@ -0,0 +1,25 @@
1
+ """error — fires on an HTTP error status in the observation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from zu_core.ports import RunContext, Scope, Severity, Verdict
6
+
7
+
8
+ class ErrorDetector:
9
+ name = "error"
10
+ scope = Scope.PER_OBSERVATION
11
+
12
+ def inspect(self, ctx: RunContext) -> Verdict | None:
13
+ # An HTTP error on a FETCHED page is RECOVERABLE, not fatal. A single bad
14
+ # url (a 403 WAF wall, a 404, a 5xx) says nothing about whether the RUN can
15
+ # succeed — an agent that searches and tries several candidates must be
16
+ # free to fetch the next one. Ending the whole run on one bad fetch (the
17
+ # old TERMINAL behaviour) broke exactly that. So this is RETRY: it is
18
+ # recorded and fed back, the model sees the error and chooses another
19
+ # action, and a run that genuinely cannot proceed still ends via the
20
+ # step/token budget — not by assuming the first url was the only one.
21
+ obs = getattr(ctx, "observation", None)
22
+ status = obs.get("status") if isinstance(obs, dict) else None
23
+ if isinstance(status, int) and status >= 400:
24
+ return Verdict(severity=Severity.RETRY, detector=self.name, detail=f"http {status}")
25
+ return None
@@ -0,0 +1,75 @@
1
+ """js-shell — fires when a page is an empty JavaScript shell.
2
+
3
+ The canonical escalation trigger: tier-1 http_fetch returns HTML that is
4
+ essentially a <div id="root"></div> plus scripts, with no real text content.
5
+ That is the signal to give up on the cheap tier and climb to a browser.
6
+
7
+ The test is structural, not size-based: a page is a shell when it has a known
8
+ SPA mount point *and* almost no human-visible text once scripts and styles are
9
+ removed. Measuring visible text (rather than raw HTML length) is what step 5
10
+ finalizes — a shell padded with a large inline bundle is still a shell, and a
11
+ small page that happens to be real content is not escalated.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+
18
+ from zu_core.ports import RunContext, Scope, Severity, Verdict
19
+
20
+ from . import _contains_any, _html_of
21
+
22
+ # Common SPA mount points / framework markers.
23
+ _SHELL_MARKERS = ('id="root"', "id='root'", 'id="app"', "id='app'", "__NEXT_DATA__")
24
+
25
+ # Strip the elements whose contents are never visible text before measuring.
26
+ # ``\s*`` in the close tag tolerates ``</script >``; the second pattern handles
27
+ # an *unterminated* script/style — a browser treats everything after an unclosed
28
+ # <script> as script text, so the heuristic does too (consume to end of input).
29
+ # HTML comments are removed FIRST so a commented-out ``<!-- <script> -->`` (or
30
+ # any literal ``<script`` inside a comment) can't trip the greedy _UNCLOSED rule
31
+ # and erase the real article body after it — a deterministic false-positive the
32
+ # unbalanced-tag heuristic would otherwise produce.
33
+ _COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
34
+ _NONVISIBLE = re.compile(r"<(script|style|template|noscript)\b.*?</\1\s*>", re.IGNORECASE | re.DOTALL)
35
+ _UNCLOSED = re.compile(r"<(script|style|template|noscript)\b.*\Z", re.IGNORECASE | re.DOTALL)
36
+ _TAGS = re.compile(r"<[^>]+>")
37
+ _WS = re.compile(r"\s+")
38
+
39
+ # Below this many characters of visible text, a page with a mount point is
40
+ # treated as an unrendered shell. Tuned against the graded fixture set.
41
+ _MIN_VISIBLE_TEXT = 64
42
+
43
+
44
+ def _visible_text(html: str) -> str:
45
+ """Human-visible text: drop script/style/template/noscript bodies, strip
46
+ the remaining tags, and collapse whitespace."""
47
+ without_code = _COMMENT.sub(" ", html)
48
+ without_code = _NONVISIBLE.sub(" ", without_code)
49
+ without_code = _UNCLOSED.sub(" ", without_code)
50
+ text = _TAGS.sub(" ", without_code)
51
+ return _WS.sub(" ", text).strip()
52
+
53
+
54
+ class JsShellDetector:
55
+ name = "js-shell"
56
+ scope = Scope.PER_OBSERVATION
57
+
58
+ def inspect(self, ctx: RunContext) -> Verdict | None:
59
+ html = _html_of(ctx)
60
+ if not html:
61
+ return None
62
+ lowered = html.lower()
63
+ looks_like_shell = _contains_any(html, _SHELL_MARKERS)
64
+ # The page defers its content to JS: a literal <script>, OR a module
65
+ # graph pulled in via <link rel="modulepreload"> with no inline script
66
+ # (a modern bundler shape the bare "<script" check would miss).
67
+ script_heavy = "<script" in lowered or "modulepreload" in lowered
68
+ thin = len(_visible_text(html)) < _MIN_VISIBLE_TEXT
69
+ if looks_like_shell and script_heavy and thin:
70
+ return Verdict(
71
+ severity=Severity.ESCALATE,
72
+ detector=self.name,
73
+ detail="page appears to be a JS shell; escalate to a browser",
74
+ )
75
+ return None
@@ -0,0 +1,7 @@
1
+ """Zu built-in validators — the on-final checks of the result.
2
+
3
+ The two cheapest rungs of the validation ladder: schema (does the result fit
4
+ the requested shape?) and grounding (does every extracted value actually
5
+ appear in retrieved content?). Grounding is the anti-hallucination check — the
6
+ core of the "agents that actually work" claim.
7
+ """
@@ -0,0 +1,162 @@
1
+ """grounding — every extracted value must appear in retrieved content.
2
+
3
+ The anti-making-things-up check: a value the agent reports that is nowhere in
4
+ the content the run actually fetched fails grounding. It reads the run's
5
+ content from the event log via RunContext, so it proves provenance, not just
6
+ plausibility.
7
+
8
+ Matching is token-boundary-aware (build step 6): a value must appear in the
9
+ retrieved content as a standalone token, not merely as a substring, so a short
10
+ value such as ``"5"`` is not spuriously grounded by ``"1985"``.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from collections.abc import Iterator
16
+
17
+ from zu_core.contracts import Result
18
+ from zu_core.ports import RunContext, Severity, Verdict
19
+
20
+
21
+ def _normalize(s: str) -> str:
22
+ """Collapse whitespace and lowercase so trivial formatting differences
23
+ between an extracted value and the page text don't cause false failures."""
24
+ return " ".join(s.split()).lower()
25
+
26
+
27
+ def _grounded(leaf_norm: str, corpus: str) -> bool:
28
+ """Is the normalized value present in the corpus on token boundaries?
29
+
30
+ Plain substring containment is too lenient: a short value like ``"5"`` would
31
+ match incidentally inside ``"1985"`` and let a fabricated number pass. We
32
+ require the value to appear as a standalone token, not a fragment of a longer
33
+ one, on two axes:
34
+
35
+ - **Alphanumeric flanks** (Unicode-aware via ``str.isalnum``): ``"5"`` inside
36
+ ``"1985"`` or ``"caf"`` inside ``"café"`` does not ground, while ``"$9.00"``
37
+ between ``>`` and ``<`` still does — punctuation is a boundary.
38
+ - **Number fragments across a decimal/thousands separator**: a ``.`` or ``,``
39
+ flanked by a digit on the *outer* side means the value is part of a larger
40
+ number, so ``"14"`` is not grounded by ``"3.14"`` nor ``"3"`` by ``"3.14"``
41
+ — but ``"5"`` in ``"Qty: 5."`` (the dot ends a sentence) still grounds.
42
+ """
43
+ if not leaf_norm:
44
+ # An empty normalized value has no provenance to prove, so it is NOT
45
+ # grounded — fail safe rather than free-pass. ``_leaf_strings`` already
46
+ # drops empty/whitespace leaves upstream, so this is reached only if a
47
+ # non-empty value normalizes to nothing; treating that as ungrounded
48
+ # keeps "I said nothing" from passing the anti-fabrication gate.
49
+ return False
50
+ n = len(leaf_norm)
51
+ start = 0
52
+ while True:
53
+ i = corpus.find(leaf_norm, start)
54
+ if i == -1:
55
+ return False
56
+ if _standalone(corpus, i, i + n):
57
+ return True
58
+ start = i + 1
59
+
60
+
61
+ # Separators that join a number to more digits to form a single larger value or
62
+ # a compound numeric token: decimal/thousands (``.`` ``,``) AND the connectors in
63
+ # dates, versions, times, ranges, SKUs and phone numbers (``-`` ``/`` ``:``). A
64
+ # match flanked by one of these with a digit on its *outer* side is a fragment of
65
+ # a longer token, not a standalone value — so "12" is not grounded by "12-2024",
66
+ # nor "30" by "12:30", just as "14" is not grounded by "3.14".
67
+ _NUM_SEPARATORS = frozenset(".,-/:")
68
+
69
+
70
+ def _standalone(corpus: str, lo: int, hi: int) -> bool:
71
+ """Are the chars flanking ``corpus[lo:hi]`` token boundaries, not part of a
72
+ longer alphanumeric token or a larger/compound number?"""
73
+ before = corpus[lo - 1] if lo > 0 else ""
74
+ after = corpus[hi] if hi < len(corpus) else ""
75
+ if before.isalnum() or after.isalnum():
76
+ return False
77
+ # A numeric separator adjacent to a digit on its outer side means this match
78
+ # is a slice of a larger number or compound token (e.g. "14" inside "3.14",
79
+ # "12" inside "12-2024", "30" inside "12:30").
80
+ if before in _NUM_SEPARATORS and corpus[lo - 2 : lo - 1].isdigit():
81
+ return False
82
+ if after in _NUM_SEPARATORS and corpus[hi + 1 : hi + 2].isdigit():
83
+ return False
84
+ return True
85
+
86
+
87
+ def _leaf_strings(value: object) -> Iterator[str]:
88
+ """Yield every scalar leaf of a result value as a string to ground.
89
+
90
+ Numbers and booleans are real extracted values too — skipping non-strings
91
+ (the previous behaviour) let a fabricated price or count pass ungrounded.
92
+ bool is checked before int because ``isinstance(True, int)`` is True, and a
93
+ boolean is not groundable page text.
94
+ """
95
+ if isinstance(value, bool):
96
+ return
97
+ if isinstance(value, (str, int, float)):
98
+ text = str(value).strip()
99
+ if text:
100
+ yield text
101
+ elif isinstance(value, dict):
102
+ for v in value.values():
103
+ yield from _leaf_strings(v)
104
+ elif isinstance(value, (list, tuple)):
105
+ for v in value:
106
+ yield from _leaf_strings(v)
107
+
108
+
109
+ def _retrieved_corpus(ctx: RunContext) -> str:
110
+ """Concatenate everything the run fetched, from data.source.fetched events.
111
+
112
+ Falls back to the current observation when the event log isn't populated
113
+ yet (the loop wires the full log in build step 4).
114
+ """
115
+ chunks: list[str] = []
116
+ for ev in getattr(ctx, "events", []) or []:
117
+ # Only *retrieved* content grounds a value — i.e. data.source.fetched
118
+ # events. Reading text-like keys from any event would let the model
119
+ # ground its own fabrications: harness.turn.completed carries the model's
120
+ # output text, which must never count as evidence about the page.
121
+ if getattr(ev, "type", "") != "data.source.fetched":
122
+ continue
123
+ payload = getattr(ev, "payload", {}) or {}
124
+ for key in ("html", "text", "content"):
125
+ if isinstance(payload.get(key), str):
126
+ chunks.append(payload[key])
127
+ # Fall back to the current observation ONLY when the event log has no fetched
128
+ # content yet (the loop wires the full log in build step 4). If fetched events
129
+ # exist, we must not also fold in the raw observation: an observation that is
130
+ # not itself retrieved page content (e.g. a model-produced turn that happens
131
+ # to carry a ``text`` key) would reopen the self-grounding hole the event-type
132
+ # filter above exists to close.
133
+ if not chunks:
134
+ obs = getattr(ctx, "observation", None)
135
+ if isinstance(obs, dict):
136
+ for key in ("html", "text", "content"):
137
+ if isinstance(obs.get(key), str):
138
+ chunks.append(obs[key])
139
+ return "\n".join(chunks)
140
+
141
+
142
+ class GroundingValidator:
143
+ name = "grounding"
144
+
145
+ def check(self, result: Result, ctx: RunContext) -> Verdict | None:
146
+ if not result.value:
147
+ return None
148
+ corpus = _normalize(_retrieved_corpus(ctx))
149
+ # The result value is usually a JSON object, but the schema may permit a
150
+ # non-object root (a list or scalar). Don't assume ``.items()`` — that
151
+ # would raise AttributeError and silently break the validator ladder.
152
+ value = result.value
153
+ fields = value.items() if isinstance(value, dict) else [("value", value)]
154
+ for field, field_value in fields:
155
+ for leaf in _leaf_strings(field_value):
156
+ if not _grounded(_normalize(leaf), corpus):
157
+ return Verdict(
158
+ severity=Severity.RETRY,
159
+ detector=self.name,
160
+ detail=f"value for {field!r} not found in retrieved content",
161
+ )
162
+ return None
@@ -0,0 +1,41 @@
1
+ """schema — the result must satisfy the task's output JSON schema."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import jsonschema
6
+
7
+ from zu_core.contracts import Result
8
+ from zu_core.ports import RunContext, Severity, Verdict
9
+
10
+
11
+ class SchemaValidator:
12
+ name = "schema"
13
+
14
+ def check(self, result: Result, ctx: RunContext) -> Verdict | None:
15
+ schema = getattr(ctx.spec, "output_schema", None) or {}
16
+ if not schema:
17
+ return None # nothing to check against
18
+ # jsonschema's richer errors carry a ``.message``; plain exceptions don't.
19
+ # One extraction, used by both the data-mismatch and bad-schema branches.
20
+ def message_of(e: Exception) -> str:
21
+ return getattr(e, "message", str(e))
22
+
23
+ try:
24
+ jsonschema.validate(instance=result.value, schema=schema)
25
+ except jsonschema.ValidationError as e:
26
+ # The data didn't match a valid schema — a retry might fix it.
27
+ return Verdict(severity=Severity.RETRY, detector=self.name, detail=message_of(e))
28
+ except Exception as e: # noqa: BLE001 - a broken schema is terminal; see below
29
+ # The output_schema itself is unusable (comes from the TaskSpec,
30
+ # unvalidated): malformed (jsonschema.SchemaError), or an
31
+ # unresolvable ``$ref`` — which jsonschema raises as a *referencing*
32
+ # error that is NOT a subclass of SchemaError and would otherwise
33
+ # escape and crash the validation ladder. Retrying can't fix a broken
34
+ # schema, so any such error is terminal, caught here unconditionally
35
+ # so the ladder never sees an unhandled exception from a bad schema.
36
+ return Verdict(
37
+ severity=Severity.TERMINAL,
38
+ detector=self.name,
39
+ detail=f"invalid output_schema: {message_of(e)}",
40
+ )
41
+ return None
@@ -0,0 +1,171 @@
1
+ """Smoke tests for the built-in detectors and their discovery.
2
+
3
+ The escalation-ladder logic is finalized against the graded fixture set in
4
+ build step 5; these lock the basic verdicts and the entry-point contract now.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from zu_checks.detectors.bot_wall import BotWallDetector
10
+ from zu_checks.detectors.empty import EmptyDetector
11
+ from zu_checks.detectors.error import ErrorDetector
12
+ from zu_checks.detectors.js_shell import JsShellDetector
13
+ from zu_core.ports import RunContext, Severity
14
+ from zu_core.registry import Registry
15
+
16
+
17
+ def _ctx(observation: dict) -> RunContext:
18
+ return RunContext(spec=None, observation=observation)
19
+
20
+
21
+ def test_empty_fires_on_blank() -> None:
22
+ v = EmptyDetector().inspect(_ctx({"html": " "}))
23
+ assert v is not None and v.severity is Severity.ESCALATE
24
+
25
+
26
+ def test_empty_passes_on_content() -> None:
27
+ assert EmptyDetector().inspect(_ctx({"html": "<p>hi</p>"})) is None
28
+
29
+
30
+ def test_empty_ignores_non_page_observations() -> None:
31
+ # Regression: a successful html_parse result (no content key) must NOT be
32
+ # read as an "empty page" and escalate — that misfired after real extraction.
33
+ assert EmptyDetector().inspect(_ctx({"selector": "h1", "matches": ["X"], "count": 1})) is None
34
+ assert EmptyDetector().inspect(_ctx({"error": "boom"})) is None
35
+ assert EmptyDetector().inspect(_ctx({})) is None
36
+
37
+
38
+ def test_error_on_http_status_is_recoverable_not_terminal() -> None:
39
+ # An HTTP error on a fetched page is RETRY, never TERMINAL: a single bad url
40
+ # (403 WAF wall, 404, 410, 5xx, 429) must not end a run that can try another
41
+ # candidate. A truly stuck run ends via budget instead.
42
+ for status in (400, 403, 404, 405, 410, 429, 451, 500, 503):
43
+ v = ErrorDetector().inspect(_ctx({"status": status, "html": ""}))
44
+ assert v is not None and v.severity is Severity.RETRY, status
45
+
46
+
47
+ def test_error_quiet_on_success() -> None:
48
+ assert ErrorDetector().inspect(_ctx({"status": 200, "html": "<p>ok</p>"})) is None
49
+
50
+
51
+ def test_js_shell_fires_on_empty_spa() -> None:
52
+ html = '<html><body><div id="root"></div><script src="/app.js"></script></body></html>'
53
+ v = JsShellDetector().inspect(_ctx({"html": html}))
54
+ assert v is not None and v.severity is Severity.ESCALATE
55
+
56
+
57
+ def test_js_shell_passes_on_real_content() -> None:
58
+ html = "<html><body>" + ("<p>real content here</p>" * 500) + "</body></html>"
59
+ assert JsShellDetector().inspect(_ctx({"html": html})) is None
60
+
61
+
62
+ def test_js_shell_fires_despite_large_inline_script() -> None:
63
+ # A shell padded with a big inline bundle is still a shell: the visible-text
64
+ # test sees through the script, where a raw-length check would be fooled.
65
+ bundle = "var x=1;" * 2000 # ~16 KB of code, zero visible text
66
+ html = f'<html><body><div id="app"></div><script>{bundle}</script></body></html>'
67
+ v = JsShellDetector().inspect(_ctx({"html": html}))
68
+ assert v is not None and v.severity is Severity.ESCALATE
69
+
70
+
71
+ def test_js_shell_fires_on_unterminated_script() -> None:
72
+ # Malformed/streamed HTML: a <script> that is never closed. A browser treats
73
+ # everything after it as script text, so the visible-text test must too —
74
+ # the page is still a shell, not real content.
75
+ html = '<html><body><div id="root"></div><script>var x=1;' + ("a();" * 2000)
76
+ v = JsShellDetector().inspect(_ctx({"html": html}))
77
+ assert v is not None and v.severity is Severity.ESCALATE
78
+
79
+
80
+ def test_js_shell_passes_on_small_but_real_page() -> None:
81
+ # A mount point with genuine prose is rendered content, not a shell.
82
+ html = (
83
+ '<html><body><div id="root">'
84
+ "<h1>Acme Widget</h1><p>The finest widget, in stock and ready to ship today.</p>"
85
+ "</div><script src=/app.js></script></body></html>"
86
+ )
87
+ assert JsShellDetector().inspect(_ctx({"html": html})) is None
88
+
89
+
90
+ def test_bot_wall_fires_on_captcha() -> None:
91
+ v = BotWallDetector().inspect(_ctx({"html": "<h1>Just a moment...</h1> please verify you are human"}))
92
+ assert v is not None and v.severity is Severity.ESCALATE
93
+
94
+
95
+ def test_bot_wall_does_not_fire_on_innocent_phrase() -> None:
96
+ # A real article that happens to contain a weak phrase must NOT escalate
97
+ # without a corroborating Cloudflare fingerprint (regression: loose match).
98
+ page = _ctx({"html": "<article><h1>Just a moment in history</h1>"
99
+ "<p>Attention required: read the safety notice first.</p></article>"})
100
+ assert BotWallDetector().inspect(page) is None
101
+
102
+
103
+ def test_bot_wall_fires_on_weak_phrase_with_cloudflare_fingerprint() -> None:
104
+ page = _ctx({"html": "<title>Just a moment...</title>"
105
+ "<div class='cf-browser-verification'></div><!-- cf-ray: abc -->"})
106
+ v = BotWallDetector().inspect(page)
107
+ assert v is not None and v.severity is Severity.ESCALATE
108
+
109
+
110
+ # --- embedded-widget: content deferred to a JS widget/iframe -----------------
111
+
112
+ _VETSTORIA = (
113
+ "<html><body><h1>Park Vets</h1><p>Lots of normal page chrome here, nav, "
114
+ "footer, plenty of visible text so this is NOT an empty shell.</p>"
115
+ "<div id='oabp-widget' domain='booking.vetstoria.com'></div>"
116
+ "<script src='https://booking.vetstoria.com/js/oabp-widget.js'></script>"
117
+ "</body></html>"
118
+ )
119
+
120
+
121
+ def test_embedded_widget_fires_on_a_js_booking_widget() -> None:
122
+ from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
123
+
124
+ v = EmbeddedWidgetDetector().inspect(_ctx({"html": _VETSTORIA}))
125
+ assert v is not None and v.severity is Severity.ESCALATE
126
+
127
+
128
+ def test_embedded_widget_fires_on_an_external_iframe_app() -> None:
129
+ from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
130
+
131
+ html = "<html><body><p>book below</p><iframe src='https://book.example/app'></iframe></body></html>"
132
+ v = EmbeddedWidgetDetector().inspect(_ctx({"html": html}))
133
+ assert v is not None and v.severity is Severity.ESCALATE
134
+
135
+
136
+ def test_embedded_widget_quiet_on_a_plain_content_page_with_analytics() -> None:
137
+ from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
138
+
139
+ # A real content page that merely loads an external analytics script and links
140
+ # to a booking page must NOT escalate — the data is in the HTML.
141
+ html = (
142
+ "<html><body><h1>Opening hours</h1><p>Mon-Fri 9-5. Call 020 555 1234.</p>"
143
+ "<a href='/book-an-appointment'>Book an appointment</a>"
144
+ "<script src='https://www.googletagmanager.com/gtag/js'></script></body></html>"
145
+ )
146
+ assert EmbeddedWidgetDetector().inspect(_ctx({"html": html})) is None
147
+
148
+
149
+ def test_embedded_widget_fires_once_then_stays_quiet() -> None:
150
+ # It's an escalation trigger: it unlocks the browser once, then must go quiet —
151
+ # a later widget page (or the rendered DOM) re-firing at the top tier would end
152
+ # the run as 'escalation exhausted' before the model can use the browser.
153
+ import types
154
+
155
+ from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
156
+
157
+ det = EmbeddedWidgetDetector()
158
+ assert det.inspect(RunContext(spec=None, observation={"html": _VETSTORIA}, events=[])) is not None
159
+ for prior in (
160
+ types.SimpleNamespace(type="harness.task.escalated", source=None, payload={}),
161
+ types.SimpleNamespace(type="data.source.fetched", source="render_dom", payload={}),
162
+ ):
163
+ ctx = RunContext(spec=None, observation={"html": _VETSTORIA}, events=[prior])
164
+ assert det.inspect(ctx) is None
165
+
166
+
167
+ def test_detectors_discoverable() -> None:
168
+ reg = Registry()
169
+ reg.discover()
170
+ for name in ("empty", "error", "js-shell", "embedded-widget", "bot-wall"):
171
+ assert name in reg.names("detectors")
@@ -0,0 +1,227 @@
1
+ """Tests for the built-in validators, their discovery, and their behaviour
2
+ inside the loop (build step 6).
3
+
4
+ These lock the core behaviour — schema enforcement and the anti-hallucination
5
+ grounding check, including token-boundary precision — and prove grounding works
6
+ against the real event log when run inside the interpreter loop (at finalise the
7
+ observation is gone, so grounding must read the data.source.fetched events).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from zu_checks.validators.grounding import GroundingValidator
13
+ from zu_checks.validators.schema import SchemaValidator
14
+ from zu_core.bus import EventBus
15
+ from zu_core.contracts import Result, Status, TaskSpec
16
+ from zu_core.loop import run_task
17
+ from zu_core.ports import RunContext, Severity
18
+ from zu_core.registry import Registry
19
+ from zu_providers.scripted import ScriptedProvider
20
+ from zu_testing import fetch_tool
21
+
22
+ _SCHEMA = {
23
+ "type": "object",
24
+ "properties": {"price": {"type": "string"}},
25
+ "required": ["price"],
26
+ }
27
+
28
+
29
+ def _ctx(observation: dict | None = None) -> RunContext:
30
+ spec = TaskSpec(query="extract the price", output_schema=_SCHEMA)
31
+ return RunContext(spec=spec, observation=observation)
32
+
33
+
34
+ def test_schema_passes_valid_result() -> None:
35
+ r = Result(status=Status.SUCCESS, value={"price": "$9.00"})
36
+ assert SchemaValidator().check(r, _ctx()) is None
37
+
38
+
39
+ def test_schema_fails_missing_required() -> None:
40
+ r = Result(status=Status.SUCCESS, value={})
41
+ v = SchemaValidator().check(r, _ctx())
42
+ # A plain data mismatch must be RETRY (the model can correct it) — NOT
43
+ # TERMINAL; the loop branches on this severity, so lock it, not just "fired".
44
+ assert v is not None and v.detector == "schema"
45
+ assert v.severity == Severity.RETRY
46
+
47
+
48
+ def test_grounding_fails_invented_value() -> None:
49
+ r = Result(status=Status.SUCCESS, value={"price": "$1000.00"})
50
+ ctx = _ctx({"html": "<span class='price'>$9.00</span>"})
51
+ v = GroundingValidator().check(r, ctx)
52
+ assert v is not None and "not found" in (v.detail or "")
53
+
54
+
55
+ def test_grounding_passes_value_on_page() -> None:
56
+ r = Result(status=Status.SUCCESS, value={"price": "$9.00"})
57
+ ctx = _ctx({"html": "<span class='price'>$9.00</span>"})
58
+ assert GroundingValidator().check(r, ctx) is None
59
+
60
+
61
+ def test_grounding_checks_numeric_values() -> None:
62
+ # A fabricated *number* must not pass ungrounded (the old code skipped
63
+ # every non-string value, so invented prices/counts sailed through).
64
+ invented = Result(status=Status.SUCCESS, value={"stock": 4096})
65
+ ctx = _ctx({"html": "<span>in stock: 7</span>"})
66
+ assert GroundingValidator().check(invented, ctx) is not None
67
+
68
+ real = Result(status=Status.SUCCESS, value={"stock": 7})
69
+ assert GroundingValidator().check(real, ctx) is None
70
+
71
+
72
+ def test_grounding_normalizes_whitespace() -> None:
73
+ # Whitespace/case differences between the value and the page shouldn't fail.
74
+ r = Result(status=Status.SUCCESS, value={"title": "Hello World"})
75
+ ctx = _ctx({"html": "<h1>hello world</h1>"})
76
+ assert GroundingValidator().check(r, ctx) is None
77
+
78
+
79
+ def test_grounding_rejects_short_value_inside_larger_token() -> None:
80
+ # Token-boundary precision: "5" must NOT be grounded by "1985" (plain
81
+ # substring matching would have let the fabricated rating pass).
82
+ only_in_year = _ctx({"html": "<p>The product launched in 1985.</p>"})
83
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"rating": 5}), only_in_year) is not None
84
+
85
+ # A genuinely standalone "5" on the page still grounds.
86
+ standalone = _ctx({"html": "<p>Rated 5 stars by 1985 reviewers.</p>"})
87
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"rating": 5}), standalone) is None
88
+
89
+
90
+ def test_grounding_rejects_value_inside_a_decimal() -> None:
91
+ # A decimal point is a token boundary for *words*, but a fabricated number
92
+ # must not be grounded by a fragment of a larger number: "14" is not on a
93
+ # page that only says "$3.14", nor is "3".
94
+ page = _ctx({"html": "<span class='price'>$3.14</span>"})
95
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"n": 14}), page) is not None
96
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"n": 3}), page) is not None
97
+ # The whole decimal still grounds, and so does an integer the dot merely ends
98
+ # a sentence after (the dot is not flanked by a digit on its outer side).
99
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"p": "3.14"}), page) is None
100
+ qty = _ctx({"html": "<p>Qty: 5.</p>"})
101
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"q": 5}), qty) is None
102
+
103
+
104
+ def test_grounding_rejects_value_inside_a_compound_token() -> None:
105
+ # A short number must not be grounded by a fragment of a date/version/time/
106
+ # SKU/phone joined by - / : — "12" is not on a page that only says "12-2024".
107
+ date = _ctx({"html": "<p>Released 12-2024 worldwide.</p>"})
108
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"m": 12}), date) is not None
109
+ ver = _ctx({"html": "<p>Build 4/19 shipped.</p>"})
110
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"b": 19}), ver) is not None
111
+ time = _ctx({"html": "<p>Starts at 12:30 sharp.</p>"})
112
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"t": 30}), time) is not None
113
+ # A genuinely standalone number flanked by a separator-with-no-adjacent-digit
114
+ # (e.g. a slash ending a path segment) still grounds.
115
+ path = _ctx({"html": "<a href='/items/42/'>item</a>"})
116
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"id": 42}), path) is None
117
+
118
+
119
+ def test_grounding_is_unicode_token_aware() -> None:
120
+ # The flank check is Unicode-aware (str.isalnum), so a value is not grounded
121
+ # as a fragment of a non-ASCII word.
122
+ page = _ctx({"html": "<p>café société</p>"})
123
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"w": "caf"}), page) is not None
124
+ assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"w": "café"}), page) is None
125
+
126
+
127
+ def test_schema_error_is_terminal_not_a_crash() -> None:
128
+ # An invalid output_schema (from the TaskSpec) raises jsonschema.SchemaError
129
+ # internally; the validator must turn it into a TERMINAL verdict, never let
130
+ # it escape and crash the validation ladder.
131
+ spec = TaskSpec(query="x", output_schema={"type": "not-a-real-type"})
132
+ ctx = RunContext(spec=spec, observation=None)
133
+ r = Result(status=Status.SUCCESS, value={"a": 1})
134
+ v = SchemaValidator().check(r, ctx)
135
+ assert v is not None and v.severity == Severity.TERMINAL
136
+ assert "invalid output_schema" in (v.detail or "")
137
+
138
+
139
+ def test_unresolvable_ref_is_terminal_not_a_crash() -> None:
140
+ # A schema with an unresolvable $ref raises a *referencing* error that is NOT
141
+ # a subclass of jsonschema.SchemaError — it would escape the old handlers and
142
+ # crash the ladder. It must become a TERMINAL verdict like any other broken
143
+ # schema, since the output_schema is untrusted TaskSpec input.
144
+ for bad in ({"$ref": "#/nope"}, {"$ref": "http://evil.example/x"}):
145
+ spec = TaskSpec(query="x", output_schema=bad)
146
+ ctx = RunContext(spec=spec, observation=None)
147
+ r = Result(status=Status.SUCCESS, value={"a": 1})
148
+ v = SchemaValidator().check(r, ctx)
149
+ assert v is not None and v.severity == Severity.TERMINAL
150
+ assert "invalid output_schema" in (v.detail or "")
151
+
152
+
153
+ # --- grounding against the real event log, inside the loop -------------------
154
+
155
+ _PAGE = "<html><body><span class='price'>$9.00</span></body></html>"
156
+
157
+
158
+ def _loop_registry() -> Registry:
159
+ reg = Registry()
160
+ reg.register("tools", "http_fetch", fetch_tool(text=_PAGE))
161
+ reg.register("validators", "schema", SchemaValidator())
162
+ reg.register("validators", "grounding", GroundingValidator())
163
+ return reg
164
+
165
+
166
+ async def test_grounding_in_loop_passes_value_from_event_log() -> None:
167
+ # At finalise the loop passes no observation, so grounding must read the
168
+ # price from the data.source.fetched event — the step-6 "against the event
169
+ # log" promise, end to end.
170
+ provider = ScriptedProvider.from_moves(
171
+ [
172
+ {"tool": "http_fetch", "args": {"url": "http://x.test/"}},
173
+ {"text": '{"price": "$9.00"}', "finish": "stop"},
174
+ ]
175
+ )
176
+ bus = EventBus()
177
+ result = await run_task(TaskSpec(query="price", output_schema=_SCHEMA), provider, _loop_registry(), bus)
178
+ assert result.status == Status.SUCCESS
179
+ assert result.value == {"price": "$9.00"}
180
+ types = [e.type for e in await bus.query()]
181
+ assert "data.source.fetched" in types # grounding had the log to read
182
+ assert "harness.validation.failed" not in types
183
+
184
+
185
+ async def test_grounding_in_loop_rejects_fabrication_then_accepts_correction() -> None:
186
+ # A price that is nowhere on the page fails grounding (RETRY); the loop feeds
187
+ # the failure back and the corrected, grounded value then succeeds.
188
+ provider = ScriptedProvider.from_moves(
189
+ [
190
+ {"tool": "http_fetch", "args": {"url": "http://x.test/"}},
191
+ {"text": '{"price": "$1000.00"}', "finish": "stop"}, # not on page -> RETRY
192
+ {"text": '{"price": "$9.00"}', "finish": "stop"}, # grounded -> SUCCESS
193
+ ]
194
+ )
195
+ bus = EventBus()
196
+ result = await run_task(TaskSpec(query="price", output_schema=_SCHEMA), provider, _loop_registry(), bus)
197
+ assert result.status == Status.SUCCESS
198
+ assert result.value == {"price": "$9.00"}
199
+ failed = [e for e in await bus.query() if e.type == "harness.validation.failed"]
200
+ assert failed and failed[0].payload["detector"] == "grounding"
201
+
202
+
203
+ async def test_grounding_corpus_ignores_the_models_own_text() -> None:
204
+ # The model's output is recorded on harness.turn.completed (the live "train
205
+ # of thought"). Grounding must NOT treat that as retrieved content, or a model
206
+ # could ground a fabrication by simply emitting it. Only the fetched page
207
+ # (which here does NOT contain the price) counts — so the value fails.
208
+ provider = ScriptedProvider.from_moves(
209
+ [
210
+ {"tool": "http_fetch", "args": {"url": "http://x.test/"}},
211
+ {"text": '{"price": "$1000.00"}', "finish": "stop"}, # spoken, not on the page
212
+ {"text": '{"price": "$1000.00"}', "finish": "stop"}, # repeated — still ungrounded
213
+ ]
214
+ )
215
+ bus = EventBus()
216
+ result = await run_task(TaskSpec(query="price", output_schema=_SCHEMA), provider, _loop_registry(), bus)
217
+ # Never succeeds: the price is only ever in the model's own text, never the page.
218
+ assert result.status != Status.SUCCESS
219
+ failed = [e for e in await bus.query() if e.type == "harness.validation.failed"]
220
+ assert any(e.payload["detector"] == "grounding" for e in failed)
221
+
222
+
223
+ def test_validators_discoverable() -> None:
224
+ reg = Registry()
225
+ reg.discover()
226
+ for name in ("schema", "grounding"):
227
+ assert name in reg.names("validators")