zu-checks 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zu_checks-0.2.0/.gitignore +66 -0
- zu_checks-0.2.0/PKG-INFO +37 -0
- zu_checks-0.2.0/README.md +17 -0
- zu_checks-0.2.0/pyproject.toml +44 -0
- zu_checks-0.2.0/src/zu_checks/__init__.py +13 -0
- zu_checks-0.2.0/src/zu_checks/detectors/__init__.py +37 -0
- zu_checks-0.2.0/src/zu_checks/detectors/action_surface_blind.py +33 -0
- zu_checks-0.2.0/src/zu_checks/detectors/bot_wall.py +54 -0
- zu_checks-0.2.0/src/zu_checks/detectors/embedded_widget.py +97 -0
- zu_checks-0.2.0/src/zu_checks/detectors/empty.py +32 -0
- zu_checks-0.2.0/src/zu_checks/detectors/error.py +25 -0
- zu_checks-0.2.0/src/zu_checks/detectors/js_shell.py +75 -0
- zu_checks-0.2.0/src/zu_checks/validators/__init__.py +7 -0
- zu_checks-0.2.0/src/zu_checks/validators/grounding.py +162 -0
- zu_checks-0.2.0/src/zu_checks/validators/schema.py +41 -0
- zu_checks-0.2.0/tests/test_detectors.py +171 -0
- zu_checks-0.2.0/tests/test_validators.py +227 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.eggs/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
|
|
9
|
+
# uv / venv
|
|
10
|
+
.venv/
|
|
11
|
+
uv.lock.bak
|
|
12
|
+
|
|
13
|
+
# Test / type caches
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
.coverage
|
|
18
|
+
htmlcov/
|
|
19
|
+
|
|
20
|
+
# Zu runtime artifacts
|
|
21
|
+
*.db
|
|
22
|
+
zu.db
|
|
23
|
+
zu.yaml.local
|
|
24
|
+
zu_review.jsonl
|
|
25
|
+
*.review.jsonl
|
|
26
|
+
# Per-agent cost telemetry ledger — machine-local run history, not source.
|
|
27
|
+
cost.jsonl
|
|
28
|
+
# A recorded replay path is learned per-run and machine-local — regenerated on
|
|
29
|
+
# every successful run, not source. The agent ships; its track does not.
|
|
30
|
+
track.json
|
|
31
|
+
# …except the flagship example ships its track on purpose, as a demo of the
|
|
32
|
+
# record/replay convergence (committed; re-runs show as ordinary modifications).
|
|
33
|
+
!examples/agents/vet-appointment/track.json
|
|
34
|
+
|
|
35
|
+
# Editor / OS
|
|
36
|
+
.idea/
|
|
37
|
+
.vscode/
|
|
38
|
+
.DS_Store
|
|
39
|
+
|
|
40
|
+
# Claude Code local session state
|
|
41
|
+
.claude/
|
|
42
|
+
|
|
43
|
+
# Secrets
|
|
44
|
+
.env
|
|
45
|
+
.env.*
|
|
46
|
+
!.env.example
|
|
47
|
+
|
|
48
|
+
# Microsoft Office temp/lock files
|
|
49
|
+
~$*
|
|
50
|
+
|
|
51
|
+
# Internal design / strategy docs — kept local, never in the public repo
|
|
52
|
+
*.docx
|
|
53
|
+
*.pdf
|
|
54
|
+
# BUILD.md is the internal build-sequence / deferred-gaps ledger — kept local.
|
|
55
|
+
# (ARCHITECTURE.md is public: an onboarding agent needs the structural map.)
|
|
56
|
+
docs/BUILD.md
|
|
57
|
+
|
|
58
|
+
# Local secret — API key for live validation, never commit
|
|
59
|
+
zu_demo_key.md
|
|
60
|
+
*_key.md
|
|
61
|
+
|
|
62
|
+
# Local PyPI publish token — never commit
|
|
63
|
+
/pypi
|
|
64
|
+
|
|
65
|
+
# Local Discord credentials (bot token / app secrets) — never commit
|
|
66
|
+
/discord
|
zu_checks-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zu-checks
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Zu built-in checks: detectors (empty, error, js-shell, embedded-widget, bot-wall) + validators (schema, grounding)
|
|
5
|
+
Project-URL: Homepage, https://github.com/k3-mt/zu
|
|
6
|
+
Project-URL: Repository, https://github.com/k3-mt/zu
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: jsonschema>=4
|
|
18
|
+
Requires-Dist: zu-core==0.2.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# zu-checks
|
|
22
|
+
|
|
23
|
+
The built-in **checks** that ship with the Zu base runtime — two stdlib plugin
|
|
24
|
+
kinds in one package:
|
|
25
|
+
|
|
26
|
+
- **detectors** (`zu_checks.detectors`) — `empty`, `error`, `js-shell`,
|
|
27
|
+
`bot-wall`. Inspect an observation and return a `Verdict`; the severity drives
|
|
28
|
+
the loop (`ESCALATE` climbs the tier ladder, `TERMINAL` ends the run).
|
|
29
|
+
- **validators** (`zu_checks.validators`) — `schema` (does the result fit the
|
|
30
|
+
requested shape?) and `grounding` (does every extracted value actually appear
|
|
31
|
+
in retrieved content? — the anti-hallucination check).
|
|
32
|
+
|
|
33
|
+
They're packaged together because both are pure-stdlib (schema adds only
|
|
34
|
+
`jsonschema`) and always present in the base — unlike the adapter packages
|
|
35
|
+
(`zu-providers`, `zu-tools`, `zu-backends`), whose separation carries distinct
|
|
36
|
+
heavy optional dependencies. All register via the standard `zu.detectors` /
|
|
37
|
+
`zu.validators` entry-point groups, exactly as a third-party check would.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# zu-checks
|
|
2
|
+
|
|
3
|
+
The built-in **checks** that ship with the Zu base runtime — two stdlib plugin
|
|
4
|
+
kinds in one package:
|
|
5
|
+
|
|
6
|
+
- **detectors** (`zu_checks.detectors`) — `empty`, `error`, `js-shell`,
|
|
7
|
+
`bot-wall`. Inspect an observation and return a `Verdict`; the severity drives
|
|
8
|
+
the loop (`ESCALATE` climbs the tier ladder, `TERMINAL` ends the run).
|
|
9
|
+
- **validators** (`zu_checks.validators`) — `schema` (does the result fit the
|
|
10
|
+
requested shape?) and `grounding` (does every extracted value actually appear
|
|
11
|
+
in retrieved content? — the anti-hallucination check).
|
|
12
|
+
|
|
13
|
+
They're packaged together because both are pure-stdlib (schema adds only
|
|
14
|
+
`jsonschema`) and always present in the base — unlike the adapter packages
|
|
15
|
+
(`zu-providers`, `zu-tools`, `zu-backends`), whose separation carries distinct
|
|
16
|
+
heavy optional dependencies. All register via the standard `zu.detectors` /
|
|
17
|
+
`zu.validators` entry-point groups, exactly as a third-party check would.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "zu-checks"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Zu built-in checks: detectors (empty, error, js-shell, embedded-widget, bot-wall) + validators (schema, grounding)"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
classifiers = [
|
|
9
|
+
"Development Status :: 4 - Beta",
|
|
10
|
+
"Intended Audience :: Developers",
|
|
11
|
+
"License :: OSI Approved :: Apache Software License",
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Programming Language :: Python :: 3.11",
|
|
14
|
+
"Programming Language :: Python :: 3.12",
|
|
15
|
+
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
|
16
|
+
"Typing :: Typed",
|
|
17
|
+
]
|
|
18
|
+
# The built-in checks are pure-stdlib (+ jsonschema for the schema validator) and
|
|
19
|
+
# always ship with the base runtime — they have no heavy/optional deps, which is
|
|
20
|
+
# why detectors and validators live together here rather than as two packages.
|
|
21
|
+
dependencies = ["zu-core==0.2.0", "jsonschema>=4"]
|
|
22
|
+
|
|
23
|
+
[project.entry-points."zu.detectors"]
|
|
24
|
+
empty = "zu_checks.detectors.empty:EmptyDetector"
|
|
25
|
+
error = "zu_checks.detectors.error:ErrorDetector"
|
|
26
|
+
js-shell = "zu_checks.detectors.js_shell:JsShellDetector"
|
|
27
|
+
embedded-widget = "zu_checks.detectors.embedded_widget:EmbeddedWidgetDetector"
|
|
28
|
+
bot-wall = "zu_checks.detectors.bot_wall:BotWallDetector"
|
|
29
|
+
action-surface-blind = "zu_checks.detectors.action_surface_blind:ActionSurfaceBlindDetector"
|
|
30
|
+
|
|
31
|
+
[project.entry-points."zu.validators"]
|
|
32
|
+
schema = "zu_checks.validators.schema:SchemaValidator"
|
|
33
|
+
grounding = "zu_checks.validators.grounding:GroundingValidator"
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/k3-mt/zu"
|
|
37
|
+
Repository = "https://github.com/k3-mt/zu"
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["hatchling"]
|
|
41
|
+
build-backend = "hatchling.build"
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.wheel]
|
|
44
|
+
packages = ["src/zu_checks"]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Zu built-in checks — the two stdlib plugin kinds that ship with the base.
|
|
2
|
+
|
|
3
|
+
* ``zu_checks.detectors`` — observation-time detectors whose Verdict severities
|
|
4
|
+
drive the loop (ESCALATE climbs the tier ladder; TERMINAL ends the run).
|
|
5
|
+
* ``zu_checks.validators`` — on-final result checks (schema shape + grounding,
|
|
6
|
+
the anti-hallucination provenance check).
|
|
7
|
+
|
|
8
|
+
They live in one package because both are pure-stdlib (the schema validator adds
|
|
9
|
+
only ``jsonschema``) and always present in the base runtime — unlike the adapter
|
|
10
|
+
packages (providers/tools/backends) whose separation carries distinct heavy
|
|
11
|
+
optional dependencies. They register through the same ``zu.detectors`` /
|
|
12
|
+
``zu.validators`` entry-point groups any third-party check would.
|
|
13
|
+
"""
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Zu built-in detectors.
|
|
2
|
+
|
|
3
|
+
A detector inspects an observation and may return a Verdict. Verdict
|
|
4
|
+
severities (WARN, RETRY, ESCALATE, TERMINAL) map onto the loop's control flow:
|
|
5
|
+
ESCALATE is the deterministic signal that climbs the tier ladder. Detectors
|
|
6
|
+
are where escalation is decided — never improvised by the model.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# What counts as page content in an observation, in preference order. The loop
|
|
11
|
+
# stores a fetched/rendered page under one of these keys (mirrors zu_core.loop's
|
|
12
|
+
# own ``_CONTENT_KEYS``); a detector must consult all of them or it goes blind to
|
|
13
|
+
# a tool that returns ``{"text": ...}`` / ``{"content": ...}`` instead of html.
|
|
14
|
+
# One source of truth, reused by ``empty`` too.
|
|
15
|
+
_CONTENT_KEYS = ("html", "text", "content")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _html_of(ctx) -> str:
|
|
19
|
+
"""Best-effort extraction of the page content from a RunContext observation.
|
|
20
|
+
|
|
21
|
+
Concatenates *every* present content key (html, text, content) rather than
|
|
22
|
+
returning only the first, so a marker detector is never blind to a tool that
|
|
23
|
+
splits content across keys — the same all-keys view the ``empty`` detector
|
|
24
|
+
uses, so the detectors agree on what "the content" is."""
|
|
25
|
+
obs = getattr(ctx, "observation", None)
|
|
26
|
+
if isinstance(obs, dict):
|
|
27
|
+
parts = [v for k in _CONTENT_KEYS if isinstance(v := obs.get(k), str) and v]
|
|
28
|
+
if parts:
|
|
29
|
+
return "\n".join(parts)
|
|
30
|
+
return ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _contains_any(html: str, markers) -> bool:
|
|
34
|
+
"""True if any marker (case-insensitive) appears in ``html`` — the shared
|
|
35
|
+
substring scan behind the marker-list detectors (bot-wall, js-shell)."""
|
|
36
|
+
lowered = html.lower()
|
|
37
|
+
return any(marker in lowered for marker in markers)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""action-surface-blind — escalate to vision when the action surface is blind.
|
|
2
|
+
|
|
3
|
+
The Action Surface (Engineering Design §11) is a fast, cheap default for the
|
|
4
|
+
common case; its competence boundary is the trigger for the next tier — pixels
|
|
5
|
+
and a vision model. When the accessibility tree is too thin to trust, the tool
|
|
6
|
+
sets ``surface_blind`` on its observation rather than silently returning an
|
|
7
|
+
incomplete surface. This detector turns that signal into the deterministic
|
|
8
|
+
ESCALATE that climbs the ladder to tier-4 vision (§11.4) — escalation decided by
|
|
9
|
+
a detector, never improvised by the model.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ActionSurfaceBlindDetector:
|
|
18
|
+
name = "action-surface-blind"
|
|
19
|
+
scope = Scope.PER_OBSERVATION
|
|
20
|
+
|
|
21
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
22
|
+
obs = getattr(ctx, "observation", None)
|
|
23
|
+
if not isinstance(obs, dict):
|
|
24
|
+
return None
|
|
25
|
+
if obs.get("surface_blind") is True:
|
|
26
|
+
surface = obs.get("action_surface")
|
|
27
|
+
reason = surface.get("blind_reason") if isinstance(surface, dict) else None
|
|
28
|
+
return Verdict(
|
|
29
|
+
severity=Severity.ESCALATE,
|
|
30
|
+
detector=self.name,
|
|
31
|
+
detail=reason or "action surface too thin to trust; escalate to vision",
|
|
32
|
+
)
|
|
33
|
+
return None
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""bot-wall — fires on an anti-bot interstitial (Cloudflare, captcha, etc.)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
6
|
+
|
|
7
|
+
from . import _contains_any, _html_of
|
|
8
|
+
|
|
9
|
+
# Strong markers: phrasing characteristic of an anti-bot interstitial, specific
|
|
10
|
+
# enough that their presence is treated as the signal on its own. This is a
|
|
11
|
+
# deterministic heuristic, not a proof: a page that *discusses* CAPTCHAs (a news
|
|
12
|
+
# story, this very comment) can contain "captcha" and would escalate — the cost
|
|
13
|
+
# is a wasted tier-2 render, not a wrong answer, and escalating a borderline page
|
|
14
|
+
# is the safer failure. ``cf-browser-verification`` is unambiguous; the natural-
|
|
15
|
+
# language phrases are the ones with residual false-positive surface.
|
|
16
|
+
_STRONG_MARKERS = (
|
|
17
|
+
"captcha",
|
|
18
|
+
"are you a robot",
|
|
19
|
+
"verify you are human",
|
|
20
|
+
"cf-browser-verification",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Weak markers: real Cloudflare wall phrasing, but common-enough English that a
|
|
24
|
+
# substring match alone false-positives (an article titled "Just a Moment in
|
|
25
|
+
# History", a banner reading "Attention required"). They fire ONLY when a
|
|
26
|
+
# Cloudflare fingerprint is also present, so a normal page is never escalated.
|
|
27
|
+
_WEAK_MARKERS = (
|
|
28
|
+
"attention required",
|
|
29
|
+
"just a moment",
|
|
30
|
+
)
|
|
31
|
+
_CLOUDFLARE_FINGERPRINTS = (
|
|
32
|
+
"cloudflare",
|
|
33
|
+
"cf-ray",
|
|
34
|
+
"cf-browser-verification",
|
|
35
|
+
"__cf",
|
|
36
|
+
"/cdn-cgi/",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class BotWallDetector:
|
|
41
|
+
name = "bot-wall"
|
|
42
|
+
scope = Scope.PER_OBSERVATION
|
|
43
|
+
|
|
44
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
45
|
+
html = _html_of(ctx)
|
|
46
|
+
strong = _contains_any(html, _STRONG_MARKERS)
|
|
47
|
+
weak = _contains_any(html, _WEAK_MARKERS) and _contains_any(html, _CLOUDFLARE_FINGERPRINTS)
|
|
48
|
+
if strong or weak:
|
|
49
|
+
return Verdict(
|
|
50
|
+
severity=Severity.ESCALATE,
|
|
51
|
+
detector=self.name,
|
|
52
|
+
detail="anti-bot wall detected",
|
|
53
|
+
)
|
|
54
|
+
return None
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""embedded-widget — fires when the page's real content is inside a JS widget.
|
|
2
|
+
|
|
3
|
+
The complement to ``js-shell``. ``js-shell`` catches an *empty* SPA shell (a
|
|
4
|
+
``<div id="root">`` with no visible text). But a page can be full of human-visible
|
|
5
|
+
chrome — nav, footer, copy — while the data the task actually needs (appointment
|
|
6
|
+
slots, a price table, a seat map) is rendered by an **embedded third-party widget
|
|
7
|
+
or iframe** that loads via JavaScript. A tier-1 ``http_fetch`` sees the chrome and
|
|
8
|
+
the empty mount point, never the data, so it would loop forever or give up. This
|
|
9
|
+
detector is the deterministic signal to *offer* the browser (tier 2) in that case.
|
|
10
|
+
|
|
11
|
+
It is conservative about what counts as a content widget, to avoid escalating on
|
|
12
|
+
ubiquitous analytics/ad scripts:
|
|
13
|
+
|
|
14
|
+
* an ``<iframe>`` with an external ``http(s)`` ``src`` — an embedded application
|
|
15
|
+
whose content is not in this DOM; or
|
|
16
|
+
* a **widget mount point** — an element whose *attributes* (id/class/data-*/domain)
|
|
17
|
+
name a content widget (``widget``, ``embed``, ``scheduler``, or a known booking
|
|
18
|
+
vendor) — together with an external ``<script>`` that fills it.
|
|
19
|
+
|
|
20
|
+
ESCALATE only *unlocks* the browser; the model renders only if it still lacks the
|
|
21
|
+
data, so being a touch generous here is cheap and fail-safe.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import re
|
|
27
|
+
|
|
28
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
29
|
+
|
|
30
|
+
from . import _html_of
|
|
31
|
+
|
|
32
|
+
# Tokens that, when they appear in an element's ATTRIBUTES (not visible text),
|
|
33
|
+
# mark a JS content-widget mount. Generic structural words plus a few common
|
|
34
|
+
# booking/scheduling vendors — kept to attribute context so a nav link like
|
|
35
|
+
# href="/book-an-appointment" or body copy never trips it.
|
|
36
|
+
_WIDGET_TOKENS = (
|
|
37
|
+
"widget", "embed", "scheduler", "data-widget",
|
|
38
|
+
"vetstoria", "oabp", "calendly", "acuityscheduling", "simplybook", "petsapp",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# An <iframe ...> carrying an external http(s) src — an embedded app.
|
|
42
|
+
_IFRAME_SRC = re.compile(r"<iframe\b[^>]*\bsrc\s*=\s*[\"']https?://", re.IGNORECASE)
|
|
43
|
+
# Any element's attribute span, to scan for a widget token in attribute context.
|
|
44
|
+
_TAG_ATTRS = re.compile(r"<[a-zA-Z][a-zA-Z0-9]*\b([^>]*)>")
|
|
45
|
+
# An external <script src="http(s)://..."> — the loader that fills a mount point.
|
|
46
|
+
_EXTERNAL_SCRIPT = re.compile(r"<script\b[^>]*\bsrc\s*=\s*[\"']https?://", re.IGNORECASE)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _has_widget_mount(html: str) -> bool:
|
|
50
|
+
"""True if some element's attributes name a content widget."""
|
|
51
|
+
for m in _TAG_ATTRS.finditer(html):
|
|
52
|
+
attrs = m.group(1).lower()
|
|
53
|
+
if any(tok in attrs for tok in _WIDGET_TOKENS):
|
|
54
|
+
return True
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _already_escalated(ctx: RunContext) -> bool:
|
|
59
|
+
"""True if the run has already escalated (or browser-rendered) this run.
|
|
60
|
+
|
|
61
|
+
This detector is an escalation *trigger*: its job is to unlock the browser
|
|
62
|
+
tier once. After that it must go quiet — every later widget page (another
|
|
63
|
+
http_fetch, or the rendered DOM, which still carries the markers) would
|
|
64
|
+
otherwise re-fire, and at the top tier a re-escalation is 'exhausted' and ENDS
|
|
65
|
+
the run before the model can use the browser it just unlocked. So: fire once,
|
|
66
|
+
then defer to the model working at the higher tier."""
|
|
67
|
+
for ev in getattr(ctx, "events", []) or []:
|
|
68
|
+
et = getattr(ev, "type", "")
|
|
69
|
+
if et == "harness.task.escalated":
|
|
70
|
+
return True
|
|
71
|
+
if et == "data.source.fetched" and getattr(ev, "source", "") == "render_dom":
|
|
72
|
+
return True
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EmbeddedWidgetDetector:
|
|
77
|
+
name = "embedded-widget"
|
|
78
|
+
scope = Scope.PER_OBSERVATION
|
|
79
|
+
|
|
80
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
81
|
+
html = _html_of(ctx)
|
|
82
|
+
if not html:
|
|
83
|
+
return None
|
|
84
|
+
if _already_escalated(ctx):
|
|
85
|
+
return None # already unlocked the browser; fire once, then stay quiet
|
|
86
|
+
embedded_app = bool(_IFRAME_SRC.search(html))
|
|
87
|
+
# A named mount point only counts when an external script is present to
|
|
88
|
+
# fill it — a bare class="...widget..." on a static page isn't deferred.
|
|
89
|
+
widget_loaded = _has_widget_mount(html) and bool(_EXTERNAL_SCRIPT.search(html))
|
|
90
|
+
if embedded_app or widget_loaded:
|
|
91
|
+
return Verdict(
|
|
92
|
+
severity=Severity.ESCALATE,
|
|
93
|
+
detector=self.name,
|
|
94
|
+
detail="page defers content to an embedded widget/iframe; "
|
|
95
|
+
"escalate to a browser to render it",
|
|
96
|
+
)
|
|
97
|
+
return None
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""empty — fires when a *fetched page* carried no usable content.
|
|
2
|
+
|
|
3
|
+
Scoped to page-content observations on purpose: it judges a fetch (a tool that
|
|
4
|
+
returned ``html``/``text``/``content``) and escalates when that content is empty
|
|
5
|
+
— the signal to climb to a browser. It must NOT fire on observations that are not
|
|
6
|
+
page fetches — e.g. ``html_parse`` returning ``{"matches": [...]}`` (a successful
|
|
7
|
+
extraction) or an error observation — or it would spuriously escalate after real
|
|
8
|
+
work. So: a content key present but blank -> escalate; no content key -> not our
|
|
9
|
+
concern (return None).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
15
|
+
|
|
16
|
+
from . import _CONTENT_KEYS # one source of truth for "what counts as page content"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EmptyDetector:
|
|
20
|
+
name = "empty"
|
|
21
|
+
scope = Scope.PER_OBSERVATION
|
|
22
|
+
|
|
23
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
24
|
+
obs = getattr(ctx, "observation", None)
|
|
25
|
+
if not isinstance(obs, dict):
|
|
26
|
+
return None
|
|
27
|
+
present = [k for k in _CONTENT_KEYS if k in obs]
|
|
28
|
+
if not present:
|
|
29
|
+
return None # not a page-content observation — "empty" doesn't apply
|
|
30
|
+
if all(not str(obs.get(k) or "").strip() for k in present):
|
|
31
|
+
return Verdict(severity=Severity.ESCALATE, detector=self.name, detail="empty observation")
|
|
32
|
+
return None
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""error — fires on an HTTP error status in the observation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ErrorDetector:
|
|
9
|
+
name = "error"
|
|
10
|
+
scope = Scope.PER_OBSERVATION
|
|
11
|
+
|
|
12
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
13
|
+
# An HTTP error on a FETCHED page is RECOVERABLE, not fatal. A single bad
|
|
14
|
+
# url (a 403 WAF wall, a 404, a 5xx) says nothing about whether the RUN can
|
|
15
|
+
# succeed — an agent that searches and tries several candidates must be
|
|
16
|
+
# free to fetch the next one. Ending the whole run on one bad fetch (the
|
|
17
|
+
# old TERMINAL behaviour) broke exactly that. So this is RETRY: it is
|
|
18
|
+
# recorded and fed back, the model sees the error and chooses another
|
|
19
|
+
# action, and a run that genuinely cannot proceed still ends via the
|
|
20
|
+
# step/token budget — not by assuming the first url was the only one.
|
|
21
|
+
obs = getattr(ctx, "observation", None)
|
|
22
|
+
status = obs.get("status") if isinstance(obs, dict) else None
|
|
23
|
+
if isinstance(status, int) and status >= 400:
|
|
24
|
+
return Verdict(severity=Severity.RETRY, detector=self.name, detail=f"http {status}")
|
|
25
|
+
return None
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""js-shell — fires when a page is an empty JavaScript shell.
|
|
2
|
+
|
|
3
|
+
The canonical escalation trigger: tier-1 http_fetch returns HTML that is
|
|
4
|
+
essentially a <div id="root"></div> plus scripts, with no real text content.
|
|
5
|
+
That is the signal to give up on the cheap tier and climb to a browser.
|
|
6
|
+
|
|
7
|
+
The test is structural, not size-based: a page is a shell when it has a known
|
|
8
|
+
SPA mount point *and* almost no human-visible text once scripts and styles are
|
|
9
|
+
removed. Measuring visible text (rather than raw HTML length) is what step 5
|
|
10
|
+
finalizes — a shell padded with a large inline bundle is still a shell, and a
|
|
11
|
+
small page that happens to be real content is not escalated.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
19
|
+
|
|
20
|
+
from . import _contains_any, _html_of
|
|
21
|
+
|
|
22
|
+
# Common SPA mount points / framework markers.
|
|
23
|
+
_SHELL_MARKERS = ('id="root"', "id='root'", 'id="app"', "id='app'", "__NEXT_DATA__")
|
|
24
|
+
|
|
25
|
+
# Strip the elements whose contents are never visible text before measuring.
|
|
26
|
+
# ``\s*`` in the close tag tolerates ``</script >``; the second pattern handles
|
|
27
|
+
# an *unterminated* script/style — a browser treats everything after an unclosed
|
|
28
|
+
# <script> as script text, so the heuristic does too (consume to end of input).
|
|
29
|
+
# HTML comments are removed FIRST so a commented-out ``<!-- <script> -->`` (or
|
|
30
|
+
# any literal ``<script`` inside a comment) can't trip the greedy _UNCLOSED rule
|
|
31
|
+
# and erase the real article body after it — a deterministic false-positive the
|
|
32
|
+
# unbalanced-tag heuristic would otherwise produce.
|
|
33
|
+
_COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
|
|
34
|
+
_NONVISIBLE = re.compile(r"<(script|style|template|noscript)\b.*?</\1\s*>", re.IGNORECASE | re.DOTALL)
|
|
35
|
+
_UNCLOSED = re.compile(r"<(script|style|template|noscript)\b.*\Z", re.IGNORECASE | re.DOTALL)
|
|
36
|
+
_TAGS = re.compile(r"<[^>]+>")
|
|
37
|
+
_WS = re.compile(r"\s+")
|
|
38
|
+
|
|
39
|
+
# Below this many characters of visible text, a page with a mount point is
|
|
40
|
+
# treated as an unrendered shell. Tuned against the graded fixture set.
|
|
41
|
+
_MIN_VISIBLE_TEXT = 64
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _visible_text(html: str) -> str:
|
|
45
|
+
"""Human-visible text: drop script/style/template/noscript bodies, strip
|
|
46
|
+
the remaining tags, and collapse whitespace."""
|
|
47
|
+
without_code = _COMMENT.sub(" ", html)
|
|
48
|
+
without_code = _NONVISIBLE.sub(" ", without_code)
|
|
49
|
+
without_code = _UNCLOSED.sub(" ", without_code)
|
|
50
|
+
text = _TAGS.sub(" ", without_code)
|
|
51
|
+
return _WS.sub(" ", text).strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class JsShellDetector:
|
|
55
|
+
name = "js-shell"
|
|
56
|
+
scope = Scope.PER_OBSERVATION
|
|
57
|
+
|
|
58
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
59
|
+
html = _html_of(ctx)
|
|
60
|
+
if not html:
|
|
61
|
+
return None
|
|
62
|
+
lowered = html.lower()
|
|
63
|
+
looks_like_shell = _contains_any(html, _SHELL_MARKERS)
|
|
64
|
+
# The page defers its content to JS: a literal <script>, OR a module
|
|
65
|
+
# graph pulled in via <link rel="modulepreload"> with no inline script
|
|
66
|
+
# (a modern bundler shape the bare "<script" check would miss).
|
|
67
|
+
script_heavy = "<script" in lowered or "modulepreload" in lowered
|
|
68
|
+
thin = len(_visible_text(html)) < _MIN_VISIBLE_TEXT
|
|
69
|
+
if looks_like_shell and script_heavy and thin:
|
|
70
|
+
return Verdict(
|
|
71
|
+
severity=Severity.ESCALATE,
|
|
72
|
+
detector=self.name,
|
|
73
|
+
detail="page appears to be a JS shell; escalate to a browser",
|
|
74
|
+
)
|
|
75
|
+
return None
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Zu built-in validators — the on-final checks of the result.
|
|
2
|
+
|
|
3
|
+
The two cheapest rungs of the validation ladder: schema (does the result fit
|
|
4
|
+
the requested shape?) and grounding (does every extracted value actually
|
|
5
|
+
appear in retrieved content?). Grounding is the anti-hallucination check — the
|
|
6
|
+
core of the "agents that actually work" claim.
|
|
7
|
+
"""
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""grounding — every extracted value must appear in retrieved content.
|
|
2
|
+
|
|
3
|
+
The anti-making-things-up check: a value the agent reports that is nowhere in
|
|
4
|
+
the content the run actually fetched fails grounding. It reads the run's
|
|
5
|
+
content from the event log via RunContext, so it proves provenance, not just
|
|
6
|
+
plausibility.
|
|
7
|
+
|
|
8
|
+
Matching is token-boundary-aware (build step 6): a value must appear in the
|
|
9
|
+
retrieved content as a standalone token, not merely as a substring, so a short
|
|
10
|
+
value such as ``"5"`` is not spuriously grounded by ``"1985"``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Iterator
|
|
16
|
+
|
|
17
|
+
from zu_core.contracts import Result
|
|
18
|
+
from zu_core.ports import RunContext, Severity, Verdict
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _normalize(s: str) -> str:
|
|
22
|
+
"""Collapse whitespace and lowercase so trivial formatting differences
|
|
23
|
+
between an extracted value and the page text don't cause false failures."""
|
|
24
|
+
return " ".join(s.split()).lower()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _grounded(leaf_norm: str, corpus: str) -> bool:
|
|
28
|
+
"""Is the normalized value present in the corpus on token boundaries?
|
|
29
|
+
|
|
30
|
+
Plain substring containment is too lenient: a short value like ``"5"`` would
|
|
31
|
+
match incidentally inside ``"1985"`` and let a fabricated number pass. We
|
|
32
|
+
require the value to appear as a standalone token, not a fragment of a longer
|
|
33
|
+
one, on two axes:
|
|
34
|
+
|
|
35
|
+
- **Alphanumeric flanks** (Unicode-aware via ``str.isalnum``): ``"5"`` inside
|
|
36
|
+
``"1985"`` or ``"caf"`` inside ``"café"`` does not ground, while ``"$9.00"``
|
|
37
|
+
between ``>`` and ``<`` still does — punctuation is a boundary.
|
|
38
|
+
- **Number fragments across a decimal/thousands separator**: a ``.`` or ``,``
|
|
39
|
+
flanked by a digit on the *outer* side means the value is part of a larger
|
|
40
|
+
number, so ``"14"`` is not grounded by ``"3.14"`` nor ``"3"`` by ``"3.14"``
|
|
41
|
+
— but ``"5"`` in ``"Qty: 5."`` (the dot ends a sentence) still grounds.
|
|
42
|
+
"""
|
|
43
|
+
if not leaf_norm:
|
|
44
|
+
# An empty normalized value has no provenance to prove, so it is NOT
|
|
45
|
+
# grounded — fail safe rather than free-pass. ``_leaf_strings`` already
|
|
46
|
+
# drops empty/whitespace leaves upstream, so this is reached only if a
|
|
47
|
+
# non-empty value normalizes to nothing; treating that as ungrounded
|
|
48
|
+
# keeps "I said nothing" from passing the anti-fabrication gate.
|
|
49
|
+
return False
|
|
50
|
+
n = len(leaf_norm)
|
|
51
|
+
start = 0
|
|
52
|
+
while True:
|
|
53
|
+
i = corpus.find(leaf_norm, start)
|
|
54
|
+
if i == -1:
|
|
55
|
+
return False
|
|
56
|
+
if _standalone(corpus, i, i + n):
|
|
57
|
+
return True
|
|
58
|
+
start = i + 1
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Separators that join a number to more digits to form a single larger value or
|
|
62
|
+
# a compound numeric token: decimal/thousands (``.`` ``,``) AND the connectors in
|
|
63
|
+
# dates, versions, times, ranges, SKUs and phone numbers (``-`` ``/`` ``:``). A
|
|
64
|
+
# match flanked by one of these with a digit on its *outer* side is a fragment of
|
|
65
|
+
# a longer token, not a standalone value — so "12" is not grounded by "12-2024",
|
|
66
|
+
# nor "30" by "12:30", just as "14" is not grounded by "3.14".
|
|
67
|
+
_NUM_SEPARATORS = frozenset(".,-/:")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _standalone(corpus: str, lo: int, hi: int) -> bool:
|
|
71
|
+
"""Are the chars flanking ``corpus[lo:hi]`` token boundaries, not part of a
|
|
72
|
+
longer alphanumeric token or a larger/compound number?"""
|
|
73
|
+
before = corpus[lo - 1] if lo > 0 else ""
|
|
74
|
+
after = corpus[hi] if hi < len(corpus) else ""
|
|
75
|
+
if before.isalnum() or after.isalnum():
|
|
76
|
+
return False
|
|
77
|
+
# A numeric separator adjacent to a digit on its outer side means this match
|
|
78
|
+
# is a slice of a larger number or compound token (e.g. "14" inside "3.14",
|
|
79
|
+
# "12" inside "12-2024", "30" inside "12:30").
|
|
80
|
+
if before in _NUM_SEPARATORS and corpus[lo - 2 : lo - 1].isdigit():
|
|
81
|
+
return False
|
|
82
|
+
if after in _NUM_SEPARATORS and corpus[hi + 1 : hi + 2].isdigit():
|
|
83
|
+
return False
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _leaf_strings(value: object) -> Iterator[str]:
|
|
88
|
+
"""Yield every scalar leaf of a result value as a string to ground.
|
|
89
|
+
|
|
90
|
+
Numbers and booleans are real extracted values too — skipping non-strings
|
|
91
|
+
(the previous behaviour) let a fabricated price or count pass ungrounded.
|
|
92
|
+
bool is checked before int because ``isinstance(True, int)`` is True, and a
|
|
93
|
+
boolean is not groundable page text.
|
|
94
|
+
"""
|
|
95
|
+
if isinstance(value, bool):
|
|
96
|
+
return
|
|
97
|
+
if isinstance(value, (str, int, float)):
|
|
98
|
+
text = str(value).strip()
|
|
99
|
+
if text:
|
|
100
|
+
yield text
|
|
101
|
+
elif isinstance(value, dict):
|
|
102
|
+
for v in value.values():
|
|
103
|
+
yield from _leaf_strings(v)
|
|
104
|
+
elif isinstance(value, (list, tuple)):
|
|
105
|
+
for v in value:
|
|
106
|
+
yield from _leaf_strings(v)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _retrieved_corpus(ctx: RunContext) -> str:
|
|
110
|
+
"""Concatenate everything the run fetched, from data.source.fetched events.
|
|
111
|
+
|
|
112
|
+
Falls back to the current observation when the event log isn't populated
|
|
113
|
+
yet (the loop wires the full log in build step 4).
|
|
114
|
+
"""
|
|
115
|
+
chunks: list[str] = []
|
|
116
|
+
for ev in getattr(ctx, "events", []) or []:
|
|
117
|
+
# Only *retrieved* content grounds a value — i.e. data.source.fetched
|
|
118
|
+
# events. Reading text-like keys from any event would let the model
|
|
119
|
+
# ground its own fabrications: harness.turn.completed carries the model's
|
|
120
|
+
# output text, which must never count as evidence about the page.
|
|
121
|
+
if getattr(ev, "type", "") != "data.source.fetched":
|
|
122
|
+
continue
|
|
123
|
+
payload = getattr(ev, "payload", {}) or {}
|
|
124
|
+
for key in ("html", "text", "content"):
|
|
125
|
+
if isinstance(payload.get(key), str):
|
|
126
|
+
chunks.append(payload[key])
|
|
127
|
+
# Fall back to the current observation ONLY when the event log has no fetched
|
|
128
|
+
# content yet (the loop wires the full log in build step 4). If fetched events
|
|
129
|
+
# exist, we must not also fold in the raw observation: an observation that is
|
|
130
|
+
# not itself retrieved page content (e.g. a model-produced turn that happens
|
|
131
|
+
# to carry a ``text`` key) would reopen the self-grounding hole the event-type
|
|
132
|
+
# filter above exists to close.
|
|
133
|
+
if not chunks:
|
|
134
|
+
obs = getattr(ctx, "observation", None)
|
|
135
|
+
if isinstance(obs, dict):
|
|
136
|
+
for key in ("html", "text", "content"):
|
|
137
|
+
if isinstance(obs.get(key), str):
|
|
138
|
+
chunks.append(obs[key])
|
|
139
|
+
return "\n".join(chunks)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class GroundingValidator:
|
|
143
|
+
name = "grounding"
|
|
144
|
+
|
|
145
|
+
def check(self, result: Result, ctx: RunContext) -> Verdict | None:
|
|
146
|
+
if not result.value:
|
|
147
|
+
return None
|
|
148
|
+
corpus = _normalize(_retrieved_corpus(ctx))
|
|
149
|
+
# The result value is usually a JSON object, but the schema may permit a
|
|
150
|
+
# non-object root (a list or scalar). Don't assume ``.items()`` — that
|
|
151
|
+
# would raise AttributeError and silently break the validator ladder.
|
|
152
|
+
value = result.value
|
|
153
|
+
fields = value.items() if isinstance(value, dict) else [("value", value)]
|
|
154
|
+
for field, field_value in fields:
|
|
155
|
+
for leaf in _leaf_strings(field_value):
|
|
156
|
+
if not _grounded(_normalize(leaf), corpus):
|
|
157
|
+
return Verdict(
|
|
158
|
+
severity=Severity.RETRY,
|
|
159
|
+
detector=self.name,
|
|
160
|
+
detail=f"value for {field!r} not found in retrieved content",
|
|
161
|
+
)
|
|
162
|
+
return None
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""schema — the result must satisfy the task's output JSON schema."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import jsonschema
|
|
6
|
+
|
|
7
|
+
from zu_core.contracts import Result
|
|
8
|
+
from zu_core.ports import RunContext, Severity, Verdict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SchemaValidator:
|
|
12
|
+
name = "schema"
|
|
13
|
+
|
|
14
|
+
def check(self, result: Result, ctx: RunContext) -> Verdict | None:
|
|
15
|
+
schema = getattr(ctx.spec, "output_schema", None) or {}
|
|
16
|
+
if not schema:
|
|
17
|
+
return None # nothing to check against
|
|
18
|
+
# jsonschema's richer errors carry a ``.message``; plain exceptions don't.
|
|
19
|
+
# One extraction, used by both the data-mismatch and bad-schema branches.
|
|
20
|
+
def message_of(e: Exception) -> str:
|
|
21
|
+
return getattr(e, "message", str(e))
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
jsonschema.validate(instance=result.value, schema=schema)
|
|
25
|
+
except jsonschema.ValidationError as e:
|
|
26
|
+
# The data didn't match a valid schema — a retry might fix it.
|
|
27
|
+
return Verdict(severity=Severity.RETRY, detector=self.name, detail=message_of(e))
|
|
28
|
+
except Exception as e: # noqa: BLE001 - a broken schema is terminal; see below
|
|
29
|
+
# The output_schema itself is unusable (comes from the TaskSpec,
|
|
30
|
+
# unvalidated): malformed (jsonschema.SchemaError), or an
|
|
31
|
+
# unresolvable ``$ref`` — which jsonschema raises as a *referencing*
|
|
32
|
+
# error that is NOT a subclass of SchemaError and would otherwise
|
|
33
|
+
# escape and crash the validation ladder. Retrying can't fix a broken
|
|
34
|
+
# schema, so any such error is terminal, caught here unconditionally
|
|
35
|
+
# so the ladder never sees an unhandled exception from a bad schema.
|
|
36
|
+
return Verdict(
|
|
37
|
+
severity=Severity.TERMINAL,
|
|
38
|
+
detector=self.name,
|
|
39
|
+
detail=f"invalid output_schema: {message_of(e)}",
|
|
40
|
+
)
|
|
41
|
+
return None
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Smoke tests for the built-in detectors and their discovery.
|
|
2
|
+
|
|
3
|
+
The escalation-ladder logic is finalized against the graded fixture set in
|
|
4
|
+
build step 5; these lock the basic verdicts and the entry-point contract now.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from zu_checks.detectors.bot_wall import BotWallDetector
|
|
10
|
+
from zu_checks.detectors.empty import EmptyDetector
|
|
11
|
+
from zu_checks.detectors.error import ErrorDetector
|
|
12
|
+
from zu_checks.detectors.js_shell import JsShellDetector
|
|
13
|
+
from zu_core.ports import RunContext, Severity
|
|
14
|
+
from zu_core.registry import Registry
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _ctx(observation: dict) -> RunContext:
|
|
18
|
+
return RunContext(spec=None, observation=observation)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_empty_fires_on_blank() -> None:
|
|
22
|
+
v = EmptyDetector().inspect(_ctx({"html": " "}))
|
|
23
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_empty_passes_on_content() -> None:
|
|
27
|
+
assert EmptyDetector().inspect(_ctx({"html": "<p>hi</p>"})) is None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_empty_ignores_non_page_observations() -> None:
|
|
31
|
+
# Regression: a successful html_parse result (no content key) must NOT be
|
|
32
|
+
# read as an "empty page" and escalate — that misfired after real extraction.
|
|
33
|
+
assert EmptyDetector().inspect(_ctx({"selector": "h1", "matches": ["X"], "count": 1})) is None
|
|
34
|
+
assert EmptyDetector().inspect(_ctx({"error": "boom"})) is None
|
|
35
|
+
assert EmptyDetector().inspect(_ctx({})) is None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_error_on_http_status_is_recoverable_not_terminal() -> None:
|
|
39
|
+
# An HTTP error on a fetched page is RETRY, never TERMINAL: a single bad url
|
|
40
|
+
# (403 WAF wall, 404, 410, 5xx, 429) must not end a run that can try another
|
|
41
|
+
# candidate. A truly stuck run ends via budget instead.
|
|
42
|
+
for status in (400, 403, 404, 405, 410, 429, 451, 500, 503):
|
|
43
|
+
v = ErrorDetector().inspect(_ctx({"status": status, "html": ""}))
|
|
44
|
+
assert v is not None and v.severity is Severity.RETRY, status
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_error_quiet_on_success() -> None:
|
|
48
|
+
assert ErrorDetector().inspect(_ctx({"status": 200, "html": "<p>ok</p>"})) is None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_js_shell_fires_on_empty_spa() -> None:
|
|
52
|
+
html = '<html><body><div id="root"></div><script src="/app.js"></script></body></html>'
|
|
53
|
+
v = JsShellDetector().inspect(_ctx({"html": html}))
|
|
54
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_js_shell_passes_on_real_content() -> None:
|
|
58
|
+
html = "<html><body>" + ("<p>real content here</p>" * 500) + "</body></html>"
|
|
59
|
+
assert JsShellDetector().inspect(_ctx({"html": html})) is None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_js_shell_fires_despite_large_inline_script() -> None:
|
|
63
|
+
# A shell padded with a big inline bundle is still a shell: the visible-text
|
|
64
|
+
# test sees through the script, where a raw-length check would be fooled.
|
|
65
|
+
bundle = "var x=1;" * 2000 # ~16 KB of code, zero visible text
|
|
66
|
+
html = f'<html><body><div id="app"></div><script>{bundle}</script></body></html>'
|
|
67
|
+
v = JsShellDetector().inspect(_ctx({"html": html}))
|
|
68
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_js_shell_fires_on_unterminated_script() -> None:
|
|
72
|
+
# Malformed/streamed HTML: a <script> that is never closed. A browser treats
|
|
73
|
+
# everything after it as script text, so the visible-text test must too —
|
|
74
|
+
# the page is still a shell, not real content.
|
|
75
|
+
html = '<html><body><div id="root"></div><script>var x=1;' + ("a();" * 2000)
|
|
76
|
+
v = JsShellDetector().inspect(_ctx({"html": html}))
|
|
77
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_js_shell_passes_on_small_but_real_page() -> None:
|
|
81
|
+
# A mount point with genuine prose is rendered content, not a shell.
|
|
82
|
+
html = (
|
|
83
|
+
'<html><body><div id="root">'
|
|
84
|
+
"<h1>Acme Widget</h1><p>The finest widget, in stock and ready to ship today.</p>"
|
|
85
|
+
"</div><script src=/app.js></script></body></html>"
|
|
86
|
+
)
|
|
87
|
+
assert JsShellDetector().inspect(_ctx({"html": html})) is None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_bot_wall_fires_on_captcha() -> None:
|
|
91
|
+
v = BotWallDetector().inspect(_ctx({"html": "<h1>Just a moment...</h1> please verify you are human"}))
|
|
92
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_bot_wall_does_not_fire_on_innocent_phrase() -> None:
|
|
96
|
+
# A real article that happens to contain a weak phrase must NOT escalate
|
|
97
|
+
# without a corroborating Cloudflare fingerprint (regression: loose match).
|
|
98
|
+
page = _ctx({"html": "<article><h1>Just a moment in history</h1>"
|
|
99
|
+
"<p>Attention required: read the safety notice first.</p></article>"})
|
|
100
|
+
assert BotWallDetector().inspect(page) is None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_bot_wall_fires_on_weak_phrase_with_cloudflare_fingerprint() -> None:
|
|
104
|
+
page = _ctx({"html": "<title>Just a moment...</title>"
|
|
105
|
+
"<div class='cf-browser-verification'></div><!-- cf-ray: abc -->"})
|
|
106
|
+
v = BotWallDetector().inspect(page)
|
|
107
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# --- embedded-widget: content deferred to a JS widget/iframe -----------------
|
|
111
|
+
|
|
112
|
+
_VETSTORIA = (
|
|
113
|
+
"<html><body><h1>Park Vets</h1><p>Lots of normal page chrome here, nav, "
|
|
114
|
+
"footer, plenty of visible text so this is NOT an empty shell.</p>"
|
|
115
|
+
"<div id='oabp-widget' domain='booking.vetstoria.com'></div>"
|
|
116
|
+
"<script src='https://booking.vetstoria.com/js/oabp-widget.js'></script>"
|
|
117
|
+
"</body></html>"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_embedded_widget_fires_on_a_js_booking_widget() -> None:
|
|
122
|
+
from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
|
|
123
|
+
|
|
124
|
+
v = EmbeddedWidgetDetector().inspect(_ctx({"html": _VETSTORIA}))
|
|
125
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_embedded_widget_fires_on_an_external_iframe_app() -> None:
|
|
129
|
+
from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
|
|
130
|
+
|
|
131
|
+
html = "<html><body><p>book below</p><iframe src='https://book.example/app'></iframe></body></html>"
|
|
132
|
+
v = EmbeddedWidgetDetector().inspect(_ctx({"html": html}))
|
|
133
|
+
assert v is not None and v.severity is Severity.ESCALATE
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_embedded_widget_quiet_on_a_plain_content_page_with_analytics() -> None:
|
|
137
|
+
from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
|
|
138
|
+
|
|
139
|
+
# A real content page that merely loads an external analytics script and links
|
|
140
|
+
# to a booking page must NOT escalate — the data is in the HTML.
|
|
141
|
+
html = (
|
|
142
|
+
"<html><body><h1>Opening hours</h1><p>Mon-Fri 9-5. Call 020 555 1234.</p>"
|
|
143
|
+
"<a href='/book-an-appointment'>Book an appointment</a>"
|
|
144
|
+
"<script src='https://www.googletagmanager.com/gtag/js'></script></body></html>"
|
|
145
|
+
)
|
|
146
|
+
assert EmbeddedWidgetDetector().inspect(_ctx({"html": html})) is None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_embedded_widget_fires_once_then_stays_quiet() -> None:
|
|
150
|
+
# It's an escalation trigger: it unlocks the browser once, then must go quiet —
|
|
151
|
+
# a later widget page (or the rendered DOM) re-firing at the top tier would end
|
|
152
|
+
# the run as 'escalation exhausted' before the model can use the browser.
|
|
153
|
+
import types
|
|
154
|
+
|
|
155
|
+
from zu_checks.detectors.embedded_widget import EmbeddedWidgetDetector
|
|
156
|
+
|
|
157
|
+
det = EmbeddedWidgetDetector()
|
|
158
|
+
assert det.inspect(RunContext(spec=None, observation={"html": _VETSTORIA}, events=[])) is not None
|
|
159
|
+
for prior in (
|
|
160
|
+
types.SimpleNamespace(type="harness.task.escalated", source=None, payload={}),
|
|
161
|
+
types.SimpleNamespace(type="data.source.fetched", source="render_dom", payload={}),
|
|
162
|
+
):
|
|
163
|
+
ctx = RunContext(spec=None, observation={"html": _VETSTORIA}, events=[prior])
|
|
164
|
+
assert det.inspect(ctx) is None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_detectors_discoverable() -> None:
|
|
168
|
+
reg = Registry()
|
|
169
|
+
reg.discover()
|
|
170
|
+
for name in ("empty", "error", "js-shell", "embedded-widget", "bot-wall"):
|
|
171
|
+
assert name in reg.names("detectors")
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""Tests for the built-in validators, their discovery, and their behaviour
|
|
2
|
+
inside the loop (build step 6).
|
|
3
|
+
|
|
4
|
+
These lock the core behaviour — schema enforcement and the anti-hallucination
|
|
5
|
+
grounding check, including token-boundary precision — and prove grounding works
|
|
6
|
+
against the real event log when run inside the interpreter loop (at finalise the
|
|
7
|
+
observation is gone, so grounding must read the data.source.fetched events).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from zu_checks.validators.grounding import GroundingValidator
|
|
13
|
+
from zu_checks.validators.schema import SchemaValidator
|
|
14
|
+
from zu_core.bus import EventBus
|
|
15
|
+
from zu_core.contracts import Result, Status, TaskSpec
|
|
16
|
+
from zu_core.loop import run_task
|
|
17
|
+
from zu_core.ports import RunContext, Severity
|
|
18
|
+
from zu_core.registry import Registry
|
|
19
|
+
from zu_providers.scripted import ScriptedProvider
|
|
20
|
+
from zu_testing import fetch_tool
|
|
21
|
+
|
|
22
|
+
_SCHEMA = {
|
|
23
|
+
"type": "object",
|
|
24
|
+
"properties": {"price": {"type": "string"}},
|
|
25
|
+
"required": ["price"],
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _ctx(observation: dict | None = None) -> RunContext:
|
|
30
|
+
spec = TaskSpec(query="extract the price", output_schema=_SCHEMA)
|
|
31
|
+
return RunContext(spec=spec, observation=observation)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_schema_passes_valid_result() -> None:
|
|
35
|
+
r = Result(status=Status.SUCCESS, value={"price": "$9.00"})
|
|
36
|
+
assert SchemaValidator().check(r, _ctx()) is None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_schema_fails_missing_required() -> None:
|
|
40
|
+
r = Result(status=Status.SUCCESS, value={})
|
|
41
|
+
v = SchemaValidator().check(r, _ctx())
|
|
42
|
+
# A plain data mismatch must be RETRY (the model can correct it) — NOT
|
|
43
|
+
# TERMINAL; the loop branches on this severity, so lock it, not just "fired".
|
|
44
|
+
assert v is not None and v.detector == "schema"
|
|
45
|
+
assert v.severity == Severity.RETRY
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_grounding_fails_invented_value() -> None:
|
|
49
|
+
r = Result(status=Status.SUCCESS, value={"price": "$1000.00"})
|
|
50
|
+
ctx = _ctx({"html": "<span class='price'>$9.00</span>"})
|
|
51
|
+
v = GroundingValidator().check(r, ctx)
|
|
52
|
+
assert v is not None and "not found" in (v.detail or "")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_grounding_passes_value_on_page() -> None:
|
|
56
|
+
r = Result(status=Status.SUCCESS, value={"price": "$9.00"})
|
|
57
|
+
ctx = _ctx({"html": "<span class='price'>$9.00</span>"})
|
|
58
|
+
assert GroundingValidator().check(r, ctx) is None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_grounding_checks_numeric_values() -> None:
|
|
62
|
+
# A fabricated *number* must not pass ungrounded (the old code skipped
|
|
63
|
+
# every non-string value, so invented prices/counts sailed through).
|
|
64
|
+
invented = Result(status=Status.SUCCESS, value={"stock": 4096})
|
|
65
|
+
ctx = _ctx({"html": "<span>in stock: 7</span>"})
|
|
66
|
+
assert GroundingValidator().check(invented, ctx) is not None
|
|
67
|
+
|
|
68
|
+
real = Result(status=Status.SUCCESS, value={"stock": 7})
|
|
69
|
+
assert GroundingValidator().check(real, ctx) is None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_grounding_normalizes_whitespace() -> None:
|
|
73
|
+
# Whitespace/case differences between the value and the page shouldn't fail.
|
|
74
|
+
r = Result(status=Status.SUCCESS, value={"title": "Hello World"})
|
|
75
|
+
ctx = _ctx({"html": "<h1>hello world</h1>"})
|
|
76
|
+
assert GroundingValidator().check(r, ctx) is None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_grounding_rejects_short_value_inside_larger_token() -> None:
|
|
80
|
+
# Token-boundary precision: "5" must NOT be grounded by "1985" (plain
|
|
81
|
+
# substring matching would have let the fabricated rating pass).
|
|
82
|
+
only_in_year = _ctx({"html": "<p>The product launched in 1985.</p>"})
|
|
83
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"rating": 5}), only_in_year) is not None
|
|
84
|
+
|
|
85
|
+
# A genuinely standalone "5" on the page still grounds.
|
|
86
|
+
standalone = _ctx({"html": "<p>Rated 5 stars by 1985 reviewers.</p>"})
|
|
87
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"rating": 5}), standalone) is None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_grounding_rejects_value_inside_a_decimal() -> None:
|
|
91
|
+
# A decimal point is a token boundary for *words*, but a fabricated number
|
|
92
|
+
# must not be grounded by a fragment of a larger number: "14" is not on a
|
|
93
|
+
# page that only says "$3.14", nor is "3".
|
|
94
|
+
page = _ctx({"html": "<span class='price'>$3.14</span>"})
|
|
95
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"n": 14}), page) is not None
|
|
96
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"n": 3}), page) is not None
|
|
97
|
+
# The whole decimal still grounds, and so does an integer the dot merely ends
|
|
98
|
+
# a sentence after (the dot is not flanked by a digit on its outer side).
|
|
99
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"p": "3.14"}), page) is None
|
|
100
|
+
qty = _ctx({"html": "<p>Qty: 5.</p>"})
|
|
101
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"q": 5}), qty) is None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_grounding_rejects_value_inside_a_compound_token() -> None:
|
|
105
|
+
# A short number must not be grounded by a fragment of a date/version/time/
|
|
106
|
+
# SKU/phone joined by - / : — "12" is not on a page that only says "12-2024".
|
|
107
|
+
date = _ctx({"html": "<p>Released 12-2024 worldwide.</p>"})
|
|
108
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"m": 12}), date) is not None
|
|
109
|
+
ver = _ctx({"html": "<p>Build 4/19 shipped.</p>"})
|
|
110
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"b": 19}), ver) is not None
|
|
111
|
+
time = _ctx({"html": "<p>Starts at 12:30 sharp.</p>"})
|
|
112
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"t": 30}), time) is not None
|
|
113
|
+
# A genuinely standalone number flanked by a separator-with-no-adjacent-digit
|
|
114
|
+
# (e.g. a slash ending a path segment) still grounds.
|
|
115
|
+
path = _ctx({"html": "<a href='/items/42/'>item</a>"})
|
|
116
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"id": 42}), path) is None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_grounding_is_unicode_token_aware() -> None:
|
|
120
|
+
# The flank check is Unicode-aware (str.isalnum), so a value is not grounded
|
|
121
|
+
# as a fragment of a non-ASCII word.
|
|
122
|
+
page = _ctx({"html": "<p>café société</p>"})
|
|
123
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"w": "caf"}), page) is not None
|
|
124
|
+
assert GroundingValidator().check(Result(status=Status.SUCCESS, value={"w": "café"}), page) is None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_schema_error_is_terminal_not_a_crash() -> None:
|
|
128
|
+
# An invalid output_schema (from the TaskSpec) raises jsonschema.SchemaError
|
|
129
|
+
# internally; the validator must turn it into a TERMINAL verdict, never let
|
|
130
|
+
# it escape and crash the validation ladder.
|
|
131
|
+
spec = TaskSpec(query="x", output_schema={"type": "not-a-real-type"})
|
|
132
|
+
ctx = RunContext(spec=spec, observation=None)
|
|
133
|
+
r = Result(status=Status.SUCCESS, value={"a": 1})
|
|
134
|
+
v = SchemaValidator().check(r, ctx)
|
|
135
|
+
assert v is not None and v.severity == Severity.TERMINAL
|
|
136
|
+
assert "invalid output_schema" in (v.detail or "")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_unresolvable_ref_is_terminal_not_a_crash() -> None:
|
|
140
|
+
# A schema with an unresolvable $ref raises a *referencing* error that is NOT
|
|
141
|
+
# a subclass of jsonschema.SchemaError — it would escape the old handlers and
|
|
142
|
+
# crash the ladder. It must become a TERMINAL verdict like any other broken
|
|
143
|
+
# schema, since the output_schema is untrusted TaskSpec input.
|
|
144
|
+
for bad in ({"$ref": "#/nope"}, {"$ref": "http://evil.example/x"}):
|
|
145
|
+
spec = TaskSpec(query="x", output_schema=bad)
|
|
146
|
+
ctx = RunContext(spec=spec, observation=None)
|
|
147
|
+
r = Result(status=Status.SUCCESS, value={"a": 1})
|
|
148
|
+
v = SchemaValidator().check(r, ctx)
|
|
149
|
+
assert v is not None and v.severity == Severity.TERMINAL
|
|
150
|
+
assert "invalid output_schema" in (v.detail or "")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# --- grounding against the real event log, inside the loop -------------------
|
|
154
|
+
|
|
155
|
+
_PAGE = "<html><body><span class='price'>$9.00</span></body></html>"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _loop_registry() -> Registry:
|
|
159
|
+
reg = Registry()
|
|
160
|
+
reg.register("tools", "http_fetch", fetch_tool(text=_PAGE))
|
|
161
|
+
reg.register("validators", "schema", SchemaValidator())
|
|
162
|
+
reg.register("validators", "grounding", GroundingValidator())
|
|
163
|
+
return reg
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
async def test_grounding_in_loop_passes_value_from_event_log() -> None:
|
|
167
|
+
# At finalise the loop passes no observation, so grounding must read the
|
|
168
|
+
# price from the data.source.fetched event — the step-6 "against the event
|
|
169
|
+
# log" promise, end to end.
|
|
170
|
+
provider = ScriptedProvider.from_moves(
|
|
171
|
+
[
|
|
172
|
+
{"tool": "http_fetch", "args": {"url": "http://x.test/"}},
|
|
173
|
+
{"text": '{"price": "$9.00"}', "finish": "stop"},
|
|
174
|
+
]
|
|
175
|
+
)
|
|
176
|
+
bus = EventBus()
|
|
177
|
+
result = await run_task(TaskSpec(query="price", output_schema=_SCHEMA), provider, _loop_registry(), bus)
|
|
178
|
+
assert result.status == Status.SUCCESS
|
|
179
|
+
assert result.value == {"price": "$9.00"}
|
|
180
|
+
types = [e.type for e in await bus.query()]
|
|
181
|
+
assert "data.source.fetched" in types # grounding had the log to read
|
|
182
|
+
assert "harness.validation.failed" not in types
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
async def test_grounding_in_loop_rejects_fabrication_then_accepts_correction() -> None:
|
|
186
|
+
# A price that is nowhere on the page fails grounding (RETRY); the loop feeds
|
|
187
|
+
# the failure back and the corrected, grounded value then succeeds.
|
|
188
|
+
provider = ScriptedProvider.from_moves(
|
|
189
|
+
[
|
|
190
|
+
{"tool": "http_fetch", "args": {"url": "http://x.test/"}},
|
|
191
|
+
{"text": '{"price": "$1000.00"}', "finish": "stop"}, # not on page -> RETRY
|
|
192
|
+
{"text": '{"price": "$9.00"}', "finish": "stop"}, # grounded -> SUCCESS
|
|
193
|
+
]
|
|
194
|
+
)
|
|
195
|
+
bus = EventBus()
|
|
196
|
+
result = await run_task(TaskSpec(query="price", output_schema=_SCHEMA), provider, _loop_registry(), bus)
|
|
197
|
+
assert result.status == Status.SUCCESS
|
|
198
|
+
assert result.value == {"price": "$9.00"}
|
|
199
|
+
failed = [e for e in await bus.query() if e.type == "harness.validation.failed"]
|
|
200
|
+
assert failed and failed[0].payload["detector"] == "grounding"
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
async def test_grounding_corpus_ignores_the_models_own_text() -> None:
|
|
204
|
+
# The model's output is recorded on harness.turn.completed (the live "train
|
|
205
|
+
# of thought"). Grounding must NOT treat that as retrieved content, or a model
|
|
206
|
+
# could ground a fabrication by simply emitting it. Only the fetched page
|
|
207
|
+
# (which here does NOT contain the price) counts — so the value fails.
|
|
208
|
+
provider = ScriptedProvider.from_moves(
|
|
209
|
+
[
|
|
210
|
+
{"tool": "http_fetch", "args": {"url": "http://x.test/"}},
|
|
211
|
+
{"text": '{"price": "$1000.00"}', "finish": "stop"}, # spoken, not on the page
|
|
212
|
+
{"text": '{"price": "$1000.00"}', "finish": "stop"}, # repeated — still ungrounded
|
|
213
|
+
]
|
|
214
|
+
)
|
|
215
|
+
bus = EventBus()
|
|
216
|
+
result = await run_task(TaskSpec(query="price", output_schema=_SCHEMA), provider, _loop_registry(), bus)
|
|
217
|
+
# Never succeeds: the price is only ever in the model's own text, never the page.
|
|
218
|
+
assert result.status != Status.SUCCESS
|
|
219
|
+
failed = [e for e in await bus.query() if e.type == "harness.validation.failed"]
|
|
220
|
+
assert any(e.payload["detector"] == "grounding" for e in failed)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def test_validators_discoverable() -> None:
|
|
224
|
+
reg = Registry()
|
|
225
|
+
reg.discover()
|
|
226
|
+
for name in ("schema", "grounding"):
|
|
227
|
+
assert name in reg.names("validators")
|