zu-checks 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zu_checks/__init__.py +13 -0
- zu_checks/detectors/__init__.py +37 -0
- zu_checks/detectors/bot_wall.py +54 -0
- zu_checks/detectors/embedded_widget.py +97 -0
- zu_checks/detectors/empty.py +32 -0
- zu_checks/detectors/error.py +25 -0
- zu_checks/detectors/js_shell.py +75 -0
- zu_checks/validators/__init__.py +7 -0
- zu_checks/validators/grounding.py +162 -0
- zu_checks/validators/schema.py +41 -0
- zu_checks-0.1.0.dist-info/METADATA +18 -0
- zu_checks-0.1.0.dist-info/RECORD +14 -0
- zu_checks-0.1.0.dist-info/WHEEL +4 -0
- zu_checks-0.1.0.dist-info/entry_points.txt +10 -0
zu_checks/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Zu built-in checks — the two stdlib plugin kinds that ship with the base.
|
|
2
|
+
|
|
3
|
+
* ``zu_checks.detectors`` — observation-time detectors whose Verdict severities
|
|
4
|
+
drive the loop (ESCALATE climbs the tier ladder; TERMINAL ends the run).
|
|
5
|
+
* ``zu_checks.validators`` — on-final result checks (schema shape + grounding,
|
|
6
|
+
the anti-hallucination provenance check).
|
|
7
|
+
|
|
8
|
+
They live in one package because both are pure-stdlib (the schema validator adds
|
|
9
|
+
only ``jsonschema``) and always present in the base runtime — unlike the adapter
|
|
10
|
+
packages (providers/tools/backends) whose separation carries distinct heavy
|
|
11
|
+
optional dependencies. They register through the same ``zu.detectors`` /
|
|
12
|
+
``zu.validators`` entry-point groups any third-party check would.
|
|
13
|
+
"""
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Zu built-in detectors.
|
|
2
|
+
|
|
3
|
+
A detector inspects an observation and may return a Verdict. Verdict
|
|
4
|
+
severities (WARN, RETRY, ESCALATE, TERMINAL) map onto the loop's control flow:
|
|
5
|
+
ESCALATE is the deterministic signal that climbs the tier ladder. Detectors
|
|
6
|
+
are where escalation is decided — never improvised by the model.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# What counts as page content in an observation, in preference order. The loop
|
|
11
|
+
# stores a fetched/rendered page under one of these keys (mirrors zu_core.loop's
|
|
12
|
+
# own ``_CONTENT_KEYS``); a detector must consult all of them or it goes blind to
|
|
13
|
+
# a tool that returns ``{"text": ...}`` / ``{"content": ...}`` instead of html.
|
|
14
|
+
# One source of truth, reused by ``empty`` too.
|
|
15
|
+
_CONTENT_KEYS = ("html", "text", "content")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _html_of(ctx) -> str:
|
|
19
|
+
"""Best-effort extraction of the page content from a RunContext observation.
|
|
20
|
+
|
|
21
|
+
Concatenates *every* present content key (html, text, content) rather than
|
|
22
|
+
returning only the first, so a marker detector is never blind to a tool that
|
|
23
|
+
splits content across keys — the same all-keys view the ``empty`` detector
|
|
24
|
+
uses, so the detectors agree on what "the content" is."""
|
|
25
|
+
obs = getattr(ctx, "observation", None)
|
|
26
|
+
if isinstance(obs, dict):
|
|
27
|
+
parts = [v for k in _CONTENT_KEYS if isinstance(v := obs.get(k), str) and v]
|
|
28
|
+
if parts:
|
|
29
|
+
return "\n".join(parts)
|
|
30
|
+
return ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _contains_any(html: str, markers) -> bool:
|
|
34
|
+
"""True if any marker (case-insensitive) appears in ``html`` — the shared
|
|
35
|
+
substring scan behind the marker-list detectors (bot-wall, js-shell)."""
|
|
36
|
+
lowered = html.lower()
|
|
37
|
+
return any(marker in lowered for marker in markers)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""bot-wall — fires on an anti-bot interstitial (Cloudflare, captcha, etc.)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
6
|
+
|
|
7
|
+
from . import _contains_any, _html_of
|
|
8
|
+
|
|
9
|
+
# Strong markers: phrasing characteristic of an anti-bot interstitial, specific
|
|
10
|
+
# enough that their presence is treated as the signal on its own. This is a
|
|
11
|
+
# deterministic heuristic, not a proof: a page that *discusses* CAPTCHAs (a news
|
|
12
|
+
# story, this very comment) can contain "captcha" and would escalate — the cost
|
|
13
|
+
# is a wasted tier-2 render, not a wrong answer, and escalating a borderline page
|
|
14
|
+
# is the safer failure. ``cf-browser-verification`` is unambiguous; the natural-
|
|
15
|
+
# language phrases are the ones with residual false-positive surface.
|
|
16
|
+
_STRONG_MARKERS = (
|
|
17
|
+
"captcha",
|
|
18
|
+
"are you a robot",
|
|
19
|
+
"verify you are human",
|
|
20
|
+
"cf-browser-verification",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Weak markers: real Cloudflare wall phrasing, but common-enough English that a
|
|
24
|
+
# substring match alone false-positives (an article titled "Just a Moment in
|
|
25
|
+
# History", a banner reading "Attention required"). They fire ONLY when a
|
|
26
|
+
# Cloudflare fingerprint is also present, so a normal page is never escalated.
|
|
27
|
+
_WEAK_MARKERS = (
|
|
28
|
+
"attention required",
|
|
29
|
+
"just a moment",
|
|
30
|
+
)
|
|
31
|
+
_CLOUDFLARE_FINGERPRINTS = (
|
|
32
|
+
"cloudflare",
|
|
33
|
+
"cf-ray",
|
|
34
|
+
"cf-browser-verification",
|
|
35
|
+
"__cf",
|
|
36
|
+
"/cdn-cgi/",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class BotWallDetector:
|
|
41
|
+
name = "bot-wall"
|
|
42
|
+
scope = Scope.PER_OBSERVATION
|
|
43
|
+
|
|
44
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
45
|
+
html = _html_of(ctx)
|
|
46
|
+
strong = _contains_any(html, _STRONG_MARKERS)
|
|
47
|
+
weak = _contains_any(html, _WEAK_MARKERS) and _contains_any(html, _CLOUDFLARE_FINGERPRINTS)
|
|
48
|
+
if strong or weak:
|
|
49
|
+
return Verdict(
|
|
50
|
+
severity=Severity.ESCALATE,
|
|
51
|
+
detector=self.name,
|
|
52
|
+
detail="anti-bot wall detected",
|
|
53
|
+
)
|
|
54
|
+
return None
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""embedded-widget — fires when the page's real content is inside a JS widget.
|
|
2
|
+
|
|
3
|
+
The complement to ``js-shell``. ``js-shell`` catches an *empty* SPA shell (a
|
|
4
|
+
``<div id="root">`` with no visible text). But a page can be full of human-visible
|
|
5
|
+
chrome — nav, footer, copy — while the data the task actually needs (appointment
|
|
6
|
+
slots, a price table, a seat map) is rendered by an **embedded third-party widget
|
|
7
|
+
or iframe** that loads via JavaScript. A tier-1 ``http_fetch`` sees the chrome and
|
|
8
|
+
the empty mount point, never the data, so it would loop forever or give up. This
|
|
9
|
+
detector is the deterministic signal to *offer* the browser (tier 2) in that case.
|
|
10
|
+
|
|
11
|
+
It is conservative about what counts as a content widget, to avoid escalating on
|
|
12
|
+
ubiquitous analytics/ad scripts:
|
|
13
|
+
|
|
14
|
+
* an ``<iframe>`` with an external ``http(s)`` ``src`` — an embedded application
|
|
15
|
+
whose content is not in this DOM; or
|
|
16
|
+
* a **widget mount point** — an element whose *attributes* (id/class/data-*/domain)
|
|
17
|
+
name a content widget (``widget``, ``embed``, ``scheduler``, or a known booking
|
|
18
|
+
vendor) — together with an external ``<script>`` that fills it.
|
|
19
|
+
|
|
20
|
+
ESCALATE only *unlocks* the browser; the model renders only if it still lacks the
|
|
21
|
+
data, so being a touch generous here is cheap and fail-safe.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import re
|
|
27
|
+
|
|
28
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
29
|
+
|
|
30
|
+
from . import _html_of
|
|
31
|
+
|
|
32
|
+
# Tokens that, when they appear in an element's ATTRIBUTES (not visible text),
|
|
33
|
+
# mark a JS content-widget mount. Generic structural words plus a few common
|
|
34
|
+
# booking/scheduling vendors — kept to attribute context so a nav link like
|
|
35
|
+
# href="/book-an-appointment" or body copy never trips it.
|
|
36
|
+
_WIDGET_TOKENS = (
|
|
37
|
+
"widget", "embed", "scheduler", "data-widget",
|
|
38
|
+
"vetstoria", "oabp", "calendly", "acuityscheduling", "simplybook", "petsapp",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# An <iframe ...> carrying an external http(s) src — an embedded app.
|
|
42
|
+
_IFRAME_SRC = re.compile(r"<iframe\b[^>]*\bsrc\s*=\s*[\"']https?://", re.IGNORECASE)
|
|
43
|
+
# Any element's attribute span, to scan for a widget token in attribute context.
|
|
44
|
+
_TAG_ATTRS = re.compile(r"<[a-zA-Z][a-zA-Z0-9]*\b([^>]*)>")
|
|
45
|
+
# An external <script src="http(s)://..."> — the loader that fills a mount point.
|
|
46
|
+
_EXTERNAL_SCRIPT = re.compile(r"<script\b[^>]*\bsrc\s*=\s*[\"']https?://", re.IGNORECASE)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _has_widget_mount(html: str) -> bool:
|
|
50
|
+
"""True if some element's attributes name a content widget."""
|
|
51
|
+
for m in _TAG_ATTRS.finditer(html):
|
|
52
|
+
attrs = m.group(1).lower()
|
|
53
|
+
if any(tok in attrs for tok in _WIDGET_TOKENS):
|
|
54
|
+
return True
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _already_escalated(ctx: RunContext) -> bool:
|
|
59
|
+
"""True if the run has already escalated (or browser-rendered) this run.
|
|
60
|
+
|
|
61
|
+
This detector is an escalation *trigger*: its job is to unlock the browser
|
|
62
|
+
tier once. After that it must go quiet — every later widget page (another
|
|
63
|
+
http_fetch, or the rendered DOM, which still carries the markers) would
|
|
64
|
+
otherwise re-fire, and at the top tier a re-escalation is 'exhausted' and ENDS
|
|
65
|
+
the run before the model can use the browser it just unlocked. So: fire once,
|
|
66
|
+
then defer to the model working at the higher tier."""
|
|
67
|
+
for ev in getattr(ctx, "events", []) or []:
|
|
68
|
+
et = getattr(ev, "type", "")
|
|
69
|
+
if et == "harness.task.escalated":
|
|
70
|
+
return True
|
|
71
|
+
if et == "data.source.fetched" and getattr(ev, "source", "") == "render_dom":
|
|
72
|
+
return True
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EmbeddedWidgetDetector:
|
|
77
|
+
name = "embedded-widget"
|
|
78
|
+
scope = Scope.PER_OBSERVATION
|
|
79
|
+
|
|
80
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
81
|
+
html = _html_of(ctx)
|
|
82
|
+
if not html:
|
|
83
|
+
return None
|
|
84
|
+
if _already_escalated(ctx):
|
|
85
|
+
return None # already unlocked the browser; fire once, then stay quiet
|
|
86
|
+
embedded_app = bool(_IFRAME_SRC.search(html))
|
|
87
|
+
# A named mount point only counts when an external script is present to
|
|
88
|
+
# fill it — a bare class="...widget..." on a static page isn't deferred.
|
|
89
|
+
widget_loaded = _has_widget_mount(html) and bool(_EXTERNAL_SCRIPT.search(html))
|
|
90
|
+
if embedded_app or widget_loaded:
|
|
91
|
+
return Verdict(
|
|
92
|
+
severity=Severity.ESCALATE,
|
|
93
|
+
detector=self.name,
|
|
94
|
+
detail="page defers content to an embedded widget/iframe; "
|
|
95
|
+
"escalate to a browser to render it",
|
|
96
|
+
)
|
|
97
|
+
return None
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""empty — fires when a *fetched page* carried no usable content.
|
|
2
|
+
|
|
3
|
+
Scoped to page-content observations on purpose: it judges a fetch (a tool that
|
|
4
|
+
returned ``html``/``text``/``content``) and escalates when that content is empty
|
|
5
|
+
— the signal to climb to a browser. It must NOT fire on observations that are not
|
|
6
|
+
page fetches — e.g. ``html_parse`` returning ``{"matches": [...]}`` (a successful
|
|
7
|
+
extraction) or an error observation — or it would spuriously escalate after real
|
|
8
|
+
work. So: a content key present but blank -> escalate; no content key -> not our
|
|
9
|
+
concern (return None).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
15
|
+
|
|
16
|
+
from . import _CONTENT_KEYS # one source of truth for "what counts as page content"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EmptyDetector:
|
|
20
|
+
name = "empty"
|
|
21
|
+
scope = Scope.PER_OBSERVATION
|
|
22
|
+
|
|
23
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
24
|
+
obs = getattr(ctx, "observation", None)
|
|
25
|
+
if not isinstance(obs, dict):
|
|
26
|
+
return None
|
|
27
|
+
present = [k for k in _CONTENT_KEYS if k in obs]
|
|
28
|
+
if not present:
|
|
29
|
+
return None # not a page-content observation — "empty" doesn't apply
|
|
30
|
+
if all(not str(obs.get(k) or "").strip() for k in present):
|
|
31
|
+
return Verdict(severity=Severity.ESCALATE, detector=self.name, detail="empty observation")
|
|
32
|
+
return None
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""error — fires on an HTTP error status in the observation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ErrorDetector:
|
|
9
|
+
name = "error"
|
|
10
|
+
scope = Scope.PER_OBSERVATION
|
|
11
|
+
|
|
12
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
13
|
+
# An HTTP error on a FETCHED page is RECOVERABLE, not fatal. A single bad
|
|
14
|
+
# url (a 403 WAF wall, a 404, a 5xx) says nothing about whether the RUN can
|
|
15
|
+
# succeed — an agent that searches and tries several candidates must be
|
|
16
|
+
# free to fetch the next one. Ending the whole run on one bad fetch (the
|
|
17
|
+
# old TERMINAL behaviour) broke exactly that. So this is RETRY: it is
|
|
18
|
+
# recorded and fed back, the model sees the error and chooses another
|
|
19
|
+
# action, and a run that genuinely cannot proceed still ends via the
|
|
20
|
+
# step/token budget — not by assuming the first url was the only one.
|
|
21
|
+
obs = getattr(ctx, "observation", None)
|
|
22
|
+
status = obs.get("status") if isinstance(obs, dict) else None
|
|
23
|
+
if isinstance(status, int) and status >= 400:
|
|
24
|
+
return Verdict(severity=Severity.RETRY, detector=self.name, detail=f"http {status}")
|
|
25
|
+
return None
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""js-shell — fires when a page is an empty JavaScript shell.
|
|
2
|
+
|
|
3
|
+
The canonical escalation trigger: tier-1 http_fetch returns HTML that is
|
|
4
|
+
essentially a <div id="root"></div> plus scripts, with no real text content.
|
|
5
|
+
That is the signal to give up on the cheap tier and climb to a browser.
|
|
6
|
+
|
|
7
|
+
The test is structural, not size-based: a page is a shell when it has a known
|
|
8
|
+
SPA mount point *and* almost no human-visible text once scripts and styles are
|
|
9
|
+
removed. Measuring visible text (rather than raw HTML length) is what step 5
|
|
10
|
+
finalizes — a shell padded with a large inline bundle is still a shell, and a
|
|
11
|
+
small page that happens to be real content is not escalated.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
from zu_core.ports import RunContext, Scope, Severity, Verdict
|
|
19
|
+
|
|
20
|
+
from . import _contains_any, _html_of
|
|
21
|
+
|
|
22
|
+
# Common SPA mount points / framework markers.
|
|
23
|
+
_SHELL_MARKERS = ('id="root"', "id='root'", 'id="app"', "id='app'", "__NEXT_DATA__")
|
|
24
|
+
|
|
25
|
+
# Strip the elements whose contents are never visible text before measuring.
|
|
26
|
+
# ``\s*`` in the close tag tolerates ``</script >``; the second pattern handles
|
|
27
|
+
# an *unterminated* script/style — a browser treats everything after an unclosed
|
|
28
|
+
# <script> as script text, so the heuristic does too (consume to end of input).
|
|
29
|
+
# HTML comments are removed FIRST so a commented-out ``<!-- <script> -->`` (or
|
|
30
|
+
# any literal ``<script`` inside a comment) can't trip the greedy _UNCLOSED rule
|
|
31
|
+
# and erase the real article body after it — a deterministic false-positive the
|
|
32
|
+
# unbalanced-tag heuristic would otherwise produce.
|
|
33
|
+
_COMMENT = re.compile(r"<!--.*?-->", re.DOTALL)
|
|
34
|
+
_NONVISIBLE = re.compile(r"<(script|style|template|noscript)\b.*?</\1\s*>", re.IGNORECASE | re.DOTALL)
|
|
35
|
+
_UNCLOSED = re.compile(r"<(script|style|template|noscript)\b.*\Z", re.IGNORECASE | re.DOTALL)
|
|
36
|
+
_TAGS = re.compile(r"<[^>]+>")
|
|
37
|
+
_WS = re.compile(r"\s+")
|
|
38
|
+
|
|
39
|
+
# Below this many characters of visible text, a page with a mount point is
|
|
40
|
+
# treated as an unrendered shell. Tuned against the graded fixture set.
|
|
41
|
+
_MIN_VISIBLE_TEXT = 64
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _visible_text(html: str) -> str:
|
|
45
|
+
"""Human-visible text: drop script/style/template/noscript bodies, strip
|
|
46
|
+
the remaining tags, and collapse whitespace."""
|
|
47
|
+
without_code = _COMMENT.sub(" ", html)
|
|
48
|
+
without_code = _NONVISIBLE.sub(" ", without_code)
|
|
49
|
+
without_code = _UNCLOSED.sub(" ", without_code)
|
|
50
|
+
text = _TAGS.sub(" ", without_code)
|
|
51
|
+
return _WS.sub(" ", text).strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class JsShellDetector:
|
|
55
|
+
name = "js-shell"
|
|
56
|
+
scope = Scope.PER_OBSERVATION
|
|
57
|
+
|
|
58
|
+
def inspect(self, ctx: RunContext) -> Verdict | None:
|
|
59
|
+
html = _html_of(ctx)
|
|
60
|
+
if not html:
|
|
61
|
+
return None
|
|
62
|
+
lowered = html.lower()
|
|
63
|
+
looks_like_shell = _contains_any(html, _SHELL_MARKERS)
|
|
64
|
+
# The page defers its content to JS: a literal <script>, OR a module
|
|
65
|
+
# graph pulled in via <link rel="modulepreload"> with no inline script
|
|
66
|
+
# (a modern bundler shape the bare "<script" check would miss).
|
|
67
|
+
script_heavy = "<script" in lowered or "modulepreload" in lowered
|
|
68
|
+
thin = len(_visible_text(html)) < _MIN_VISIBLE_TEXT
|
|
69
|
+
if looks_like_shell and script_heavy and thin:
|
|
70
|
+
return Verdict(
|
|
71
|
+
severity=Severity.ESCALATE,
|
|
72
|
+
detector=self.name,
|
|
73
|
+
detail="page appears to be a JS shell; escalate to a browser",
|
|
74
|
+
)
|
|
75
|
+
return None
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Zu built-in validators — the on-final checks of the result.
|
|
2
|
+
|
|
3
|
+
The two cheapest rungs of the validation ladder: schema (does the result fit
|
|
4
|
+
the requested shape?) and grounding (does every extracted value actually
|
|
5
|
+
appear in retrieved content?). Grounding is the anti-hallucination check — the
|
|
6
|
+
core of the "agents that actually work" claim.
|
|
7
|
+
"""
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""grounding — every extracted value must appear in retrieved content.
|
|
2
|
+
|
|
3
|
+
The anti-making-things-up check: a value the agent reports that is nowhere in
|
|
4
|
+
the content the run actually fetched fails grounding. It reads the run's
|
|
5
|
+
content from the event log via RunContext, so it proves provenance, not just
|
|
6
|
+
plausibility.
|
|
7
|
+
|
|
8
|
+
Matching is token-boundary-aware (build step 6): a value must appear in the
|
|
9
|
+
retrieved content as a standalone token, not merely as a substring, so a short
|
|
10
|
+
value such as ``"5"`` is not spuriously grounded by ``"1985"``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Iterator
|
|
16
|
+
|
|
17
|
+
from zu_core.contracts import Result
|
|
18
|
+
from zu_core.ports import RunContext, Severity, Verdict
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _normalize(s: str) -> str:
|
|
22
|
+
"""Collapse whitespace and lowercase so trivial formatting differences
|
|
23
|
+
between an extracted value and the page text don't cause false failures."""
|
|
24
|
+
return " ".join(s.split()).lower()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _grounded(leaf_norm: str, corpus: str) -> bool:
|
|
28
|
+
"""Is the normalized value present in the corpus on token boundaries?
|
|
29
|
+
|
|
30
|
+
Plain substring containment is too lenient: a short value like ``"5"`` would
|
|
31
|
+
match incidentally inside ``"1985"`` and let a fabricated number pass. We
|
|
32
|
+
require the value to appear as a standalone token, not a fragment of a longer
|
|
33
|
+
one, on two axes:
|
|
34
|
+
|
|
35
|
+
- **Alphanumeric flanks** (Unicode-aware via ``str.isalnum``): ``"5"`` inside
|
|
36
|
+
``"1985"`` or ``"caf"`` inside ``"café"`` does not ground, while ``"$9.00"``
|
|
37
|
+
between ``>`` and ``<`` still does — punctuation is a boundary.
|
|
38
|
+
- **Number fragments across a decimal/thousands separator**: a ``.`` or ``,``
|
|
39
|
+
flanked by a digit on the *outer* side means the value is part of a larger
|
|
40
|
+
number, so ``"14"`` is not grounded by ``"3.14"`` nor ``"3"`` by ``"3.14"``
|
|
41
|
+
— but ``"5"`` in ``"Qty: 5."`` (the dot ends a sentence) still grounds.
|
|
42
|
+
"""
|
|
43
|
+
if not leaf_norm:
|
|
44
|
+
# An empty normalized value has no provenance to prove, so it is NOT
|
|
45
|
+
# grounded — fail safe rather than free-pass. ``_leaf_strings`` already
|
|
46
|
+
# drops empty/whitespace leaves upstream, so this is reached only if a
|
|
47
|
+
# non-empty value normalizes to nothing; treating that as ungrounded
|
|
48
|
+
# keeps "I said nothing" from passing the anti-fabrication gate.
|
|
49
|
+
return False
|
|
50
|
+
n = len(leaf_norm)
|
|
51
|
+
start = 0
|
|
52
|
+
while True:
|
|
53
|
+
i = corpus.find(leaf_norm, start)
|
|
54
|
+
if i == -1:
|
|
55
|
+
return False
|
|
56
|
+
if _standalone(corpus, i, i + n):
|
|
57
|
+
return True
|
|
58
|
+
start = i + 1
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Separators that join a number to more digits to form a single larger value or
|
|
62
|
+
# a compound numeric token: decimal/thousands (``.`` ``,``) AND the connectors in
|
|
63
|
+
# dates, versions, times, ranges, SKUs and phone numbers (``-`` ``/`` ``:``). A
|
|
64
|
+
# match flanked by one of these with a digit on its *outer* side is a fragment of
|
|
65
|
+
# a longer token, not a standalone value — so "12" is not grounded by "12-2024",
|
|
66
|
+
# nor "30" by "12:30", just as "14" is not grounded by "3.14".
|
|
67
|
+
_NUM_SEPARATORS = frozenset(".,-/:")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _standalone(corpus: str, lo: int, hi: int) -> bool:
|
|
71
|
+
"""Are the chars flanking ``corpus[lo:hi]`` token boundaries, not part of a
|
|
72
|
+
longer alphanumeric token or a larger/compound number?"""
|
|
73
|
+
before = corpus[lo - 1] if lo > 0 else ""
|
|
74
|
+
after = corpus[hi] if hi < len(corpus) else ""
|
|
75
|
+
if before.isalnum() or after.isalnum():
|
|
76
|
+
return False
|
|
77
|
+
# A numeric separator adjacent to a digit on its outer side means this match
|
|
78
|
+
# is a slice of a larger number or compound token (e.g. "14" inside "3.14",
|
|
79
|
+
# "12" inside "12-2024", "30" inside "12:30").
|
|
80
|
+
if before in _NUM_SEPARATORS and corpus[lo - 2 : lo - 1].isdigit():
|
|
81
|
+
return False
|
|
82
|
+
if after in _NUM_SEPARATORS and corpus[hi + 1 : hi + 2].isdigit():
|
|
83
|
+
return False
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _leaf_strings(value: object) -> Iterator[str]:
|
|
88
|
+
"""Yield every scalar leaf of a result value as a string to ground.
|
|
89
|
+
|
|
90
|
+
Numbers and booleans are real extracted values too — skipping non-strings
|
|
91
|
+
(the previous behaviour) let a fabricated price or count pass ungrounded.
|
|
92
|
+
bool is checked before int because ``isinstance(True, int)`` is True, and a
|
|
93
|
+
boolean is not groundable page text.
|
|
94
|
+
"""
|
|
95
|
+
if isinstance(value, bool):
|
|
96
|
+
return
|
|
97
|
+
if isinstance(value, (str, int, float)):
|
|
98
|
+
text = str(value).strip()
|
|
99
|
+
if text:
|
|
100
|
+
yield text
|
|
101
|
+
elif isinstance(value, dict):
|
|
102
|
+
for v in value.values():
|
|
103
|
+
yield from _leaf_strings(v)
|
|
104
|
+
elif isinstance(value, (list, tuple)):
|
|
105
|
+
for v in value:
|
|
106
|
+
yield from _leaf_strings(v)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _retrieved_corpus(ctx: RunContext) -> str:
|
|
110
|
+
"""Concatenate everything the run fetched, from data.source.fetched events.
|
|
111
|
+
|
|
112
|
+
Falls back to the current observation when the event log isn't populated
|
|
113
|
+
yet (the loop wires the full log in build step 4).
|
|
114
|
+
"""
|
|
115
|
+
chunks: list[str] = []
|
|
116
|
+
for ev in getattr(ctx, "events", []) or []:
|
|
117
|
+
# Only *retrieved* content grounds a value — i.e. data.source.fetched
|
|
118
|
+
# events. Reading text-like keys from any event would let the model
|
|
119
|
+
# ground its own fabrications: harness.turn.completed carries the model's
|
|
120
|
+
# output text, which must never count as evidence about the page.
|
|
121
|
+
if getattr(ev, "type", "") != "data.source.fetched":
|
|
122
|
+
continue
|
|
123
|
+
payload = getattr(ev, "payload", {}) or {}
|
|
124
|
+
for key in ("html", "text", "content"):
|
|
125
|
+
if isinstance(payload.get(key), str):
|
|
126
|
+
chunks.append(payload[key])
|
|
127
|
+
# Fall back to the current observation ONLY when the event log has no fetched
|
|
128
|
+
# content yet (the loop wires the full log in build step 4). If fetched events
|
|
129
|
+
# exist, we must not also fold in the raw observation: an observation that is
|
|
130
|
+
# not itself retrieved page content (e.g. a model-produced turn that happens
|
|
131
|
+
# to carry a ``text`` key) would reopen the self-grounding hole the event-type
|
|
132
|
+
# filter above exists to close.
|
|
133
|
+
if not chunks:
|
|
134
|
+
obs = getattr(ctx, "observation", None)
|
|
135
|
+
if isinstance(obs, dict):
|
|
136
|
+
for key in ("html", "text", "content"):
|
|
137
|
+
if isinstance(obs.get(key), str):
|
|
138
|
+
chunks.append(obs[key])
|
|
139
|
+
return "\n".join(chunks)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class GroundingValidator:
|
|
143
|
+
name = "grounding"
|
|
144
|
+
|
|
145
|
+
def check(self, result: Result, ctx: RunContext) -> Verdict | None:
|
|
146
|
+
if not result.value:
|
|
147
|
+
return None
|
|
148
|
+
corpus = _normalize(_retrieved_corpus(ctx))
|
|
149
|
+
# The result value is usually a JSON object, but the schema may permit a
|
|
150
|
+
# non-object root (a list or scalar). Don't assume ``.items()`` — that
|
|
151
|
+
# would raise AttributeError and silently break the validator ladder.
|
|
152
|
+
value = result.value
|
|
153
|
+
fields = value.items() if isinstance(value, dict) else [("value", value)]
|
|
154
|
+
for field, field_value in fields:
|
|
155
|
+
for leaf in _leaf_strings(field_value):
|
|
156
|
+
if not _grounded(_normalize(leaf), corpus):
|
|
157
|
+
return Verdict(
|
|
158
|
+
severity=Severity.RETRY,
|
|
159
|
+
detector=self.name,
|
|
160
|
+
detail=f"value for {field!r} not found in retrieved content",
|
|
161
|
+
)
|
|
162
|
+
return None
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""schema — the result must satisfy the task's output JSON schema."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import jsonschema
|
|
6
|
+
|
|
7
|
+
from zu_core.contracts import Result
|
|
8
|
+
from zu_core.ports import RunContext, Severity, Verdict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SchemaValidator:
|
|
12
|
+
name = "schema"
|
|
13
|
+
|
|
14
|
+
def check(self, result: Result, ctx: RunContext) -> Verdict | None:
|
|
15
|
+
schema = getattr(ctx.spec, "output_schema", None) or {}
|
|
16
|
+
if not schema:
|
|
17
|
+
return None # nothing to check against
|
|
18
|
+
# jsonschema's richer errors carry a ``.message``; plain exceptions don't.
|
|
19
|
+
# One extraction, used by both the data-mismatch and bad-schema branches.
|
|
20
|
+
def message_of(e: Exception) -> str:
|
|
21
|
+
return getattr(e, "message", str(e))
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
jsonschema.validate(instance=result.value, schema=schema)
|
|
25
|
+
except jsonschema.ValidationError as e:
|
|
26
|
+
# The data didn't match a valid schema — a retry might fix it.
|
|
27
|
+
return Verdict(severity=Severity.RETRY, detector=self.name, detail=message_of(e))
|
|
28
|
+
except Exception as e: # noqa: BLE001 - a broken schema is terminal; see below
|
|
29
|
+
# The output_schema itself is unusable (comes from the TaskSpec,
|
|
30
|
+
# unvalidated): malformed (jsonschema.SchemaError), or an
|
|
31
|
+
# unresolvable ``$ref`` — which jsonschema raises as a *referencing*
|
|
32
|
+
# error that is NOT a subclass of SchemaError and would otherwise
|
|
33
|
+
# escape and crash the validation ladder. Retrying can't fix a broken
|
|
34
|
+
# schema, so any such error is terminal, caught here unconditionally
|
|
35
|
+
# so the ladder never sees an unhandled exception from a bad schema.
|
|
36
|
+
return Verdict(
|
|
37
|
+
severity=Severity.TERMINAL,
|
|
38
|
+
detector=self.name,
|
|
39
|
+
detail=f"invalid output_schema: {message_of(e)}",
|
|
40
|
+
)
|
|
41
|
+
return None
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zu-checks
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Zu built-in checks: detectors (empty, error, js-shell, embedded-widget, bot-wall) + validators (schema, grounding)
|
|
5
|
+
Project-URL: Homepage, https://github.com/k3-mt/zu
|
|
6
|
+
Project-URL: Repository, https://github.com/k3-mt/zu
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: jsonschema>=4
|
|
18
|
+
Requires-Dist: zu-core==0.1.0
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
zu_checks/__init__.py,sha256=DFWxEdZBy4vqncg4IJUg9GxA3Q9tewAF8EyHsDXxGls,746
|
|
2
|
+
zu_checks/detectors/__init__.py,sha256=yBPvvUvk5yoHWHCsTu6NWrYFExO3PcMmkW61bhT7IVI,1667
|
|
3
|
+
zu_checks/detectors/bot_wall.py,sha256=KYH5WrNp4B-Auwn06PK7nG4SYXND6vbGJBc1Kx1KS20,1935
|
|
4
|
+
zu_checks/detectors/embedded_widget.py,sha256=NIMEQEX11gXxHGA0crMUBibCLIzaEQAYKcD6s3xTP4I,4458
|
|
5
|
+
zu_checks/detectors/empty.py,sha256=0eNF2ZZrX4UCVQ_Mp-bNd--q_ejhRFlZn9E5-avWM2s,1375
|
|
6
|
+
zu_checks/detectors/error.py,sha256=63ds67Vz84WLgl3ouqCpmATGPIYHkFnoOSMPuGpGT_E,1232
|
|
7
|
+
zu_checks/detectors/js_shell.py,sha256=N16KhXBB8Pl6QMLqiXhYUDn9pP0w_RowJuphBXbKmW8,3378
|
|
8
|
+
zu_checks/validators/__init__.py,sha256=UIB0gBGtPzR4Zc8hGRQ45R_RU7YrHqItE1b6nR7qg4g,348
|
|
9
|
+
zu_checks/validators/grounding.py,sha256=TaEE8oiE9f5mqEYt_qadxQUudg8v52IxZioODGXmuUs,7281
|
|
10
|
+
zu_checks/validators/schema.py,sha256=VVxM28vzLoA-Vd6Ytf4dT7xVN1aiQdILF-n7qcBm5PQ,1888
|
|
11
|
+
zu_checks-0.1.0.dist-info/METADATA,sha256=MS3oPArfyzBmma7Esb2DvXEJVrvI7AlsSxCs6VLsHQU,804
|
|
12
|
+
zu_checks-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
13
|
+
zu_checks-0.1.0.dist-info/entry_points.txt,sha256=nzsKOh8HkgUgURbDEmXcYnkVDOjCyWOEkxSnUTv0Ebc,432
|
|
14
|
+
zu_checks-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
[zu.detectors]
|
|
2
|
+
bot-wall = zu_checks.detectors.bot_wall:BotWallDetector
|
|
3
|
+
embedded-widget = zu_checks.detectors.embedded_widget:EmbeddedWidgetDetector
|
|
4
|
+
empty = zu_checks.detectors.empty:EmptyDetector
|
|
5
|
+
error = zu_checks.detectors.error:ErrorDetector
|
|
6
|
+
js-shell = zu_checks.detectors.js_shell:JsShellDetector
|
|
7
|
+
|
|
8
|
+
[zu.validators]
|
|
9
|
+
grounding = zu_checks.validators.grounding:GroundingValidator
|
|
10
|
+
schema = zu_checks.validators.schema:SchemaValidator
|