dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/arg_provenance.py
ADDED
|
@@ -0,0 +1,814 @@
|
|
|
1
|
+
"""ARG — the argument-provenance verdict: *did the model MINT this id, or RESOLVE it?*
|
|
2
|
+
|
|
3
|
+
docs/143 §5a/§7 (the EnterpriseOps-Gym audit) — the **survivor** binding. Of every
|
|
4
|
+
gate the audit floated for a cheap agent on a stateful enterprise benchmark, exactly
|
|
5
|
+
one passes DOS's own byte-inequality axiom (docs/141) cleanly: a check of
|
|
6
|
+
**provenance-of-a-string**. Before a mutating tool call fires, ask of each id/FK-shaped
|
|
7
|
+
argument:
|
|
8
|
+
|
|
9
|
+
> did this value APPEAR in env-authored bytes the agent already saw (a prior tool
|
|
10
|
+
> RESULT, or the task text), or did the model MINT it from nowhere?
|
|
11
|
+
|
|
12
|
+
That is a clean **byte-author** question — the gym MCP server authored the read-result
|
|
13
|
+
bytes; the judged agent did not — so it sidesteps the **mirror-verifier trap** (docs/141,
|
|
14
|
+
docs/143 §5a) entirely: it needs no answer key, no held-out final state, and **no
|
|
15
|
+
self-authored satisfaction predicate** ("is this the row the task *required*?" — the
|
|
16
|
+
forgeable-in-the-agent's-favor question this module must never ask). It attacks two
|
|
17
|
+
*named* benchmark failure modes (docs/143 §1b) that feed the Integrity (FK-validity)
|
|
18
|
+
verifier:
|
|
19
|
+
|
|
20
|
+
* **Incorrect ID Resolution** — passing unverified IDs minted by the model instead of
|
|
21
|
+
resolving the correct IDs through prior tool interactions.
|
|
22
|
+
* **Missing Prerequisite Lookup** — creating an object without first querying the
|
|
23
|
+
prereqs, so the FK it references was never read.
|
|
24
|
+
|
|
25
|
+
Why this is the kernel's to own, and where the line is
|
|
26
|
+
======================================================
|
|
27
|
+
|
|
28
|
+
`believe=True` here means **only** "no id arg was minted from nowhere" — it is NEVER a
|
|
29
|
+
claim that the args are *correct* (that would be a satisfaction predicate, the trap).
|
|
30
|
+
The structural guarantee that this module cannot launder a self-authored predicate is in
|
|
31
|
+
the type: the provenance corpus (`PriorResults`) is a tuple of `EnvBlob`, and an
|
|
32
|
+
`EnvBlob` can carry only an **env** `CorpusSource` (`TOOL_RESULT` / `TASK_TEXT`). There
|
|
33
|
+
is deliberately **no `AGENT_AUTHORED` member** — a boundary that tried to fold an
|
|
34
|
+
assistant turn into the corpus has no enum to tag it with, so model-authored bytes are
|
|
35
|
+
*unrepresentable* as evidence. That is the docs/143 §5a discipline made structural, the
|
|
36
|
+
same shape `evidence.believe_under_floor` uses (a forgeable-floor source can never grant
|
|
37
|
+
belief) — here pushed one step further: the forgeable class does not exist in the type.
|
|
38
|
+
|
|
39
|
+
The verdict is **advisory**: it REPORTS; it never raises, never dispatches, never mutates.
|
|
40
|
+
The consumer (a `dos_react`-style orchestrator wrapper, benchmark-side — NOT in the
|
|
41
|
+
kernel) reads `unsupported` and injects ONE nudge ToolMessage ("resolve `<value>` via a
|
|
42
|
+
read tool first") instead of dispatching the mutating call, pushing the cheap model to do
|
|
43
|
+
the prerequisite lookup it skipped. The verdict's only power is to nudge-MORE; it has no
|
|
44
|
+
output that can force a call through — **refuse-MORE-only by the shape of the type**, the
|
|
45
|
+
admission-seam / fail-to-abstain discipline re-aimed at the argument grain. The
|
|
46
|
+
per-arg-value re-injection cap (≤1, docs/143 §4) lives in the consumer; the pure verdict
|
|
47
|
+
cannot loop.
|
|
48
|
+
|
|
49
|
+
The two errors, and which one is safe
|
|
50
|
+
=====================================
|
|
51
|
+
|
|
52
|
+
Only two error directions are reachable, and the design biases hard toward the safe one:
|
|
53
|
+
|
|
54
|
+
* **false-SUPPORTED** (a minted id coincidentally substrings the corpus) → the verdict
|
|
55
|
+
declines to nudge → R1 degrades to the baseline for that call. SAFE — no worse than
|
|
56
|
+
not having the gate.
|
|
57
|
+
* **false-UNSUPPORTED** (a *legit derived* id — padded `INC0010023` from a bare env
|
|
58
|
+
`10023`, a composite `user_42@acme.com` from env parts — wrongly flagged) → an
|
|
59
|
+
unnecessary nudge wastes an iteration and, on a thrashing agent near its cap, can
|
|
60
|
+
convert a would-pass run into a timeout (the docs/143 §8 feasible-task **kill-signal**).
|
|
61
|
+
DANGEROUS. The component decomposition (Step D) + the derived-id containment rungs
|
|
62
|
+
(Step E reverse-substring + numeric-pad-normalize) drive this rate toward ~0, which is
|
|
63
|
+
what lets R1 clear its gate (Integrity UP, feasible-task rate FLAT).
|
|
64
|
+
|
|
65
|
+
So the whole module is tuned to **under-fire**: a missed mint is a silent safe ABSTAIN; a
|
|
66
|
+
false flag risks a real regression. Every ambiguous case resolves to ABSTAIN.
|
|
67
|
+
|
|
68
|
+
⚓ Pure kernel, I/O on the edge (the dos idiom — mirrors `liveness.classify`,
|
|
69
|
+
`churn.decide_coalesce`, `evidence.believe_under_floor`): `classify_call(ToolCall,
|
|
70
|
+
PriorResults, policy) -> ProvenanceVerdict` is a frozen dataclass in, a frozen verdict
|
|
71
|
+
out. The caller (the wrapper, at the benchmark boundary) flattens each prior tool RESULT
|
|
72
|
+
to a string and tags it with its env source BEFORE the call; the kernel never parses
|
|
73
|
+
JSON, never reads a file, never calls a clock. That is what lets the whole verdict be
|
|
74
|
+
unit-tested on frozen fixtures with zero benchmark/LLM/MCP access — the keystone the audit
|
|
75
|
+
calls "testable with zero benchmark access."
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
from __future__ import annotations
|
|
79
|
+
|
|
80
|
+
import enum
|
|
81
|
+
import re
|
|
82
|
+
from dataclasses import dataclass
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# The closed source vocabulary — the structural non-self-authorship guarantee.
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
class CorpusSource(str, enum.Enum):
|
|
89
|
+
"""Where an `EnvBlob`'s bytes came from — and CRUCIALLY, *only env classes exist*.
|
|
90
|
+
|
|
91
|
+
Mirrors `evidence.Accountability` in spirit (who authored the bytes) but is local
|
|
92
|
+
and **closed to env-authored classes by construction**: there is deliberately no
|
|
93
|
+
`AGENT_AUTHORED` member. The provenance corpus is built only of `EnvBlob`s, and an
|
|
94
|
+
`EnvBlob` can be tagged with nothing but these two — so a boundary that tried to fold
|
|
95
|
+
a model turn into the corpus has *no enum value to use* and the bytes cannot enter.
|
|
96
|
+
The mirror-verifier trap (docs/143 §5a — grading the agent against bytes the agent
|
|
97
|
+
authored) is thereby made **unrepresentable in the type**, not merely discouraged.
|
|
98
|
+
|
|
99
|
+
`str`-valued so it round-trips a CLI token / JSON without a lookup table (the
|
|
100
|
+
`Accountability` / `Liveness` idiom).
|
|
101
|
+
|
|
102
|
+
TOOL_RESULT — bytes the gym MCP server authored: a prior read/tool RESULT the agent
|
|
103
|
+
observed but did not write. The primary provenance source.
|
|
104
|
+
TASK_TEXT — bytes the gym authored in the task prompt / policy doc. An id the task
|
|
105
|
+
itself names is env-authored (docs/143 §4 P1 flags task-text ids as a
|
|
106
|
+
needed first-class source, so a task-named id is never false-flagged).
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
TOOL_RESULT = "TOOL_RESULT"
|
|
110
|
+
TASK_TEXT = "TASK_TEXT"
|
|
111
|
+
|
|
112
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
113
|
+
return self.value
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class ProvenanceStance(str, enum.Enum):
|
|
117
|
+
"""The per-arg verdict — three-valued, the `EvidenceStance` analogue.
|
|
118
|
+
|
|
119
|
+
`str`-valued so it round-trips a token / JSON / exit code without a lookup table.
|
|
120
|
+
|
|
121
|
+
SUPPORTED — id-shaped, a reference on a mutating call, AND every data-bearing
|
|
122
|
+
component traced to env-authored bytes. The "believe" rung.
|
|
123
|
+
UNSUPPORTED — id-shaped, a reference on a mutating call, the corpus was non-empty,
|
|
124
|
+
AND ≥1 data-bearing component appears NOWHERE in env bytes → looks
|
|
125
|
+
model-minted. The ONLY stance that drives a nudge.
|
|
126
|
+
ABSTAIN — the fail-safe zero: not id-shaped, OR a read/non-mutating call, OR a
|
|
127
|
+
new-key (the create's own minted identity), OR the corpus is empty
|
|
128
|
+
(first call — we cannot prove mintage with zero env bytes, so we never
|
|
129
|
+
accuse). Honest no-signal; never a block.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
SUPPORTED = "SUPPORTED"
|
|
133
|
+
UNSUPPORTED = "UNSUPPORTED"
|
|
134
|
+
ABSTAIN = "ABSTAIN"
|
|
135
|
+
|
|
136
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
137
|
+
return self.value
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
# Frozen inputs — the pure datum a caller gathers at the boundary and hands in.
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
@dataclass(frozen=True)
|
|
144
|
+
class EnvBlob:
|
|
145
|
+
"""One env-authored chunk of the provenance corpus — bytes the agent did NOT write.
|
|
146
|
+
|
|
147
|
+
`text` is one prior tool RESULT (or the task text) already flattened to a string at
|
|
148
|
+
the boundary (the wrapper does `json.dumps(result)` / `str(...)`; the kernel never
|
|
149
|
+
parses JSON). `source` is the load-bearing field: it can ONLY be an env
|
|
150
|
+
`CorpusSource`, so an `EnvBlob` is by construction not forgeable-floor evidence.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
text: str
|
|
154
|
+
source: CorpusSource
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@dataclass(frozen=True)
|
|
158
|
+
class PriorResults:
|
|
159
|
+
"""The whole env-authored corpus accumulated before the call under scrutiny.
|
|
160
|
+
|
|
161
|
+
`blobs` is a tuple of `EnvBlob` — every prior tool RESULT plus the task text, each
|
|
162
|
+
tagged with its env source. Empty (`()`) on the very first call of an episode, which
|
|
163
|
+
`classify_call` reads as "cannot prove mintage → ABSTAIN-all" (the load-bearing
|
|
164
|
+
first-call safe direction). The blobs are kept WHOLE (not pre-tokenized) so an id
|
|
165
|
+
embedded mid-prose ("close incident INC0010023 today") is still found by containment.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
blobs: tuple[EnvBlob, ...] = ()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclass(frozen=True)
|
|
172
|
+
class ToolArg:
|
|
173
|
+
"""One argument of the tool call, as the pure datum a provenance check sees.
|
|
174
|
+
|
|
175
|
+
`value` is the raw value the model emitted (str | int | float | bool | None | list |
|
|
176
|
+
dict); the fold provenance-checks scalars and recurses into list/dict. `is_reference`
|
|
177
|
+
is the create's-own-key guard: the wrapper sets it False for the slot that holds the
|
|
178
|
+
NEW object's OWN identity/primary key (resolved from the tool schema) — a brand-new
|
|
179
|
+
minted natural key (a new email, a new title-slug) is minted-AND-correct and must
|
|
180
|
+
never be nudged ("you cannot resolve an id you are inventing"). Defaults True (the
|
|
181
|
+
common case: most args reference existing rows), so an un-annotating caller gets the
|
|
182
|
+
gating behavior; the create's-own-key exemption is opt-in at the boundary.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
name: str
|
|
186
|
+
value: object
|
|
187
|
+
is_reference: bool = True
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@dataclass(frozen=True)
|
|
191
|
+
class ToolCall:
|
|
192
|
+
"""The tool call under scrutiny — the `AdmissionRequest` analogue.
|
|
193
|
+
|
|
194
|
+
`is_mutating` is set by the wrapper from the tool schema (write-verb classifier). A
|
|
195
|
+
read/non-mutating call is never gated — reads are how provenance ENTERS the corpus —
|
|
196
|
+
so `is_mutating=False` short-circuits the whole fold to ABSTAIN-all. The wrapper's
|
|
197
|
+
write-verb classifier is deliberately **fail-open** (when unsure, treat as a read):
|
|
198
|
+
under-gating is the feasible-task-safe direction here, the explicit inversion of the
|
|
199
|
+
kernel's usual fail-closed posture, because a false gate risks a real regression while
|
|
200
|
+
a missed gate just degrades to baseline.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
tool_name: str
|
|
204
|
+
args: tuple[ToolArg, ...]
|
|
205
|
+
is_mutating: bool = True
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@dataclass(frozen=True)
|
|
209
|
+
class ProvenancePolicy:
|
|
210
|
+
"""The thresholds — mechanism is kernel, knobs are config (the `LivenessPolicy` seam).
|
|
211
|
+
|
|
212
|
+
Defaults GENERIC; a host may declare its own in `dos.toml [arg_provenance]` read back
|
|
213
|
+
through `SubstrateConfig` (the closed-config-as-data pattern).
|
|
214
|
+
|
|
215
|
+
min_component_len — a component shorter than this is dropped from the must-trace set
|
|
216
|
+
(too collision-prone to demand OR to substring-match): a bare
|
|
217
|
+
"P1" / "42" / "US" standalone is not provenance-checkable. There
|
|
218
|
+
is deliberately NO fractional-support knob: the only honest rule
|
|
219
|
+
is "every data-bearing component traces" (a sub-1.0 leniency
|
|
220
|
+
would be a laundering leak — a mostly-minted id passing).
|
|
221
|
+
case_sensitive — casefold both sides by default. ServiceNow ids (INC0010023) are
|
|
222
|
+
case-stable, but DB echoes / emails / usernames vary; casefold
|
|
223
|
+
avoids a false-UNSUPPORTED on a re-cased legit id (fewest-false-
|
|
224
|
+
blocks bias).
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
min_component_len: int = 4
|
|
228
|
+
case_sensitive: bool = False
|
|
229
|
+
|
|
230
|
+
def __post_init__(self) -> None:
|
|
231
|
+
if self.min_component_len < 1:
|
|
232
|
+
raise ValueError("min_component_len must be >= 1")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
DEFAULT_POLICY = ProvenancePolicy()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# ---------------------------------------------------------------------------
|
|
239
|
+
# Frozen verdicts — the folded answer, advisory only.
|
|
240
|
+
# ---------------------------------------------------------------------------
|
|
241
|
+
@dataclass(frozen=True)
|
|
242
|
+
class ArgProvenance:
|
|
243
|
+
"""One argument's provenance sub-verdict (legible distrust — the per-arg detail).
|
|
244
|
+
|
|
245
|
+
`matched_in` names which env source(s) carried the traced components (the rung made
|
|
246
|
+
visible). `components_unmatched` names the precise minted sub-ids — the minimal,
|
|
247
|
+
exact target the nudge speaks ("resolve <those parts> via a read first"), so a nudge
|
|
248
|
+
is never a vague "resolve your id."
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
arg_name: str
|
|
252
|
+
value_repr: str
|
|
253
|
+
stance: ProvenanceStance
|
|
254
|
+
id_shaped: bool
|
|
255
|
+
is_reference: bool
|
|
256
|
+
matched_in: tuple[CorpusSource, ...]
|
|
257
|
+
components_checked: tuple[str, ...]
|
|
258
|
+
components_unmatched: tuple[str, ...]
|
|
259
|
+
reason: str
|
|
260
|
+
|
|
261
|
+
def to_dict(self) -> dict:
|
|
262
|
+
return {
|
|
263
|
+
"arg_name": self.arg_name,
|
|
264
|
+
"value_repr": self.value_repr,
|
|
265
|
+
"stance": self.stance.value,
|
|
266
|
+
"id_shaped": self.id_shaped,
|
|
267
|
+
"is_reference": self.is_reference,
|
|
268
|
+
"matched_in": [s.value for s in self.matched_in],
|
|
269
|
+
"components_checked": list(self.components_checked),
|
|
270
|
+
"components_unmatched": list(self.components_unmatched),
|
|
271
|
+
"reason": self.reason,
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@dataclass(frozen=True)
|
|
276
|
+
class ProvenanceVerdict:
|
|
277
|
+
"""The folded top-level answer over a tool call — the `LivenessVerdict` analogue.
|
|
278
|
+
|
|
279
|
+
`believe` is True iff NO arg is UNSUPPORTED — i.e. every id-shaped reference arg either
|
|
280
|
+
traced to env bytes or the call had none to check. It means ONLY "no id was minted from
|
|
281
|
+
nowhere," NEVER "the args are correct" (no satisfaction claim — the trap). `unsupported`
|
|
282
|
+
is the arg names the nudge targets (empty ⟺ believe). `args` carries every per-arg
|
|
283
|
+
sub-verdict (including abstained ones) for legibility. Advisory: never raises, never
|
|
284
|
+
dispatches — the consumer reads `unsupported` and decides whether to nudge.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
believe: bool
|
|
288
|
+
args: tuple[ArgProvenance, ...]
|
|
289
|
+
unsupported: tuple[str, ...]
|
|
290
|
+
reason: str
|
|
291
|
+
|
|
292
|
+
def to_dict(self) -> dict:
|
|
293
|
+
return {
|
|
294
|
+
"believe": self.believe,
|
|
295
|
+
"args": [a.to_dict() for a in self.args],
|
|
296
|
+
"unsupported": list(self.unsupported),
|
|
297
|
+
"reason": self.reason,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ---------------------------------------------------------------------------
|
|
302
|
+
# Detection + matching — pure, decidable from the corpus alone, no answer key.
|
|
303
|
+
# ---------------------------------------------------------------------------
|
|
304
|
+
|
|
305
|
+
# Step B negative filters — a value matching any of these is a quantity/literal, NOT an
|
|
306
|
+
# FK, so it is rejected as not-id-shaped BEFORE any positive test (the date/money/version/
|
|
307
|
+
# phone false-block killer). Anchored full-string. The datetime forms are real-data-
|
|
308
|
+
# hardened (docs/143 live run): a full ISO-8601 timestamp `2025-08-23T00:00:00Z` was being
|
|
309
|
+
# mis-split into id components (`23T00`, `59`), so the filter matches the whole stamp with
|
|
310
|
+
# its `T`/`:`/`Z`/offset, not just a bare `YYYY-MM-DD`.
|
|
311
|
+
_RE_ISO_DATE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
|
312
|
+
_RE_ISO_DATETIME = re.compile(
|
|
313
|
+
r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2}(\.\d+)?)?(Z|[+-]\d{2}:?\d{2})?$"
|
|
314
|
+
)
|
|
315
|
+
_RE_TIME = re.compile(r"^\d{1,2}:\d{2}(:\d{2})?$")
|
|
316
|
+
_RE_DECIMAL = re.compile(r"^\d+\.\d+$")
|
|
317
|
+
_RE_VERSION = re.compile(r"^v?\d+(\.\d+)+$")
|
|
318
|
+
# Phone-ish: must carry a phone SEPARATOR (a '-'/'+'/'('/space) — a BARE integer is NOT
|
|
319
|
+
# phone-ish (it is a numeric PK / FK). The old `^[\d\-+()\s]+$` matched every pure-digit
|
|
320
|
+
# string, wrongly negative-filtering numeric ids like `1179` (docs/143 real-data fix).
|
|
321
|
+
_RE_PHONEISH = re.compile(r"^[\d()][\d\-+()\s]*[\-+()\s][\d\-+()\s]*\d$")
|
|
322
|
+
_RE_EPOCH_MS = re.compile(r"^\d{13}$") # a 13-digit ms-epoch timestamp (a quantity, not an FK)
|
|
323
|
+
_LITERAL_WORDS = frozenset({"true", "false", "null", "none"})
|
|
324
|
+
|
|
325
|
+
# Step C positive signatures.
|
|
326
|
+
_RE_HEX32 = re.compile(r"^[0-9a-f]{32}$")
|
|
327
|
+
_RE_UUID = re.compile(r"^[0-9a-fA-F]{8}-(?:[0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}$")
|
|
328
|
+
_RE_EMAIL = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
329
|
+
_RE_HAS_DIGIT = re.compile(r"\d")
|
|
330
|
+
_RE_HAS_ALPHA = re.compile(r"[A-Za-z]")
|
|
331
|
+
# The character class an id-shaped mixed-alnum token may contain (no internal whitespace).
|
|
332
|
+
_RE_ID_CHARS = re.compile(r"^[A-Za-z0-9._:#/\-@]+$")
|
|
333
|
+
_RE_DIGIT_RUN = re.compile(r"\d+")
|
|
334
|
+
|
|
335
|
+
# The delimiter class a composite id splits on (Step D).
|
|
336
|
+
_DELIM_RE = re.compile(r"[@._:#/\-]")
|
|
337
|
+
# The tokenizer for the env corpus (Step E exact-match): id-delimiters + whitespace + the
|
|
338
|
+
# JSON/structural punctuation that wraps env values (braces, brackets, quotes, commas,
|
|
339
|
+
# parens, equals, semicolons). Without stripping these, a JSON value `10023` tokenizes as
|
|
340
|
+
# `10023}` and an exact/pad match misses — so the env corpus must be punctuation-clean.
|
|
341
|
+
_ENV_TOKEN_RE = re.compile(r"[@._:#/\-\s{}\[\]\"',()=;<>|]+")
|
|
342
|
+
# A "clean" id part: an optional alpha prefix then a trailing digit run (INC0010023,
|
|
343
|
+
# acme99, p0001) — the part whose DATA unit is just the digit run. Anything else that is
|
|
344
|
+
# alnum (hex, interleaved) is demanded WHOLE so we never demand a meaningless 1-char run.
|
|
345
|
+
_RE_PREFIX_THEN_DIGITS = re.compile(r"^[A-Za-z]*\d+$")
|
|
346
|
+
|
|
347
|
+
# Name-hint suffixes (Step C corroboration). Suffix-anchored, NOT substring — so
|
|
348
|
+
# `phone_number` / `version_number` / `due_to_date` are NOT hints (their substrings
|
|
349
|
+
# `_number`/`_to`/`_date` are excluded by anchoring to these exact tails).
|
|
350
|
+
_NAME_HINT_SUFFIXES = ("_id", "_sys_id", "sys_id", "_ref", "_key", "_email")
|
|
351
|
+
|
|
352
|
+
# Quantity-name stoplist (docs/143 real-data fix): an arg whose name signals a NUMBER, not
|
|
353
|
+
# an FK — a price, an amount, a count. A bare pure-digit value in such a slot is the model
|
|
354
|
+
# legitimately SETTING a quantity (contract_price=33414), never an id to resolve, so it must
|
|
355
|
+
# never be treated as id-shaped (it caused live false-flags). Matched as a name SUFFIX /
|
|
356
|
+
# whole-word so `unit_price`/`total_amount`/`max_results` hit but an `*_id` never does.
|
|
357
|
+
_QUANTITY_NAME_PARTS = (
|
|
358
|
+
"price", "amount", "cost", "total", "count", "quantity", "qty", "size", "limit",
|
|
359
|
+
"number", "num", "age", "duration", "score", "rate", "percent", "weight", "height",
|
|
360
|
+
"width", "length", "balance", "fee", "rank", "position", "offset", "page", "max",
|
|
361
|
+
"min", "priority", "level", "year", "month", "day", "hour", "minute", "second",
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _name_is_quantity(arg_name: str) -> bool:
|
|
366
|
+
"""True iff the arg NAME signals a quantity (price/amount/count/…), so a bare number in
|
|
367
|
+
it is a value the model sets, not an FK to resolve. Matched on the trailing token of a
|
|
368
|
+
snake/camel name so `unit_price`/`maxResults` hit while an `account_id` never does."""
|
|
369
|
+
n = arg_name.casefold()
|
|
370
|
+
last = n.replace("-", "_").split("_")[-1]
|
|
371
|
+
return last in _QUANTITY_NAME_PARTS or any(n.endswith(q) for q in _QUANTITY_NAME_PARTS)
|
|
372
|
+
|
|
373
|
+
# Grammar stoplist (Step D) — pure-alpha composite parts that are domain GRAMMAR, not the
|
|
374
|
+
# data-bearing identifier portion: common TLDs + a few connective/scheme words the model
|
|
375
|
+
# may legitimately type. A pure-alpha part in this set is exempt from must-trace even when
|
|
376
|
+
# it is long enough to otherwise be demanded; a pure-alpha part NOT in this set and long
|
|
377
|
+
# enough (e.g. the org-name in user_42@acme.com / @evil.com) IS demanded, so a minted
|
|
378
|
+
# domain is caught while a real one traces. Kept deliberately small — over-listing turns a
|
|
379
|
+
# minted identifier word into exempt grammar (a laundering leak), under-listing risks a
|
|
380
|
+
# false-block on an exotic TLD (the safe direction: a false-block degrades, see module doc).
|
|
381
|
+
_GRAMMAR_WORDS = frozenset({
|
|
382
|
+
"com", "org", "net", "edu", "gov", "mil", "int", "io", "co", "us", "uk", "ca", "au",
|
|
383
|
+
"www", "http", "https", "mailto", "ftp", "api", "mail", "smtp", "imap",
|
|
384
|
+
})
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def _casefold(s: str, policy: ProvenancePolicy) -> str:
|
|
388
|
+
return s if policy.case_sensitive else s.casefold()
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _is_negative_filtered(s: str) -> bool:
|
|
392
|
+
"""Step B — True iff `s` is a quantity/literal (date/time/decimal/version/phone/bool),
|
|
393
|
+
NOT an FK. Such a value is never id-shaped."""
|
|
394
|
+
if _RE_ISO_DATE.match(s) or _RE_ISO_DATETIME.match(s) or _RE_TIME.match(s):
|
|
395
|
+
return True
|
|
396
|
+
if _RE_DECIMAL.match(s) or _RE_VERSION.match(s) or _RE_PHONEISH.match(s):
|
|
397
|
+
return True
|
|
398
|
+
if _RE_EPOCH_MS.match(s):
|
|
399
|
+
return True
|
|
400
|
+
if s.casefold() in _LITERAL_WORDS:
|
|
401
|
+
return True
|
|
402
|
+
# Pure prose: internal whitespace AND no embedded digit AND not email-shaped → free
|
|
403
|
+
# text (a short_description), never an id.
|
|
404
|
+
if " " in s and not _RE_HAS_DIGIT.search(s) and not _RE_EMAIL.match(s):
|
|
405
|
+
return True
|
|
406
|
+
return False
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _name_is_hint(arg_name: str) -> bool:
|
|
410
|
+
"""True iff the arg NAME suffix-matches an id-bearing slot name. Corroborating only —
|
|
411
|
+
never promotes a Step-B-rejected value, and requires a positive value signature too."""
|
|
412
|
+
n = arg_name.casefold()
|
|
413
|
+
return any(n.endswith(suf) for suf in _NAME_HINT_SUFFIXES)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _is_id_shaped(s: str, arg_name: str, policy: ProvenancePolicy) -> bool:
|
|
417
|
+
"""Step C — True iff `s` carries a POSITIVE id signature. Biased to under-fire: a
|
|
418
|
+
missed id is a silent safe ABSTAIN; a false flag risks a false-block."""
|
|
419
|
+
# Real-data recall lift (docs/143): a BARE SHORT INTEGER in a strong FK-name slot
|
|
420
|
+
# (`group_id`, `caller_id`, `*_ref`) is the ServiceNow numeric-PK pattern (group_id=81).
|
|
421
|
+
# The default min_component_len drops it, costing recall on the dominant
|
|
422
|
+
# 'Incorrect ID Resolution' shape. Promote it to id-shaped down to len 2 — but ONLY
|
|
423
|
+
# when the NAME corroborates (a bare number in a non-FK slot stays a quantity). The
|
|
424
|
+
# int-equality matcher handles its collision risk, and the name-hint keeps precision
|
|
425
|
+
# intact (a `limit`/`priority`/`page` value never reaches here). It must still survive
|
|
426
|
+
# Step B (so a 13-digit epoch / a decimal is excluded before this).
|
|
427
|
+
if (
|
|
428
|
+
s.isdigit() and 2 <= len(s) < policy.min_component_len
|
|
429
|
+
and _name_is_hint(arg_name) and not _name_is_quantity(arg_name)
|
|
430
|
+
and not _is_negative_filtered(s)
|
|
431
|
+
):
|
|
432
|
+
return True
|
|
433
|
+
if len(s) < policy.min_component_len:
|
|
434
|
+
return False
|
|
435
|
+
if _is_negative_filtered(s):
|
|
436
|
+
return False
|
|
437
|
+
if " " in s:
|
|
438
|
+
# An id token has no internal whitespace (email/prose handled above).
|
|
439
|
+
return False
|
|
440
|
+
if not _RE_ID_CHARS.match(s):
|
|
441
|
+
return False
|
|
442
|
+
# (iii) hex32 / UUID, (iv) email.
|
|
443
|
+
if _RE_HEX32.match(s) or _RE_UUID.match(s) or _RE_EMAIL.match(s):
|
|
444
|
+
return True
|
|
445
|
+
# (i) mixed alnum.
|
|
446
|
+
if _RE_HAS_DIGIT.search(s) and _RE_HAS_ALPHA.search(s):
|
|
447
|
+
return True
|
|
448
|
+
# (ii) pure-digit key of sufficient length (survived Step B, so not a date/decimal) —
|
|
449
|
+
# UNLESS the name signals a quantity (price/amount/count), where a bare number is a value
|
|
450
|
+
# the model sets, not an FK to resolve (the docs/143 contract_price=33414 false-flag).
|
|
451
|
+
if s.isdigit() and len(s) >= policy.min_component_len and not _name_is_quantity(arg_name):
|
|
452
|
+
return True
|
|
453
|
+
# Name-hint corroboration: a long opaque pure-alpha token (no digit, no delimiter)
|
|
454
|
+
# in an *_id/_ref/_key slot is an opaque key. Never fires on a short or Step-B value.
|
|
455
|
+
if _name_is_hint(arg_name) and len(s) >= policy.min_component_len and s.isalnum():
|
|
456
|
+
return True
|
|
457
|
+
return False
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _data_bearing_components(s: str, policy: ProvenancePolicy) -> tuple[tuple[str, ...], bool]:
|
|
461
|
+
"""Step D — split `s` into components and return the DATA-BEARING ones (those that
|
|
462
|
+
must trace) plus whether the value is genuinely id-shaped after decomposition.
|
|
463
|
+
|
|
464
|
+
A component is:
|
|
465
|
+
* DATA-BEARING (must trace) — a DIGIT-RUN anywhere (e.g. "INC0010023" → "0010023";
|
|
466
|
+
the prefix is grammar), a high-entropy interleaved alnum token kept WHOLE (a hex/
|
|
467
|
+
UUID chunk), OR a long non-grammar pure-alpha label in the DOMAIN position (the
|
|
468
|
+
org label after an "@": the "acme"/"evil" in user_42@<x>.com — a minted domain
|
|
469
|
+
must be caught, a real one resolves).
|
|
470
|
+
* GRAMMAR (exempt) — a pure-alpha part OUTSIDE the domain position (the "INC" prefix,
|
|
471
|
+
a "user"/"jane"/"doe" type-word or name, an enum word), and any TLD/scheme word in
|
|
472
|
+
the stoplist. Domain grammar the model may legitimately supply; NOT demanded. This
|
|
473
|
+
asymmetry — alpha-in-domain-position is data, alpha-elsewhere is grammar — is what
|
|
474
|
+
lets the local-part "user" stay exempt (no false-block on the supported composite)
|
|
475
|
+
while the org label is still checked.
|
|
476
|
+
* DROPPED — a part shorter than min_component_len (a "US", a "v"): too collision-prone
|
|
477
|
+
to demand or to match.
|
|
478
|
+
|
|
479
|
+
The ENUM GUARD: if after decomposition there are ZERO data-bearing components (every
|
|
480
|
+
part is grammar or dropped — "itil_admin", "in_progress"), the value is NOT genuinely
|
|
481
|
+
id-shaped → the caller ABSTAINs. A role/status/enum token never nudges.
|
|
482
|
+
|
|
483
|
+
REAL-DATA HARDENING (docs/143 live run, the §8 false-block kill-signal made concrete):
|
|
484
|
+
* a UUID / 32-hex value is demanded WHOLE (one component), never split on its `-`
|
|
485
|
+
delimiters — splitting `3fc71c6d-bfa1-4339-b089-…` demanded sub-chunks like `1`/`089`
|
|
486
|
+
that don't independently trace, a guaranteed false-flag on a legit label id.
|
|
487
|
+
* a digit-run in an EMAIL LOCAL part (`jason.smith10@…` → `10`) is a username
|
|
488
|
+
discriminator, NOT a resolvable FK — it is grammar, never demanded. Only a digit-run
|
|
489
|
+
long enough to be a real id (>= min_component_len) is demanded from a local-part token.
|
|
490
|
+
"""
|
|
491
|
+
# A UUID or 32-hex value is ONE opaque identity — demand it whole, never split. (Step C
|
|
492
|
+
# already accepted it as id-shaped; splitting it on '-' is the documented false-flag.)
|
|
493
|
+
if _RE_UUID.match(s) or _RE_HEX32.match(s):
|
|
494
|
+
return (s,), True
|
|
495
|
+
|
|
496
|
+
# Split into the local region (before the first "@") and the domain region (after it).
|
|
497
|
+
# A pure-alpha label is grammar in the local region, data-bearing in the domain region.
|
|
498
|
+
at = s.find("@")
|
|
499
|
+
local_s, domain_s = (s, "") if at < 0 else (s[:at], s[at + 1:])
|
|
500
|
+
is_email = at >= 0
|
|
501
|
+
local_parts = [p for p in _DELIM_RE.split(local_s) if p]
|
|
502
|
+
domain_parts = [p for p in _DELIM_RE.split(domain_s) if p]
|
|
503
|
+
demanded: list[str] = []
|
|
504
|
+
for part, in_domain in (
|
|
505
|
+
[(p, False) for p in local_parts] + [(p, True) for p in domain_parts]
|
|
506
|
+
):
|
|
507
|
+
if not _RE_HAS_DIGIT.search(part):
|
|
508
|
+
# Pure-alpha part. In the LOCAL region it is grammar (exempt) — the "INC"/"user"
|
|
509
|
+
# type-word, a name. In the DOMAIN region a long, non-grammar label is the org
|
|
510
|
+
# identity and IS data-bearing (a minted domain is caught while a real one
|
|
511
|
+
# resolves); a stoplist TLD/scheme word stays grammar everywhere.
|
|
512
|
+
if (
|
|
513
|
+
in_domain
|
|
514
|
+
and len(part) >= policy.min_component_len
|
|
515
|
+
and part.casefold() not in _GRAMMAR_WORDS
|
|
516
|
+
):
|
|
517
|
+
demanded.append(part)
|
|
518
|
+
continue
|
|
519
|
+
if _RE_PREFIX_THEN_DIGITS.match(part):
|
|
520
|
+
# A clean alpha-prefix + trailing digit run (INC0010023, acme99, p0001): the
|
|
521
|
+
# DATA unit is just the digit run (the prefix is grammar). One digit run — BUT in
|
|
522
|
+
# an email LOCAL part the digit suffix is a username discriminator (smith10), not
|
|
523
|
+
# an FK, so only demand it if it is long enough to be a real id.
|
|
524
|
+
run = _RE_DIGIT_RUN.findall(part)
|
|
525
|
+
if run:
|
|
526
|
+
d = run[-1]
|
|
527
|
+
if is_email and not in_domain and len(d) < policy.min_component_len:
|
|
528
|
+
continue # username discriminator — grammar, not a resolvable FK
|
|
529
|
+
demanded.append(d)
|
|
530
|
+
else:
|
|
531
|
+
# A high-entropy interleaved alnum token (hex/UUID chunk, a1b2c3…): demand the
|
|
532
|
+
# WHOLE chunk as one unit. Demanding its individual 1-char digit runs would be
|
|
533
|
+
# both meaningless (a "1" matches everything) and a false-block risk, so the
|
|
534
|
+
# opaque token traces or it doesn't, atomically.
|
|
535
|
+
demanded.append(part)
|
|
536
|
+
# Dedup; drop any demanded component shorter than min_component_len UNLESS it is a
|
|
537
|
+
# pure-digit run (a short numeric PK is real data — int-equality matching handles its
|
|
538
|
+
# collision risk; a short non-digit chunk is too collision-prone to demand or match).
|
|
539
|
+
seen: set[str] = set()
|
|
540
|
+
out: list[str] = []
|
|
541
|
+
for c in demanded:
|
|
542
|
+
if len(c) < policy.min_component_len and not c.isdigit():
|
|
543
|
+
continue
|
|
544
|
+
if c not in seen:
|
|
545
|
+
seen.add(c)
|
|
546
|
+
out.append(c)
|
|
547
|
+
return tuple(out), bool(out)
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _component_found(component: str, env_text: str, env_tokens: frozenset[str],
|
|
551
|
+
policy: ProvenancePolicy) -> bool:
|
|
552
|
+
"""Step E — True iff `component` traces to the env corpus. Several rungs, any one:
|
|
553
|
+
|
|
554
|
+
(a) exact: equals an env token.
|
|
555
|
+
(b) substring: is a substring of the joined env text (len-guarded).
|
|
556
|
+
(c) reverse-substring: an env token (len-guarded) is a substring of the component
|
|
557
|
+
— covers the model DERIVING a padded/prefixed id from a bare env fragment.
|
|
558
|
+
(d) numeric-pad normalize: for a pure-digit component, compare zero-stripped /
|
|
559
|
+
int-value forms against env digit tokens — the most common ServiceNow livelock
|
|
560
|
+
("0010023" derived from env bare int "10023" and vice versa).
|
|
561
|
+
"""
|
|
562
|
+
c = _casefold(component, policy)
|
|
563
|
+
# (a) exact token.
|
|
564
|
+
if c in env_tokens:
|
|
565
|
+
return True
|
|
566
|
+
# (d) numeric-pad normalize (do before substring so int-equality is authoritative).
|
|
567
|
+
if c.isdigit():
|
|
568
|
+
c_int = c.lstrip("0") or "0"
|
|
569
|
+
for tok in env_tokens:
|
|
570
|
+
if tok.isdigit() and (tok.lstrip("0") or "0") == c_int:
|
|
571
|
+
return True
|
|
572
|
+
# (b) substring in the joined env text.
|
|
573
|
+
if len(c) >= policy.min_component_len and c in env_text:
|
|
574
|
+
return True
|
|
575
|
+
# (c) reverse-substring: a sufficiently long env token sits inside the component.
|
|
576
|
+
if len(c) >= policy.min_component_len:
|
|
577
|
+
for tok in env_tokens:
|
|
578
|
+
if len(tok) >= policy.min_component_len and tok in c:
|
|
579
|
+
return True
|
|
580
|
+
# numeric reverse: an env digit token's int form inside the component's digits.
|
|
581
|
+
if c.isdigit():
|
|
582
|
+
for tok in env_tokens:
|
|
583
|
+
if tok.isdigit() and len(tok) >= policy.min_component_len:
|
|
584
|
+
stripped = tok.lstrip("0") or "0"
|
|
585
|
+
if stripped in c:
|
|
586
|
+
return True
|
|
587
|
+
return False
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _flatten_leaves(value: object) -> list[object]:
|
|
591
|
+
"""Recurse a list/dict arg value to its scalar leaves (Step A.3). Each leaf is
|
|
592
|
+
provenance-checked independently; the arg folds to UNSUPPORTED if ANY id-leaf is
|
|
593
|
+
minted, SUPPORTED if all id-leaves trace, ABSTAIN if no leaf is id-shaped."""
|
|
594
|
+
out: list[object] = []
|
|
595
|
+
if isinstance(value, dict):
|
|
596
|
+
for v in value.values():
|
|
597
|
+
out.extend(_flatten_leaves(v))
|
|
598
|
+
elif isinstance(value, (list, tuple)):
|
|
599
|
+
for v in value:
|
|
600
|
+
out.extend(_flatten_leaves(v))
|
|
601
|
+
else:
|
|
602
|
+
out.append(value)
|
|
603
|
+
return out
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def _build_env(prior: PriorResults, policy: ProvenancePolicy) -> tuple[str, dict[str, set[CorpusSource]]]:
|
|
607
|
+
"""Boundary-free corpus prep: the joined casefolded env text + a token→sources map.
|
|
608
|
+
|
|
609
|
+
Returns (joined_text, token_sources) where token_sources maps each env token to the
|
|
610
|
+
set of `CorpusSource`s that supplied it (for `matched_in`). Computed once per call.
|
|
611
|
+
"""
|
|
612
|
+
texts: list[str] = []
|
|
613
|
+
token_sources: dict[str, set[CorpusSource]] = {}
|
|
614
|
+
for blob in prior.blobs:
|
|
615
|
+
t = _casefold(blob.text, policy)
|
|
616
|
+
texts.append(t)
|
|
617
|
+
for tok in _ENV_TOKEN_RE.split(t):
|
|
618
|
+
if tok:
|
|
619
|
+
token_sources.setdefault(tok, set()).add(blob.source)
|
|
620
|
+
return " ".join(texts), token_sources
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def classify_arg(arg: ToolArg, prior: PriorResults,
|
|
624
|
+
policy: ProvenancePolicy = DEFAULT_POLICY) -> ArgProvenance:
|
|
625
|
+
"""Per-arg leaf — SUPPORTED / UNSUPPORTED / ABSTAIN for ONE argument. PURE.
|
|
626
|
+
|
|
627
|
+
Recurses into list/dict values (fold: any id-leaf UNSUPPORTED → UNSUPPORTED; all
|
|
628
|
+
id-leaves traced → SUPPORTED; no id-leaf → ABSTAIN). Assumes the call-level guards
|
|
629
|
+
(read call, empty corpus) were applied by `classify_call`; a direct caller passing a
|
|
630
|
+
non-empty `prior` gets the full check.
|
|
631
|
+
"""
|
|
632
|
+
name = arg.name
|
|
633
|
+
# Step A.1 — the create's-own-key exemption.
|
|
634
|
+
if not arg.is_reference:
|
|
635
|
+
return ArgProvenance(
|
|
636
|
+
arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.ABSTAIN,
|
|
637
|
+
id_shaped=False, is_reference=False, matched_in=(), components_checked=(),
|
|
638
|
+
components_unmatched=(),
|
|
639
|
+
reason="new-key / own-identity slot — not a reference to resolve",
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Step A.3 — recurse composite container values.
|
|
643
|
+
if isinstance(arg.value, (list, tuple, dict)):
|
|
644
|
+
leaves = _flatten_leaves(arg.value)
|
|
645
|
+
any_id = False
|
|
646
|
+
unmatched_all: list[str] = []
|
|
647
|
+
matched_sources: set[CorpusSource] = set()
|
|
648
|
+
checked_all: list[str] = []
|
|
649
|
+
any_unsupported = False
|
|
650
|
+
for leaf in leaves:
|
|
651
|
+
sub = classify_arg(ToolArg(name=name, value=leaf, is_reference=True), prior, policy)
|
|
652
|
+
checked_all.extend(sub.components_checked)
|
|
653
|
+
if sub.id_shaped:
|
|
654
|
+
any_id = True
|
|
655
|
+
matched_sources.update(sub.matched_in)
|
|
656
|
+
if sub.stance is ProvenanceStance.UNSUPPORTED:
|
|
657
|
+
any_unsupported = True
|
|
658
|
+
unmatched_all.extend(sub.components_unmatched)
|
|
659
|
+
if not any_id:
|
|
660
|
+
return ArgProvenance(
|
|
661
|
+
arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.ABSTAIN,
|
|
662
|
+
id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
|
|
663
|
+
components_unmatched=(),
|
|
664
|
+
reason="container arg with no id-shaped leaf — nothing to provenance-check",
|
|
665
|
+
)
|
|
666
|
+
if any_unsupported:
|
|
667
|
+
return ArgProvenance(
|
|
668
|
+
arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.UNSUPPORTED,
|
|
669
|
+
id_shaped=True, is_reference=True, matched_in=tuple(sorted(matched_sources, key=lambda s: s.value)),
|
|
670
|
+
components_checked=tuple(checked_all), components_unmatched=tuple(unmatched_all),
|
|
671
|
+
reason="at least one id in the container did not appear in env-authored bytes",
|
|
672
|
+
)
|
|
673
|
+
return ArgProvenance(
|
|
674
|
+
arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.SUPPORTED,
|
|
675
|
+
id_shaped=True, is_reference=True, matched_in=tuple(sorted(matched_sources, key=lambda s: s.value)),
|
|
676
|
+
components_checked=tuple(checked_all), components_unmatched=(),
|
|
677
|
+
reason="every id in the container traced to env-authored bytes",
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
# Step A.2 — None / bool are never ids.
|
|
681
|
+
if arg.value is None or isinstance(arg.value, bool):
|
|
682
|
+
return ArgProvenance(
|
|
683
|
+
arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.ABSTAIN,
|
|
684
|
+
id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
|
|
685
|
+
components_unmatched=(), reason="flag/None value — never an id",
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
s = str(arg.value).strip()
|
|
689
|
+
if not _is_id_shaped(s, name, policy):
|
|
690
|
+
return ArgProvenance(
|
|
691
|
+
arg_name=name, value_repr=s, stance=ProvenanceStance.ABSTAIN,
|
|
692
|
+
id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
|
|
693
|
+
components_unmatched=(), reason="not id/FK-shaped — quantity, literal, or prose",
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
env_text, token_sources = _build_env(prior, policy)
|
|
697
|
+
env_tokens = frozenset(token_sources)
|
|
698
|
+
|
|
699
|
+
# WHOLE-VALUE DIRECT MATCH (the primary rung — docs/143 live run). The overwhelmingly
|
|
700
|
+
# common honest case is the model passing an id it read back VERBATIM (`INC_004`,
|
|
701
|
+
# `msg_001`, a UUID). If the entire value appears in the env corpus — as an exact token
|
|
702
|
+
# OR a substring of the joined text — it is RESOLVED, full stop. We answer here BEFORE
|
|
703
|
+
# decomposing, because decomposition is a heuristic for *derived/composite* ids and on a
|
|
704
|
+
# verbatim id it can demand a too-short sub-run (`004`) that the matcher then misses,
|
|
705
|
+
# a guaranteed false-flag. Direct containment needs no hashing or fuzzy match — the id
|
|
706
|
+
# is the same bytes the env authored, so a plain substring is the exact, honest test.
|
|
707
|
+
cf_whole = _casefold(s, policy)
|
|
708
|
+
if cf_whole in env_tokens or cf_whole in env_text:
|
|
709
|
+
srcs = token_sources.get(cf_whole)
|
|
710
|
+
if not srcs:
|
|
711
|
+
srcs = set()
|
|
712
|
+
for tok, ss in token_sources.items():
|
|
713
|
+
if cf_whole in tok or tok in cf_whole:
|
|
714
|
+
srcs.update(ss)
|
|
715
|
+
return ArgProvenance(
|
|
716
|
+
arg_name=name, value_repr=s, stance=ProvenanceStance.SUPPORTED,
|
|
717
|
+
id_shaped=True, is_reference=True,
|
|
718
|
+
matched_in=tuple(sorted(srcs, key=lambda x: x.value)),
|
|
719
|
+
components_checked=(s,), components_unmatched=(),
|
|
720
|
+
reason=f"the id {s!r} appears verbatim in env-authored bytes (direct match)",
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
components, genuinely_id = _data_bearing_components(s, policy)
|
|
724
|
+
if not genuinely_id:
|
|
725
|
+
# Step D enum guard: delimiter present but no data-bearing component (itil_admin).
|
|
726
|
+
return ArgProvenance(
|
|
727
|
+
arg_name=name, value_repr=s, stance=ProvenanceStance.ABSTAIN,
|
|
728
|
+
id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
|
|
729
|
+
components_unmatched=(),
|
|
730
|
+
reason="enum/role/status token — no data-bearing component to resolve",
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
unmatched: list[str] = []
|
|
734
|
+
matched_sources: set[CorpusSource] = set()
|
|
735
|
+
for c in components:
|
|
736
|
+
cf = _casefold(c, policy)
|
|
737
|
+
if _component_found(c, env_text, env_tokens, policy):
|
|
738
|
+
# Record which source(s) supplied a hit (best-effort: exact-token map, else
|
|
739
|
+
# tag as the union of sources whose text contains it).
|
|
740
|
+
hit_sources = token_sources.get(cf)
|
|
741
|
+
if hit_sources:
|
|
742
|
+
matched_sources.update(hit_sources)
|
|
743
|
+
else:
|
|
744
|
+
for blob_tok, srcs in token_sources.items():
|
|
745
|
+
if cf in blob_tok or blob_tok in cf:
|
|
746
|
+
matched_sources.update(srcs)
|
|
747
|
+
else:
|
|
748
|
+
unmatched.append(c)
|
|
749
|
+
|
|
750
|
+
if unmatched:
|
|
751
|
+
return ArgProvenance(
|
|
752
|
+
arg_name=name, value_repr=s, stance=ProvenanceStance.UNSUPPORTED,
|
|
753
|
+
id_shaped=True, is_reference=True,
|
|
754
|
+
matched_in=tuple(sorted(matched_sources, key=lambda x: x.value)),
|
|
755
|
+
components_checked=components, components_unmatched=tuple(unmatched),
|
|
756
|
+
reason=(
|
|
757
|
+
f"id-shaped reference {s!r} has component(s) {unmatched} that appear in no "
|
|
758
|
+
f"env-authored bytes — looks model-minted (resolve via a read first)"
|
|
759
|
+
),
|
|
760
|
+
)
|
|
761
|
+
return ArgProvenance(
|
|
762
|
+
arg_name=name, value_repr=s, stance=ProvenanceStance.SUPPORTED,
|
|
763
|
+
id_shaped=True, is_reference=True,
|
|
764
|
+
matched_in=tuple(sorted(matched_sources, key=lambda x: x.value)),
|
|
765
|
+
components_checked=components, components_unmatched=(),
|
|
766
|
+
reason=f"every data-bearing component of {s!r} traced to env-authored bytes",
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def classify_call(call: ToolCall, prior: PriorResults,
|
|
771
|
+
policy: ProvenancePolicy = DEFAULT_POLICY) -> ProvenanceVerdict:
|
|
772
|
+
"""The top-level fold over a tool call — the `liveness.classify` shape. PURE.
|
|
773
|
+
|
|
774
|
+
Call-level guards first (each → ABSTAIN-all, believe=True):
|
|
775
|
+
* a read / non-mutating call — reads are how provenance ENTERS, never gated.
|
|
776
|
+
* an empty corpus — the first call of an episode; with zero env bytes we cannot
|
|
777
|
+
prove mintage, so we never accuse (the load-bearing first-call safe direction).
|
|
778
|
+
Else maps `classify_arg` over the args; `believe = not any UNSUPPORTED`;
|
|
779
|
+
`unsupported = the UNSUPPORTED arg names` (what the consumer's nudge targets).
|
|
780
|
+
"""
|
|
781
|
+
if not call.is_mutating:
|
|
782
|
+
return ProvenanceVerdict(
|
|
783
|
+
believe=True, args=(), unsupported=(),
|
|
784
|
+
reason="read / non-mutating call — provenance not gated (reads source it)",
|
|
785
|
+
)
|
|
786
|
+
if not prior.blobs:
|
|
787
|
+
return ProvenanceVerdict(
|
|
788
|
+
believe=True,
|
|
789
|
+
args=tuple(
|
|
790
|
+
ArgProvenance(
|
|
791
|
+
arg_name=a.name, value_repr=str(a.value), stance=ProvenanceStance.ABSTAIN,
|
|
792
|
+
id_shaped=False, is_reference=a.is_reference, matched_in=(),
|
|
793
|
+
components_checked=(), components_unmatched=(),
|
|
794
|
+
reason="empty corpus (first call) — cannot prove mintage, abstain",
|
|
795
|
+
)
|
|
796
|
+
for a in call.args
|
|
797
|
+
),
|
|
798
|
+
unsupported=(),
|
|
799
|
+
reason="empty env corpus — first call of the episode, nothing to check against",
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
arg_verdicts = tuple(classify_arg(a, prior, policy) for a in call.args)
|
|
803
|
+
unsupported = tuple(a.arg_name for a in arg_verdicts if a.stance is ProvenanceStance.UNSUPPORTED)
|
|
804
|
+
believe = not unsupported
|
|
805
|
+
if believe:
|
|
806
|
+
reason = "no id/FK argument was minted from nowhere (all traced or none to check)"
|
|
807
|
+
else:
|
|
808
|
+
reason = (
|
|
809
|
+
f"{len(unsupported)} id/FK argument(s) appear model-minted: "
|
|
810
|
+
f"{', '.join(unsupported)} — resolve via a read tool first"
|
|
811
|
+
)
|
|
812
|
+
return ProvenanceVerdict(
|
|
813
|
+
believe=believe, args=arg_verdicts, unsupported=unsupported, reason=reason,
|
|
814
|
+
)
|