dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
dos/arg_provenance.py ADDED
@@ -0,0 +1,814 @@
1
+ """ARG — the argument-provenance verdict: *did the model MINT this id, or RESOLVE it?*
2
+
3
+ docs/143 §5a/§7 (the EnterpriseOps-Gym audit) — the **survivor** binding. Of every
4
+ gate the audit floated for a cheap agent on a stateful enterprise benchmark, exactly
5
+ one passes DOS's own byte-inequality axiom (docs/141) cleanly: a check of
6
+ **provenance-of-a-string**. Before a mutating tool call fires, ask of each id/FK-shaped
7
+ argument:
8
+
9
+ > did this value APPEAR in env-authored bytes the agent already saw (a prior tool
10
+ > RESULT, or the task text), or did the model MINT it from nowhere?
11
+
12
+ That is a clean **byte-author** question — the gym MCP server authored the read-result
13
+ bytes; the judged agent did not — so it sidesteps the **mirror-verifier trap** (docs/141,
14
+ docs/143 §5a) entirely: it needs no answer key, no held-out final state, and **no
15
+ self-authored satisfaction predicate** ("is this the row the task *required*?" — the
16
+ forgeable-in-the-agent's-favor question this module must never ask). It attacks two
17
+ *named* benchmark failure modes (docs/143 §1b) that feed the Integrity (FK-validity)
18
+ verifier:
19
+
20
+ * **Incorrect ID Resolution** — passing unverified IDs minted by the model instead of
21
+ resolving the correct IDs through prior tool interactions.
22
+ * **Missing Prerequisite Lookup** — creating an object without first querying the
23
+ prereqs, so the FK it references was never read.
24
+
25
+ Why this is the kernel's to own, and where the line is
26
+ ======================================================
27
+
28
+ `believe=True` here means **only** "no id arg was minted from nowhere" — it is NEVER a
29
+ claim that the args are *correct* (that would be a satisfaction predicate, the trap).
30
+ The structural guarantee that this module cannot launder a self-authored predicate is in
31
+ the type: the provenance corpus (`PriorResults`) is a tuple of `EnvBlob`, and an
32
+ `EnvBlob` can carry only an **env** `CorpusSource` (`TOOL_RESULT` / `TASK_TEXT`). There
33
+ is deliberately **no `AGENT_AUTHORED` member** — a boundary that tried to fold an
34
+ assistant turn into the corpus has no enum to tag it with, so model-authored bytes are
35
+ *unrepresentable* as evidence. That is the docs/143 §5a discipline made structural, the
36
+ same shape `evidence.believe_under_floor` uses (a forgeable-floor source can never grant
37
+ belief) — here pushed one step further: the forgeable class does not exist in the type.
38
+
39
+ The verdict is **advisory**: it REPORTS; it never raises, never dispatches, never mutates.
40
+ The consumer (a `dos_react`-style orchestrator wrapper, benchmark-side — NOT in the
41
+ kernel) reads `unsupported` and injects ONE nudge ToolMessage ("resolve `<value>` via a
42
+ read tool first") instead of dispatching the mutating call, pushing the cheap model to do
43
+ the prerequisite lookup it skipped. The verdict's only power is to nudge-MORE; it has no
44
+ output that can force a call through — **refuse-MORE-only by the shape of the type**, the
45
+ admission-seam / fail-to-abstain discipline re-aimed at the argument grain. The
46
+ per-arg-value re-injection cap (≤1, docs/143 §4) lives in the consumer; the pure verdict
47
+ cannot loop.
48
+
49
+ The two errors, and which one is safe
50
+ =====================================
51
+
52
+ Only two error directions are reachable, and the design biases hard toward the safe one:
53
+
54
+ * **false-SUPPORTED** (a minted id coincidentally substrings the corpus) → the verdict
55
+ declines to nudge → R1 degrades to the baseline for that call. SAFE — no worse than
56
+ not having the gate.
57
+ * **false-UNSUPPORTED** (a *legit derived* id — padded `INC0010023` from a bare env
58
+ `10023`, a composite `user_42@acme.com` from env parts — wrongly flagged) → an
59
+ unnecessary nudge wastes an iteration and, on a thrashing agent near its cap, can
60
+ convert a would-pass run into a timeout (the docs/143 §8 feasible-task **kill-signal**).
61
+ DANGEROUS. The component decomposition (Step D) + the derived-id containment rungs
62
+ (Step E reverse-substring + numeric-pad-normalize) drive this rate toward ~0, which is
63
+ what lets R1 clear its gate (Integrity UP, feasible-task rate FLAT).
64
+
65
+ So the whole module is tuned to **under-fire**: a missed mint is a silent safe ABSTAIN; a
66
+ false flag risks a real regression. Every ambiguous case resolves to ABSTAIN.
67
+
68
+ ⚓ Pure kernel, I/O on the edge (the dos idiom — mirrors `liveness.classify`,
69
+ `churn.decide_coalesce`, `evidence.believe_under_floor`): `classify_call(ToolCall,
70
+ PriorResults, policy) -> ProvenanceVerdict` is a frozen dataclass in, a frozen verdict
71
+ out. The caller (the wrapper, at the benchmark boundary) flattens each prior tool RESULT
72
+ to a string and tags it with its env source BEFORE the call; the kernel never parses
73
+ JSON, never reads a file, never calls a clock. That is what lets the whole verdict be
74
+ unit-tested on frozen fixtures with zero benchmark/LLM/MCP access — the keystone the audit
75
+ calls "testable with zero benchmark access."
76
+ """
77
+
78
+ from __future__ import annotations
79
+
80
+ import enum
81
+ import re
82
+ from dataclasses import dataclass
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # The closed source vocabulary — the structural non-self-authorship guarantee.
87
+ # ---------------------------------------------------------------------------
88
+ class CorpusSource(str, enum.Enum):
89
+ """Where an `EnvBlob`'s bytes came from — and CRUCIALLY, *only env classes exist*.
90
+
91
+ Mirrors `evidence.Accountability` in spirit (who authored the bytes) but is local
92
+ and **closed to env-authored classes by construction**: there is deliberately no
93
+ `AGENT_AUTHORED` member. The provenance corpus is built only of `EnvBlob`s, and an
94
+ `EnvBlob` can be tagged with nothing but these two — so a boundary that tried to fold
95
+ a model turn into the corpus has *no enum value to use* and the bytes cannot enter.
96
+ The mirror-verifier trap (docs/143 §5a — grading the agent against bytes the agent
97
+ authored) is thereby made **unrepresentable in the type**, not merely discouraged.
98
+
99
+ `str`-valued so it round-trips a CLI token / JSON without a lookup table (the
100
+ `Accountability` / `Liveness` idiom).
101
+
102
+ TOOL_RESULT — bytes the gym MCP server authored: a prior read/tool RESULT the agent
103
+ observed but did not write. The primary provenance source.
104
+ TASK_TEXT — bytes the gym authored in the task prompt / policy doc. An id the task
105
+ itself names is env-authored (docs/143 §4 P1 flags task-text ids as a
106
+ needed first-class source, so a task-named id is never false-flagged).
107
+ """
108
+
109
+ TOOL_RESULT = "TOOL_RESULT"
110
+ TASK_TEXT = "TASK_TEXT"
111
+
112
+ def __str__(self) -> str: # pragma: no cover - trivial
113
+ return self.value
114
+
115
+
116
+ class ProvenanceStance(str, enum.Enum):
117
+ """The per-arg verdict — three-valued, the `EvidenceStance` analogue.
118
+
119
+ `str`-valued so it round-trips a token / JSON / exit code without a lookup table.
120
+
121
+ SUPPORTED — id-shaped, a reference on a mutating call, AND every data-bearing
122
+ component traced to env-authored bytes. The "believe" rung.
123
+ UNSUPPORTED — id-shaped, a reference on a mutating call, the corpus was non-empty,
124
+ AND ≥1 data-bearing component appears NOWHERE in env bytes → looks
125
+ model-minted. The ONLY stance that drives a nudge.
126
+ ABSTAIN — the fail-safe zero: not id-shaped, OR a read/non-mutating call, OR a
127
+ new-key (the create's own minted identity), OR the corpus is empty
128
+ (first call — we cannot prove mintage with zero env bytes, so we never
129
+ accuse). Honest no-signal; never a block.
130
+ """
131
+
132
+ SUPPORTED = "SUPPORTED"
133
+ UNSUPPORTED = "UNSUPPORTED"
134
+ ABSTAIN = "ABSTAIN"
135
+
136
+ def __str__(self) -> str: # pragma: no cover - trivial
137
+ return self.value
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # Frozen inputs — the pure datum a caller gathers at the boundary and hands in.
142
+ # ---------------------------------------------------------------------------
143
+ @dataclass(frozen=True)
144
+ class EnvBlob:
145
+ """One env-authored chunk of the provenance corpus — bytes the agent did NOT write.
146
+
147
+ `text` is one prior tool RESULT (or the task text) already flattened to a string at
148
+ the boundary (the wrapper does `json.dumps(result)` / `str(...)`; the kernel never
149
+ parses JSON). `source` is the load-bearing field: it can ONLY be an env
150
+ `CorpusSource`, so an `EnvBlob` is by construction not forgeable-floor evidence.
151
+ """
152
+
153
+ text: str
154
+ source: CorpusSource
155
+
156
+
157
+ @dataclass(frozen=True)
158
+ class PriorResults:
159
+ """The whole env-authored corpus accumulated before the call under scrutiny.
160
+
161
+ `blobs` is a tuple of `EnvBlob` — every prior tool RESULT plus the task text, each
162
+ tagged with its env source. Empty (`()`) on the very first call of an episode, which
163
+ `classify_call` reads as "cannot prove mintage → ABSTAIN-all" (the load-bearing
164
+ first-call safe direction). The blobs are kept WHOLE (not pre-tokenized) so an id
165
+ embedded mid-prose ("close incident INC0010023 today") is still found by containment.
166
+ """
167
+
168
+ blobs: tuple[EnvBlob, ...] = ()
169
+
170
+
171
+ @dataclass(frozen=True)
172
+ class ToolArg:
173
+ """One argument of the tool call, as the pure datum a provenance check sees.
174
+
175
+ `value` is the raw value the model emitted (str | int | float | bool | None | list |
176
+ dict); the fold provenance-checks scalars and recurses into list/dict. `is_reference`
177
+ is the create's-own-key guard: the wrapper sets it False for the slot that holds the
178
+ NEW object's OWN identity/primary key (resolved from the tool schema) — a brand-new
179
+ minted natural key (a new email, a new title-slug) is minted-AND-correct and must
180
+ never be nudged ("you cannot resolve an id you are inventing"). Defaults True (the
181
+ common case: most args reference existing rows), so an un-annotating caller gets the
182
+ gating behavior; the create's-own-key exemption is opt-in at the boundary.
183
+ """
184
+
185
+ name: str
186
+ value: object
187
+ is_reference: bool = True
188
+
189
+
190
+ @dataclass(frozen=True)
191
+ class ToolCall:
192
+ """The tool call under scrutiny — the `AdmissionRequest` analogue.
193
+
194
+ `is_mutating` is set by the wrapper from the tool schema (write-verb classifier). A
195
+ read/non-mutating call is never gated — reads are how provenance ENTERS the corpus —
196
+ so `is_mutating=False` short-circuits the whole fold to ABSTAIN-all. The wrapper's
197
+ write-verb classifier is deliberately **fail-open** (when unsure, treat as a read):
198
+ under-gating is the feasible-task-safe direction here, the explicit inversion of the
199
+ kernel's usual fail-closed posture, because a false gate risks a real regression while
200
+ a missed gate just degrades to baseline.
201
+ """
202
+
203
+ tool_name: str
204
+ args: tuple[ToolArg, ...]
205
+ is_mutating: bool = True
206
+
207
+
208
+ @dataclass(frozen=True)
209
+ class ProvenancePolicy:
210
+ """The thresholds — mechanism is kernel, knobs are config (the `LivenessPolicy` seam).
211
+
212
+ Defaults GENERIC; a host may declare its own in `dos.toml [arg_provenance]` read back
213
+ through `SubstrateConfig` (the closed-config-as-data pattern).
214
+
215
+ min_component_len — a component shorter than this is dropped from the must-trace set
216
+ (too collision-prone to demand OR to substring-match): a bare
217
+ "P1" / "42" / "US" standalone is not provenance-checkable. There
218
+ is deliberately NO fractional-support knob: the only honest rule
219
+ is "every data-bearing component traces" (a sub-1.0 leniency
220
+ would be a laundering leak — a mostly-minted id passing).
221
+ case_sensitive — casefold both sides by default. ServiceNow ids (INC0010023) are
222
+ case-stable, but DB echoes / emails / usernames vary; casefold
223
+ avoids a false-UNSUPPORTED on a re-cased legit id (fewest-false-
224
+ blocks bias).
225
+ """
226
+
227
+ min_component_len: int = 4
228
+ case_sensitive: bool = False
229
+
230
+ def __post_init__(self) -> None:
231
+ if self.min_component_len < 1:
232
+ raise ValueError("min_component_len must be >= 1")
233
+
234
+
235
+ DEFAULT_POLICY = ProvenancePolicy()
236
+
237
+
238
+ # ---------------------------------------------------------------------------
239
+ # Frozen verdicts — the folded answer, advisory only.
240
+ # ---------------------------------------------------------------------------
241
+ @dataclass(frozen=True)
242
+ class ArgProvenance:
243
+ """One argument's provenance sub-verdict (legible distrust — the per-arg detail).
244
+
245
+ `matched_in` names which env source(s) carried the traced components (the rung made
246
+ visible). `components_unmatched` names the precise minted sub-ids — the minimal,
247
+ exact target the nudge speaks ("resolve <those parts> via a read first"), so a nudge
248
+ is never a vague "resolve your id."
249
+ """
250
+
251
+ arg_name: str
252
+ value_repr: str
253
+ stance: ProvenanceStance
254
+ id_shaped: bool
255
+ is_reference: bool
256
+ matched_in: tuple[CorpusSource, ...]
257
+ components_checked: tuple[str, ...]
258
+ components_unmatched: tuple[str, ...]
259
+ reason: str
260
+
261
+ def to_dict(self) -> dict:
262
+ return {
263
+ "arg_name": self.arg_name,
264
+ "value_repr": self.value_repr,
265
+ "stance": self.stance.value,
266
+ "id_shaped": self.id_shaped,
267
+ "is_reference": self.is_reference,
268
+ "matched_in": [s.value for s in self.matched_in],
269
+ "components_checked": list(self.components_checked),
270
+ "components_unmatched": list(self.components_unmatched),
271
+ "reason": self.reason,
272
+ }
273
+
274
+
275
+ @dataclass(frozen=True)
276
+ class ProvenanceVerdict:
277
+ """The folded top-level answer over a tool call — the `LivenessVerdict` analogue.
278
+
279
+ `believe` is True iff NO arg is UNSUPPORTED — i.e. every id-shaped reference arg either
280
+ traced to env bytes or the call had none to check. It means ONLY "no id was minted from
281
+ nowhere," NEVER "the args are correct" (no satisfaction claim — the trap). `unsupported`
282
+ is the arg names the nudge targets (empty ⟺ believe). `args` carries every per-arg
283
+ sub-verdict (including abstained ones) for legibility. Advisory: never raises, never
284
+ dispatches — the consumer reads `unsupported` and decides whether to nudge.
285
+ """
286
+
287
+ believe: bool
288
+ args: tuple[ArgProvenance, ...]
289
+ unsupported: tuple[str, ...]
290
+ reason: str
291
+
292
+ def to_dict(self) -> dict:
293
+ return {
294
+ "believe": self.believe,
295
+ "args": [a.to_dict() for a in self.args],
296
+ "unsupported": list(self.unsupported),
297
+ "reason": self.reason,
298
+ }
299
+
300
+
301
+ # ---------------------------------------------------------------------------
302
+ # Detection + matching — pure, decidable from the corpus alone, no answer key.
303
+ # ---------------------------------------------------------------------------
304
+
305
+ # Step B negative filters — a value matching any of these is a quantity/literal, NOT an
306
+ # FK, so it is rejected as not-id-shaped BEFORE any positive test (the date/money/version/
307
+ # phone false-block killer). Anchored full-string. The datetime forms are real-data-
308
+ # hardened (docs/143 live run): a full ISO-8601 timestamp `2025-08-23T00:00:00Z` was being
309
+ # mis-split into id components (`23T00`, `59`), so the filter matches the whole stamp with
310
+ # its `T`/`:`/`Z`/offset, not just a bare `YYYY-MM-DD`.
311
+ _RE_ISO_DATE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
312
+ _RE_ISO_DATETIME = re.compile(
313
+ r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2}(\.\d+)?)?(Z|[+-]\d{2}:?\d{2})?$"
314
+ )
315
+ _RE_TIME = re.compile(r"^\d{1,2}:\d{2}(:\d{2})?$")
316
+ _RE_DECIMAL = re.compile(r"^\d+\.\d+$")
317
+ _RE_VERSION = re.compile(r"^v?\d+(\.\d+)+$")
318
+ # Phone-ish: must carry a phone SEPARATOR (a '-'/'+'/'('/space) — a BARE integer is NOT
319
+ # phone-ish (it is a numeric PK / FK). The old `^[\d\-+()\s]+$` matched every pure-digit
320
+ # string, wrongly negative-filtering numeric ids like `1179` (docs/143 real-data fix).
321
+ _RE_PHONEISH = re.compile(r"^[\d()][\d\-+()\s]*[\-+()\s][\d\-+()\s]*\d$")
322
+ _RE_EPOCH_MS = re.compile(r"^\d{13}$") # a 13-digit ms-epoch timestamp (a quantity, not an FK)
323
+ _LITERAL_WORDS = frozenset({"true", "false", "null", "none"})
324
+
325
+ # Step C positive signatures.
326
+ _RE_HEX32 = re.compile(r"^[0-9a-f]{32}$")
327
+ _RE_UUID = re.compile(r"^[0-9a-fA-F]{8}-(?:[0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}$")
328
+ _RE_EMAIL = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
329
+ _RE_HAS_DIGIT = re.compile(r"\d")
330
+ _RE_HAS_ALPHA = re.compile(r"[A-Za-z]")
331
+ # The character class an id-shaped mixed-alnum token may contain (no internal whitespace).
332
+ _RE_ID_CHARS = re.compile(r"^[A-Za-z0-9._:#/\-@]+$")
333
+ _RE_DIGIT_RUN = re.compile(r"\d+")
334
+
335
+ # The delimiter class a composite id splits on (Step D).
336
+ _DELIM_RE = re.compile(r"[@._:#/\-]")
337
+ # The tokenizer for the env corpus (Step E exact-match): id-delimiters + whitespace + the
338
+ # JSON/structural punctuation that wraps env values (braces, brackets, quotes, commas,
339
+ # parens, equals, semicolons). Without stripping these, a JSON value `10023` tokenizes as
340
+ # `10023}` and an exact/pad match misses — so the env corpus must be punctuation-clean.
341
+ _ENV_TOKEN_RE = re.compile(r"[@._:#/\-\s{}\[\]\"',()=;<>|]+")
342
+ # A "clean" id part: an optional alpha prefix then a trailing digit run (INC0010023,
343
+ # acme99, p0001) — the part whose DATA unit is just the digit run. Anything else that is
344
+ # alnum (hex, interleaved) is demanded WHOLE so we never demand a meaningless 1-char run.
345
+ _RE_PREFIX_THEN_DIGITS = re.compile(r"^[A-Za-z]*\d+$")
346
+
347
+ # Name-hint suffixes (Step C corroboration). Suffix-anchored, NOT substring — so
348
+ # `phone_number` / `version_number` / `due_to_date` are NOT hints (their substrings
349
+ # `_number`/`_to`/`_date` are excluded by anchoring to these exact tails).
350
+ _NAME_HINT_SUFFIXES = ("_id", "_sys_id", "sys_id", "_ref", "_key", "_email")
351
+
352
+ # Quantity-name stoplist (docs/143 real-data fix): an arg whose name signals a NUMBER, not
353
+ # an FK — a price, an amount, a count. A bare pure-digit value in such a slot is the model
354
+ # legitimately SETTING a quantity (contract_price=33414), never an id to resolve, so it must
355
+ # never be treated as id-shaped (it caused live false-flags). Matched as a name SUFFIX /
356
+ # whole-word so `unit_price`/`total_amount`/`max_results` hit but an `*_id` never does.
357
+ _QUANTITY_NAME_PARTS = (
358
+ "price", "amount", "cost", "total", "count", "quantity", "qty", "size", "limit",
359
+ "number", "num", "age", "duration", "score", "rate", "percent", "weight", "height",
360
+ "width", "length", "balance", "fee", "rank", "position", "offset", "page", "max",
361
+ "min", "priority", "level", "year", "month", "day", "hour", "minute", "second",
362
+ )
363
+
364
+
365
+ def _name_is_quantity(arg_name: str) -> bool:
366
+ """True iff the arg NAME signals a quantity (price/amount/count/…), so a bare number in
367
+ it is a value the model sets, not an FK to resolve. Matched on the trailing token of a
368
+ snake/camel name so `unit_price`/`maxResults` hit while an `account_id` never does."""
369
+ n = arg_name.casefold()
370
+ last = n.replace("-", "_").split("_")[-1]
371
+ return last in _QUANTITY_NAME_PARTS or any(n.endswith(q) for q in _QUANTITY_NAME_PARTS)
372
+
373
+ # Grammar stoplist (Step D) — pure-alpha composite parts that are domain GRAMMAR, not the
374
+ # data-bearing identifier portion: common TLDs + a few connective/scheme words the model
375
+ # may legitimately type. A pure-alpha part in this set is exempt from must-trace even when
376
+ # it is long enough to otherwise be demanded; a pure-alpha part NOT in this set and long
377
+ # enough (e.g. the org-name in user_42@acme.com / @evil.com) IS demanded, so a minted
378
+ # domain is caught while a real one traces. Kept deliberately small — over-listing turns a
379
+ # minted identifier word into exempt grammar (a laundering leak), under-listing risks a
380
+ # false-block on an exotic TLD (the safe direction: a false-block degrades, see module doc).
381
+ _GRAMMAR_WORDS = frozenset({
382
+ "com", "org", "net", "edu", "gov", "mil", "int", "io", "co", "us", "uk", "ca", "au",
383
+ "www", "http", "https", "mailto", "ftp", "api", "mail", "smtp", "imap",
384
+ })
385
+
386
+
387
+ def _casefold(s: str, policy: ProvenancePolicy) -> str:
388
+ return s if policy.case_sensitive else s.casefold()
389
+
390
+
391
+ def _is_negative_filtered(s: str) -> bool:
392
+ """Step B — True iff `s` is a quantity/literal (date/time/decimal/version/phone/bool),
393
+ NOT an FK. Such a value is never id-shaped."""
394
+ if _RE_ISO_DATE.match(s) or _RE_ISO_DATETIME.match(s) or _RE_TIME.match(s):
395
+ return True
396
+ if _RE_DECIMAL.match(s) or _RE_VERSION.match(s) or _RE_PHONEISH.match(s):
397
+ return True
398
+ if _RE_EPOCH_MS.match(s):
399
+ return True
400
+ if s.casefold() in _LITERAL_WORDS:
401
+ return True
402
+ # Pure prose: internal whitespace AND no embedded digit AND not email-shaped → free
403
+ # text (a short_description), never an id.
404
+ if " " in s and not _RE_HAS_DIGIT.search(s) and not _RE_EMAIL.match(s):
405
+ return True
406
+ return False
407
+
408
+
409
+ def _name_is_hint(arg_name: str) -> bool:
410
+ """True iff the arg NAME suffix-matches an id-bearing slot name. Corroborating only —
411
+ never promotes a Step-B-rejected value, and requires a positive value signature too."""
412
+ n = arg_name.casefold()
413
+ return any(n.endswith(suf) for suf in _NAME_HINT_SUFFIXES)
414
+
415
+
416
+ def _is_id_shaped(s: str, arg_name: str, policy: ProvenancePolicy) -> bool:
417
+ """Step C — True iff `s` carries a POSITIVE id signature. Biased to under-fire: a
418
+ missed id is a silent safe ABSTAIN; a false flag risks a false-block."""
419
+ # Real-data recall lift (docs/143): a BARE SHORT INTEGER in a strong FK-name slot
420
+ # (`group_id`, `caller_id`, `*_ref`) is the ServiceNow numeric-PK pattern (group_id=81).
421
+ # The default min_component_len drops it, costing recall on the dominant
422
+ # 'Incorrect ID Resolution' shape. Promote it to id-shaped down to len 2 — but ONLY
423
+ # when the NAME corroborates (a bare number in a non-FK slot stays a quantity). The
424
+ # int-equality matcher handles its collision risk, and the name-hint keeps precision
425
+ # intact (a `limit`/`priority`/`page` value never reaches here). It must still survive
426
+ # Step B (so a 13-digit epoch / a decimal is excluded before this).
427
+ if (
428
+ s.isdigit() and 2 <= len(s) < policy.min_component_len
429
+ and _name_is_hint(arg_name) and not _name_is_quantity(arg_name)
430
+ and not _is_negative_filtered(s)
431
+ ):
432
+ return True
433
+ if len(s) < policy.min_component_len:
434
+ return False
435
+ if _is_negative_filtered(s):
436
+ return False
437
+ if " " in s:
438
+ # An id token has no internal whitespace (email/prose handled above).
439
+ return False
440
+ if not _RE_ID_CHARS.match(s):
441
+ return False
442
+ # (iii) hex32 / UUID, (iv) email.
443
+ if _RE_HEX32.match(s) or _RE_UUID.match(s) or _RE_EMAIL.match(s):
444
+ return True
445
+ # (i) mixed alnum.
446
+ if _RE_HAS_DIGIT.search(s) and _RE_HAS_ALPHA.search(s):
447
+ return True
448
+ # (ii) pure-digit key of sufficient length (survived Step B, so not a date/decimal) —
449
+ # UNLESS the name signals a quantity (price/amount/count), where a bare number is a value
450
+ # the model sets, not an FK to resolve (the docs/143 contract_price=33414 false-flag).
451
+ if s.isdigit() and len(s) >= policy.min_component_len and not _name_is_quantity(arg_name):
452
+ return True
453
+ # Name-hint corroboration: a long opaque pure-alpha token (no digit, no delimiter)
454
+ # in an *_id/_ref/_key slot is an opaque key. Never fires on a short or Step-B value.
455
+ if _name_is_hint(arg_name) and len(s) >= policy.min_component_len and s.isalnum():
456
+ return True
457
+ return False
458
+
459
+
460
+ def _data_bearing_components(s: str, policy: ProvenancePolicy) -> tuple[tuple[str, ...], bool]:
461
+ """Step D — split `s` into components and return the DATA-BEARING ones (those that
462
+ must trace) plus whether the value is genuinely id-shaped after decomposition.
463
+
464
+ A component is:
465
+ * DATA-BEARING (must trace) — a DIGIT-RUN anywhere (e.g. "INC0010023" → "0010023";
466
+ the prefix is grammar), a high-entropy interleaved alnum token kept WHOLE (a hex/
467
+ UUID chunk), OR a long non-grammar pure-alpha label in the DOMAIN position (the
468
+ org label after an "@": the "acme"/"evil" in user_42@<x>.com — a minted domain
469
+ must be caught, a real one resolves).
470
+ * GRAMMAR (exempt) — a pure-alpha part OUTSIDE the domain position (the "INC" prefix,
471
+ a "user"/"jane"/"doe" type-word or name, an enum word), and any TLD/scheme word in
472
+ the stoplist. Domain grammar the model may legitimately supply; NOT demanded. This
473
+ asymmetry — alpha-in-domain-position is data, alpha-elsewhere is grammar — is what
474
+ lets the local-part "user" stay exempt (no false-block on the supported composite)
475
+ while the org label is still checked.
476
+ * DROPPED — a part shorter than min_component_len (a "US", a "v"): too collision-prone
477
+ to demand or to match.
478
+
479
+ The ENUM GUARD: if after decomposition there are ZERO data-bearing components (every
480
+ part is grammar or dropped — "itil_admin", "in_progress"), the value is NOT genuinely
481
+ id-shaped → the caller ABSTAINs. A role/status/enum token never nudges.
482
+
483
+ REAL-DATA HARDENING (docs/143 live run, the §8 false-block kill-signal made concrete):
484
+ * a UUID / 32-hex value is demanded WHOLE (one component), never split on its `-`
485
+ delimiters — splitting `3fc71c6d-bfa1-4339-b089-…` demanded sub-chunks like `1`/`089`
486
+ that don't independently trace, a guaranteed false-flag on a legit label id.
487
+ * a digit-run in an EMAIL LOCAL part (`jason.smith10@…` → `10`) is a username
488
+ discriminator, NOT a resolvable FK — it is grammar, never demanded. Only a digit-run
489
+ long enough to be a real id (>= min_component_len) is demanded from a local-part token.
490
+ """
491
+ # A UUID or 32-hex value is ONE opaque identity — demand it whole, never split. (Step C
492
+ # already accepted it as id-shaped; splitting it on '-' is the documented false-flag.)
493
+ if _RE_UUID.match(s) or _RE_HEX32.match(s):
494
+ return (s,), True
495
+
496
+ # Split into the local region (before the first "@") and the domain region (after it).
497
+ # A pure-alpha label is grammar in the local region, data-bearing in the domain region.
498
+ at = s.find("@")
499
+ local_s, domain_s = (s, "") if at < 0 else (s[:at], s[at + 1:])
500
+ is_email = at >= 0
501
+ local_parts = [p for p in _DELIM_RE.split(local_s) if p]
502
+ domain_parts = [p for p in _DELIM_RE.split(domain_s) if p]
503
+ demanded: list[str] = []
504
+ for part, in_domain in (
505
+ [(p, False) for p in local_parts] + [(p, True) for p in domain_parts]
506
+ ):
507
+ if not _RE_HAS_DIGIT.search(part):
508
+ # Pure-alpha part. In the LOCAL region it is grammar (exempt) — the "INC"/"user"
509
+ # type-word, a name. In the DOMAIN region a long, non-grammar label is the org
510
+ # identity and IS data-bearing (a minted domain is caught while a real one
511
+ # resolves); a stoplist TLD/scheme word stays grammar everywhere.
512
+ if (
513
+ in_domain
514
+ and len(part) >= policy.min_component_len
515
+ and part.casefold() not in _GRAMMAR_WORDS
516
+ ):
517
+ demanded.append(part)
518
+ continue
519
+ if _RE_PREFIX_THEN_DIGITS.match(part):
520
+ # A clean alpha-prefix + trailing digit run (INC0010023, acme99, p0001): the
521
+ # DATA unit is just the digit run (the prefix is grammar). One digit run — BUT in
522
+ # an email LOCAL part the digit suffix is a username discriminator (smith10), not
523
+ # an FK, so only demand it if it is long enough to be a real id.
524
+ run = _RE_DIGIT_RUN.findall(part)
525
+ if run:
526
+ d = run[-1]
527
+ if is_email and not in_domain and len(d) < policy.min_component_len:
528
+ continue # username discriminator — grammar, not a resolvable FK
529
+ demanded.append(d)
530
+ else:
531
+ # A high-entropy interleaved alnum token (hex/UUID chunk, a1b2c3…): demand the
532
+ # WHOLE chunk as one unit. Demanding its individual 1-char digit runs would be
533
+ # both meaningless (a "1" matches everything) and a false-block risk, so the
534
+ # opaque token traces or it doesn't, atomically.
535
+ demanded.append(part)
536
+ # Dedup; drop any demanded component shorter than min_component_len UNLESS it is a
537
+ # pure-digit run (a short numeric PK is real data — int-equality matching handles its
538
+ # collision risk; a short non-digit chunk is too collision-prone to demand or match).
539
+ seen: set[str] = set()
540
+ out: list[str] = []
541
+ for c in demanded:
542
+ if len(c) < policy.min_component_len and not c.isdigit():
543
+ continue
544
+ if c not in seen:
545
+ seen.add(c)
546
+ out.append(c)
547
+ return tuple(out), bool(out)
548
+
549
+
550
+ def _component_found(component: str, env_text: str, env_tokens: frozenset[str],
551
+ policy: ProvenancePolicy) -> bool:
552
+ """Step E — True iff `component` traces to the env corpus. Several rungs, any one:
553
+
554
+ (a) exact: equals an env token.
555
+ (b) substring: is a substring of the joined env text (len-guarded).
556
+ (c) reverse-substring: an env token (len-guarded) is a substring of the component
557
+ — covers the model DERIVING a padded/prefixed id from a bare env fragment.
558
+ (d) numeric-pad normalize: for a pure-digit component, compare zero-stripped /
559
+ int-value forms against env digit tokens — the most common ServiceNow livelock
560
+ ("0010023" derived from env bare int "10023" and vice versa).
561
+ """
562
+ c = _casefold(component, policy)
563
+ # (a) exact token.
564
+ if c in env_tokens:
565
+ return True
566
+ # (d) numeric-pad normalize (do before substring so int-equality is authoritative).
567
+ if c.isdigit():
568
+ c_int = c.lstrip("0") or "0"
569
+ for tok in env_tokens:
570
+ if tok.isdigit() and (tok.lstrip("0") or "0") == c_int:
571
+ return True
572
+ # (b) substring in the joined env text.
573
+ if len(c) >= policy.min_component_len and c in env_text:
574
+ return True
575
+ # (c) reverse-substring: a sufficiently long env token sits inside the component.
576
+ if len(c) >= policy.min_component_len:
577
+ for tok in env_tokens:
578
+ if len(tok) >= policy.min_component_len and tok in c:
579
+ return True
580
+ # numeric reverse: an env digit token's int form inside the component's digits.
581
+ if c.isdigit():
582
+ for tok in env_tokens:
583
+ if tok.isdigit() and len(tok) >= policy.min_component_len:
584
+ stripped = tok.lstrip("0") or "0"
585
+ if stripped in c:
586
+ return True
587
+ return False
588
+
589
+
590
+ def _flatten_leaves(value: object) -> list[object]:
591
+ """Recurse a list/dict arg value to its scalar leaves (Step A.3). Each leaf is
592
+ provenance-checked independently; the arg folds to UNSUPPORTED if ANY id-leaf is
593
+ minted, SUPPORTED if all id-leaves trace, ABSTAIN if no leaf is id-shaped."""
594
+ out: list[object] = []
595
+ if isinstance(value, dict):
596
+ for v in value.values():
597
+ out.extend(_flatten_leaves(v))
598
+ elif isinstance(value, (list, tuple)):
599
+ for v in value:
600
+ out.extend(_flatten_leaves(v))
601
+ else:
602
+ out.append(value)
603
+ return out
604
+
605
+
606
+ def _build_env(prior: PriorResults, policy: ProvenancePolicy) -> tuple[str, dict[str, set[CorpusSource]]]:
607
+ """Boundary-free corpus prep: the joined casefolded env text + a token→sources map.
608
+
609
+ Returns (joined_text, token_sources) where token_sources maps each env token to the
610
+ set of `CorpusSource`s that supplied it (for `matched_in`). Computed once per call.
611
+ """
612
+ texts: list[str] = []
613
+ token_sources: dict[str, set[CorpusSource]] = {}
614
+ for blob in prior.blobs:
615
+ t = _casefold(blob.text, policy)
616
+ texts.append(t)
617
+ for tok in _ENV_TOKEN_RE.split(t):
618
+ if tok:
619
+ token_sources.setdefault(tok, set()).add(blob.source)
620
+ return " ".join(texts), token_sources
621
+
622
+
623
+ def classify_arg(arg: ToolArg, prior: PriorResults,
624
+ policy: ProvenancePolicy = DEFAULT_POLICY) -> ArgProvenance:
625
+ """Per-arg leaf — SUPPORTED / UNSUPPORTED / ABSTAIN for ONE argument. PURE.
626
+
627
+ Recurses into list/dict values (fold: any id-leaf UNSUPPORTED → UNSUPPORTED; all
628
+ id-leaves traced → SUPPORTED; no id-leaf → ABSTAIN). Assumes the call-level guards
629
+ (read call, empty corpus) were applied by `classify_call`; a direct caller passing a
630
+ non-empty `prior` gets the full check.
631
+ """
632
+ name = arg.name
633
+ # Step A.1 — the create's-own-key exemption.
634
+ if not arg.is_reference:
635
+ return ArgProvenance(
636
+ arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.ABSTAIN,
637
+ id_shaped=False, is_reference=False, matched_in=(), components_checked=(),
638
+ components_unmatched=(),
639
+ reason="new-key / own-identity slot — not a reference to resolve",
640
+ )
641
+
642
+ # Step A.3 — recurse composite container values.
643
+ if isinstance(arg.value, (list, tuple, dict)):
644
+ leaves = _flatten_leaves(arg.value)
645
+ any_id = False
646
+ unmatched_all: list[str] = []
647
+ matched_sources: set[CorpusSource] = set()
648
+ checked_all: list[str] = []
649
+ any_unsupported = False
650
+ for leaf in leaves:
651
+ sub = classify_arg(ToolArg(name=name, value=leaf, is_reference=True), prior, policy)
652
+ checked_all.extend(sub.components_checked)
653
+ if sub.id_shaped:
654
+ any_id = True
655
+ matched_sources.update(sub.matched_in)
656
+ if sub.stance is ProvenanceStance.UNSUPPORTED:
657
+ any_unsupported = True
658
+ unmatched_all.extend(sub.components_unmatched)
659
+ if not any_id:
660
+ return ArgProvenance(
661
+ arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.ABSTAIN,
662
+ id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
663
+ components_unmatched=(),
664
+ reason="container arg with no id-shaped leaf — nothing to provenance-check",
665
+ )
666
+ if any_unsupported:
667
+ return ArgProvenance(
668
+ arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.UNSUPPORTED,
669
+ id_shaped=True, is_reference=True, matched_in=tuple(sorted(matched_sources, key=lambda s: s.value)),
670
+ components_checked=tuple(checked_all), components_unmatched=tuple(unmatched_all),
671
+ reason="at least one id in the container did not appear in env-authored bytes",
672
+ )
673
+ return ArgProvenance(
674
+ arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.SUPPORTED,
675
+ id_shaped=True, is_reference=True, matched_in=tuple(sorted(matched_sources, key=lambda s: s.value)),
676
+ components_checked=tuple(checked_all), components_unmatched=(),
677
+ reason="every id in the container traced to env-authored bytes",
678
+ )
679
+
680
+ # Step A.2 — None / bool are never ids.
681
+ if arg.value is None or isinstance(arg.value, bool):
682
+ return ArgProvenance(
683
+ arg_name=name, value_repr=str(arg.value), stance=ProvenanceStance.ABSTAIN,
684
+ id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
685
+ components_unmatched=(), reason="flag/None value — never an id",
686
+ )
687
+
688
+ s = str(arg.value).strip()
689
+ if not _is_id_shaped(s, name, policy):
690
+ return ArgProvenance(
691
+ arg_name=name, value_repr=s, stance=ProvenanceStance.ABSTAIN,
692
+ id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
693
+ components_unmatched=(), reason="not id/FK-shaped — quantity, literal, or prose",
694
+ )
695
+
696
+ env_text, token_sources = _build_env(prior, policy)
697
+ env_tokens = frozenset(token_sources)
698
+
699
+ # WHOLE-VALUE DIRECT MATCH (the primary rung — docs/143 live run). The overwhelmingly
700
+ # common honest case is the model passing an id it read back VERBATIM (`INC_004`,
701
+ # `msg_001`, a UUID). If the entire value appears in the env corpus — as an exact token
702
+ # OR a substring of the joined text — it is RESOLVED, full stop. We answer here BEFORE
703
+ # decomposing, because decomposition is a heuristic for *derived/composite* ids and on a
704
+ # verbatim id it can demand a too-short sub-run (`004`) that the matcher then misses,
705
+ # a guaranteed false-flag. Direct containment needs no hashing or fuzzy match — the id
706
+ # is the same bytes the env authored, so a plain substring is the exact, honest test.
707
+ cf_whole = _casefold(s, policy)
708
+ if cf_whole in env_tokens or cf_whole in env_text:
709
+ srcs = token_sources.get(cf_whole)
710
+ if not srcs:
711
+ srcs = set()
712
+ for tok, ss in token_sources.items():
713
+ if cf_whole in tok or tok in cf_whole:
714
+ srcs.update(ss)
715
+ return ArgProvenance(
716
+ arg_name=name, value_repr=s, stance=ProvenanceStance.SUPPORTED,
717
+ id_shaped=True, is_reference=True,
718
+ matched_in=tuple(sorted(srcs, key=lambda x: x.value)),
719
+ components_checked=(s,), components_unmatched=(),
720
+ reason=f"the id {s!r} appears verbatim in env-authored bytes (direct match)",
721
+ )
722
+
723
+ components, genuinely_id = _data_bearing_components(s, policy)
724
+ if not genuinely_id:
725
+ # Step D enum guard: delimiter present but no data-bearing component (itil_admin).
726
+ return ArgProvenance(
727
+ arg_name=name, value_repr=s, stance=ProvenanceStance.ABSTAIN,
728
+ id_shaped=False, is_reference=True, matched_in=(), components_checked=(),
729
+ components_unmatched=(),
730
+ reason="enum/role/status token — no data-bearing component to resolve",
731
+ )
732
+
733
+ unmatched: list[str] = []
734
+ matched_sources: set[CorpusSource] = set()
735
+ for c in components:
736
+ cf = _casefold(c, policy)
737
+ if _component_found(c, env_text, env_tokens, policy):
738
+ # Record which source(s) supplied a hit (best-effort: exact-token map, else
739
+ # tag as the union of sources whose text contains it).
740
+ hit_sources = token_sources.get(cf)
741
+ if hit_sources:
742
+ matched_sources.update(hit_sources)
743
+ else:
744
+ for blob_tok, srcs in token_sources.items():
745
+ if cf in blob_tok or blob_tok in cf:
746
+ matched_sources.update(srcs)
747
+ else:
748
+ unmatched.append(c)
749
+
750
+ if unmatched:
751
+ return ArgProvenance(
752
+ arg_name=name, value_repr=s, stance=ProvenanceStance.UNSUPPORTED,
753
+ id_shaped=True, is_reference=True,
754
+ matched_in=tuple(sorted(matched_sources, key=lambda x: x.value)),
755
+ components_checked=components, components_unmatched=tuple(unmatched),
756
+ reason=(
757
+ f"id-shaped reference {s!r} has component(s) {unmatched} that appear in no "
758
+ f"env-authored bytes — looks model-minted (resolve via a read first)"
759
+ ),
760
+ )
761
+ return ArgProvenance(
762
+ arg_name=name, value_repr=s, stance=ProvenanceStance.SUPPORTED,
763
+ id_shaped=True, is_reference=True,
764
+ matched_in=tuple(sorted(matched_sources, key=lambda x: x.value)),
765
+ components_checked=components, components_unmatched=(),
766
+ reason=f"every data-bearing component of {s!r} traced to env-authored bytes",
767
+ )
768
+
769
+
770
+ def classify_call(call: ToolCall, prior: PriorResults,
771
+ policy: ProvenancePolicy = DEFAULT_POLICY) -> ProvenanceVerdict:
772
+ """The top-level fold over a tool call — the `liveness.classify` shape. PURE.
773
+
774
+ Call-level guards first (each → ABSTAIN-all, believe=True):
775
+ * a read / non-mutating call — reads are how provenance ENTERS, never gated.
776
+ * an empty corpus — the first call of an episode; with zero env bytes we cannot
777
+ prove mintage, so we never accuse (the load-bearing first-call safe direction).
778
+ Else maps `classify_arg` over the args; `believe = not any UNSUPPORTED`;
779
+ `unsupported = the UNSUPPORTED arg names` (what the consumer's nudge targets).
780
+ """
781
+ if not call.is_mutating:
782
+ return ProvenanceVerdict(
783
+ believe=True, args=(), unsupported=(),
784
+ reason="read / non-mutating call — provenance not gated (reads source it)",
785
+ )
786
+ if not prior.blobs:
787
+ return ProvenanceVerdict(
788
+ believe=True,
789
+ args=tuple(
790
+ ArgProvenance(
791
+ arg_name=a.name, value_repr=str(a.value), stance=ProvenanceStance.ABSTAIN,
792
+ id_shaped=False, is_reference=a.is_reference, matched_in=(),
793
+ components_checked=(), components_unmatched=(),
794
+ reason="empty corpus (first call) — cannot prove mintage, abstain",
795
+ )
796
+ for a in call.args
797
+ ),
798
+ unsupported=(),
799
+ reason="empty env corpus — first call of the episode, nothing to check against",
800
+ )
801
+
802
+ arg_verdicts = tuple(classify_arg(a, prior, policy) for a in call.args)
803
+ unsupported = tuple(a.arg_name for a in arg_verdicts if a.stance is ProvenanceStance.UNSUPPORTED)
804
+ believe = not unsupported
805
+ if believe:
806
+ reason = "no id/FK argument was minted from nowhere (all traced or none to check)"
807
+ else:
808
+ reason = (
809
+ f"{len(unsupported)} id/FK argument(s) appear model-minted: "
810
+ f"{', '.join(unsupported)} — resolve via a read tool first"
811
+ )
812
+ return ProvenanceVerdict(
813
+ believe=believe, args=arg_verdicts, unsupported=unsupported, reason=reason,
814
+ )