tracegauge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tes/__init__.py ADDED
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib.metadata import PackageNotFoundError, version as _pkg_version
4
+
5
+ try:
6
+ __version__: str = _pkg_version("tracegauge")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0.dev0"
9
+
10
+ """tes — Token-Efficiency Scorer SDK."""
11
+
12
+ from tes.judge import JUDGE_SETUP_HINT, JudgeConfig # noqa: F401
13
+ from tes.score import ( # noqa: F401
14
+ ThreeAxisResult,
15
+ TOKEN_DOMAIN_OF_VALIDITY,
16
+ TRAJECTORY_DOMAIN_OF_VALIDITY,
17
+ WASTE_DOMAIN_OF_VALIDITY,
18
+ load_baselines,
19
+ score_session,
20
+ )
21
+
22
+ __all__ = [
23
+ "__version__",
24
+ "ThreeAxisResult",
25
+ "JudgeConfig",
26
+ "JUDGE_SETUP_HINT",
27
+ "load_baselines",
28
+ "score_session",
29
+ "TOKEN_DOMAIN_OF_VALIDITY",
30
+ "TRAJECTORY_DOMAIN_OF_VALIDITY",
31
+ "WASTE_DOMAIN_OF_VALIDITY",
32
+ ]
tes/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from tes.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
tes/_digest.py ADDED
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ """tes/_digest.py — Shared digest dataclasses used by tes.adapt and tes.judge.
4
+
5
+ Self-contained (no src/ or scripts/ imports). These are a direct copy of the
6
+ dataclasses in src/token_efficiency/trace_digest.py, kept here so the installed
7
+ wheel does not depend on the repo's src/ tree.
8
+
9
+ These are internal to the tes package — not part of the public API.
10
+ """
11
+
12
+ from dataclasses import dataclass
13
+
14
+
15
+ @dataclass
16
+ class TurnDigest:
17
+ """Compact representation of one conversation turn."""
18
+
19
+ turn_index: int
20
+ role: str # "ai" | "user" | "tool" | "system"
21
+ tool_names: list[str] # names of tools called in this turn
22
+ content_snippet: str # first 300 chars of content_text, stripped
23
+ token_count_input: int
24
+ token_count_output: int
25
+ cache_read: int
26
+ h2_duplicate: bool # True if annotation flagged this turn as llm_h2_duplicate_message
27
+
28
+
29
+ @dataclass
30
+ class SessionDigest:
31
+ """Human- and judge-consumable digest of a full session."""
32
+
33
+ session_id: str
34
+ domain: str
35
+ resolved: bool
36
+ total_tokens: int
37
+ turn_count: int
38
+ h2_duplicate_count: int
39
+ cache_hit_rate: float
40
+ p25_token_ratio: float
41
+ output_tokens_available: bool # True when per-turn output tokens are recorded
42
+ task_description: str # first user turn content, first 800 chars
43
+ turns: list[TurnDigest] # all turns, ordered by turn_index
44
+
45
+
46
+ def reconstruct_digest(d: dict) -> SessionDigest:
47
+ """Reconstruct a SessionDigest from the plain dict stored in adapted records.
48
+
49
+ Handles records generated before output_tokens_available was added by
50
+ defaulting the field to False when absent (safe: swe_agent sessions lack it).
51
+ """
52
+ turns = [TurnDigest(**t) for t in d["turns"]]
53
+ fields = {k: v for k, v in d.items() if k != "turns"}
54
+ fields.setdefault("output_tokens_available", False)
55
+ return SessionDigest(**fields, turns=turns)
56
+
57
+
58
+ def digest_to_text(digest: SessionDigest) -> str:
59
+ """Render a SessionDigest as judge-readable text (show_stats=False mode).
60
+
61
+ Omits formula-derived stats so the judge anchors on agent behaviour
62
+ rather than token math.
63
+ """
64
+ header_summary = (
65
+ f"Domain: {digest.domain} | Resolved: {digest.resolved} | "
66
+ f"Turns: {digest.turn_count} | "
67
+ f"Output Tokens: {'available' if digest.output_tokens_available else 'unavailable (swe_agent)'}"
68
+ )
69
+ lines: list[str] = [f"=== SESSION {digest.session_id} ===", header_summary]
70
+ lines += ["", f"TASK: {digest.task_description}", "", "TRAJECTORY:"]
71
+
72
+ for turn in digest.turns:
73
+ if turn.role == "system":
74
+ continue
75
+ tool_str: str = ", ".join(turn.tool_names) if turn.tool_names else "none"
76
+ # ENV_RESULT is the display label for openhands environment-response turns (role="tool").
77
+ role_upper: str = "ENV_RESULT" if turn.role == "tool" else turn.role.upper()
78
+ lines.append(
79
+ f"[T{turn.turn_index}] {role_upper} — tools: {tool_str} — "
80
+ f"in: {turn.token_count_input} / out: {turn.token_count_output}"
81
+ )
82
+ lines.append(f" {turn.content_snippet}")
83
+
84
+ return "\n".join(lines)
85
+
86
+
87
+ __all__ = ["TurnDigest", "SessionDigest", "reconstruct_digest", "digest_to_text"]
@@ -0,0 +1,409 @@
1
+ from __future__ import annotations
2
+
3
+ """tes._waste_detectors — Deterministic waste event detectors over session trace digests.
4
+
5
+ No LLM inference. No model dependency. Each detector fires only on behavior that is
6
+ waste under any reasonable definition: conservative (under-detect defensible waste),
7
+ uncontestable (any evaluator agrees), auditable (proof turns attached to every event).
8
+ """
9
+
10
+ import re
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Shared output type
17
+ # ---------------------------------------------------------------------------
18
+
19
+
20
+ @dataclass
21
+ class WasteEvent:
22
+ """A detected waste event with auditable evidence turns."""
23
+
24
+ detector: str
25
+ session_id: str
26
+ turns: list[int] # turn_index values (from digest) that prove the event
27
+ repeat_count: int = 1 # number of consecutive failures in REPEATED-FAILED-RETRY
28
+ evidence: dict[str, Any] = field(default_factory=dict)
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # REPEATED-FAILED-RETRY detector
33
+ # ---------------------------------------------------------------------------
34
+ # Fires when a shell tool produces an identical error ≥ 2 times consecutively,
35
+ # with no state-changing operation between any pair of identical failures.
36
+ #
37
+ # Design constraints (from B4 spec credibility rule):
38
+ # CONSERVATIVE — under-detect defensible waste, not over-detect arguable waste.
39
+ # UNCONTESTABLE — the specific failing resource is named in both error messages;
40
+ # exact full-snippet match is required (not prefix).
41
+ # AUDITABLE — every event carries the specific turns that prove it.
42
+ #
43
+ # Key limitation (documented):
44
+ # The digest content_snippet captures only 300 chars. Two failures identical in
45
+ # their first 300 chars but differing after would match — an acceptable
46
+ # over-fire risk given the conservative posture everywhere else.
47
+ # ---------------------------------------------------------------------------
48
+
49
+ _SHELL_TOOLS: frozenset[str] = frozenset({"Bash", "PowerShell"})
50
+ _WRITE_TOOLS: frozenset[str] = frozenset({"Write", "Edit", "NotebookEdit"})
51
+
52
+ # Transient availability errors and CI-polling status codes: retry or re-poll is
53
+ # correct behaviour, not waste. Excluded unconditionally so the rule never fires.
54
+ _TRANSIENT_PATTERNS: list[re.Pattern[str]] = [
55
+ re.compile(p, re.IGNORECASE)
56
+ for p in [
57
+ r"ZONE_RESOURCE_POOL_EXHAUSTED",
58
+ r"RESOURCE_POOL_EXHAUSTED",
59
+ r"QUOTA_EXCEEDED",
60
+ r"rateLimitExceeded",
61
+ r"rate.?limit.?exceeded",
62
+ r"quota.?exceeded",
63
+ r"429 Too Many Requests",
64
+ r"503 Service Unavailable",
65
+ # gh CLI CI-polling status codes — not fixable failures; polling is transient.
66
+ # `gh pr checks` returns exit code 8 + tabular "pending" when checks are running.
67
+ # `gh pr checks` returns exit code 1 + "no checks reported" when CI hasn't started.
68
+ # Both are CI-polling, not agent-fixable failures.
69
+ r"\tpending\t",
70
+ r"no checks reported on the ",
71
+ ]
72
+ ]
73
+
74
+ # Signals in tool-result snippets that indicate Bash-driven state mutation.
75
+ # If any appears between two identical failures, the state DID change and
76
+ # the second failure is not an uncontested repeat — don't fire.
77
+ _STATE_MUTATION_PATTERNS: list[re.Pattern[str]] = [
78
+ # Package installs
79
+ re.compile(r"successfully installed", re.IGNORECASE),
80
+ re.compile(r"added \d+ package", re.IGNORECASE),
81
+ re.compile(r"packages installed successfully", re.IGNORECASE),
82
+ re.compile(r"\bnpm\b.{0,30}\badded\b", re.IGNORECASE),
83
+ re.compile(r"\bpip\b.{0,30}\binstalled\b", re.IGNORECASE),
84
+ # git state changes
85
+ re.compile(r"HEAD is now at", re.IGNORECASE),
86
+ re.compile(r"\bfast.?forward\b", re.IGNORECASE),
87
+ re.compile(r"\bupdated branch\b", re.IGNORECASE),
88
+ # File write confirmations (CC tool messages)
89
+ re.compile(
90
+ r"The file .{0,120} has been (?:updated|created) successfully",
91
+ re.IGNORECASE,
92
+ ),
93
+ re.compile(r"written successfully", re.IGNORECASE),
94
+ ]
95
+
96
+
97
+ def _is_shell_call(turn: dict[str, Any]) -> bool:
98
+ return turn.get("role") == "ai" and bool(
99
+ set(turn.get("tool_names", [])) & _SHELL_TOOLS
100
+ )
101
+
102
+
103
+ def _is_write_call(turn: dict[str, Any]) -> bool:
104
+ return turn.get("role") == "ai" and bool(
105
+ set(turn.get("tool_names", [])) & _WRITE_TOOLS
106
+ )
107
+
108
+
109
+ def _is_error_result(snippet: str) -> bool:
110
+ """Return True if snippet carries a recognisable non-empty failure signal."""
111
+ if len(snippet.strip()) < 20:
112
+ return False
113
+ s_lower = snippet.lower()
114
+ if re.search(r"exit code [1-9]", s_lower):
115
+ return True
116
+ if snippet.startswith("fatal:"):
117
+ return True
118
+ # grep "No such file" without an exit-code prefix (short but unambiguous)
119
+ if snippet.startswith("grep: ") and "no such file" in s_lower:
120
+ return True
121
+ return False
122
+
123
+
124
+ def _is_transient(snippet: str) -> bool:
125
+ return any(p.search(snippet) for p in _TRANSIENT_PATTERNS)
126
+
127
+
128
+ def _has_state_mutation(snippet: str) -> bool:
129
+ return any(p.search(snippet) for p in _STATE_MUTATION_PATTERNS)
130
+
131
+
132
+ def _next_tool_pos(turns: list[dict[str, Any]], from_pos: int) -> int | None:
133
+ """First 'tool' turn position after from_pos, or None if a 'user' turn intervenes."""
134
+ for k in range(from_pos + 1, len(turns)):
135
+ role = turns[k].get("role")
136
+ if role == "tool":
137
+ return k
138
+ if role == "user":
139
+ return None # human message intervenes; can't attribute result to this call
140
+ return None
141
+
142
+
143
+ def detect_repeated_failed_retry(
144
+ session_id: str,
145
+ turns: list[dict[str, Any]],
146
+ ) -> list[WasteEvent]:
147
+ """Detect runs of consecutive identical shell failures with no state change between.
148
+
149
+ A run of N consecutive identical failures emits ONE WasteEvent with repeat_count=N
150
+ and all proof turns in the ``turns`` field. A single failure (N=1) is not an event.
151
+
152
+ State-change barriers that end a run (conservative — any sign of change stops the run):
153
+ - An "ai" turn calling Write, Edit, or NotebookEdit
154
+ - A "tool" turn whose snippet matches any _STATE_MUTATION_PATTERNS entry
155
+ - A "user" (human) turn (new instructions = new context)
156
+
157
+ Transient errors (zone exhaustion, rate limits, quotas) are excluded unconditionally.
158
+ """
159
+ if not turns:
160
+ return []
161
+
162
+ events: list[WasteEvent] = []
163
+ n = len(turns)
164
+ idx_to_pos: dict[int, int] = {t["turn_index"]: pos for pos, t in enumerate(turns)}
165
+
166
+ i = 0
167
+ while i < n:
168
+ if not _is_shell_call(turns[i]):
169
+ i += 1
170
+ continue
171
+
172
+ result_pos = _next_tool_pos(turns, i)
173
+ if result_pos is None:
174
+ i += 1
175
+ continue
176
+
177
+ snippet = turns[result_pos].get("content_snippet", "")
178
+ if not _is_error_result(snippet) or _is_transient(snippet):
179
+ i = result_pos + 1
180
+ continue
181
+
182
+ # Qualifying error — try to extend into a run
183
+ run_call_idxs: list[int] = [turns[i]["turn_index"]]
184
+ run_result_idxs: list[int] = [turns[result_pos]["turn_index"]]
185
+ target_snip = snippet
186
+
187
+ k = result_pos + 1
188
+ while k < n:
189
+ t = turns[k]
190
+ role = t.get("role")
191
+
192
+ if role == "user":
193
+ break # human turn: barrier
194
+
195
+ if role == "ai":
196
+ if _is_write_call(t):
197
+ break # file-write barrier
198
+
199
+ if _is_shell_call(t):
200
+ nrp = _next_tool_pos(turns, k)
201
+ if nrp is None:
202
+ break
203
+ next_snip = turns[nrp].get("content_snippet", "")
204
+ if next_snip == target_snip:
205
+ run_call_idxs.append(t["turn_index"])
206
+ run_result_idxs.append(turns[nrp]["turn_index"])
207
+ k = nrp + 1
208
+ continue
209
+ else:
210
+ break # different result: run ended
211
+
212
+ # Non-write, non-shell ai turn (text/reasoning/read-only tool): OK
213
+ k += 1
214
+ continue
215
+
216
+ if role == "tool":
217
+ if _has_state_mutation(t.get("content_snippet", "")):
218
+ break # Bash-driven state change
219
+ k += 1
220
+ continue
221
+
222
+ k += 1 # any other role: skip
223
+
224
+ if len(run_call_idxs) >= 2:
225
+ proof_turns: list[int] = []
226
+ for call_idx, res_idx in zip(run_call_idxs, run_result_idxs):
227
+ proof_turns.append(call_idx)
228
+ proof_turns.append(res_idx)
229
+
230
+ events.append(
231
+ WasteEvent(
232
+ detector="REPEATED-FAILED-RETRY",
233
+ session_id=session_id,
234
+ turns=proof_turns,
235
+ repeat_count=len(run_call_idxs),
236
+ evidence={
237
+ "error_snippet": target_snip,
238
+ "first_call_turn": run_call_idxs[0],
239
+ "last_result_turn": run_result_idxs[-1],
240
+ "turns_gap": run_result_idxs[-1] - run_call_idxs[0],
241
+ },
242
+ )
243
+ )
244
+ last_result_list_pos = idx_to_pos[run_result_idxs[-1]]
245
+ i = last_result_list_pos + 1
246
+ else:
247
+ i = result_pos + 1
248
+
249
+ return events
250
+
251
+
252
+ # ---------------------------------------------------------------------------
253
+ # REDUNDANT-READ detector
254
+ # ---------------------------------------------------------------------------
255
+ # Fires when the agent re-reads an unchanged file: same content fetched again
256
+ # with no Write/Edit/NotebookEdit or context-reset (user turn) between reads.
257
+ #
258
+ # Two detection paths, separately labeled and reported:
259
+ #
260
+ # PATH A — CC Read tool returns "File unchanged since last read"
261
+ # The CC tool itself detected the redundancy and said so. This is the
262
+ # tool's verdict, not an inference — maximally uncontestable. The hint
263
+ # appears when the file is already in the conversation context unchanged.
264
+ #
265
+ # PATH B — Identical content_snippet from two Read results (conservative)
266
+ # Same line-numbered file content (≥80 chars, starts with \d+\t (pre-v2.1.38) or \s+\d+→ (v2.1.38+)) appears
267
+ # in two Read results within a ≤10-turn window with no state change between.
268
+ # 10 turns is a conservative cap: re-orientation reads after long work are
269
+ # excluded. Report the gap distribution before locking this threshold.
270
+ #
271
+ # Design principle: content-matching is MORE robust here than command-matching
272
+ # (which we can't do — the digest doesn't capture file paths). If the same
273
+ # file content appears in two Read results, the same portion of the same file
274
+ # was fetched twice. File changes between reads produce different content and
275
+ # naturally don't fire.
276
+ #
277
+ # Documented limitation: 300-char truncation. Two reads of the same file where
278
+ # an edit changed only content beyond char 300 would match. Conservative posture
279
+ # means we accept this rare over-fire rather than miss clear redundant reads.
280
+ # ---------------------------------------------------------------------------
281
+
282
+ _FILE_UNCHANGED_PREFIX = "File unchanged since last read"
283
+ _LINE_NUMBERED_RE = re.compile(r"^\d+\t|^\s+\d+→")
284
+ _REDUNDANT_READ_GAP_MAX = 5 # PATH B: gaps 7-9 are contestable (re-orientation after
285
+ # several intervening operations is plausibly legitimate); ≤5 stays uncontestable.
286
+
287
+
288
+ def _is_read_call(turn: dict[str, Any]) -> bool:
289
+ return turn.get("role") == "ai" and "Read" in turn.get("tool_names", [])
290
+
291
+
292
+ def _is_line_numbered_content(snippet: str) -> bool:
293
+ """Return True if snippet looks like genuine file content from the Read tool."""
294
+ if len(snippet.strip()) < 80:
295
+ return False
296
+ if snippet.startswith(_FILE_UNCHANGED_PREFIX):
297
+ return False # PATH A territory
298
+ if snippet.startswith("<"):
299
+ return False # system-reminder injections, error XML
300
+ return bool(_LINE_NUMBERED_RE.match(snippet))
301
+
302
+
303
+ def _extract_path_from_hint(snippet: str) -> str | None:
304
+ """Try to find a file path in a 'File unchanged' hint snippet."""
305
+ m = re.search(r"(?:[A-Za-z]:\\|/)[^\s'\"<>]+\.\w+", snippet)
306
+ return m.group() if m else None
307
+
308
+
309
+ def detect_redundant_read(
310
+ session_id: str,
311
+ turns: list[dict[str, Any]],
312
+ ) -> list[WasteEvent]:
313
+ """Detect redundant file reads: same file content fetched again with no change between.
314
+
315
+ PATH A events: the CC Read tool itself reported "File unchanged since last read."
316
+ PATH B events: two Read results carry identical line-numbered content within ≤10 turns
317
+ with no Write/Edit/NotebookEdit or user (context-reset) turn between.
318
+
319
+ Every event carries path="A" or path="B" in evidence so callers can report
320
+ fire rates per path separately (PATH A is tool-authoritative; PATH B is inferred).
321
+ """
322
+ if not turns:
323
+ return []
324
+
325
+ events: list[WasteEvent] = []
326
+ n = len(turns)
327
+ idx_to_pos: dict[int, int] = {t["turn_index"]: pos for pos, t in enumerate(turns)}
328
+
329
+ # ---- PATH A scan -------------------------------------------------------
330
+ for i, t in enumerate(turns):
331
+ if not _is_read_call(t):
332
+ continue
333
+ result_pos = _next_tool_pos(turns, i)
334
+ if result_pos is None:
335
+ continue
336
+ snip = turns[result_pos].get("content_snippet", "")
337
+ if snip.startswith(_FILE_UNCHANGED_PREFIX):
338
+ events.append(
339
+ WasteEvent(
340
+ detector="REDUNDANT-READ",
341
+ session_id=session_id,
342
+ turns=[t["turn_index"], turns[result_pos]["turn_index"]],
343
+ evidence={
344
+ "path": "A",
345
+ "call_turn": t["turn_index"],
346
+ "result_turn": turns[result_pos]["turn_index"],
347
+ "content_snippet": snip[:120],
348
+ "file_path": _extract_path_from_hint(snip),
349
+ "gap": 0,
350
+ },
351
+ )
352
+ )
353
+
354
+ # ---- PATH B scan -------------------------------------------------------
355
+ # Collect all qualifying Read results: (call_idx, result_idx, list_pos, snippet)
356
+ reads: list[tuple[int, int, int, str]] = []
357
+ for i, t in enumerate(turns):
358
+ if not _is_read_call(t):
359
+ continue
360
+ rp = _next_tool_pos(turns, i)
361
+ if rp is None:
362
+ continue
363
+ snip = turns[rp].get("content_snippet", "")
364
+ if _is_line_numbered_content(snip):
365
+ reads.append((t["turn_index"], turns[rp]["turn_index"], rp, snip))
366
+
367
+ # Track which call_1 indices have already fired to avoid duplicate events
368
+ # from the same first read pairing with multiple later reads.
369
+ fired_first: set[int] = set()
370
+
371
+ for ia, (call_a, res_a, pos_a, snip_a) in enumerate(reads):
372
+ if call_a in fired_first:
373
+ continue
374
+ for call_b, res_b, pos_b, snip_b in reads[ia + 1 :]:
375
+ if snip_a != snip_b:
376
+ continue
377
+ gap = call_b - res_a
378
+ if gap <= 0 or gap > _REDUNDANT_READ_GAP_MAX:
379
+ continue
380
+
381
+ # Check barriers in the range (pos_a+1 .. pos of call_b, exclusive)
382
+ call_b_pos = idx_to_pos.get(call_b, pos_a + 1)
383
+ has_barrier = any(
384
+ (_is_write_call(turns[k]) or turns[k].get("role") == "user")
385
+ for k in range(pos_a + 1, call_b_pos)
386
+ )
387
+ if has_barrier:
388
+ continue
389
+
390
+ events.append(
391
+ WasteEvent(
392
+ detector="REDUNDANT-READ",
393
+ session_id=session_id,
394
+ turns=[call_a, res_a, call_b, res_b],
395
+ evidence={
396
+ "path": "B",
397
+ "call_1_turn": call_a,
398
+ "result_1_turn": res_a,
399
+ "call_2_turn": call_b,
400
+ "result_2_turn": res_b,
401
+ "content_snippet": snip_a[:120],
402
+ "gap": gap,
403
+ },
404
+ )
405
+ )
406
+ fired_first.add(call_a)
407
+ break # one event per first-read; move to next ia
408
+
409
+ return events