contextrot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contextrot/__init__.py +3 -0
- contextrot/adapters/__init__.py +12 -0
- contextrot/adapters/base.py +28 -0
- contextrot/adapters/claude_code.py +197 -0
- contextrot/analysis/__init__.py +106 -0
- contextrot/analysis/composition.py +72 -0
- contextrot/analysis/prescriptions.py +101 -0
- contextrot/analysis/rot.py +132 -0
- contextrot/cli.py +165 -0
- contextrot/models.py +78 -0
- contextrot/pricing.py +69 -0
- contextrot/report/__init__.py +4 -0
- contextrot/report/html.py +161 -0
- contextrot/report/template.html.j2 +169 -0
- contextrot/report/terminal.py +164 -0
- contextrot/signals/__init__.py +140 -0
- contextrot-0.1.0.dist-info/METADATA +133 -0
- contextrot-0.1.0.dist-info/RECORD +21 -0
- contextrot-0.1.0.dist-info/WHEEL +4 -0
- contextrot-0.1.0.dist-info/entry_points.txt +2 -0
- contextrot-0.1.0.dist-info/licenses/LICENSE +21 -0
contextrot/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Adapter registry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from contextrot.adapters.base import SessionAdapter
|
|
6
|
+
from contextrot.adapters.claude_code import ClaudeCodeAdapter
|
|
7
|
+
|
|
8
|
+
ADAPTERS: dict[str, SessionAdapter] = {
|
|
9
|
+
ClaudeCodeAdapter.name: ClaudeCodeAdapter(),
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
__all__ = ["ADAPTERS", "SessionAdapter", "ClaudeCodeAdapter"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Adapter interface.
|
|
2
|
+
|
|
3
|
+
To add support for a new agent CLI, subclass SessionAdapter, implement
|
|
4
|
+
discover() and parse(), and register it in adapters/__init__.py. Adapters
|
|
5
|
+
must be tolerant: unknown fields are ignored, malformed lines are skipped,
|
|
6
|
+
and a partially parsed session is better than a crash.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from contextrot.models import Session
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SessionAdapter(ABC):
|
|
18
|
+
"""Parses one agent CLI's native transcripts into normalized Sessions."""
|
|
19
|
+
|
|
20
|
+
name: str = "base"
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def discover(self, data_dir: Path | None = None) -> list[Path]:
|
|
24
|
+
"""Return transcript files available on this machine."""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def parse(self, path: Path) -> Session | None:
|
|
28
|
+
"""Parse one transcript file. Return None if it holds no usable steps."""
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Claude Code transcript adapter.
|
|
2
|
+
|
|
3
|
+
Claude Code stores each session as a JSONL file under
|
|
4
|
+
``~/.claude/projects/<project-slug>/<session-uuid>.jsonl``. Relevant entry
|
|
5
|
+
types:
|
|
6
|
+
|
|
7
|
+
- ``assistant``: one model API call. ``message.usage`` carries token
|
|
8
|
+
accounting (``input_tokens``, ``cache_creation_input_tokens``,
|
|
9
|
+
``cache_read_input_tokens``, ``output_tokens``); ``message.content`` is a
|
|
10
|
+
list of blocks (``text``, ``thinking``, ``tool_use``).
|
|
11
|
+
- ``user``: either a human prompt (string content) or tool results
|
|
12
|
+
(``tool_result`` blocks with ``tool_use_id`` and ``is_error``).
|
|
13
|
+
- ``isSidechain: true`` marks sub-agent traffic; it is counted but excluded
|
|
14
|
+
from the main step list so fill percentages reflect the primary context
|
|
15
|
+
window.
|
|
16
|
+
|
|
17
|
+
Format observed on Claude Code 2.x. Parsing is tolerant by design: fields
|
|
18
|
+
we don't recognize are ignored, lines that fail to decode are skipped.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from contextrot.adapters.base import SessionAdapter
|
|
28
|
+
from contextrot.models import Session, Step, ToolCall
|
|
29
|
+
|
|
30
|
+
# Tool input keys used as the retry/re-read "target", in priority order.
|
|
31
|
+
_TARGET_KEYS = ("file_path", "path", "url", "pattern", "command", "query")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _parse_ts(raw: object) -> datetime | None:
|
|
35
|
+
if not isinstance(raw, str):
|
|
36
|
+
return None
|
|
37
|
+
try:
|
|
38
|
+
return datetime.fromisoformat(raw.replace("Z", "+00:00"))
|
|
39
|
+
except ValueError:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _target_of(name: str, tool_input: dict) -> str | None:
|
|
44
|
+
for key in _TARGET_KEYS:
|
|
45
|
+
val = tool_input.get(key)
|
|
46
|
+
if isinstance(val, str) and val:
|
|
47
|
+
# For shell commands, the first line is enough to identify a retry.
|
|
48
|
+
return val.split("\n", 1)[0][:300]
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _result_text(content: object) -> str:
|
|
53
|
+
if isinstance(content, str):
|
|
54
|
+
return content
|
|
55
|
+
if isinstance(content, list):
|
|
56
|
+
parts = []
|
|
57
|
+
for block in content:
|
|
58
|
+
if isinstance(block, dict) and isinstance(block.get("text"), str):
|
|
59
|
+
parts.append(block["text"])
|
|
60
|
+
return "\n".join(parts)
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ClaudeCodeAdapter(SessionAdapter):
|
|
65
|
+
name = "claude-code"
|
|
66
|
+
|
|
67
|
+
def discover(self, data_dir: Path | None = None) -> list[Path]:
|
|
68
|
+
root = data_dir or Path.home() / ".claude" / "projects"
|
|
69
|
+
if not root.is_dir():
|
|
70
|
+
return []
|
|
71
|
+
return sorted(root.glob("*/*.jsonl"))
|
|
72
|
+
|
|
73
|
+
def parse(self, path: Path) -> Session | None:
|
|
74
|
+
session = Session(
|
|
75
|
+
session_id=path.stem,
|
|
76
|
+
source=self.name,
|
|
77
|
+
project=path.parent.name,
|
|
78
|
+
)
|
|
79
|
+
# tool_use_id -> ToolCall, so results (which arrive in later user
|
|
80
|
+
# entries) can be attached to the step that made the call.
|
|
81
|
+
open_calls: dict[str, ToolCall] = {}
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
with path.open(encoding="utf-8", errors="replace") as fh:
|
|
85
|
+
for line in fh:
|
|
86
|
+
try:
|
|
87
|
+
entry = json.loads(line)
|
|
88
|
+
except json.JSONDecodeError:
|
|
89
|
+
continue
|
|
90
|
+
if not isinstance(entry, dict):
|
|
91
|
+
continue
|
|
92
|
+
self._consume(entry, session, open_calls)
|
|
93
|
+
except OSError:
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
if not session.steps:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
# Use the real working directory as the project name when present.
|
|
100
|
+
return session
|
|
101
|
+
|
|
102
|
+
def _consume(self, entry: dict, session: Session, open_calls: dict[str, ToolCall]) -> None:
|
|
103
|
+
etype = entry.get("type")
|
|
104
|
+
if etype == "assistant":
|
|
105
|
+
if entry.get("isSidechain"):
|
|
106
|
+
session.sidechain_steps += 1
|
|
107
|
+
return
|
|
108
|
+
self._consume_assistant(entry, session, open_calls)
|
|
109
|
+
elif etype == "user" and not entry.get("isSidechain"):
|
|
110
|
+
self._consume_user(entry, session, open_calls)
|
|
111
|
+
|
|
112
|
+
cwd = entry.get("cwd")
|
|
113
|
+
if isinstance(cwd, str) and cwd:
|
|
114
|
+
session.project = cwd
|
|
115
|
+
|
|
116
|
+
def _consume_assistant(
|
|
117
|
+
self, entry: dict, session: Session, open_calls: dict[str, ToolCall]
|
|
118
|
+
) -> None:
|
|
119
|
+
message = entry.get("message")
|
|
120
|
+
if not isinstance(message, dict):
|
|
121
|
+
return
|
|
122
|
+
usage = message.get("usage")
|
|
123
|
+
ts = _parse_ts(entry.get("timestamp"))
|
|
124
|
+
|
|
125
|
+
content = message.get("content")
|
|
126
|
+
texts: list[str] = []
|
|
127
|
+
calls: list[ToolCall] = []
|
|
128
|
+
if isinstance(content, list):
|
|
129
|
+
for block in content:
|
|
130
|
+
if not isinstance(block, dict):
|
|
131
|
+
continue
|
|
132
|
+
btype = block.get("type")
|
|
133
|
+
if btype == "text" and isinstance(block.get("text"), str):
|
|
134
|
+
texts.append(block["text"])
|
|
135
|
+
elif btype == "tool_use":
|
|
136
|
+
name = block.get("name") or "unknown"
|
|
137
|
+
raw_input = block.get("input")
|
|
138
|
+
tool_input: dict = raw_input if isinstance(raw_input, dict) else {}
|
|
139
|
+
call = ToolCall(
|
|
140
|
+
name=str(name),
|
|
141
|
+
tool_use_id=str(block.get("id") or ""),
|
|
142
|
+
target=_target_of(str(name), tool_input),
|
|
143
|
+
)
|
|
144
|
+
calls.append(call)
|
|
145
|
+
if call.tool_use_id:
|
|
146
|
+
open_calls[call.tool_use_id] = call
|
|
147
|
+
|
|
148
|
+
# Claude Code streams one API call across several assistant entries
|
|
149
|
+
# that share a requestId; usage rides on each. Merge by requestId so
|
|
150
|
+
# a single call isn't double counted: only the entry that carries
|
|
151
|
+
# usage starts a new step, subsequent content is folded in.
|
|
152
|
+
if isinstance(usage, dict):
|
|
153
|
+
step = Step(
|
|
154
|
+
timestamp=ts,
|
|
155
|
+
model=str(message.get("model") or "unknown"),
|
|
156
|
+
input_tokens=int(usage.get("input_tokens") or 0),
|
|
157
|
+
cache_creation_tokens=int(usage.get("cache_creation_input_tokens") or 0),
|
|
158
|
+
cache_read_tokens=int(usage.get("cache_read_input_tokens") or 0),
|
|
159
|
+
output_tokens=int(usage.get("output_tokens") or 0),
|
|
160
|
+
tool_calls=calls,
|
|
161
|
+
assistant_text="\n".join(texts),
|
|
162
|
+
)
|
|
163
|
+
session.steps.append(step)
|
|
164
|
+
if session.started_at is None:
|
|
165
|
+
session.started_at = ts
|
|
166
|
+
if ts is not None:
|
|
167
|
+
session.ended_at = ts
|
|
168
|
+
elif session.steps:
|
|
169
|
+
last = session.steps[-1]
|
|
170
|
+
last.tool_calls.extend(calls)
|
|
171
|
+
if texts:
|
|
172
|
+
last.assistant_text = (last.assistant_text + "\n" + "\n".join(texts)).strip()
|
|
173
|
+
|
|
174
|
+
def _consume_user(self, entry: dict, session: Session, open_calls: dict[str, ToolCall]) -> None:
|
|
175
|
+
message = entry.get("message")
|
|
176
|
+
if not isinstance(message, dict):
|
|
177
|
+
return
|
|
178
|
+
content = message.get("content")
|
|
179
|
+
if isinstance(content, str):
|
|
180
|
+
session.user_message_chars += len(content)
|
|
181
|
+
return
|
|
182
|
+
if not isinstance(content, list):
|
|
183
|
+
return
|
|
184
|
+
for block in content:
|
|
185
|
+
if not isinstance(block, dict):
|
|
186
|
+
continue
|
|
187
|
+
if block.get("type") == "tool_result":
|
|
188
|
+
call = open_calls.pop(str(block.get("tool_use_id") or ""), None)
|
|
189
|
+
if call is None:
|
|
190
|
+
continue
|
|
191
|
+
text = _result_text(block.get("content"))
|
|
192
|
+
call.result_chars = len(text)
|
|
193
|
+
if block.get("is_error"):
|
|
194
|
+
call.is_error = True
|
|
195
|
+
call.error_text = text[:500]
|
|
196
|
+
elif block.get("type") == "text" and isinstance(block.get("text"), str):
|
|
197
|
+
session.user_message_chars += len(block["text"])
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Analysis orchestrator: transcripts in, AnalysisResult out."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import UTC, datetime, timedelta
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from contextrot.adapters import ADAPTERS
|
|
10
|
+
from contextrot.analysis.composition import Composition, estimate_composition
|
|
11
|
+
from contextrot.analysis.prescriptions import Prescription, prescribe
|
|
12
|
+
from contextrot.analysis.rot import RotCurve, build_rot_curve
|
|
13
|
+
from contextrot.models import Session
|
|
14
|
+
from contextrot.pricing import DEFAULT_CONTEXT_WINDOW, context_window_for
|
|
15
|
+
from contextrot.signals import StepSignals, extract_signals
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class AnalysisResult:
|
|
20
|
+
sessions: list[Session]
|
|
21
|
+
steps: list[StepSignals]
|
|
22
|
+
curve: RotCurve
|
|
23
|
+
composition: Composition
|
|
24
|
+
prescriptions: list[Prescription]
|
|
25
|
+
context_window: int
|
|
26
|
+
total_cost_usd: float
|
|
27
|
+
rework_cost_usd: float # cost of degraded steps + their outputs
|
|
28
|
+
steps_past_knee: int
|
|
29
|
+
days: int | None
|
|
30
|
+
skipped_sessions: int = 0
|
|
31
|
+
signal_rates: dict[str, float] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_sessions(
|
|
35
|
+
data_dir: Path | None = None,
|
|
36
|
+
project_filter: str | None = None,
|
|
37
|
+
days: int | None = None,
|
|
38
|
+
min_steps: int = 3,
|
|
39
|
+
) -> tuple[list[Session], int]:
|
|
40
|
+
"""Discover and parse sessions across all adapters. Returns (sessions, skipped)."""
|
|
41
|
+
sessions: list[Session] = []
|
|
42
|
+
skipped = 0
|
|
43
|
+
cutoff = datetime.now(UTC) - timedelta(days=days) if days else None
|
|
44
|
+
|
|
45
|
+
for adapter in ADAPTERS.values():
|
|
46
|
+
for path in adapter.discover(data_dir):
|
|
47
|
+
if cutoff is not None:
|
|
48
|
+
try:
|
|
49
|
+
mtime = datetime.fromtimestamp(path.stat().st_mtime, tz=UTC)
|
|
50
|
+
if mtime < cutoff:
|
|
51
|
+
continue
|
|
52
|
+
except OSError:
|
|
53
|
+
continue
|
|
54
|
+
session = adapter.parse(path)
|
|
55
|
+
if session is None or len(session.steps) < min_steps:
|
|
56
|
+
skipped += 1
|
|
57
|
+
continue
|
|
58
|
+
if project_filter and project_filter.lower() not in session.project.lower():
|
|
59
|
+
continue
|
|
60
|
+
sessions.append(session)
|
|
61
|
+
|
|
62
|
+
sessions.sort(key=lambda s: (s.started_at or datetime.min.replace(tzinfo=UTC)))
|
|
63
|
+
return sessions, skipped
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def analyze(
|
|
67
|
+
data_dir: Path | None = None,
|
|
68
|
+
project_filter: str | None = None,
|
|
69
|
+
days: int | None = 30,
|
|
70
|
+
window_override: int | None = None,
|
|
71
|
+
) -> AnalysisResult:
|
|
72
|
+
sessions, skipped = load_sessions(data_dir, project_filter, days)
|
|
73
|
+
|
|
74
|
+
all_steps: list[StepSignals] = []
|
|
75
|
+
window = window_override or DEFAULT_CONTEXT_WINDOW
|
|
76
|
+
for s in sessions:
|
|
77
|
+
model = s.steps[0].model if s.steps else ""
|
|
78
|
+
session_window = context_window_for(model, window_override)
|
|
79
|
+
window = max(window, session_window)
|
|
80
|
+
all_steps.extend(extract_signals(s, session_window).steps)
|
|
81
|
+
|
|
82
|
+
curve = build_rot_curve(all_steps)
|
|
83
|
+
comp = estimate_composition(sessions, window)
|
|
84
|
+
|
|
85
|
+
total_cost = sum(st.cost_usd for st in all_steps)
|
|
86
|
+
rework_cost = sum(st.cost_usd for st in all_steps if st.degraded)
|
|
87
|
+
knee = curve.knee_pct
|
|
88
|
+
past_knee = sum(1 for st in all_steps if knee is not None and st.fill_pct >= knee)
|
|
89
|
+
|
|
90
|
+
n = max(len(all_steps), 1)
|
|
91
|
+
signal_rates = {name: count / n for name, count in curve.signal_totals.items()}
|
|
92
|
+
|
|
93
|
+
return AnalysisResult(
|
|
94
|
+
sessions=sessions,
|
|
95
|
+
steps=all_steps,
|
|
96
|
+
curve=curve,
|
|
97
|
+
composition=comp,
|
|
98
|
+
prescriptions=prescribe(curve, comp, rework_cost, past_knee),
|
|
99
|
+
context_window=window,
|
|
100
|
+
total_cost_usd=total_cost,
|
|
101
|
+
rework_cost_usd=rework_cost,
|
|
102
|
+
steps_past_knee=past_knee,
|
|
103
|
+
days=days,
|
|
104
|
+
skipped_sessions=skipped,
|
|
105
|
+
signal_rates=signal_rates,
|
|
106
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Context-composition estimation.
|
|
2
|
+
|
|
3
|
+
Transcripts don't record the literal context content, but token accounting
|
|
4
|
+
lets us attribute where the window goes with useful accuracy:
|
|
5
|
+
|
|
6
|
+
- session overhead: prompt tokens of the *first* API call — system prompt,
|
|
7
|
+
tool schemas, MCP schemas, CLAUDE.md — everything loaded before the user
|
|
8
|
+
typed a word
|
|
9
|
+
- tool outputs: estimated from result characters (chars/4 heuristic)
|
|
10
|
+
- conversation: user + assistant text, same heuristic
|
|
11
|
+
- other growth: whatever remains of peak prompt size (thinking, file
|
|
12
|
+
snapshots, framework bookkeeping)
|
|
13
|
+
|
|
14
|
+
Figures are labeled as estimates in every report.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
from contextrot.models import Session
|
|
22
|
+
|
|
23
|
+
CHARS_PER_TOKEN = 4
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Composition:
|
|
28
|
+
overhead_tokens: int # loaded before the first user word
|
|
29
|
+
tool_output_tokens: int
|
|
30
|
+
conversation_tokens: int
|
|
31
|
+
other_growth_tokens: int
|
|
32
|
+
peak_prompt_tokens: int
|
|
33
|
+
context_window: int
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def overhead_pct_of_window(self) -> float:
|
|
37
|
+
return 100.0 * self.overhead_tokens / max(self.context_window, 1)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def estimate_composition(sessions: list[Session], context_window: int) -> Composition:
|
|
41
|
+
"""Per-session average composition, so all figures share one scale.
|
|
42
|
+
|
|
43
|
+
Tool-output and conversation figures are tokens that *flowed through*
|
|
44
|
+
the session; with compaction they can exceed the window itself, which
|
|
45
|
+
is exactly the point — that flow is what fills it.
|
|
46
|
+
"""
|
|
47
|
+
n = max(len(sessions), 1)
|
|
48
|
+
overhead_sum = tool_out = convo = 0
|
|
49
|
+
peak_sum = 0
|
|
50
|
+
for s in sessions:
|
|
51
|
+
if s.steps:
|
|
52
|
+
overhead_sum += s.steps[0].prompt_tokens
|
|
53
|
+
peak_sum += s.peak_prompt_tokens
|
|
54
|
+
for st in s.steps:
|
|
55
|
+
convo += len(st.assistant_text) // CHARS_PER_TOKEN
|
|
56
|
+
for c in st.tool_calls:
|
|
57
|
+
tool_out += c.result_chars // CHARS_PER_TOKEN
|
|
58
|
+
convo += s.user_message_chars // CHARS_PER_TOKEN
|
|
59
|
+
|
|
60
|
+
overhead = overhead_sum // n
|
|
61
|
+
avg_peak = peak_sum // n
|
|
62
|
+
avg_tool = tool_out // n
|
|
63
|
+
avg_convo = convo // n
|
|
64
|
+
other = max(0, avg_peak - overhead - avg_tool - avg_convo)
|
|
65
|
+
return Composition(
|
|
66
|
+
overhead_tokens=overhead,
|
|
67
|
+
tool_output_tokens=avg_tool,
|
|
68
|
+
conversation_tokens=avg_convo,
|
|
69
|
+
other_growth_tokens=other,
|
|
70
|
+
peak_prompt_tokens=avg_peak,
|
|
71
|
+
context_window=context_window,
|
|
72
|
+
)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Prescription engine v1.
|
|
2
|
+
|
|
3
|
+
Rule-based recommendations, each quantified from the user's own data.
|
|
4
|
+
A prescription is only emitted when its evidence threshold is met — an
|
|
5
|
+
empty list is a valid, honest output.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from contextrot.analysis.composition import Composition
|
|
13
|
+
from contextrot.analysis.rot import RotCurve
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Prescription:
|
|
18
|
+
title: str
|
|
19
|
+
detail: str
|
|
20
|
+
impact: str # quantified expected benefit
|
|
21
|
+
priority: int # lower = more important
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def prescribe(
|
|
25
|
+
curve: RotCurve,
|
|
26
|
+
comp: Composition,
|
|
27
|
+
rework_cost_usd: float,
|
|
28
|
+
steps_past_knee: int,
|
|
29
|
+
) -> list[Prescription]:
|
|
30
|
+
out: list[Prescription] = []
|
|
31
|
+
|
|
32
|
+
if curve.knee_pct is not None and curve.high_fill_rate and curve.low_fill_rate:
|
|
33
|
+
out.append(
|
|
34
|
+
Prescription(
|
|
35
|
+
title=f"Compact or restart sessions before ~{curve.knee_pct}% context fill",
|
|
36
|
+
detail=(
|
|
37
|
+
f"Your failure-signal rate rises from "
|
|
38
|
+
f"{curve.low_fill_rate:.1%} below {int(curve.knee_pct)}% fill to "
|
|
39
|
+
f"{curve.high_fill_rate:.1%} in deep context. "
|
|
40
|
+
f"{steps_past_knee} of your recent steps ran past that threshold."
|
|
41
|
+
),
|
|
42
|
+
impact=(
|
|
43
|
+
f"Estimated ${rework_cost_usd:.2f} of recent spend went to degraded "
|
|
44
|
+
"steps and their retries; most of it is concentrated past the knee."
|
|
45
|
+
),
|
|
46
|
+
priority=1,
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if comp.overhead_pct_of_window >= 15:
|
|
51
|
+
out.append(
|
|
52
|
+
Prescription(
|
|
53
|
+
title="Audit your session startup overhead",
|
|
54
|
+
detail=(
|
|
55
|
+
f"On average, ~{comp.overhead_tokens:,} tokens "
|
|
56
|
+
f"({comp.overhead_pct_of_window:.0f}% of the context window) are "
|
|
57
|
+
"loaded before your first word — system prompt, MCP tool schemas, "
|
|
58
|
+
"CLAUDE.md. Disable MCP servers you don't use in this project and "
|
|
59
|
+
"trim stale CLAUDE.md sections."
|
|
60
|
+
),
|
|
61
|
+
impact=(
|
|
62
|
+
"Every point of startup overhead is a point of working context "
|
|
63
|
+
"you never get back, in every session."
|
|
64
|
+
),
|
|
65
|
+
priority=2,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
total_growth = comp.tool_output_tokens + comp.conversation_tokens + comp.other_growth_tokens
|
|
70
|
+
if total_growth > 0 and comp.tool_output_tokens / total_growth >= 0.5:
|
|
71
|
+
out.append(
|
|
72
|
+
Prescription(
|
|
73
|
+
title="Tool outputs dominate your context growth",
|
|
74
|
+
detail=(
|
|
75
|
+
f"~{comp.tool_output_tokens:,} tokens (est.) of context growth come "
|
|
76
|
+
"from tool results. Prefer targeted reads (offsets, limits), "
|
|
77
|
+
"narrower searches, and quieter commands over dumping whole files "
|
|
78
|
+
"and logs into the window."
|
|
79
|
+
),
|
|
80
|
+
impact="Slower fill means more steps before the degradation zone.",
|
|
81
|
+
priority=3,
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
reread_total = curve.signal_totals.get("reread", 0)
|
|
86
|
+
if curve.total_steps and reread_total / curve.total_steps >= 0.08:
|
|
87
|
+
out.append(
|
|
88
|
+
Prescription(
|
|
89
|
+
title="Your agent frequently re-reads files it already read",
|
|
90
|
+
detail=(
|
|
91
|
+
f"Re-reads fired on {reread_total} steps "
|
|
92
|
+
f"({reread_total / curve.total_steps:.0%}). That usually means the "
|
|
93
|
+
"original content scrolled out of effective attention. Splitting "
|
|
94
|
+
"large tasks into shorter sessions keeps files 'fresh'."
|
|
95
|
+
),
|
|
96
|
+
impact="Fewer re-reads is both cheaper and a direct rot symptom removed.",
|
|
97
|
+
priority=4,
|
|
98
|
+
)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return sorted(out, key=lambda p: p.priority)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Rot-curve statistics.
|
|
2
|
+
|
|
3
|
+
Buckets steps by context-fill percentage and measures how often failure
|
|
4
|
+
signals fire in each bucket. All statistics are observational: this is a
|
|
5
|
+
diagnostic on your own sessions, not a controlled experiment, and the
|
|
6
|
+
report says so. Wilson score intervals are used because bucket counts are
|
|
7
|
+
often small and rates are near the boundary.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import math
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
from contextrot.signals import SIGNAL_NAMES, StepSignals
|
|
16
|
+
|
|
17
|
+
BUCKET_WIDTH = 10 # percent
|
|
18
|
+
LOW_FILL_MAX = 40.0 # "fresh context" zone
|
|
19
|
+
HIGH_FILL_MIN = 60.0 # "deep context" zone
|
|
20
|
+
MIN_BUCKET_N = 15 # buckets below this are shown but marked low-confidence
|
|
21
|
+
KNEE_RATIO = 1.5 # bucket rate vs baseline that marks the degradation knee
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def wilson_interval(successes: int, n: int, z: float = 1.96) -> tuple[float, float]:
|
|
25
|
+
if n == 0:
|
|
26
|
+
return (0.0, 1.0)
|
|
27
|
+
p = successes / n
|
|
28
|
+
denom = 1 + z * z / n
|
|
29
|
+
center = (p + z * z / (2 * n)) / denom
|
|
30
|
+
margin = (z / denom) * math.sqrt(p * (1 - p) / n + z * z / (4 * n * n))
|
|
31
|
+
return (max(0.0, center - margin), min(1.0, center + margin))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class Bucket:
|
|
36
|
+
lo: int # inclusive fill %
|
|
37
|
+
hi: int # exclusive fill %
|
|
38
|
+
n: int = 0
|
|
39
|
+
degraded: int = 0
|
|
40
|
+
by_signal: dict[str, int] = field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def rate(self) -> float:
|
|
44
|
+
return self.degraded / self.n if self.n else 0.0
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def ci(self) -> tuple[float, float]:
|
|
48
|
+
return wilson_interval(self.degraded, self.n)
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def low_confidence(self) -> bool:
|
|
52
|
+
return self.n < MIN_BUCKET_N
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class RotCurve:
|
|
57
|
+
buckets: list[Bucket]
|
|
58
|
+
total_steps: int
|
|
59
|
+
total_degraded: int
|
|
60
|
+
low_fill_rate: float | None # rate below LOW_FILL_MAX
|
|
61
|
+
high_fill_rate: float | None # rate at/above HIGH_FILL_MIN
|
|
62
|
+
low_fill_n: int
|
|
63
|
+
high_fill_n: int
|
|
64
|
+
degradation_ratio: float | None # high / low
|
|
65
|
+
ratio_significant: bool # Wilson CIs of the two zones don't overlap
|
|
66
|
+
knee_pct: int | None # start of first bucket where rate >= KNEE_RATIO * baseline
|
|
67
|
+
signal_totals: dict[str, int] = field(default_factory=dict)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def overall_rate(self) -> float:
|
|
71
|
+
return self.total_degraded / self.total_steps if self.total_steps else 0.0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def build_rot_curve(steps: list[StepSignals]) -> RotCurve:
|
|
75
|
+
buckets = [Bucket(lo, lo + BUCKET_WIDTH) for lo in range(0, 100, BUCKET_WIDTH)]
|
|
76
|
+
signal_totals = dict.fromkeys(SIGNAL_NAMES, 0)
|
|
77
|
+
|
|
78
|
+
low_n = low_d = high_n = high_d = 0
|
|
79
|
+
total_degraded = 0
|
|
80
|
+
|
|
81
|
+
for s in steps:
|
|
82
|
+
idx = min(int(s.fill_pct // BUCKET_WIDTH), len(buckets) - 1)
|
|
83
|
+
b = buckets[idx]
|
|
84
|
+
b.n += 1
|
|
85
|
+
if s.degraded:
|
|
86
|
+
b.degraded += 1
|
|
87
|
+
total_degraded += 1
|
|
88
|
+
for name in SIGNAL_NAMES:
|
|
89
|
+
if getattr(s, name):
|
|
90
|
+
b.by_signal[name] = b.by_signal.get(name, 0) + 1
|
|
91
|
+
signal_totals[name] += 1
|
|
92
|
+
|
|
93
|
+
if s.fill_pct < LOW_FILL_MAX:
|
|
94
|
+
low_n += 1
|
|
95
|
+
low_d += 1 if s.degraded else 0
|
|
96
|
+
elif s.fill_pct >= HIGH_FILL_MIN:
|
|
97
|
+
high_n += 1
|
|
98
|
+
high_d += 1 if s.degraded else 0
|
|
99
|
+
|
|
100
|
+
low_rate = low_d / low_n if low_n else None
|
|
101
|
+
high_rate = high_d / high_n if high_n else None
|
|
102
|
+
|
|
103
|
+
ratio = None
|
|
104
|
+
significant = False
|
|
105
|
+
if low_rate is not None and high_rate is not None and low_n and high_n:
|
|
106
|
+
ratio = high_rate / low_rate if low_rate > 0 else (float("inf") if high_rate > 0 else 1.0)
|
|
107
|
+
lo_ci = wilson_interval(low_d, low_n)
|
|
108
|
+
hi_ci = wilson_interval(high_d, high_n)
|
|
109
|
+
significant = hi_ci[0] > lo_ci[1] # high zone's floor above low zone's ceiling
|
|
110
|
+
|
|
111
|
+
knee = None
|
|
112
|
+
if low_rate is not None and low_rate > 0:
|
|
113
|
+
for b in buckets:
|
|
114
|
+
if b.lo < LOW_FILL_MAX or b.low_confidence:
|
|
115
|
+
continue
|
|
116
|
+
if b.rate >= KNEE_RATIO * low_rate:
|
|
117
|
+
knee = b.lo
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
return RotCurve(
|
|
121
|
+
buckets=buckets,
|
|
122
|
+
total_steps=len(steps),
|
|
123
|
+
total_degraded=total_degraded,
|
|
124
|
+
low_fill_rate=low_rate,
|
|
125
|
+
high_fill_rate=high_rate,
|
|
126
|
+
low_fill_n=low_n,
|
|
127
|
+
high_fill_n=high_n,
|
|
128
|
+
degradation_ratio=ratio,
|
|
129
|
+
ratio_significant=significant,
|
|
130
|
+
knee_pct=knee,
|
|
131
|
+
signal_totals=signal_totals,
|
|
132
|
+
)
|