coding-agent-wrapper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
caw/faststats.py ADDED
@@ -0,0 +1,298 @@
1
+ """Fast extraction of frequently-needed statistics from trajectory files.
2
+
3
+ A full ``Trajectory.from_dict(json.loads(...))`` round-trip is slow on large
4
+ trajectories because the ``turns`` array can hold many MB of tool I/O that
5
+ the caller does not need. Most consumers (cost dashboards, spend limiters,
6
+ list views) only want a handful of header / footer fields:
7
+
8
+ cost_usd, model, created_at, completed_at, duration_ms, token totals.
9
+
10
+ ``FastStats`` extracts these by reading only the head and tail of the file
11
+ (~8 KB total) and parsing the predictable indent=2 layout that
12
+ :class:`caw.storage.SessionStore` writes. The fast path is roughly 3x
13
+ quicker than ``json.loads`` on a directory of small trajectories and 25x+
14
+ faster on multi-MB files. When the fast path fails (non-CAW layout, hand
15
+ edited file, etc.) it falls back to a full JSON parse.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import re
22
+ from dataclasses import asdict, dataclass
23
+ from pathlib import Path
24
+ from typing import Any, Iterable, Iterator, Optional
25
+
26
+ from caw.models import Trajectory
27
+
28
+ __all__ = ["FastStats"]
29
+
30
+ # Bytes read from the head and tail of each file. The CAW writer puts every
31
+ # header field (agent, model, session_id, created_at, completed_at,
32
+ # usage_limited) in the first ~300 bytes and the trailing usage / total_usage
33
+ # / duration_ms / metadata block in the last few hundred bytes, so 4 KB on
34
+ # each side is plenty of headroom even for files with long ``system_prompt``
35
+ # values stretching the header.
36
+ _HEAD_BYTES = 4096
37
+ _TAIL_BYTES = 4096
38
+
39
+
40
+ def _str_field(blob: str, key: str) -> str:
41
+ """Return the first JSON string value for ``"key": "..."`` in *blob*."""
42
+ m = re.search(rf'"{re.escape(key)}"\s*:\s*"((?:[^"\\]|\\.)*)"', blob)
43
+ if not m:
44
+ return ""
45
+ raw = m.group(1)
46
+ # Decode JSON escapes (e.g. \", \\, \n, \uXXXX) by parsing as a JSON string
47
+ try:
48
+ return json.loads(f'"{raw}"')
49
+ except json.JSONDecodeError:
50
+ return raw
51
+
52
+
53
+ def _num_field(blob: str, key: str, *, default: float = 0.0) -> float:
54
+ """Return the first JSON number value for ``"key": <num>`` in *blob*."""
55
+ m = re.search(rf'"{re.escape(key)}"\s*:\s*([0-9eE.+-]+)', blob)
56
+ return float(m.group(1)) if m else default
57
+
58
+
59
+ def _bool_field(blob: str, key: str) -> bool:
60
+ m = re.search(rf'"{re.escape(key)}"\s*:\s*(true|false)', blob)
61
+ return bool(m and m.group(1) == "true")
62
+
63
+
64
+ @dataclass
65
+ class FastStats:
66
+ """Lightweight statistics for a CAW trajectory.
67
+
68
+ The class is intentionally narrow: it exposes only the fields that
69
+ consumers ask for repeatedly without paying for a full trajectory parse.
70
+ For everything else (turns, tool calls, content blocks) load the file
71
+ via :meth:`caw.agent.Session.load_trajectory` instead.
72
+
73
+ All ``cost_usd`` / token values come from the trajectory's
74
+ ``total_usage`` (recursive across subagents) when present, falling back
75
+ to ``usage`` for older trajectories that did not record it separately.
76
+ """
77
+
78
+ path: Optional[Path] = None
79
+
80
+ # Header fields (from the start of the file).
81
+ agent: str = ""
82
+ model: str = ""
83
+ session_id: str = ""
84
+ created_at: str = ""
85
+ completed_at: str = ""
86
+ usage_limited: bool = False
87
+
88
+ # Footer fields (from the tail of the file).
89
+ duration_ms: int = 0
90
+ cost_usd: float = 0.0
91
+ input_tokens: int = 0
92
+ output_tokens: int = 0
93
+ cache_read_tokens: int = 0
94
+ cache_write_tokens: int = 0
95
+
96
+ @property
97
+ def total_tokens(self) -> int:
98
+ return self.input_tokens + self.output_tokens
99
+
100
+ def to_dict(self) -> dict[str, Any]:
101
+ """Return a JSON-serializable dict (``path`` is stringified)."""
102
+ d = asdict(self)
103
+ d["path"] = str(self.path) if self.path is not None else None
104
+ return d
105
+
106
+ # ------------------------------------------------------------------
107
+ # Constructors
108
+ # ------------------------------------------------------------------
109
+
110
+ @classmethod
111
+ def from_trajectory(cls, trajectory: Trajectory, *, path: str | Path | None = None) -> FastStats:
112
+ """Build :class:`FastStats` from an in-memory :class:`Trajectory`."""
113
+ usage = trajectory.total_usage
114
+ return cls(
115
+ path=Path(path) if path is not None else None,
116
+ agent=trajectory.agent,
117
+ model=trajectory.model,
118
+ session_id=trajectory.session_id,
119
+ created_at=trajectory.created_at,
120
+ completed_at=trajectory.completed_at,
121
+ usage_limited=trajectory.usage_limited,
122
+ duration_ms=trajectory.duration_ms,
123
+ cost_usd=usage.cost_usd,
124
+ input_tokens=usage.input_tokens,
125
+ output_tokens=usage.output_tokens,
126
+ cache_read_tokens=usage.cache_read_tokens,
127
+ cache_write_tokens=usage.cache_write_tokens,
128
+ )
129
+
130
+ @classmethod
131
+ def from_path(cls, path: str | Path) -> Optional[FastStats]:
132
+ """Read fast stats from *path*.
133
+
134
+ Returns ``None`` if the file does not exist, is empty, or is not a
135
+ recognizable trajectory file. Tries the head/tail fast path first
136
+ and falls back to a full JSON parse on failure.
137
+ """
138
+ path = Path(path)
139
+ try:
140
+ size = path.stat().st_size
141
+ except OSError:
142
+ return None
143
+ if size == 0:
144
+ return None
145
+
146
+ try:
147
+ with open(path, "rb") as f:
148
+ head_len = min(_HEAD_BYTES, size)
149
+ head = f.read(head_len).decode("utf-8", errors="replace")
150
+ if size <= _HEAD_BYTES:
151
+ tail = head
152
+ elif size <= _HEAD_BYTES + _TAIL_BYTES:
153
+ tail = head + f.read().decode("utf-8", errors="replace")
154
+ else:
155
+ f.seek(size - _TAIL_BYTES)
156
+ tail = f.read(_TAIL_BYTES).decode("utf-8", errors="replace")
157
+ except OSError:
158
+ return None
159
+
160
+ stats = cls._fast_extract(head, tail, path)
161
+ if stats is not None:
162
+ return stats
163
+
164
+ # Fallback: parse the full document. ``Trajectory.from_dict``
165
+ # tolerates missing keys, so we explicitly require ``model`` to be
166
+ # present and non-empty before treating the file as a trajectory.
167
+ try:
168
+ data = json.loads(path.read_bytes())
169
+ except (OSError, ValueError):
170
+ return None
171
+ if not isinstance(data, dict) or not data.get("model"):
172
+ return None
173
+ try:
174
+ traj = Trajectory.from_dict(data)
175
+ except (ValueError, KeyError, TypeError):
176
+ return None
177
+ return cls.from_trajectory(traj, path=path)
178
+
179
+ # ------------------------------------------------------------------
180
+ # Directory iteration
181
+ # ------------------------------------------------------------------
182
+
183
+ @classmethod
184
+ def iter_directory(
185
+ cls,
186
+ directory: str | Path,
187
+ *,
188
+ patterns: Iterable[str] = ("**/trajectory.json", "**/*.traj.json"),
189
+ skip_parts: Iterable[str] = (),
190
+ ) -> Iterator[FastStats]:
191
+ """Yield :class:`FastStats` for every trajectory file under *directory*.
192
+
193
+ ``patterns`` is a list of globs (relative to *directory*) to scan;
194
+ the default catches both the canonical CAW layout
195
+ (``sessions/<id>/trajectory.json``) and the ``.traj.json`` files
196
+ produced by ad-hoc writers. Files whose path contains any directory
197
+ component listed in ``skip_parts`` are excluded. Unreadable or
198
+ malformed files are silently dropped.
199
+ """
200
+ directory = Path(directory)
201
+ if not directory.is_dir():
202
+ return
203
+ skip_set = set(skip_parts)
204
+ seen: set[Path] = set()
205
+ for pattern in patterns:
206
+ for file in directory.glob(pattern):
207
+ if file in seen or not file.is_file():
208
+ continue
209
+ seen.add(file)
210
+ if skip_set:
211
+ try:
212
+ rel_parts = file.relative_to(directory).parts
213
+ except ValueError:
214
+ rel_parts = file.parts
215
+ if any(part in skip_set for part in rel_parts):
216
+ continue
217
+ stats = cls.from_path(file)
218
+ if stats is not None:
219
+ yield stats
220
+
221
+ @classmethod
222
+ def directory_total_cost(
223
+ cls,
224
+ directory: str | Path,
225
+ **kwargs: Any,
226
+ ) -> float:
227
+ """Sum ``cost_usd`` across every trajectory under *directory*.
228
+
229
+ Extra keyword arguments are forwarded to :meth:`iter_directory`.
230
+ """
231
+ return sum(s.cost_usd for s in cls.iter_directory(directory, **kwargs))
232
+
233
+ # ------------------------------------------------------------------
234
+ # Internals
235
+ # ------------------------------------------------------------------
236
+
237
+ @classmethod
238
+ def _fast_extract(cls, head: str, tail: str, path: Path) -> Optional[FastStats]:
239
+ """Pull fields from the raw head/tail text. Returns ``None`` on miss."""
240
+ # ``model`` is a required CAW field. Its absence in the head means
241
+ # this isn't a CAW trajectory and the caller should fall back.
242
+ model = _str_field(head, "model")
243
+ if not model:
244
+ return None
245
+
246
+ agent = _str_field(head, "agent")
247
+ session_id = _str_field(head, "session_id")
248
+ created_at = _str_field(head, "created_at")
249
+ completed_at = _str_field(head, "completed_at")
250
+ usage_limited = _bool_field(head, "usage_limited")
251
+
252
+ # The trailing top-level usage block. With indent=2 the canonical
253
+ # writer always emits ``\n "total_usage": {`` (or ``"usage": {``)
254
+ # at column 2 — nested usage blocks inside ``turns`` use indent 6+,
255
+ # so anchoring on the 2-space form is unambiguous.
256
+ anchor = tail.rfind('\n "total_usage": {')
257
+ if anchor == -1:
258
+ anchor = tail.rfind('\n "usage": {')
259
+ if anchor == -1:
260
+ return None
261
+
262
+ trailing = tail[anchor:]
263
+ # Find the first matching closing brace. ``UsageStats`` has no
264
+ # nested objects so a simple ``find`` is correct.
265
+ end = trailing.find("}")
266
+ if end == -1:
267
+ return None
268
+ usage_blob = trailing[:end]
269
+
270
+ cost_usd = _num_field(usage_blob, "cost_usd")
271
+ input_tokens = int(_num_field(usage_blob, "input_tokens"))
272
+ output_tokens = int(_num_field(usage_blob, "output_tokens"))
273
+ cache_read_tokens = int(_num_field(usage_blob, "cache_read_tokens"))
274
+ cache_write_tokens = int(_num_field(usage_blob, "cache_write_tokens"))
275
+
276
+ # ``duration_ms`` lives at the very end of the file, after the usage
277
+ # blocks. Search the slice past the usage block we just consumed to
278
+ # avoid matching any per-turn ``duration_ms`` that might still be in
279
+ # the tail buffer.
280
+ post_usage = trailing[end:]
281
+ m = re.search(r'\n "duration_ms"\s*:\s*([0-9]+)', post_usage)
282
+ duration_ms = int(m.group(1)) if m else 0
283
+
284
+ return cls(
285
+ path=path,
286
+ agent=agent,
287
+ model=model,
288
+ session_id=session_id,
289
+ created_at=created_at,
290
+ completed_at=completed_at,
291
+ usage_limited=usage_limited,
292
+ duration_ms=duration_ms,
293
+ cost_usd=cost_usd,
294
+ input_tokens=input_tokens,
295
+ output_tokens=output_tokens,
296
+ cache_read_tokens=cache_read_tokens,
297
+ cache_write_tokens=cache_write_tokens,
298
+ )