@ictechgy/context-guard 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/CHANGELOG.md +49 -0
  2. package/LICENSE +201 -0
  3. package/NOTICE +4 -0
  4. package/README.ko.md +353 -0
  5. package/README.md +353 -0
  6. package/context-guard-kit/README.md +76 -0
  7. package/context-guard-kit/benchmark_runner.py +1898 -0
  8. package/context-guard-kit/claude_transcript_cost_audit.py +1591 -0
  9. package/context-guard-kit/context_compress.py +543 -0
  10. package/context-guard-kit/context_escrow.py +919 -0
  11. package/context-guard-kit/context_guard_cli.py +149 -0
  12. package/context-guard-kit/context_guard_diet.py +1036 -0
  13. package/context-guard-kit/context_pack.py +929 -0
  14. package/context-guard-kit/failed_attempt_nudge.py +567 -0
  15. package/context-guard-kit/guard_large_read.py +690 -0
  16. package/context-guard-kit/hook_secret_patterns.py +43 -0
  17. package/context-guard-kit/read_symbol.py +483 -0
  18. package/context-guard-kit/rewrite_bash_for_token_budget.py +501 -0
  19. package/context-guard-kit/sanitize_output.py +725 -0
  20. package/context-guard-kit/settings.example.json +67 -0
  21. package/context-guard-kit/setup_wizard.py +1724 -0
  22. package/context-guard-kit/statusline.sh +362 -0
  23. package/context-guard-kit/statusline_merged.sh +157 -0
  24. package/context-guard-kit/tool_schema_pruner.py +837 -0
  25. package/context-guard-kit/trim_command_output.py +1098 -0
  26. package/docs/distribution.md +55 -0
  27. package/package.json +70 -0
  28. package/packaging/homebrew/context-guard.rb.template +34 -0
  29. package/plugins/context-guard/.claude-plugin/plugin.json +41 -0
  30. package/plugins/context-guard/LICENSE +201 -0
  31. package/plugins/context-guard/NOTICE +4 -0
  32. package/plugins/context-guard/README.ko.md +135 -0
  33. package/plugins/context-guard/README.md +135 -0
  34. package/plugins/context-guard/bin/claude-read-symbol +6 -0
  35. package/plugins/context-guard/bin/claude-sanitize-output +6 -0
  36. package/plugins/context-guard/bin/claude-token-artifact +6 -0
  37. package/plugins/context-guard/bin/claude-token-audit +6 -0
  38. package/plugins/context-guard/bin/claude-token-bench +6 -0
  39. package/plugins/context-guard/bin/claude-token-diet +6 -0
  40. package/plugins/context-guard/bin/claude-token-failed-nudge +6 -0
  41. package/plugins/context-guard/bin/claude-token-guard-read +6 -0
  42. package/plugins/context-guard/bin/claude-token-rewrite-bash +6 -0
  43. package/plugins/context-guard/bin/claude-token-setup +6 -0
  44. package/plugins/context-guard/bin/claude-token-statusline +6 -0
  45. package/plugins/context-guard/bin/claude-token-statusline-merged +6 -0
  46. package/plugins/context-guard/bin/claude-trim-output +6 -0
  47. package/plugins/context-guard/bin/context-guard +149 -0
  48. package/plugins/context-guard/bin/context-guard-artifact +919 -0
  49. package/plugins/context-guard/bin/context-guard-audit +1591 -0
  50. package/plugins/context-guard/bin/context-guard-bench +1898 -0
  51. package/plugins/context-guard/bin/context-guard-compress +543 -0
  52. package/plugins/context-guard/bin/context-guard-diet +1036 -0
  53. package/plugins/context-guard/bin/context-guard-failed-nudge +567 -0
  54. package/plugins/context-guard/bin/context-guard-guard-read +690 -0
  55. package/plugins/context-guard/bin/context-guard-pack +929 -0
  56. package/plugins/context-guard/bin/context-guard-read-symbol +483 -0
  57. package/plugins/context-guard/bin/context-guard-rewrite-bash +501 -0
  58. package/plugins/context-guard/bin/context-guard-sanitize-output +725 -0
  59. package/plugins/context-guard/bin/context-guard-setup +1724 -0
  60. package/plugins/context-guard/bin/context-guard-statusline +362 -0
  61. package/plugins/context-guard/bin/context-guard-statusline-merged +157 -0
  62. package/plugins/context-guard/bin/context-guard-tool-prune +837 -0
  63. package/plugins/context-guard/bin/context-guard-trim-output +1098 -0
  64. package/plugins/context-guard/brief/README.md +65 -0
  65. package/plugins/context-guard/brief/brief-mode.lite.md +29 -0
  66. package/plugins/context-guard/brief/brief-mode.standard.md +31 -0
  67. package/plugins/context-guard/brief/brief-mode.ultra.md +32 -0
  68. package/plugins/context-guard/lib/hook_secret_patterns.py +43 -0
  69. package/plugins/context-guard/skills/audit/SKILL.md +39 -0
  70. package/plugins/context-guard/skills/optimize/SKILL.md +48 -0
  71. package/plugins/context-guard/skills/setup/SKILL.md +40 -0
@@ -0,0 +1,1898 @@
1
+ #!/usr/bin/env python3
2
+ """Claude Code 토큰 절감 벤치마크 자동 실행 runner.
3
+
4
+ `research/benchmark-plan.md` 의 task set × variant 조합을 비대화형 `claude -p`
5
+ 호출로 실행하고, `tokens_per_successful_task` 측정에 필요한 컬럼을 CSV 에 적재한다.
6
+
7
+ 사용 예:
8
+
9
+ ```bash
10
+ context-guard-kit/benchmark_runner.py \
11
+ --tasks bench/tasks.json --variants bench/variants.json \
12
+ --csv bench/results.csv
13
+
14
+ context-guard-kit/benchmark_runner.py --tasks bench/tasks.json \
15
+ --variants bench/variants.json --task-id t01 --variant baseline --dry-run
16
+ ```
17
+
18
+ Task fixture (`tasks.json`): 각 task 는 다음 필드를 가진다.
19
+
20
+ ```json
21
+ [
22
+ {
23
+ "id": "t01",
24
+ "prompt": "Add validation to src/auth/session.ts ...",
25
+ "model": "sonnet",
26
+ "effort": "medium",
27
+ "max_turns": 3,
28
+ "max_budget_usd": 1.0,
29
+ "allowed_tools": ["Read", "Edit", "Bash(npm test*)"],
30
+ "success_command": "npm test -- auth/session",
31
+ "success_cwd": "."
32
+ }
33
+ ]
34
+ ```
35
+
36
+ Variant fixture (`variants.json`): 각 variant 는 `claude -p` 에 추가할 옵션 묶음을 정의한다.
37
+
38
+ ```json
39
+ [
40
+ {"name": "baseline", "extra_args": []},
41
+ {"name": "context_hygiene", "extra_args": ["--strict-mcp-config", "--mcp-config", "bench/minimal-mcp.json"]}
42
+ ]
43
+ ```
44
+
45
+ dry-run 모드는 실제 호출은 하지 않고 어떤 명령이 실행될지만 출력한다.
46
+ """
47
+ from __future__ import annotations
48
+
49
+ import argparse
50
+ import collections
51
+ from contextlib import contextmanager
52
+ import csv
53
+ import datetime as _dt
54
+ import json
55
+ import math
56
+ import os
57
+ import re
58
+ import selectors
59
+ import shlex
60
+ import shutil
61
+ import signal
62
+ import stat
63
+ import subprocess
64
+ import sys
65
+ import time
66
+ import unicodedata
67
+ from dataclasses import dataclass, field
68
+ from pathlib import Path
69
+ from typing import Any
70
+
71
+ try:
72
+ import fcntl
73
+ except ImportError: # pragma: no cover - benchmark runner already requires POSIX no-follow IO.
74
+ fcntl = None # type: ignore[assignment]
75
+
76
+ CSV_COLUMNS = [
77
+ "date",
78
+ "claude_version",
79
+ "task_id",
80
+ "variant",
81
+ "model",
82
+ "effort",
83
+ "total_tokens",
84
+ "input_tokens",
85
+ "output_tokens",
86
+ "cache_read",
87
+ "cache_creation",
88
+ "provider_cached_tokens",
89
+ "provider_cached_tokens_measured",
90
+ "cost_usd",
91
+ "cost_measured",
92
+ "wall_time_seconds",
93
+ "turns",
94
+ "hook_triggers",
95
+ "bytes_before",
96
+ "bytes_after",
97
+ "artifacts_used",
98
+ "external_tokens",
99
+ "external_tokens_measured",
100
+ "external_cost_usd",
101
+ "external_cost_measured",
102
+ "total_cost_with_shift_usd",
103
+ "success",
104
+ "corrections",
105
+ "notes",
106
+ "primary_tokens_measured",
107
+ ]
108
+ MAX_CSV_NOTE_CHARS = 500
109
+ MAX_CSV_ROWS = 100_000
110
+ CSV_FORMULA_PREFIXES = ("=", "+", "-", "@")
111
+ PROTECTED_VARIANT_FLAGS = frozenset({
112
+ "--",
113
+ "-p",
114
+ "--print",
115
+ "--model",
116
+ "--max-turns",
117
+ "--output-format",
118
+ "--allowedTools",
119
+ "--allowed-tools",
120
+ "--max-budget-usd",
121
+ "--effort",
122
+ })
123
+ SECRET_NOTE_KEY_RE = r"[A-Za-z0-9_.-]*(?:api[-_]?key|token|secret|password|client[-_]?secret)[A-Za-z0-9_.-]*"
124
+ SECRET_NOTE_VALUE_RE = r"(?:'[^']*'|\"[^\"]*\"|[^\s,}&#;]+)"
125
+ SECRET_NOTE_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
126
+ (re.compile(r"(?i)\bBearer\s+[A-Za-z0-9._~+/=-]+"), "[REDACTED]"),
127
+ (re.compile(r"(?i)\bBasic\s+[A-Za-z0-9._~+/=-]+"), "[REDACTED]"),
128
+ (re.compile(rf"(?i)([?&#;]({SECRET_NOTE_KEY_RE})=)[^\s?&#;]+"), r"\1[REDACTED]"),
129
+ (re.compile(rf"(?i)(^|[\s{{,?&#;])([\"']?(?:{SECRET_NOTE_KEY_RE})[\"']?\s*[:=]\s*){SECRET_NOTE_VALUE_RE}"), r"\1\2[REDACTED]"),
130
+ (re.compile(rf"(?i)(^|[\s\"'])(--(?:{SECRET_NOTE_KEY_RE})(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"), r"\1\2[REDACTED]"),
131
+ (re.compile(r"(?i)(^|[\s\"'])((?:-u|--user)(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"), r"\1\2[REDACTED]"),
132
+ (re.compile(r"gh[pousr]_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
133
+ (re.compile(r"github_pat_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
134
+ (re.compile(r"glpat-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
135
+ (re.compile(r"xox[abprs]-[A-Za-z0-9-]{10,}"), "[REDACTED]"),
136
+ (re.compile(r"(?:AKIA|ASIA)[0-9A-Z]{16}"), "[REDACTED]"),
137
+ (re.compile(r"(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{16,}"), "[REDACTED]"),
138
+ (re.compile(r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
139
+ (re.compile(r"npm_[A-Za-z0-9]{20,}"), "[REDACTED]"),
140
+ (re.compile(r"AIza[0-9A-Za-z_\-]{20,}"), "[REDACTED]"),
141
+ (re.compile(r"SG\.[A-Za-z0-9_-]{16,}\.[A-Za-z0-9_-]{16,}"), "[REDACTED]"),
142
+ (re.compile(r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"), "[REDACTED]"),
143
+ (re.compile(r"([a-z][a-z0-9+.-]*://)[^/\s@]+@", re.IGNORECASE), r"\1[REDACTED]@"),
144
+ )
145
+
146
+ # claude -p --output-format json 및 호환 벤치마크 provider usage 키 후보.
147
+ # Anthropic SDK, Claude Code, OpenAI-style JSON 출력 형식이 시간이 지나며 바뀔 수
148
+ # 있어 다중 후보로 best-effort 매칭한다.
149
+ USAGE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...]], ...] = (
150
+ ("input_tokens", ("input_tokens", "inputTokens", "prompt_tokens", "promptTokens")),
151
+ ("output_tokens", ("output_tokens", "outputTokens", "completion_tokens", "completionTokens")),
152
+ ("cache_read", ("cache_read_input_tokens", "cacheRead")),
153
+ ("cache_creation", ("cache_creation_input_tokens", "cacheCreation")),
154
+ )
155
+ PROVIDER_CACHE_DETAIL_KEYS = (
156
+ "prompt_tokens_details",
157
+ "promptTokensDetails",
158
+ "input_tokens_details",
159
+ "inputTokensDetails",
160
+ )
161
+ PROVIDER_CACHED_TOKEN_KEYS = ("cached_tokens", "cachedTokens")
162
+ COST_KEYS = ("total_cost_usd", "cost_usd", "costUSD")
163
+ SHIFT_METRIC_KEY_GROUPS: tuple[tuple[str, tuple[str, ...]], ...] = (
164
+ ("turns", ("turns", "num_turns", "total_turns")),
165
+ ("hook_triggers", ("hook_triggers", "hookTriggerCount", "hook_trigger_count")),
166
+ ("bytes_before", ("bytes_before", "bytesBefore", "raw_bytes_before")),
167
+ ("bytes_after", ("bytes_after", "bytesAfter", "visible_bytes_after")),
168
+ ("artifacts_used", ("artifacts_used", "artifact_count", "artifactsUsed")),
169
+ )
170
+ EXTERNAL_TOKEN_AGGREGATE_KEYS = ("external_tokens",)
171
+ EXTERNAL_COST_AGGREGATE_KEYS = ("external_cost_usd",)
172
+ EXTERNAL_SOURCE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...], tuple[str, ...]], ...] = (
173
+ ("auxiliary", ("auxiliary_tokens",), ("auxiliary_cost_usd",)),
174
+ ("subagent", ("subagent_tokens",), ("subagent_cost_usd",)),
175
+ ("provider", ("provider_tokens",), ("provider_cost_usd",)),
176
+ )
177
+ MAX_USAGE_TOKEN_COUNT = 10**12
178
+ MAX_USAGE_COST_USD = 10**9
179
+ # Byte -> token proxy 환산 계수. 측정된 모델 토큰이 아니라 byte delta 기반 보수적
180
+ # 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
181
+ # ~4 bytes/token의 통용 근사값을 사용한다.
182
+ TOKEN_PROXY_BYTES_PER_TOKEN = 4
183
+ CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
184
+ SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
185
+ VERSION_OUTPUT_MAX_BYTES = 16_000
186
+ PROCESS_TERMINATE_GRACE_SECONDS = 2.0
187
+ ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
188
+ "tmp": Path("/private/tmp"),
189
+ "var": Path("/private/var"),
190
+ }
191
+
192
+
193
+ def _base_open_flags() -> int:
194
+ flags = os.O_RDONLY
195
+ if hasattr(os, "O_CLOEXEC"):
196
+ flags |= os.O_CLOEXEC
197
+ return flags
198
+
199
+
200
+ def _no_follow_flag() -> int:
201
+ if hasattr(os, "O_NOFOLLOW"):
202
+ return os.O_NOFOLLOW
203
+ raise OSError("platform does not support no-follow file opens")
204
+
205
+
206
+ def no_follow_file_ops_supported() -> bool:
207
+ return hasattr(os, "O_NOFOLLOW") and os.open in os.supports_dir_fd and os.mkdir in os.supports_dir_fd
208
+
209
+
210
+ def require_no_follow_file_ops_supported() -> None:
211
+ if not no_follow_file_ops_supported():
212
+ raise SystemExit(
213
+ "benchmark runner requires POSIX no-follow file operations for safe fixture and CSV paths; "
214
+ "this platform is not supported yet."
215
+ )
216
+
217
+
218
+ def _directory_flag() -> int:
219
+ return getattr(os, "O_DIRECTORY", 0)
220
+
221
+
222
+ def _normalized_link_target(parent: Path, raw_target: str) -> Path:
223
+ target = Path(raw_target)
224
+ if not target.is_absolute():
225
+ target = parent / target
226
+ return Path(os.path.normpath(str(target)))
227
+
228
+
229
+ def _normalize_allowed_first_absolute_symlink(path: Path) -> Path:
230
+ if not path.is_absolute() or len(path.parts) < 2:
231
+ return path
232
+ first = path.parts[1]
233
+ expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
234
+ if expected is None:
235
+ return path
236
+ link = Path(path.anchor) / first
237
+ try:
238
+ if not stat.S_ISLNK(os.lstat(link).st_mode):
239
+ return path
240
+ if _normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
241
+ return path
242
+ except OSError:
243
+ return path
244
+ return expected.joinpath(*path.parts[2:])
245
+
246
+
247
+ def _open_directory_at(dir_fd: int, component: str, path: Path) -> int:
248
+ fd = os.open(component, _base_open_flags() | _directory_flag() | _no_follow_flag(), dir_fd=dir_fd)
249
+ try:
250
+ if not stat.S_ISDIR(os.fstat(fd).st_mode):
251
+ raise OSError(f"not a directory: {path}")
252
+ return fd
253
+ except Exception:
254
+ os.close(fd)
255
+ raise
256
+
257
+
258
+ def _ensure_directory_no_symlink(path: Path, *, create: bool = False) -> int:
259
+ if os.open not in os.supports_dir_fd or os.mkdir not in os.supports_dir_fd:
260
+ raise OSError("platform does not support directory-relative no-follow directory access")
261
+ path = _normalize_allowed_first_absolute_symlink(path)
262
+ components = list(path.parts)
263
+ if path.is_absolute() and components:
264
+ components = components[1:]
265
+ root = path.anchor if path.is_absolute() else "."
266
+ dir_fd = os.open(root or ".", _base_open_flags() | _directory_flag())
267
+ try:
268
+ for component in components:
269
+ try:
270
+ next_fd = _open_directory_at(dir_fd, component, path)
271
+ except FileNotFoundError:
272
+ if not create:
273
+ raise
274
+ os.mkdir(component, 0o777, dir_fd=dir_fd)
275
+ next_fd = _open_directory_at(dir_fd, component, path)
276
+ os.close(dir_fd)
277
+ dir_fd = next_fd
278
+ return dir_fd
279
+ except Exception:
280
+ os.close(dir_fd)
281
+ raise
282
+
283
+
284
+ def _open_regular_no_symlink(
285
+ path: Path,
286
+ flags: int | None = None,
287
+ mode: int = 0o666,
288
+ *,
289
+ create_parent: bool = False,
290
+ ) -> int:
291
+ if os.open not in os.supports_dir_fd:
292
+ raise OSError("platform does not support directory-relative no-follow opens")
293
+ path = _normalize_allowed_first_absolute_symlink(path)
294
+ parent_fd = _ensure_directory_no_symlink(path.parent, create=create_parent)
295
+ open_flags = (flags if flags is not None else _base_open_flags()) | _no_follow_flag()
296
+ try:
297
+ fd = os.open(path.name, open_flags, mode, dir_fd=parent_fd)
298
+ try:
299
+ if not stat.S_ISREG(os.fstat(fd).st_mode):
300
+ raise OSError(f"not a regular file: {path}")
301
+ return fd
302
+ except Exception:
303
+ os.close(fd)
304
+ raise
305
+ finally:
306
+ os.close(parent_fd)
307
+
308
+
309
+ def _read_text_no_follow(path: Path) -> str:
310
+ fd = _open_regular_no_symlink(path)
311
+ try:
312
+ with os.fdopen(fd, "r", encoding="utf-8") as handle:
313
+ fd = -1
314
+ return handle.read()
315
+ finally:
316
+ if fd != -1:
317
+ os.close(fd)
318
+
319
+
320
+ @contextmanager
321
+ def csv_file_lock(csv_path: Path, *, create_parent: bool) -> Any:
322
+ """Serialize CSV read/write access with a no-follow sidecar lock file."""
323
+ if fcntl is None:
324
+ raise OSError("platform does not support advisory CSV locks")
325
+ lock_path = csv_path.with_name(f"{csv_path.name}.lock")
326
+ fd = _open_regular_no_symlink(lock_path, os.O_CREAT | os.O_RDWR, 0o600, create_parent=create_parent)
327
+ locked = False
328
+ try:
329
+ fcntl.flock(fd, fcntl.LOCK_EX)
330
+ locked = True
331
+ yield
332
+ finally:
333
+ try:
334
+ if locked:
335
+ fcntl.flock(fd, fcntl.LOCK_UN)
336
+ finally:
337
+ os.close(fd)
338
+
339
+
340
+ # 재현성 우선: fixture 에 명시되지 않은 필드는 argv 로 전달하지 않는다.
341
+ # 사용자가 baseline 으로 의도한 변형이 implicit default(예: effort="medium")로 인해
342
+ # 왜곡되지 않도록, 파싱 단계에서 명시 여부를 그대로 보존한다.
343
+ @dataclass
344
+ class TaskFixture:
345
+ id: str
346
+ prompt: str
347
+ model: str = "sonnet"
348
+ effort: str | None = None
349
+ max_turns: int = 3
350
+ max_budget_usd: float | None = None
351
+ allowed_tools: list[str] = field(default_factory=list)
352
+ success_command: str | None = None
353
+ success_cwd: str = "."
354
+
355
+
356
+ @dataclass
357
+ class Variant:
358
+ name: str
359
+ extra_args: list[str] = field(default_factory=list)
360
+
361
+
362
+ @dataclass
363
+ class RunResult:
364
+ task_id: str
365
+ variant: str
366
+ model: str
367
+ effort: str
368
+ tokens: dict[str, int]
369
+ cost_usd: float
370
+ success: bool
371
+ notes: str
372
+ corrections: int = 0
373
+ cost_measured: bool = False
374
+ wall_time_seconds: float = 0.0
375
+ turns: int = 0
376
+ hook_triggers: int = 0
377
+ bytes_before: int = 0
378
+ bytes_after: int = 0
379
+ artifacts_used: int = 0
380
+ external_tokens: int = 0
381
+ external_tokens_measured: bool = False
382
+ external_cost_usd: float = 0.0
383
+ external_cost_measured: bool = False
384
+ provider_cached_tokens: int = 0
385
+ provider_cached_tokens_measured: bool = False
386
+ primary_tokens_measured: bool = False
387
+
388
+
389
+ @dataclass
390
+ class BoundedProcessResult:
391
+ returncode: int
392
+ stdout: str
393
+ stderr: str
394
+ timed_out: bool = False
395
+ output_truncated: bool = False
396
+
397
+
398
+ def parse_positive_int(value: Any, *, field: str, owner: str) -> int:
399
+ """Parse a JSON fixture field that must be a positive integer."""
400
+ if isinstance(value, bool):
401
+ raise SystemExit(f"{owner} {field} must be a positive integer")
402
+ if isinstance(value, int):
403
+ parsed = value
404
+ elif isinstance(value, str) and re.fullmatch(r"[0-9]+", value.strip()):
405
+ parsed = int(value.strip())
406
+ else:
407
+ raise SystemExit(f"{owner} {field} must be a positive integer")
408
+ if parsed <= 0:
409
+ raise SystemExit(f"{owner} {field} must be > 0")
410
+ return parsed
411
+
412
+
413
+ def parse_string_list(value: Any, *, field: str, owner: str) -> list[str]:
414
+ """Parse a JSON fixture field that must be a list of non-empty strings."""
415
+ if value is None:
416
+ raise SystemExit(f"{owner} {field} must be a JSON list of strings")
417
+ if not isinstance(value, list):
418
+ raise SystemExit(f"{owner} {field} must be a JSON list of strings")
419
+ items: list[str] = []
420
+ for index, item in enumerate(value):
421
+ if not isinstance(item, str):
422
+ raise SystemExit(f"{owner} {field}[{index}] must be a string")
423
+ if not item.strip():
424
+ raise SystemExit(f"{owner} {field}[{index}] must be non-empty")
425
+ items.append(item)
426
+ return items
427
+
428
+
429
+ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[str]:
430
+ for index, arg in enumerate(extra_args):
431
+ flag = arg.split("=", 1)[0]
432
+ if flag in PROTECTED_VARIANT_FLAGS:
433
+ raise SystemExit(
434
+ f"{owner} extra_args[{index}] must not override runner-controlled Claude flags: {flag}"
435
+ )
436
+ return extra_args
437
+
438
+
439
+ def normalize_usage_token(value: Any) -> int | None:
440
+ """Return a safe non-negative token count, or None for invalid metrics."""
441
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
442
+ return None
443
+ try:
444
+ numeric = float(value)
445
+ except (OverflowError, ValueError):
446
+ return None
447
+ if not math.isfinite(numeric) or numeric < 0 or numeric > MAX_USAGE_TOKEN_COUNT:
448
+ return None
449
+ return int(numeric)
450
+
451
+
452
+ def normalize_usage_cost(value: Any) -> float | None:
453
+ """Return a safe non-negative cost value, or None for invalid metrics."""
454
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
455
+ return None
456
+ try:
457
+ numeric = float(value)
458
+ except (OverflowError, ValueError):
459
+ return None
460
+ if not math.isfinite(numeric) or numeric < 0 or numeric > MAX_USAGE_COST_USD:
461
+ return None
462
+ return numeric
463
+
464
+
465
+ def parse_tasks(path: Path) -> list[TaskFixture]:
466
+ raw = json.loads(_read_text_no_follow(path))
467
+ if not isinstance(raw, list):
468
+ raise SystemExit(f"tasks file must be a JSON list: {path}")
469
+ fixtures: list[TaskFixture] = []
470
+ for item in raw:
471
+ if not isinstance(item, dict):
472
+ raise SystemExit(f"task entry must be a JSON object: {item}")
473
+ effort_raw = item.get("effort")
474
+ budget_raw = item.get("max_budget_usd")
475
+ if budget_raw is not None:
476
+ try:
477
+ budget = float(budget_raw)
478
+ except (TypeError, ValueError):
479
+ raise SystemExit(f"task {item.get('id')} max_budget_usd must be number or null")
480
+ if not math.isfinite(budget) or budget <= 0:
481
+ raise SystemExit(f"task {item.get('id')} max_budget_usd must be finite and > 0 (use null for unlimited)")
482
+ else:
483
+ budget = None
484
+ fixtures.append(TaskFixture(
485
+ id=str(item["id"]),
486
+ prompt=str(item["prompt"]),
487
+ model=str(item.get("model", "sonnet")),
488
+ effort=str(effort_raw) if effort_raw is not None else None,
489
+ max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {item.get('id')}"),
490
+ max_budget_usd=budget,
491
+ allowed_tools=parse_string_list(
492
+ item.get("allowed_tools", []),
493
+ field="allowed_tools",
494
+ owner=f"task {item.get('id')}",
495
+ ),
496
+ success_command=item.get("success_command"),
497
+ success_cwd=str(item.get("success_cwd", ".")),
498
+ ))
499
+ return fixtures
500
+
501
+
502
+ def parse_variants(path: Path) -> list[Variant]:
503
+ raw = json.loads(_read_text_no_follow(path))
504
+ if not isinstance(raw, list):
505
+ raise SystemExit(f"variants file must be a JSON list: {path}")
506
+ variants: list[Variant] = []
507
+ for item in raw:
508
+ if not isinstance(item, dict):
509
+ raise SystemExit(f"variant entry must be a JSON object: {item}")
510
+ variants.append(Variant(
511
+ name=str(item["name"]),
512
+ extra_args=validate_variant_extra_args(
513
+ parse_string_list(
514
+ item.get("extra_args", []),
515
+ field="extra_args",
516
+ owner=f"variant {item.get('name')}",
517
+ ),
518
+ owner=f"variant {item.get('name')}",
519
+ ),
520
+ ))
521
+ return variants
522
+
523
+
524
+ def collect_usage(payload: Any) -> tuple[dict[str, int], float, bool, bool]:
525
+ """`claude -p --output-format json` 응답에서 token / cost 추출.
526
+
527
+ 의도된 정책: 한 응답에 top-level usage 와 nested per-message usage 가 동시에 있으면
528
+ 이중 합산이 되어 비용이 과대 보고된다. 따라서 각 bucket / cost 모두 **첫 매칭** 만
529
+ 채택한다 (top-level → BFS 순서). 응답 구조가 바뀌어 첫 매칭이 의도와 다른 경우에는
530
+ fixture/variant 단위로 측정 결과를 점검하라.
531
+ """
532
+ tokens: dict[str, int] = {key: 0 for key, _ in USAGE_KEY_GROUPS}
533
+ seen_token: dict[str, bool] = {key: False for key, _ in USAGE_KEY_GROUPS}
534
+ cost = 0.0
535
+ seen_cost = False
536
+ # BFS 로 walk 해 top-level dict 가 nested dict 보다 먼저 평가되도록 한다.
537
+ queue: collections.deque[Any] = collections.deque([payload])
538
+ while queue:
539
+ cur = queue.popleft()
540
+ if isinstance(cur, dict):
541
+ for bucket, keys in USAGE_KEY_GROUPS:
542
+ if seen_token[bucket]:
543
+ continue
544
+ for key in keys:
545
+ token_count = normalize_usage_token(cur.get(key))
546
+ if token_count is not None:
547
+ tokens[bucket] = token_count
548
+ seen_token[bucket] = True
549
+ break
550
+ if not seen_cost:
551
+ for key in COST_KEYS:
552
+ cost_value = normalize_usage_cost(cur.get(key))
553
+ if cost_value is not None:
554
+ cost = cost_value
555
+ seen_cost = True
556
+ break
557
+ queue.extend(cur.values())
558
+ elif isinstance(cur, list):
559
+ queue.extend(cur)
560
+ # Token-savings claims require a comparable primary-token total. Cache
561
+ # buckets are optional zeroes in normal provider payloads, but the core
562
+ # input/output buckets must both be observed; otherwise an output-only or
563
+ # input-only partial payload would be treated as measured zero for the
564
+ # missing side and could overstate savings.
565
+ primary_tokens_measured = seen_token["input_tokens"] and seen_token["output_tokens"]
566
+ return tokens, cost, seen_cost, primary_tokens_measured
567
+
568
+
569
+ def collect_provider_cache_telemetry(payload: Any) -> tuple[int, bool]:
570
+ """Extract provider-specific prompt-cache telemetry without changing token totals.
571
+
572
+ OpenAI-style responses expose cached prompt tokens under
573
+ `usage.prompt_tokens_details.cached_tokens`. That number is useful cache
574
+ telemetry, but `prompt_tokens` may already include cached tokens, so keep it
575
+ separate from the primary token buckets and from ContextGuard savings claims.
576
+ Anthropic-style `cache_read_input_tokens` remains in the normal `cache_read`
577
+ bucket handled by `collect_usage`.
578
+ """
579
+ queue: collections.deque[Any] = collections.deque([payload])
580
+ while queue:
581
+ cur = queue.popleft()
582
+ if isinstance(cur, dict):
583
+ for details_key in PROVIDER_CACHE_DETAIL_KEYS:
584
+ details = cur.get(details_key)
585
+ if not isinstance(details, dict):
586
+ continue
587
+ for cached_key in PROVIDER_CACHED_TOKEN_KEYS:
588
+ cached = normalize_usage_token(details.get(cached_key))
589
+ if cached is not None:
590
+ return cached, True
591
+ queue.extend(cur.values())
592
+ elif isinstance(cur, list):
593
+ queue.extend(cur)
594
+ return 0, False
595
+
596
+
597
+ def collect_provider_cached_tokens(payload: Any) -> int:
598
+ """Return cached-token telemetry value for callers that only need the count."""
599
+ cached_tokens, _measured = collect_provider_cache_telemetry(payload)
600
+ return cached_tokens
601
+
602
+
603
+ def elapsed_seconds_since(start: float) -> float:
604
+ return max(0.0, time.monotonic() - start)
605
+
606
+
607
+ def first_normalized_token(cur: dict[str, Any], keys: tuple[str, ...]) -> int | None:
608
+ for key in keys:
609
+ value = normalize_usage_token(cur.get(key))
610
+ if value is not None:
611
+ return value
612
+ return None
613
+
614
+
615
+ def first_normalized_cost(cur: dict[str, Any], keys: tuple[str, ...]) -> float | None:
616
+ for key in keys:
617
+ value = normalize_usage_cost(cur.get(key))
618
+ if value is not None:
619
+ return value
620
+ return None
621
+
622
+
623
+ def contains_external_source_tokens(value: Any) -> bool:
624
+ queue: collections.deque[Any] = collections.deque([value])
625
+ while queue:
626
+ cur = queue.popleft()
627
+ if isinstance(cur, dict):
628
+ for _source, token_keys, _cost_keys in EXTERNAL_SOURCE_KEY_GROUPS:
629
+ if first_normalized_token(cur, token_keys) is not None:
630
+ return True
631
+ queue.extend(cur.values())
632
+ elif isinstance(cur, list):
633
+ queue.extend(cur)
634
+ return False
635
+
636
+
637
+ def collect_shift_metrics(payload: Any) -> dict[str, int | float | bool]:
638
+ """Collect optional cost-shift / byte-saving metrics without requiring them.
639
+
640
+ External work is reported by evolving Claude/runner payloads either as one
641
+ aggregate (`external_tokens` + `external_cost_usd`) or as explicit source
642
+ records (`auxiliary_*`, `subagent_*`, `provider_*`). Do not mix those two
643
+ shapes: if an aggregate token count exists, it is authoritative; otherwise
644
+ sum only source-token records and mark cost measured only when every
645
+ positive source-token record carries its matching source cost.
646
+ """
647
+ metrics: dict[str, int | float | bool] = {key: 0 for key, _ in SHIFT_METRIC_KEY_GROUPS}
648
+ seen: dict[str, bool] = {key: False for key, _ in SHIFT_METRIC_KEY_GROUPS}
649
+ aggregate_tokens: int | None = None
650
+ aggregate_cost = 0.0
651
+ aggregate_cost_measured = False
652
+ source_tokens = 0
653
+ source_tokens_measured = False
654
+ source_cost = 0.0
655
+ source_cost_covered = True
656
+ metrics["external_cost_usd"] = 0.0
657
+ metrics["external_cost_measured"] = False
658
+ metrics["external_tokens"] = 0
659
+ metrics["external_tokens_measured"] = False
660
+ queue: collections.deque[Any] = collections.deque([payload])
661
+ while queue:
662
+ cur = queue.popleft()
663
+ if isinstance(cur, dict):
664
+ for bucket, keys in SHIFT_METRIC_KEY_GROUPS:
665
+ if seen[bucket]:
666
+ continue
667
+ value = first_normalized_token(cur, keys)
668
+ if value is not None:
669
+ metrics[bucket] = value
670
+ seen[bucket] = True
671
+
672
+ if aggregate_tokens is None:
673
+ value = first_normalized_token(cur, EXTERNAL_TOKEN_AGGREGATE_KEYS)
674
+ if value is not None:
675
+ aggregate_tokens = value
676
+ cost = first_normalized_cost(cur, EXTERNAL_COST_AGGREGATE_KEYS)
677
+ if cost is not None:
678
+ aggregate_cost = cost
679
+ aggregate_cost_measured = True
680
+
681
+ source_values = [
682
+ (value, cost_keys)
683
+ for _source, token_keys, cost_keys in EXTERNAL_SOURCE_KEY_GROUPS
684
+ for value in [first_normalized_token(cur, token_keys)]
685
+ if value is not None
686
+ ]
687
+ if source_values and not any(contains_external_source_tokens(value) for value in cur.values()):
688
+ for value, cost_keys in source_values:
689
+ source_tokens += value
690
+ source_tokens_measured = True
691
+ cost = first_normalized_cost(cur, cost_keys)
692
+ if cost is not None:
693
+ source_cost += cost
694
+ elif value > 0:
695
+ source_cost_covered = False
696
+ queue.extend(cur.values())
697
+ elif isinstance(cur, list):
698
+ queue.extend(cur)
699
+
700
+ if aggregate_tokens is not None:
701
+ metrics["external_tokens"] = aggregate_tokens
702
+ metrics["external_tokens_measured"] = True
703
+ metrics["external_cost_usd"] = aggregate_cost if aggregate_cost_measured else 0.0
704
+ metrics["external_cost_measured"] = aggregate_cost_measured
705
+ elif source_tokens_measured:
706
+ metrics["external_tokens"] = source_tokens
707
+ metrics["external_tokens_measured"] = True
708
+ metrics["external_cost_usd"] = source_cost
709
+ metrics["external_cost_measured"] = source_cost_covered
710
+ return metrics
711
+
712
+
713
+ def claude_version(claude_bin: str) -> str:
714
+ try:
715
+ proc = run_bounded_command(
716
+ [claude_bin, "--version"],
717
+ cwd=Path.cwd(),
718
+ timeout_seconds=5,
719
+ max_output_bytes=VERSION_OUTPUT_MAX_BYTES,
720
+ )
721
+ return proc.stdout.strip().splitlines()[0] if proc.stdout else "unknown"
722
+ except (OSError, subprocess.TimeoutExpired, ValueError):
723
+ return "unknown"
724
+
725
+
726
+ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> list[str]:
727
+ """`claude -p` argv 를 빌드한다.
728
+
729
+ fixture 에 명시되지 않은 옵션(effort, max_budget_usd) 은 argv 에서 빠진다.
730
+ 이렇게 해야 baseline variant 의 실제 의미(=defaults 그대로)가 implicit
731
+ runner default 로 왜곡되지 않는다.
732
+ """
733
+ argv = [claude_bin, "-p", "--model", task.model,
734
+ "--max-turns", str(task.max_turns), "--output-format", "json"]
735
+ if task.effort:
736
+ argv.extend(["--effort", task.effort])
737
+ if task.max_budget_usd is not None:
738
+ argv.extend(["--max-budget-usd", str(task.max_budget_usd)])
739
+ if task.allowed_tools:
740
+ argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
741
+ argv.extend(variant.extra_args)
742
+ argv.append("--")
743
+ argv.append(task.prompt)
744
+ return argv
745
+
746
+
747
+ def executable_argv0(command: str) -> str:
748
+ resolved = shutil.which(command)
749
+ if resolved:
750
+ return str(Path(resolved).expanduser().resolve())
751
+ path = Path(command).expanduser()
752
+ if path.is_absolute():
753
+ return str(path)
754
+ return str(path.resolve())
755
+
756
+
757
+ def _signal_process_group(proc: subprocess.Popen[bytes], sig: int, pgid: int | None) -> None:
758
+ if pgid is not None:
759
+ try:
760
+ os.killpg(pgid, sig)
761
+ return
762
+ except (AttributeError, ProcessLookupError):
763
+ pass
764
+ except OSError:
765
+ pass
766
+ try:
767
+ if sig == signal.SIGKILL:
768
+ proc.kill()
769
+ else:
770
+ proc.terminate()
771
+ except OSError:
772
+ pass
773
+
774
+
775
+ def run_bounded_command(
776
+ argv: list[str],
777
+ *,
778
+ cwd: Path,
779
+ timeout_seconds: int,
780
+ max_output_bytes: int,
781
+ ) -> BoundedProcessResult:
782
+ proc = subprocess.Popen(
783
+ argv,
784
+ cwd=cwd,
785
+ stdout=subprocess.PIPE,
786
+ stderr=subprocess.PIPE,
787
+ start_new_session=True,
788
+ )
789
+ try:
790
+ pgid = os.getpgid(proc.pid)
791
+ except OSError:
792
+ pgid = proc.pid
793
+ selector = selectors.DefaultSelector()
794
+ buffers: dict[str, bytearray] = {"stdout": bytearray(), "stderr": bytearray()}
795
+ streams = {"stdout": proc.stdout, "stderr": proc.stderr}
796
+ for name, stream in streams.items():
797
+ if stream is None:
798
+ continue
799
+ try:
800
+ os.set_blocking(stream.fileno(), False)
801
+ except (AttributeError, OSError):
802
+ pass
803
+ selector.register(stream, selectors.EVENT_READ, name)
804
+
805
+ timed_out = False
806
+ output_truncated = False
807
+ terminated_at: float | None = None
808
+ sent_kill = False
809
+ deadline = time.monotonic() + timeout_seconds
810
+ try:
811
+ while selector.get_map():
812
+ now = time.monotonic()
813
+ if now >= deadline:
814
+ timed_out = True
815
+ if terminated_at is None:
816
+ _signal_process_group(proc, signal.SIGTERM, pgid)
817
+ terminated_at = now
818
+ if terminated_at is not None and not sent_kill:
819
+ if now - terminated_at >= PROCESS_TERMINATE_GRACE_SECONDS:
820
+ _signal_process_group(proc, signal.SIGKILL, pgid)
821
+ sent_kill = True
822
+ if sent_kill and terminated_at is not None:
823
+ if now - terminated_at >= PROCESS_TERMINATE_GRACE_SECONDS * 2:
824
+ timed_out = True
825
+ break
826
+ events = selector.select(timeout=0.05)
827
+ for key, _ in events:
828
+ name = key.data
829
+ stream = key.fileobj
830
+ try:
831
+ chunk = os.read(stream.fileno(), 65536)
832
+ except BlockingIOError:
833
+ continue
834
+ if not chunk:
835
+ selector.unregister(stream)
836
+ try:
837
+ stream.close()
838
+ except OSError:
839
+ pass
840
+ continue
841
+ buffer = buffers[name]
842
+ remaining = max_output_bytes - len(buffer)
843
+ if remaining > 0:
844
+ buffer.extend(chunk[:remaining])
845
+ if len(chunk) > remaining:
846
+ output_truncated = True
847
+ if terminated_at is None:
848
+ _signal_process_group(proc, signal.SIGTERM, pgid)
849
+ terminated_at = time.monotonic()
850
+ finally:
851
+ selector.close()
852
+
853
+ try:
854
+ returncode = proc.wait(timeout=PROCESS_TERMINATE_GRACE_SECONDS)
855
+ except subprocess.TimeoutExpired:
856
+ _signal_process_group(proc, signal.SIGKILL, pgid)
857
+ try:
858
+ returncode = proc.wait(timeout=PROCESS_TERMINATE_GRACE_SECONDS)
859
+ except subprocess.TimeoutExpired:
860
+ returncode = 124
861
+ timed_out = True
862
+ if timed_out:
863
+ returncode = 124
864
+ elif output_truncated:
865
+ returncode = 125
866
+ return BoundedProcessResult(
867
+ returncode=returncode,
868
+ stdout=bytes(buffers["stdout"]).decode("utf-8", "replace"),
869
+ stderr=bytes(buffers["stderr"]).decode("utf-8", "replace"),
870
+ timed_out=timed_out,
871
+ output_truncated=output_truncated,
872
+ )
873
+
874
+
875
+ # shlex.split 은 shell injection 은 막지만 `true ; echo pwned` 같은 입력을 그대로
876
+ # `["true", ";", "echo", "pwned"]` 로 분해해 /usr/bin/true 가 ";"·"echo"·"pwned" 를
877
+ # 그냥 인자로 무시하고 success=true 로 끝나는 false-positive 를 만들 수 있다.
878
+ # 따라서 shlex 분해 결과 토큰에 셸 합성 의도를 가진 것으로 보이는 문자가 포함되면 거부한다.
879
+ _SHELL_META_TOKENS = frozenset({";", "&&", "||", "|", "&", "<", ">", ">>", "<<", "<<<"})
880
+
881
+
882
+ def _has_shell_meta(argv: list[str]) -> bool:
883
+ for tok in argv:
884
+ if tok in _SHELL_META_TOKENS:
885
+ return True
886
+ # 토큰 안에 `$( ... )` / 백틱 같은 명령 치환 흔적이 있어도 거부.
887
+ if "$(" in tok or "`" in tok:
888
+ return True
889
+ return False
890
+
891
+
892
+ def run_success_command(task: TaskFixture, project_root: Path) -> tuple[bool, str]:
893
+ """fixture 의 success_command 를 실행한다.
894
+
895
+ - `shlex.split + shell=False` 로 단일 argv 만 실행한다.
896
+ - 분해된 토큰에 셸 합성 의도(`;`, `&&`, `|`, `$()`, 백틱 등)가 있으면 거부한다.
897
+ `success_command` 는 단일 검증 명령 또는 헬퍼 스크립트 한 개의 경로여야 한다.
898
+ - `success_cwd` 가 project_root 밖으로 escape 하면 거부한다 (..//../etc 같은 케이스).
899
+ """
900
+ if not task.success_command:
901
+ return True, "no success_command configured"
902
+ try:
903
+ argv = shlex.split(task.success_command)
904
+ except ValueError as exc:
905
+ return False, f"success_command parse error: {exc}"
906
+ if not argv:
907
+ return False, "success_command parsed to empty argv"
908
+ if _has_shell_meta(argv):
909
+ return False, "success_command contains shell-composition tokens (use a helper script)"
910
+ project_root_resolved = project_root.resolve()
911
+ cwd = (project_root / task.success_cwd).resolve()
912
+ try:
913
+ cwd.relative_to(project_root_resolved)
914
+ except ValueError:
915
+ return False, f"success_cwd escapes project_root: {cwd}"
916
+ try:
917
+ proc = run_bounded_command(
918
+ argv,
919
+ cwd=cwd,
920
+ timeout_seconds=600,
921
+ max_output_bytes=SUCCESS_COMMAND_OUTPUT_MAX_BYTES,
922
+ )
923
+ except (OSError, subprocess.TimeoutExpired, ValueError) as exc:
924
+ return False, f"success_command failed to launch: {exc}"
925
+ if proc.timed_out:
926
+ return False, "success_command timed out after 600s"
927
+ if proc.output_truncated:
928
+ return False, f"success_command output limit exceeded ({SUCCESS_COMMAND_OUTPUT_MAX_BYTES} bytes)"
929
+ return proc.returncode == 0, f"exit={proc.returncode}"
930
+
931
+
932
+ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
933
+ project_root: Path, dry_run: bool) -> RunResult:
934
+ argv = build_claude_argv(claude_bin, task, variant)
935
+ started_at = time.monotonic()
936
+ if dry_run:
937
+ return RunResult(
938
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
939
+ tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
940
+ success=True, notes=f"dry-run: {shlex.join(argv)}",
941
+ wall_time_seconds=0.0,
942
+ )
943
+ argv[0] = executable_argv0(argv[0])
944
+ try:
945
+ proc = run_bounded_command(
946
+ argv,
947
+ cwd=project_root,
948
+ timeout_seconds=1800,
949
+ max_output_bytes=CLAUDE_OUTPUT_MAX_BYTES,
950
+ )
951
+ except (OSError, subprocess.TimeoutExpired, ValueError) as exc:
952
+ return RunResult(
953
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
954
+ tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
955
+ success=False, notes=f"claude launch failed: {exc}",
956
+ wall_time_seconds=elapsed_seconds_since(started_at),
957
+ )
958
+ if proc.timed_out:
959
+ return RunResult(
960
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
961
+ tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
962
+ success=False, notes="claude timed out after 1800s",
963
+ wall_time_seconds=elapsed_seconds_since(started_at),
964
+ )
965
+ if proc.output_truncated:
966
+ return RunResult(
967
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
968
+ tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
969
+ success=False, notes=f"claude output limit exceeded ({CLAUDE_OUTPUT_MAX_BYTES} bytes)",
970
+ wall_time_seconds=elapsed_seconds_since(started_at),
971
+ )
972
+ if proc.returncode != 0:
973
+ return RunResult(
974
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
975
+ tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
976
+ success=False, notes=f"claude exit={proc.returncode}: {proc.stderr[-200:].strip()}",
977
+ wall_time_seconds=elapsed_seconds_since(started_at),
978
+ )
979
+ try:
980
+ payload = json.loads(proc.stdout)
981
+ except json.JSONDecodeError as exc:
982
+ return RunResult(
983
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
984
+ tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
985
+ success=False, notes=f"claude returned non-JSON: {exc.msg}",
986
+ wall_time_seconds=elapsed_seconds_since(started_at),
987
+ )
988
+ tokens, cost, cost_measured, primary_tokens_measured = collect_usage(payload)
989
+ provider_cached_tokens, provider_cached_tokens_measured = collect_provider_cache_telemetry(payload)
990
+ shift_metrics = collect_shift_metrics(payload)
991
+ success, success_note = run_success_command(task, project_root)
992
+ return RunResult(
993
+ task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
994
+ tokens=tokens, cost_usd=cost, success=success, notes=success_note,
995
+ cost_measured=cost_measured,
996
+ primary_tokens_measured=primary_tokens_measured,
997
+ wall_time_seconds=elapsed_seconds_since(started_at),
998
+ turns=int(shift_metrics["turns"]),
999
+ hook_triggers=int(shift_metrics["hook_triggers"]),
1000
+ bytes_before=int(shift_metrics["bytes_before"]),
1001
+ bytes_after=int(shift_metrics["bytes_after"]),
1002
+ artifacts_used=int(shift_metrics["artifacts_used"]),
1003
+ external_tokens=int(shift_metrics["external_tokens"]),
1004
+ external_tokens_measured=bool(shift_metrics["external_tokens_measured"]),
1005
+ external_cost_usd=float(shift_metrics["external_cost_usd"]),
1006
+ external_cost_measured=bool(shift_metrics["external_cost_measured"]),
1007
+ provider_cached_tokens=provider_cached_tokens,
1008
+ provider_cached_tokens_measured=provider_cached_tokens_measured,
1009
+ )
1010
+
1011
+
1012
+ def append_csv(csv_path: Path, claude_ver: str, result: RunResult, *, skip_existing: bool = False) -> bool:
1013
+ with csv_file_lock(csv_path, create_parent=True):
1014
+ if skip_existing and (result.task_id, result.variant) in _read_existing_keys_unlocked(csv_path):
1015
+ return False
1016
+ flags = os.O_CREAT | os.O_APPEND | os.O_WRONLY
1017
+ fd = _open_regular_no_symlink(csv_path, flags, 0o600, create_parent=True)
1018
+ try:
1019
+ new_file = os.fstat(fd).st_size == 0
1020
+ if not new_file:
1021
+ validate_csv_schema(csv_path, read_csv_header_unlocked(csv_path))
1022
+ with os.fdopen(fd, "a", encoding="utf-8", newline="") as f:
1023
+ fd = -1
1024
+ writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
1025
+ if new_file:
1026
+ writer.writeheader()
1027
+ tokens = result.tokens
1028
+ total = sum(tokens.values())
1029
+ shifted_cost_known = cost_shift_measured(result)
1030
+ writer.writerow({
1031
+ "date": sanitize_csv_cell(_dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")),
1032
+ "claude_version": sanitize_csv_cell(claude_ver),
1033
+ "task_id": sanitize_csv_cell(result.task_id),
1034
+ "variant": sanitize_csv_cell(result.variant),
1035
+ "model": sanitize_csv_cell(result.model),
1036
+ "effort": sanitize_csv_cell(result.effort),
1037
+ "total_tokens": total,
1038
+ "input_tokens": tokens.get("input_tokens", 0),
1039
+ "output_tokens": tokens.get("output_tokens", 0),
1040
+ "cache_read": tokens.get("cache_read", 0),
1041
+ "cache_creation": tokens.get("cache_creation", 0),
1042
+ "provider_cached_tokens": result.provider_cached_tokens,
1043
+ "provider_cached_tokens_measured": (
1044
+ "true" if result.provider_cached_tokens_measured else "false"
1045
+ ),
1046
+ "cost_usd": f"{result.cost_usd:.6f}",
1047
+ "cost_measured": "true" if result.cost_measured else "false",
1048
+ "wall_time_seconds": f"{result.wall_time_seconds:.6f}",
1049
+ "turns": result.turns,
1050
+ "hook_triggers": result.hook_triggers,
1051
+ "bytes_before": result.bytes_before,
1052
+ "bytes_after": result.bytes_after,
1053
+ "artifacts_used": result.artifacts_used,
1054
+ "external_tokens": result.external_tokens,
1055
+ "external_tokens_measured": "true" if result.external_tokens_measured else "false",
1056
+ "external_cost_usd": f"{result.external_cost_usd:.6f}",
1057
+ "external_cost_measured": "true" if result.external_cost_measured else "false",
1058
+ "total_cost_with_shift_usd": (
1059
+ f"{(result.cost_usd + result.external_cost_usd):.6f}" if shifted_cost_known else ""
1060
+ ),
1061
+ "success": "true" if result.success else "false",
1062
+ "corrections": result.corrections,
1063
+ "notes": sanitize_csv_note(result.notes),
1064
+ "primary_tokens_measured": "true" if result.primary_tokens_measured else "false",
1065
+ })
1066
+ finally:
1067
+ if fd != -1:
1068
+ os.close(fd)
1069
+ return True
1070
+
1071
+
1072
+ def cost_shift_measured(result: RunResult) -> bool:
1073
+ return (
1074
+ result.cost_measured
1075
+ and result.external_tokens_measured
1076
+ and (result.external_tokens == 0 or result.external_cost_measured)
1077
+ )
1078
+
1079
+
1080
+ def read_csv_header_unlocked(csv_path: Path) -> list[str] | None:
1081
+ fd = _open_regular_no_symlink(csv_path)
1082
+ try:
1083
+ with os.fdopen(fd, "r", encoding="utf-8", newline="") as handle:
1084
+ fd = -1
1085
+ reader = csv.reader(handle)
1086
+ try:
1087
+ return next(reader)
1088
+ except StopIteration:
1089
+ return None
1090
+ finally:
1091
+ if fd != -1:
1092
+ os.close(fd)
1093
+
1094
+
1095
+ def validate_csv_schema(csv_path: Path, fieldnames: list[str] | None) -> None:
1096
+ """Fail loudly instead of appending/reporting across incompatible CSV schemas."""
1097
+ if fieldnames is None:
1098
+ return
1099
+ if fieldnames != CSV_COLUMNS:
1100
+ raise SystemExit(
1101
+ f"CSV schema mismatch for {csv_path}; start a new --csv file or migrate the header "
1102
+ f"to: {','.join(CSV_COLUMNS)}"
1103
+ )
1104
+
1105
+
1106
+ def write_text_no_follow(path: Path, text: str) -> None:
1107
+ fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, 0o600, create_parent=True)
1108
+ try:
1109
+ with os.fdopen(fd, "w", encoding="utf-8") as handle:
1110
+ fd = -1
1111
+ handle.write(text)
1112
+ finally:
1113
+ if fd != -1:
1114
+ os.close(fd)
1115
+
1116
+
1117
+ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
1118
+ shifted_cost_known = cost_shift_measured(result)
1119
+ payload = {
1120
+ "date": _dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
1121
+ "claude_version": claude_ver,
1122
+ "task_id": result.task_id,
1123
+ "variant": result.variant,
1124
+ "success": result.success,
1125
+ "primary_cost_measured": result.cost_measured,
1126
+ "primary_cost_usd": round(result.cost_usd, 6),
1127
+ "primary_tokens_measured": result.primary_tokens_measured,
1128
+ "provider_cached_tokens": result.provider_cached_tokens,
1129
+ "provider_cached_tokens_measured": result.provider_cached_tokens_measured,
1130
+ "wall_time_seconds": round(result.wall_time_seconds, 6),
1131
+ "external_tokens_measured": result.external_tokens_measured,
1132
+ "external_cost_measured": result.external_cost_measured,
1133
+ "external_cost_usd": round(result.external_cost_usd, 6),
1134
+ "total_cost_with_shift_usd": (
1135
+ round(result.cost_usd + result.external_cost_usd, 6) if shifted_cost_known else None
1136
+ ),
1137
+ "primary_tokens": sum(result.tokens.values()),
1138
+ "external_tokens": result.external_tokens,
1139
+ "artifacts_used": result.artifacts_used,
1140
+ "bytes_before": result.bytes_before,
1141
+ "bytes_after": result.bytes_after,
1142
+ "hook_triggers": result.hook_triggers,
1143
+ "turns": result.turns,
1144
+ "notes": sanitize_csv_note(result.notes),
1145
+ }
1146
+ with csv_file_lock(path, create_parent=True):
1147
+ fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
1148
+ try:
1149
+ with os.fdopen(fd, "a", encoding="utf-8") as handle:
1150
+ fd = -1
1151
+ handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
1152
+ finally:
1153
+ if fd != -1:
1154
+ os.close(fd)
1155
+
1156
+
1157
+ def _read_existing_keys_unlocked(csv_path: Path) -> set[tuple[str, str]]:
1158
+ try:
1159
+ fd = _open_regular_no_symlink(csv_path)
1160
+ except FileNotFoundError:
1161
+ return set()
1162
+ keys: set[tuple[str, str]] = set()
1163
+ try:
1164
+ with os.fdopen(fd, "r", encoding="utf-8", newline="") as f:
1165
+ fd = -1
1166
+ reader = csv.DictReader(f)
1167
+ fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
1168
+ validate_csv_schema(csv_path, fieldnames)
1169
+ for row in reader:
1170
+ tid = row.get("task_id") or ""
1171
+ var = row.get("variant") or ""
1172
+ if tid and var:
1173
+ keys.add((tid, var))
1174
+ finally:
1175
+ if fd != -1:
1176
+ os.close(fd)
1177
+ return keys
1178
+
1179
+
1180
+ def _csv_exists_no_follow(csv_path: Path) -> bool:
1181
+ """Probe the CSV itself without following symlinks or creating a sidecar lock."""
1182
+ try:
1183
+ fd = _open_regular_no_symlink(csv_path)
1184
+ except FileNotFoundError:
1185
+ return False
1186
+ else:
1187
+ os.close(fd)
1188
+ return True
1189
+
1190
+
1191
+ def existing_keys(csv_path: Path) -> set[tuple[str, str]]:
1192
+ """이미 적재된 (task_id, variant) 조합. resume 시 skip 판정에 사용."""
1193
+ if not _csv_exists_no_follow(csv_path):
1194
+ return set()
1195
+ with csv_file_lock(csv_path, create_parent=False):
1196
+ return _read_existing_keys_unlocked(csv_path)
1197
+
1198
+
1199
+ def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
1200
+ try:
1201
+ fd = _open_regular_no_symlink(csv_path)
1202
+ except FileNotFoundError:
1203
+ return []
1204
+ try:
1205
+ with os.fdopen(fd, "r", encoding="utf-8", newline="") as handle:
1206
+ fd = -1
1207
+ reader = csv.DictReader(handle)
1208
+ fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
1209
+ validate_csv_schema(csv_path, fieldnames)
1210
+ rows: list[dict[str, str]] = []
1211
+ for index, row in enumerate(reader, start=1):
1212
+ if index > MAX_CSV_ROWS:
1213
+ raise SystemExit(f"CSV row limit exceeded for {csv_path}: > {MAX_CSV_ROWS}")
1214
+ rows.append(row)
1215
+ return rows
1216
+ finally:
1217
+ if fd != -1:
1218
+ os.close(fd)
1219
+
1220
+
1221
+ def row_int(row: dict[str, str], key: str) -> int:
1222
+ try:
1223
+ return int(float(row.get(key) or 0))
1224
+ except (TypeError, ValueError, OverflowError):
1225
+ return 0
1226
+
1227
+
1228
+ def row_optional_nonnegative_int(row: dict[str, str], key: str) -> int | None:
1229
+ raw = row.get(key)
1230
+ if raw is None:
1231
+ return None
1232
+ text = str(raw).strip()
1233
+ if not re.fullmatch(r"[0-9]+", text):
1234
+ return None
1235
+ try:
1236
+ return int(text)
1237
+ except (TypeError, ValueError, OverflowError):
1238
+ return None
1239
+
1240
+
1241
+ def row_float(row: dict[str, str], key: str) -> float:
1242
+ try:
1243
+ value = float(row.get(key) or 0)
1244
+ except (TypeError, ValueError, OverflowError):
1245
+ return 0.0
1246
+ return value if math.isfinite(value) else 0.0
1247
+
1248
+
1249
+ def row_optional_float(row: dict[str, str], key: str) -> float | None:
1250
+ raw = row.get(key)
1251
+ if raw is None or str(raw).strip() == "":
1252
+ return None
1253
+ try:
1254
+ value = float(raw)
1255
+ except (TypeError, ValueError, OverflowError):
1256
+ return None
1257
+ return value if math.isfinite(value) else None
1258
+
1259
+
1260
+ def row_has_finite_float(row: dict[str, str], key: str) -> bool:
1261
+ return row_optional_float(row, key) is not None
1262
+
1263
+
1264
+ def row_bool(row: dict[str, str], key: str) -> bool:
1265
+ return str(row.get(key) or "").strip().lower() == "true"
1266
+
1267
+
1268
+ def row_success(row: dict[str, str]) -> bool:
1269
+ return str(row.get("success") or "").strip().lower() == "true"
1270
+
1271
+
1272
+ def row_cost_shift_measured(row: dict[str, str]) -> bool:
1273
+ return (
1274
+ row_bool(row, "cost_measured")
1275
+ and row_bool(row, "external_tokens_measured")
1276
+ and (row_int(row, "external_tokens") == 0 or row_bool(row, "external_cost_measured"))
1277
+ )
1278
+
1279
+
1280
+ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str) -> dict[str, Any]:
1281
+ by_variant: dict[str, dict[str, Any]] = {}
1282
+ successful_rows_by_variant_task: dict[str, dict[str, list[dict[str, str]]]] = {}
1283
+ seen_tasks_by_variant: dict[str, set[str]] = {}
1284
+ successful_tasks_by_variant: dict[str, set[str]] = {}
1285
+
1286
+ for row in rows:
1287
+ variant = row.get("variant") or "unknown"
1288
+ task_id = row.get("task_id") or "unknown"
1289
+ seen_tasks_by_variant.setdefault(variant, set()).add(task_id)
1290
+ bucket = by_variant.setdefault(
1291
+ variant,
1292
+ {
1293
+ "runs": 0,
1294
+ "successful_runs": 0,
1295
+ "failed_runs": 0,
1296
+ "total_tokens_all_runs": 0,
1297
+ "primary_tokens_measured_runs": 0,
1298
+ "primary_cost_all_runs_usd": 0.0,
1299
+ "primary_cost_measured_runs": 0,
1300
+ "wall_time_seconds_all_runs": 0.0,
1301
+ "wall_time_seconds_measured_runs": 0,
1302
+ "provider_cached_tokens_all_runs": 0,
1303
+ "provider_cached_tokens_measured_runs": 0,
1304
+ "total_cost_with_shift_all_runs_usd": 0.0,
1305
+ "total_cost_with_shift_measured_runs": 0,
1306
+ "total_tokens_successful": 0,
1307
+ "primary_tokens_measured_successful": 0,
1308
+ "primary_cost_successful_usd": 0.0,
1309
+ "primary_cost_measured_successful": 0,
1310
+ "wall_time_seconds_successful": 0.0,
1311
+ "wall_time_seconds_measured_successful": 0,
1312
+ "provider_cached_tokens_successful": 0,
1313
+ "provider_cached_tokens_measured_successful": 0,
1314
+ "external_cost_successful_usd": 0.0,
1315
+ "external_cost_unknown_successful": 0,
1316
+ "total_cost_with_shift_successful_usd": 0.0,
1317
+ "total_cost_with_shift_measured_successful": 0,
1318
+ "external_tokens_successful": 0,
1319
+ "external_tokens_measured_successful": 0,
1320
+ "artifacts_used_successful": 0,
1321
+ "corrections_successful": 0,
1322
+ "bytes_before_successful": 0,
1323
+ "bytes_after_successful": 0,
1324
+ "turns_successful": 0,
1325
+ "hook_triggers_successful": 0,
1326
+ },
1327
+ )
1328
+ bucket["runs"] += 1
1329
+ bucket["total_tokens_all_runs"] += row_int(row, "total_tokens")
1330
+ if row_bool(row, "primary_tokens_measured"):
1331
+ bucket["primary_tokens_measured_runs"] += 1
1332
+ bucket["wall_time_seconds_all_runs"] += row_float(row, "wall_time_seconds")
1333
+ if row_has_finite_float(row, "wall_time_seconds"):
1334
+ bucket["wall_time_seconds_measured_runs"] += 1
1335
+ bucket["provider_cached_tokens_all_runs"] += row_int(row, "provider_cached_tokens")
1336
+ if row_bool(row, "provider_cached_tokens_measured"):
1337
+ bucket["provider_cached_tokens_measured_runs"] += 1
1338
+ if row_bool(row, "cost_measured"):
1339
+ bucket["primary_cost_all_runs_usd"] += row_float(row, "cost_usd")
1340
+ bucket["primary_cost_measured_runs"] += 1
1341
+ shifted_cost = row_optional_float(row, "total_cost_with_shift_usd")
1342
+ if row_cost_shift_measured(row) and shifted_cost is not None:
1343
+ bucket["total_cost_with_shift_all_runs_usd"] += shifted_cost
1344
+ bucket["total_cost_with_shift_measured_runs"] += 1
1345
+ if not row_success(row):
1346
+ bucket["failed_runs"] += 1
1347
+ continue
1348
+ bucket["successful_runs"] += 1
1349
+ successful_tasks_by_variant.setdefault(variant, set()).add(task_id)
1350
+ successful_rows_by_variant_task.setdefault(variant, {}).setdefault(task_id, []).append(row)
1351
+ bucket["total_tokens_successful"] += row_int(row, "total_tokens")
1352
+ if row_bool(row, "primary_tokens_measured"):
1353
+ bucket["primary_tokens_measured_successful"] += 1
1354
+ bucket["wall_time_seconds_successful"] += row_float(row, "wall_time_seconds")
1355
+ if row_has_finite_float(row, "wall_time_seconds"):
1356
+ bucket["wall_time_seconds_measured_successful"] += 1
1357
+ bucket["provider_cached_tokens_successful"] += row_int(row, "provider_cached_tokens")
1358
+ if row_bool(row, "provider_cached_tokens_measured"):
1359
+ bucket["provider_cached_tokens_measured_successful"] += 1
1360
+ if row_bool(row, "cost_measured"):
1361
+ bucket["primary_cost_successful_usd"] += row_float(row, "cost_usd")
1362
+ bucket["primary_cost_measured_successful"] += 1
1363
+ if row_bool(row, "external_tokens_measured") and (
1364
+ row_int(row, "external_tokens") == 0 or row_bool(row, "external_cost_measured")
1365
+ ):
1366
+ bucket["external_cost_successful_usd"] += row_float(row, "external_cost_usd")
1367
+ else:
1368
+ bucket["external_cost_unknown_successful"] += 1
1369
+ if row_cost_shift_measured(row) and shifted_cost is not None:
1370
+ bucket["total_cost_with_shift_successful_usd"] += shifted_cost
1371
+ bucket["total_cost_with_shift_measured_successful"] += 1
1372
+ if row_bool(row, "external_tokens_measured"):
1373
+ bucket["external_tokens_successful"] += row_int(row, "external_tokens")
1374
+ bucket["external_tokens_measured_successful"] += 1
1375
+ bucket["artifacts_used_successful"] += row_int(row, "artifacts_used")
1376
+ bucket["corrections_successful"] += row_int(row, "corrections")
1377
+ bucket["bytes_before_successful"] += row_int(row, "bytes_before")
1378
+ bucket["bytes_after_successful"] += row_int(row, "bytes_after")
1379
+ bucket["turns_successful"] += row_int(row, "turns")
1380
+ bucket["hook_triggers_successful"] += row_int(row, "hook_triggers")
1381
+
1382
+ for variant, bucket in by_variant.items():
1383
+ successes = bucket["successful_runs"]
1384
+ runs = bucket["runs"]
1385
+ bucket["failure_rate"] = (bucket["failed_runs"] / runs) if runs else None
1386
+ bucket["task_count"] = len(seen_tasks_by_variant.get(variant, set()))
1387
+ bucket["successful_task_count"] = len(successful_tasks_by_variant.get(variant, set()))
1388
+ if bucket["task_count"]:
1389
+ bucket["tokens_per_task_including_failures"] = (
1390
+ bucket["total_tokens_all_runs"] / bucket["task_count"]
1391
+ if bucket["primary_tokens_measured_runs"] == runs
1392
+ else None
1393
+ )
1394
+ bucket["wall_time_seconds_per_task_including_failures"] = (
1395
+ bucket["wall_time_seconds_all_runs"] / bucket["task_count"]
1396
+ )
1397
+ bucket["provider_cached_tokens_per_task_including_failures"] = (
1398
+ bucket["provider_cached_tokens_all_runs"] / bucket["task_count"]
1399
+ )
1400
+ if bucket["primary_cost_measured_runs"] == runs:
1401
+ bucket["primary_cost_per_task_including_failures_usd"] = (
1402
+ bucket["primary_cost_all_runs_usd"] / bucket["task_count"]
1403
+ )
1404
+ else:
1405
+ bucket["primary_cost_per_task_including_failures_usd"] = None
1406
+ if bucket["total_cost_with_shift_measured_runs"] == runs:
1407
+ bucket["total_cost_with_shift_per_task_including_failures_usd"] = (
1408
+ bucket["total_cost_with_shift_all_runs_usd"] / bucket["task_count"]
1409
+ )
1410
+ else:
1411
+ bucket["total_cost_with_shift_per_task_including_failures_usd"] = None
1412
+ else:
1413
+ bucket["tokens_per_task_including_failures"] = None
1414
+ bucket["wall_time_seconds_per_task_including_failures"] = None
1415
+ bucket["provider_cached_tokens_per_task_including_failures"] = None
1416
+ bucket["primary_cost_per_task_including_failures_usd"] = None
1417
+ bucket["total_cost_with_shift_per_task_including_failures_usd"] = None
1418
+ if successes:
1419
+ bucket["tokens_per_successful_task"] = (
1420
+ bucket["total_tokens_successful"] / successes
1421
+ if bucket["primary_tokens_measured_successful"] == successes
1422
+ else None
1423
+ )
1424
+ bucket["wall_time_seconds_per_successful_task"] = bucket["wall_time_seconds_successful"] / successes
1425
+ bucket["provider_cached_tokens_per_successful_task"] = (
1426
+ bucket["provider_cached_tokens_successful"] / successes
1427
+ )
1428
+ if bucket["primary_cost_measured_successful"] == successes:
1429
+ bucket["primary_cost_per_successful_task_usd"] = (
1430
+ bucket["primary_cost_successful_usd"] / successes
1431
+ )
1432
+ else:
1433
+ bucket["primary_cost_per_successful_task_usd"] = None
1434
+ if bucket["total_cost_with_shift_measured_successful"] == successes:
1435
+ bucket["total_cost_with_shift_per_successful_task_usd"] = (
1436
+ bucket["total_cost_with_shift_successful_usd"] / successes
1437
+ )
1438
+ else:
1439
+ bucket["total_cost_with_shift_per_successful_task_usd"] = None
1440
+ bucket["external_tokens_per_successful_task"] = (
1441
+ bucket["external_tokens_successful"] / successes
1442
+ if bucket["external_tokens_measured_successful"] == successes
1443
+ else None
1444
+ )
1445
+ bucket["artifacts_used_per_successful_task"] = bucket["artifacts_used_successful"] / successes
1446
+ bucket["corrections_per_successful_task"] = bucket["corrections_successful"] / successes
1447
+ before = bucket["bytes_before_successful"]
1448
+ after = bucket["bytes_after_successful"]
1449
+ bucket["byte_reduction_ratio"] = (after / before) if before else None
1450
+ else:
1451
+ bucket["tokens_per_successful_task"] = None
1452
+ bucket["wall_time_seconds_per_successful_task"] = None
1453
+ bucket["provider_cached_tokens_per_successful_task"] = None
1454
+ bucket["primary_cost_per_successful_task_usd"] = None
1455
+ bucket["total_cost_with_shift_per_successful_task_usd"] = None
1456
+ bucket["external_tokens_per_successful_task"] = None
1457
+ bucket["artifacts_used_per_successful_task"] = None
1458
+ bucket["corrections_per_successful_task"] = None
1459
+ bucket["byte_reduction_ratio"] = None
1460
+
1461
+ # 각 variant는 하나의 compression strategy를 대표한다. byte 절감/토큰 proxy/
1462
+ # 텔레메트리 증거 등급을 보수적으로(additive) 노출한다. 토큰 proxy는 측정된
1463
+ # 모델 토큰이 아니라 byte delta 기반 추정치이므로 evidence="inferred"로 둔다.
1464
+ bucket["compression_strategy"] = variant
1465
+ bucket["is_baseline_strategy"] = variant == baseline_variant
1466
+ bytes_before = bucket["bytes_before_successful"]
1467
+ bytes_after = bucket["bytes_after_successful"]
1468
+ byte_metrics_present = bool(bytes_before or bytes_after)
1469
+ if successes and byte_metrics_present:
1470
+ bytes_saved = max(0, bytes_before - bytes_after)
1471
+ token_proxy_saved = bytes_saved // TOKEN_PROXY_BYTES_PER_TOKEN
1472
+ bucket["bytes_saved_successful"] = bytes_saved
1473
+ bucket["bytes_saved_per_successful_task"] = bytes_saved / successes
1474
+ bucket["byte_savings_pct"] = ((bytes_before - bytes_after) / bytes_before * 100.0) if bytes_before else None
1475
+ bucket["token_proxy_saved_successful"] = token_proxy_saved
1476
+ bucket["token_proxy_saved_per_successful_task"] = token_proxy_saved / successes
1477
+ else:
1478
+ bucket["bytes_saved_successful"] = None
1479
+ bucket["bytes_saved_per_successful_task"] = None
1480
+ bucket["byte_savings_pct"] = None
1481
+ bucket["token_proxy_saved_successful"] = None
1482
+ bucket["token_proxy_saved_per_successful_task"] = None
1483
+ bucket["observed_telemetry"] = {
1484
+ "tokens": (
1485
+ "observed" if runs and bucket["primary_tokens_measured_runs"] == runs
1486
+ else ("partial" if bucket["primary_tokens_measured_runs"] else "unavailable")
1487
+ ),
1488
+ "primary_cost": (
1489
+ "observed" if runs and bucket["primary_cost_measured_runs"] == runs
1490
+ else ("partial" if bucket["primary_cost_measured_runs"] else "unavailable")
1491
+ ),
1492
+ "external_tokens": (
1493
+ "observed" if successes and bucket["external_tokens_measured_successful"] == successes
1494
+ else ("partial" if bucket["external_tokens_measured_successful"] else "unavailable")
1495
+ ),
1496
+ "byte_savings": "observed" if byte_metrics_present else "unavailable",
1497
+ "token_proxy": "inferred" if (successes and byte_metrics_present) else "unavailable",
1498
+ "wall_time": (
1499
+ "observed" if runs and bucket["wall_time_seconds_measured_runs"] == runs
1500
+ else ("partial" if bucket["wall_time_seconds_measured_runs"] else "unavailable")
1501
+ ),
1502
+ "provider_cache": (
1503
+ "observed" if runs and bucket["provider_cached_tokens_measured_runs"] == runs
1504
+ else ("partial" if bucket["provider_cached_tokens_measured_runs"] else "unavailable")
1505
+ ),
1506
+ }
1507
+
1508
+ def average_task_metric(variant: str, task_id: str, key: str) -> float | None:
1509
+ values = [
1510
+ row_optional_float(row, key)
1511
+ for row in successful_rows_by_variant_task.get(variant, {}).get(task_id, [])
1512
+ ]
1513
+ known = [value for value in values if value is not None]
1514
+ return (sum(known) / len(known)) if known else None
1515
+
1516
+ def average_task_int_metric(variant: str, task_id: str, key: str) -> float | None:
1517
+ rows_for_task = successful_rows_by_variant_task.get(variant, {}).get(task_id, [])
1518
+ if not rows_for_task:
1519
+ return None
1520
+ values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
1521
+ if any(value is None for value in values):
1522
+ return None
1523
+ return sum(value for value in values if value is not None) / len(values)
1524
+
1525
+ def average_paired_metric(
1526
+ variant: str,
1527
+ task_ids: set[str],
1528
+ key: str,
1529
+ ) -> tuple[float | None, float | None, int]:
1530
+ baseline_values: list[float] = []
1531
+ variant_values: list[float] = []
1532
+ for task_id in sorted(task_ids):
1533
+ baseline_value = average_task_metric(baseline_variant, task_id, key)
1534
+ variant_value = average_task_metric(variant, task_id, key)
1535
+ if baseline_value is None or variant_value is None:
1536
+ continue
1537
+ baseline_values.append(baseline_value)
1538
+ variant_values.append(variant_value)
1539
+ if not baseline_values:
1540
+ return None, None, 0
1541
+ return (
1542
+ sum(baseline_values) / len(baseline_values),
1543
+ sum(variant_values) / len(variant_values),
1544
+ len(baseline_values),
1545
+ )
1546
+
1547
+ def average_paired_int_metric(
1548
+ variant: str,
1549
+ task_ids: set[str],
1550
+ key: str,
1551
+ ) -> tuple[float | None, float | None, int]:
1552
+ baseline_values: list[float] = []
1553
+ variant_values: list[float] = []
1554
+ for task_id in sorted(task_ids):
1555
+ baseline_value = average_task_int_metric(baseline_variant, task_id, key)
1556
+ variant_value = average_task_int_metric(variant, task_id, key)
1557
+ if baseline_value is None or variant_value is None:
1558
+ continue
1559
+ baseline_values.append(baseline_value)
1560
+ variant_values.append(variant_value)
1561
+ if not baseline_values:
1562
+ return None, None, 0
1563
+ return (
1564
+ sum(baseline_values) / len(baseline_values),
1565
+ sum(variant_values) / len(variant_values),
1566
+ len(baseline_values),
1567
+ )
1568
+
1569
+ comparisons: list[dict[str, Any]] = []
1570
+ baseline = by_variant.get(baseline_variant)
1571
+ baseline_successful_tasks = successful_tasks_by_variant.get(baseline_variant, set())
1572
+ baseline_failure_rate = baseline.get("failure_rate") if baseline else None
1573
+ for variant, bucket in sorted(by_variant.items()):
1574
+ if variant == baseline_variant:
1575
+ continue
1576
+ variant_successful_tasks = successful_tasks_by_variant.get(variant, set())
1577
+ matched_tasks = baseline_successful_tasks & variant_successful_tasks
1578
+ token_matched_tasks = {
1579
+ task_id for task_id in matched_tasks
1580
+ if all(
1581
+ row_bool(row, "primary_tokens_measured")
1582
+ for row in successful_rows_by_variant_task[baseline_variant][task_id]
1583
+ )
1584
+ and all(
1585
+ row_bool(row, "primary_tokens_measured")
1586
+ for row in successful_rows_by_variant_task[variant][task_id]
1587
+ )
1588
+ }
1589
+ base_tokens, variant_tokens, token_task_count = average_paired_metric(
1590
+ variant,
1591
+ token_matched_tasks,
1592
+ "total_tokens",
1593
+ )
1594
+ base_wall_time, variant_wall_time, wall_time_task_count = average_paired_metric(
1595
+ variant,
1596
+ matched_tasks,
1597
+ "wall_time_seconds",
1598
+ )
1599
+ base_corrections, variant_corrections, corrections_task_count = average_paired_int_metric(
1600
+ variant,
1601
+ matched_tasks,
1602
+ "corrections",
1603
+ )
1604
+ base_cost, variant_cost, cost_task_count = average_paired_metric(
1605
+ variant,
1606
+ {
1607
+ task_id for task_id in matched_tasks
1608
+ if all(
1609
+ row_cost_shift_measured(row)
1610
+ for row in successful_rows_by_variant_task[baseline_variant][task_id]
1611
+ )
1612
+ and all(
1613
+ row_cost_shift_measured(row)
1614
+ for row in successful_rows_by_variant_task[variant][task_id]
1615
+ )
1616
+ },
1617
+ "total_cost_with_shift_usd",
1618
+ )
1619
+ failure_rate = bucket.get("failure_rate")
1620
+ failure_delta = None
1621
+ if isinstance(baseline_failure_rate, (int, float)) and isinstance(failure_rate, (int, float)):
1622
+ failure_delta = (failure_rate - baseline_failure_rate) * 100.0
1623
+ missing_baseline_success_tasks = sorted(baseline_successful_tasks - variant_successful_tasks)
1624
+ quality_gate = "pass"
1625
+ if not baseline or not baseline.get("successful_runs"):
1626
+ quality_gate = "insufficient_baseline"
1627
+ elif not bucket.get("successful_runs"):
1628
+ quality_gate = "insufficient_success"
1629
+ elif missing_baseline_success_tasks:
1630
+ quality_gate = "matched_task_regression"
1631
+ elif failure_delta is not None and failure_delta >= 10.0:
1632
+ quality_gate = "failure_rate_regression"
1633
+ elif matched_tasks and corrections_task_count < len(matched_tasks):
1634
+ quality_gate = "insufficient_corrections_data"
1635
+ elif (
1636
+ isinstance(base_corrections, (int, float))
1637
+ and isinstance(variant_corrections, (int, float))
1638
+ and variant_corrections > base_corrections
1639
+ ):
1640
+ quality_gate = "corrections_regression"
1641
+ comparison: dict[str, Any] = {
1642
+ "variant": variant,
1643
+ "baseline_variant": baseline_variant,
1644
+ "quality_gate": quality_gate,
1645
+ "baseline_failure_rate": baseline_failure_rate,
1646
+ "variant_failure_rate": failure_rate,
1647
+ "failure_rate_delta_pp": failure_delta,
1648
+ "matched_successful_task_count": len(matched_tasks),
1649
+ "baseline_successful_task_count": len(baseline_successful_tasks),
1650
+ "missing_baseline_success_tasks": missing_baseline_success_tasks,
1651
+ "baseline_corrections_per_successful_task": base_corrections,
1652
+ "variant_corrections_per_successful_task": variant_corrections,
1653
+ "paired_corrections_task_count": corrections_task_count,
1654
+ }
1655
+ if isinstance(base_corrections, (int, float)) and isinstance(variant_corrections, (int, float)):
1656
+ comparison["corrections_delta_per_successful_task"] = variant_corrections - base_corrections
1657
+ if isinstance(base_tokens, (int, float)) and isinstance(variant_tokens, (int, float)) and base_tokens:
1658
+ comparison["token_delta_per_successful_task"] = variant_tokens - base_tokens
1659
+ comparison["token_savings_pct"] = (base_tokens - variant_tokens) / base_tokens * 100.0
1660
+ comparison["paired_token_task_count"] = token_task_count
1661
+ else:
1662
+ comparison["token_savings_pct"] = None
1663
+ comparison["paired_token_task_count"] = 0
1664
+ if (
1665
+ isinstance(base_wall_time, (int, float))
1666
+ and isinstance(variant_wall_time, (int, float))
1667
+ and base_wall_time
1668
+ ):
1669
+ comparison["wall_time_delta_seconds_per_successful_task"] = variant_wall_time - base_wall_time
1670
+ comparison["wall_time_change_pct"] = (variant_wall_time - base_wall_time) / base_wall_time * 100.0
1671
+ comparison["paired_wall_time_task_count"] = wall_time_task_count
1672
+ else:
1673
+ comparison["wall_time_delta_seconds_per_successful_task"] = None
1674
+ comparison["wall_time_change_pct"] = None
1675
+ comparison["paired_wall_time_task_count"] = wall_time_task_count
1676
+ if isinstance(base_cost, (int, float)) and isinstance(variant_cost, (int, float)) and base_cost:
1677
+ comparison["total_cost_with_shift_delta_usd"] = variant_cost - base_cost
1678
+ comparison["cost_savings_pct_with_shift"] = (base_cost - variant_cost) / base_cost * 100.0
1679
+ comparison["paired_cost_task_count"] = cost_task_count
1680
+ else:
1681
+ comparison["cost_savings_pct_with_shift"] = None
1682
+ comparison["paired_cost_task_count"] = cost_task_count
1683
+ comparisons.append(comparison)
1684
+
1685
+ claim_status = "insufficient_baseline"
1686
+ if baseline and baseline.get("successful_runs"):
1687
+ claim_status = "compare_variants" if comparisons else "baseline_only"
1688
+ if comparisons:
1689
+ quality_ok = all(item.get("quality_gate") == "pass" for item in comparisons)
1690
+ paired_token_data = all((item.get("paired_token_task_count") or 0) > 0 for item in comparisons)
1691
+ token_savings_observed = all((item.get("token_savings_pct") or 0) > 0 for item in comparisons)
1692
+ shifted_cost_savings = [
1693
+ item.get("cost_savings_pct_with_shift")
1694
+ for item in comparisons
1695
+ if isinstance(item.get("cost_savings_pct_with_shift"), (int, float))
1696
+ ]
1697
+ all_shifted_cost_measured = len(shifted_cost_savings) == len(comparisons)
1698
+ shifted_cost_ok = all_shifted_cost_measured and all(value > 0 for value in shifted_cost_savings)
1699
+ if not quality_ok:
1700
+ claim_status = "quality_gate_watch"
1701
+ elif not paired_token_data:
1702
+ claim_status = "insufficient_paired_data"
1703
+ elif token_savings_observed and shifted_cost_ok:
1704
+ claim_status = "token_and_shifted_cost_savings_observed"
1705
+ elif token_savings_observed and not all_shifted_cost_measured:
1706
+ claim_status = "token_savings_observed_cost_unmeasured"
1707
+ elif token_savings_observed:
1708
+ claim_status = "token_savings_observed_cost_shift_watch"
1709
+ return {
1710
+ "schema": "context-guard-bench-report-v1",
1711
+ "baseline_variant": baseline_variant,
1712
+ "row_count": len(rows),
1713
+ "summary_by_variant": by_variant,
1714
+ "comparisons": comparisons,
1715
+ "claim_status": claim_status,
1716
+ "caveat": (
1717
+ "Proxy byte reductions are reported separately from matched-task token/cost metrics; "
1718
+ "shifted cost savings require measured primary cost and measured external cost when "
1719
+ "external tokens are present. Wall time and provider cached-token fields are diagnostic "
1720
+ "telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache "
1721
+ "discounts must stay separate from token-reduction claims."
1722
+ ),
1723
+ }
1724
+
1725
+ def write_report_json(csv_path: Path, report_path: Path, baseline_variant: str) -> dict[str, Any]:
1726
+ # Keep lock order stable across all report writes: source CSV first, derived
1727
+ # report second. Do not introduce a report -> CSV path; that can deadlock
1728
+ # concurrent report generation.
1729
+ with csv_file_lock(csv_path, create_parent=True):
1730
+ report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
1731
+ with csv_file_lock(report_path, create_parent=True):
1732
+ write_text_no_follow(
1733
+ report_path,
1734
+ json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
1735
+ )
1736
+ return report
1737
+
1738
+
1739
+ def sanitize_note_text(value: Any) -> str:
1740
+ """Normalize untrusted benchmark note text without output-length policy."""
1741
+ text = "" if value is None else str(value)
1742
+ text = "".join(" " if unicodedata.category(ch)[0] == "C" else ch for ch in text)
1743
+ text = " ".join(text.split())
1744
+ for pattern, replacement in SECRET_NOTE_PATTERNS:
1745
+ text = pattern.sub(replacement, text)
1746
+ return text
1747
+
1748
+
1749
+ def sanitize_csv_note(value: Any) -> str:
1750
+ """Normalize untrusted notes before writing them to benchmark CSV output."""
1751
+ text = sanitize_note_text(value)
1752
+ if text.startswith(CSV_FORMULA_PREFIXES):
1753
+ text = "'" + text
1754
+ if len(text) > MAX_CSV_NOTE_CHARS:
1755
+ text = text[:MAX_CSV_NOTE_CHARS - 12].rstrip() + "…[truncated]"
1756
+ return text
1757
+
1758
+
1759
+ def sanitize_csv_cell(value: Any) -> str:
1760
+ """Normalize short untrusted CSV labels and block spreadsheet formulas."""
1761
+ text = sanitize_note_text(value)
1762
+ if text.startswith(CSV_FORMULA_PREFIXES):
1763
+ text = "'" + text
1764
+ return text
1765
+
1766
+
1767
+ def filter_targets(tasks: list[TaskFixture], variants: list[Variant],
1768
+ only_task: str | None, only_variant: str | None) -> list[tuple[TaskFixture, Variant]]:
1769
+ targets: list[tuple[TaskFixture, Variant]] = []
1770
+ for task in tasks:
1771
+ if only_task and task.id != only_task:
1772
+ continue
1773
+ for variant in variants:
1774
+ if only_variant and variant.name != only_variant:
1775
+ continue
1776
+ targets.append((task, variant))
1777
+ return targets
1778
+
1779
+
1780
+ def normalized_output_path(path: Path) -> Path:
1781
+ expanded = path.expanduser()
1782
+ if not expanded.is_absolute():
1783
+ expanded = Path.cwd() / expanded
1784
+ return Path(os.path.normpath(str(_normalize_allowed_first_absolute_symlink(expanded))))
1785
+
1786
+
1787
+ def existing_file_identity(path: Path) -> tuple[int, int] | None:
1788
+ try:
1789
+ fd = _open_regular_no_symlink(normalized_output_path(path))
1790
+ except FileNotFoundError:
1791
+ return None
1792
+ try:
1793
+ st = os.fstat(fd)
1794
+ return (int(st.st_dev), int(st.st_ino))
1795
+ finally:
1796
+ os.close(fd)
1797
+
1798
+
1799
+ def validate_distinct_output_paths(csv_path: Path, ledger_path: Path | None, report_path: Path | None) -> None:
1800
+ outputs = [("csv", csv_path), ("ledger-jsonl", ledger_path), ("report-json", report_path)]
1801
+ seen: dict[Path, str] = {}
1802
+ seen_identity: dict[tuple[int, int], str] = {}
1803
+ for label, path in outputs:
1804
+ if path is None:
1805
+ continue
1806
+ normalized = normalized_output_path(path)
1807
+ previous = seen.get(normalized)
1808
+ if previous is not None:
1809
+ raise SystemExit(f"--{label} must not point to the same path as --{previous}: {normalized}")
1810
+ seen[normalized] = label
1811
+ identity = existing_file_identity(normalized)
1812
+ if identity is not None:
1813
+ previous_identity = seen_identity.get(identity)
1814
+ if previous_identity is not None:
1815
+ raise SystemExit(f"--{label} must not point to the same file as --{previous_identity}: {normalized}")
1816
+ seen_identity[identity] = label
1817
+
1818
+
1819
+ def main() -> int:
1820
+ parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
1821
+ parser.add_argument("--tasks", required=True, type=Path, help="task fixture JSON")
1822
+ parser.add_argument("--variants", required=True, type=Path, help="variant fixture JSON")
1823
+ parser.add_argument("--csv", default=Path("bench/results.csv"), type=Path,
1824
+ help="results CSV path (header is added on first write)")
1825
+ parser.add_argument("--task-id", default=None, help="run only the named task id")
1826
+ parser.add_argument("--variant", default=None, help="run only the named variant")
1827
+ parser.add_argument("--claude-bin", default=os.environ.get("CLAUDE_BIN", "claude"),
1828
+ help="claude CLI executable (default: $CLAUDE_BIN or 'claude')")
1829
+ parser.add_argument("--project-root", default=Path("."), type=Path,
1830
+ help="working directory used for success_command (default: cwd)")
1831
+ parser.add_argument("--dry-run", action="store_true",
1832
+ help="print the claude command without invoking it")
1833
+ parser.add_argument("--resume", action="store_true",
1834
+ help="skip (task_id, variant) rows already present in --csv")
1835
+ parser.add_argument("--ledger-jsonl", default=None, type=Path,
1836
+ help="optional JSONL ledger path for cost-shift accounting per run")
1837
+ parser.add_argument("--report-json", default=None, type=Path,
1838
+ help="optional A/B summary report JSON path generated from --csv after real runs")
1839
+ parser.add_argument("--baseline-variant", default="baseline",
1840
+ help="variant name used as the report baseline (default: baseline)")
1841
+ args = parser.parse_args()
1842
+
1843
+ require_no_follow_file_ops_supported()
1844
+ validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
1845
+
1846
+ if not args.dry_run and shutil.which(args.claude_bin) is None:
1847
+ # claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
1848
+ if not Path(args.claude_bin).exists():
1849
+ print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
1850
+ return 2
1851
+
1852
+ tasks = parse_tasks(args.tasks)
1853
+ variants = parse_variants(args.variants)
1854
+ targets = filter_targets(tasks, variants, args.task_id, args.variant)
1855
+ if not targets:
1856
+ print("no (task, variant) targets matched the filters", file=sys.stderr)
1857
+ return 1
1858
+
1859
+ skip_keys = existing_keys(args.csv) if args.resume else set()
1860
+ project_root = args.project_root.resolve()
1861
+ claude_ver = "dry-run" if args.dry_run else claude_version(args.claude_bin)
1862
+
1863
+ completed = 0
1864
+ for task, variant in targets:
1865
+ if (task.id, variant.name) in skip_keys:
1866
+ print(f"skip {task.id}/{variant.name} (already in {args.csv})")
1867
+ continue
1868
+ print(f"run {task.id}/{variant.name} ...", flush=True)
1869
+ result = run_fixture(task, variant, args.claude_bin, project_root, args.dry_run)
1870
+ # dry-run row 는 CSV 에 적재하지 않는다. 적재하면 (a) tokens=0/cost=0 이 평균을
1871
+ # 깎고, (b) --resume 이 그 (task, variant) 를 skip 해 실제 측정값이 영구 누락된다.
1872
+ wrote = True
1873
+ if not args.dry_run:
1874
+ wrote = append_csv(args.csv, claude_ver, result, skip_existing=args.resume)
1875
+ if wrote and args.ledger_jsonl is not None:
1876
+ append_cost_shift_ledger(args.ledger_jsonl, claude_ver, result)
1877
+ completed += 1
1878
+ status = "ok" if result.success else "FAIL"
1879
+ if args.dry_run:
1880
+ suffix = " (dry-run; CSV not updated)"
1881
+ elif not wrote:
1882
+ suffix = " (CSV not updated; row already present)"
1883
+ else:
1884
+ suffix = ""
1885
+ print(
1886
+ f" {status} tokens={sum(result.tokens.values())} cost=${result.cost_usd:.4f} "
1887
+ f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
1888
+ )
1889
+ target = args.csv if not args.dry_run else "(dry-run; no CSV writes)"
1890
+ if args.report_json is not None and not args.dry_run:
1891
+ report = write_report_json(args.csv, args.report_json, args.baseline_variant)
1892
+ print(f"report {args.report_json}: {report['claim_status']}")
1893
+ print(f"completed {completed} run(s); results in {target}")
1894
+ return 0
1895
+
1896
+
1897
+ if __name__ == "__main__":
1898
+ raise SystemExit(main())