@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/README.ko.md +59 -31
  3. package/README.md +85 -36
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  8. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  9. package/docs/benchmark-workflow-examples.md +3 -0
  10. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  11. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  12. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  13. package/docs/distribution.md +10 -7
  14. package/docs/experimental-benchmark-fixtures.md +30 -6
  15. package/package.json +4 -6
  16. package/packaging/homebrew/context-guard.rb.template +1 -1
  17. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  18. package/plugins/context-guard/README.ko.md +20 -14
  19. package/plugins/context-guard/README.md +26 -17
  20. package/plugins/context-guard/bin/context-guard +147 -25
  21. package/plugins/context-guard/bin/context-guard-artifact +884 -79
  22. package/plugins/context-guard/bin/context-guard-audit +33 -2
  23. package/plugins/context-guard/bin/context-guard-bench +1542 -31
  24. package/plugins/context-guard/bin/context-guard-cache-score +665 -0
  25. package/plugins/context-guard/bin/context-guard-compress +146 -1
  26. package/plugins/context-guard/bin/context-guard-cost +790 -6
  27. package/plugins/context-guard/bin/context-guard-experiments +463 -26
  28. package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
  29. package/plugins/context-guard/bin/context-guard-filter +163 -7
  30. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  31. package/plugins/context-guard/bin/context-guard-pack +892 -49
  32. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  33. package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
  34. package/plugins/context-guard/bin/context-guard-setup +165 -31
  35. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  36. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  37. package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
  38. package/plugins/context-guard/bin/context-guard-trim-output +288 -41
  39. package/plugins/context-guard/brief/README.md +5 -5
  40. package/plugins/context-guard/lib/context_guard_commands.py +230 -0
  41. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  42. package/context-guard-kit/README.md +0 -91
  43. package/context-guard-kit/benchmark_runner.py +0 -2401
  44. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  45. package/context-guard-kit/context_compress.py +0 -695
  46. package/context-guard-kit/context_escrow.py +0 -935
  47. package/context-guard-kit/context_filter.py +0 -637
  48. package/context-guard-kit/context_guard_cli.py +0 -325
  49. package/context-guard-kit/context_guard_diet.py +0 -1711
  50. package/context-guard-kit/context_pack.py +0 -2713
  51. package/context-guard-kit/cost_guard.py +0 -2349
  52. package/context-guard-kit/experimental_registry.py +0 -4348
  53. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  54. package/context-guard-kit/guard_large_read.py +0 -690
  55. package/context-guard-kit/hook_secret_patterns.py +0 -43
  56. package/context-guard-kit/read_symbol.py +0 -483
  57. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  58. package/context-guard-kit/sanitize_output.py +0 -725
  59. package/context-guard-kit/settings.example.json +0 -67
  60. package/context-guard-kit/setup_wizard.py +0 -2515
  61. package/context-guard-kit/statusline.sh +0 -362
  62. package/context-guard-kit/statusline_merged.sh +0 -157
  63. package/context-guard-kit/tool_schema_pruner.py +0 -837
  64. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -1,2401 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Claude Code 토큰 절감 벤치마크 자동 실행 runner.
3
-
4
- `research/benchmark-plan.md` 의 task set × variant 조합을 비대화형 `claude -p`
5
- 호출로 실행하고, `tokens_per_successful_task` 측정에 필요한 컬럼을 CSV 에 적재한다.
6
-
7
- 사용 예:
8
-
9
- ```bash
10
- context-guard-kit/benchmark_runner.py \
11
- --tasks bench/tasks.json --variants bench/variants.json \
12
- --csv bench/results.csv
13
-
14
- context-guard-kit/benchmark_runner.py --tasks bench/tasks.json \
15
- --variants bench/variants.json --task-id t01 --variant baseline --dry-run
16
- ```
17
-
18
- Task fixture (`tasks.json`): 각 task 는 다음 필드를 가진다.
19
-
20
- ```json
21
- [
22
- {
23
- "id": "t01",
24
- "prompt": "Add validation to src/auth/session.ts ...",
25
- "model": "sonnet",
26
- "effort": "medium",
27
- "max_turns": 3,
28
- "max_budget_usd": 1.0,
29
- "allowed_tools": ["Read", "Edit", "Bash(npm test*)"],
30
- "variant_prompt_files": {"context_hygiene": "t01.context_hygiene.prompt.md"},
31
- "success_command": "npm test -- auth/session",
32
- "success_cwd": "."
33
- }
34
- ]
35
- ```
36
-
37
- Variant fixture (`variants.json`): 각 variant 는 `claude -p` 에 추가할 옵션 묶음을 정의한다.
38
-
39
- ```json
40
- [
41
- {"name": "baseline", "extra_args": []},
42
- {"name": "context_hygiene", "extra_args": ["--strict-mcp-config", "--mcp-config", "bench/minimal-mcp.json"]}
43
- ]
44
- ```
45
-
46
- dry-run 모드는 실제 호출은 하지 않고 어떤 명령이 실행될지만 출력한다.
47
- """
48
- from __future__ import annotations
49
-
50
- import argparse
51
- import collections
52
- from contextlib import contextmanager
53
- import csv
54
- import datetime as _dt
55
- import json
56
- import math
57
- import os
58
- import re
59
- import selectors
60
- import shlex
61
- import shutil
62
- import signal
63
- import stat
64
- import subprocess
65
- import sys
66
- import time
67
- import unicodedata
68
- from dataclasses import dataclass, field
69
- from pathlib import Path
70
- from typing import Any
71
-
72
- try:
73
- import fcntl
74
- except ImportError: # pragma: no cover - benchmark runner already requires POSIX no-follow IO.
75
- fcntl = None # type: ignore[assignment]
76
-
77
- CSV_COLUMNS = [
78
- "date",
79
- "claude_version",
80
- "task_id",
81
- "variant",
82
- "model",
83
- "effort",
84
- "total_tokens",
85
- "input_tokens",
86
- "output_tokens",
87
- "cache_read",
88
- "cache_creation",
89
- "provider_cached_tokens",
90
- "provider_cached_tokens_measured",
91
- "cost_usd",
92
- "cost_measured",
93
- "wall_time_seconds",
94
- "turns",
95
- "hook_triggers",
96
- "bytes_before",
97
- "bytes_after",
98
- "artifacts_used",
99
- "external_tokens",
100
- "external_tokens_measured",
101
- "external_cost_usd",
102
- "external_cost_measured",
103
- "total_cost_with_shift_usd",
104
- "success",
105
- "corrections",
106
- "notes",
107
- "primary_tokens_measured",
108
- ]
109
- MAX_CSV_NOTE_CHARS = 500
110
- MAX_CSV_ROWS = 100_000
111
- CSV_FORMULA_PREFIXES = ("=", "+", "-", "@")
112
- PLACEHOLDER_SUCCESS_COMMAND_MARKER = "fixture-only placeholder: replace success_command before real benchmark runs"
113
- PROTECTED_VARIANT_FLAGS = frozenset({
114
- "--",
115
- "-p",
116
- "--print",
117
- "--model",
118
- "--max-turns",
119
- "--output-format",
120
- "--allowedTools",
121
- "--allowed-tools",
122
- "--max-budget-usd",
123
- "--effort",
124
- })
125
- SECRET_NOTE_KEY_RE = r"[A-Za-z0-9_.-]*(?:api[-_]?key|token|secret|password|client[-_]?secret)[A-Za-z0-9_.-]*"
126
- SECRET_NOTE_VALUE_RE = r"(?:'[^']*'|\"[^\"]*\"|[^\s,}&#;]+)"
127
- SECRET_NOTE_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
128
- (re.compile(r"(?i)\bBearer\s+[A-Za-z0-9._~+/=-]+"), "[REDACTED]"),
129
- (re.compile(r"(?i)\bBasic\s+[A-Za-z0-9._~+/=-]+"), "[REDACTED]"),
130
- (re.compile(rf"(?i)([?&#;]({SECRET_NOTE_KEY_RE})=)[^\s?&#;]+"), r"\1[REDACTED]"),
131
- (re.compile(rf"(?i)(^|[\s{{,?&#;])([\"']?(?:{SECRET_NOTE_KEY_RE})[\"']?\s*[:=]\s*){SECRET_NOTE_VALUE_RE}"), r"\1\2[REDACTED]"),
132
- (re.compile(rf"(?i)(^|[\s\"'])(--(?:{SECRET_NOTE_KEY_RE})(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"), r"\1\2[REDACTED]"),
133
- (re.compile(r"(?i)(^|[\s\"'])((?:-u|--user)(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"), r"\1\2[REDACTED]"),
134
- (re.compile(r"gh[pousr]_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
135
- (re.compile(r"github_pat_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
136
- (re.compile(r"glpat-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
137
- (re.compile(r"xox[abprs]-[A-Za-z0-9-]{10,}"), "[REDACTED]"),
138
- (re.compile(r"(?:AKIA|ASIA)[0-9A-Z]{16}"), "[REDACTED]"),
139
- (re.compile(r"(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{16,}"), "[REDACTED]"),
140
- (re.compile(r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
141
- (re.compile(r"npm_[A-Za-z0-9]{20,}"), "[REDACTED]"),
142
- (re.compile(r"AIza[0-9A-Za-z_\-]{20,}"), "[REDACTED]"),
143
- (re.compile(r"SG\.[A-Za-z0-9_-]{16,}\.[A-Za-z0-9_-]{16,}"), "[REDACTED]"),
144
- (re.compile(r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"), "[REDACTED]"),
145
- (re.compile(r"([a-z][a-z0-9+.-]*://)[^/\s@]+@", re.IGNORECASE), r"\1[REDACTED]@"),
146
- )
147
-
148
- # claude -p --output-format json 및 호환 벤치마크 provider usage 키 후보.
149
- # Anthropic SDK, Claude Code, OpenAI-style JSON 출력 형식이 시간이 지나며 바뀔 수
150
- # 있어 다중 후보로 best-effort 매칭한다.
151
- USAGE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...]], ...] = (
152
- ("input_tokens", ("input_tokens", "inputTokens", "prompt_tokens", "promptTokens")),
153
- ("output_tokens", ("output_tokens", "outputTokens", "completion_tokens", "completionTokens")),
154
- ("cache_read", ("cache_read_input_tokens", "cacheRead")),
155
- ("cache_creation", ("cache_creation_input_tokens", "cacheCreation")),
156
- )
157
- PROVIDER_CACHE_DETAIL_KEYS = (
158
- "prompt_tokens_details",
159
- "promptTokensDetails",
160
- "input_tokens_details",
161
- "inputTokensDetails",
162
- )
163
- PROVIDER_CACHED_TOKEN_KEYS = ("cached_tokens", "cachedTokens")
164
- COST_KEYS = ("total_cost_usd", "cost_usd", "costUSD")
165
- SHIFT_METRIC_KEY_GROUPS: tuple[tuple[str, tuple[str, ...]], ...] = (
166
- ("turns", ("turns", "num_turns", "total_turns")),
167
- ("hook_triggers", ("hook_triggers", "hookTriggerCount", "hook_trigger_count")),
168
- ("bytes_before", ("bytes_before", "bytesBefore", "raw_bytes_before")),
169
- ("bytes_after", ("bytes_after", "bytesAfter", "visible_bytes_after")),
170
- ("artifacts_used", ("artifacts_used", "artifact_count", "artifactsUsed")),
171
- )
172
- EXTERNAL_TOKEN_AGGREGATE_KEYS = ("external_tokens",)
173
- EXTERNAL_COST_AGGREGATE_KEYS = ("external_cost_usd",)
174
- EXTERNAL_SOURCE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...], tuple[str, ...]], ...] = (
175
- ("auxiliary", ("auxiliary_tokens",), ("auxiliary_cost_usd",)),
176
- ("subagent", ("subagent_tokens",), ("subagent_cost_usd",)),
177
- ("provider", ("provider_tokens",), ("provider_cost_usd",)),
178
- )
179
- MAX_USAGE_TOKEN_COUNT = 10**12
180
- MAX_USAGE_COST_USD = 10**9
181
- # Byte -> token proxy 환산 계수. 측정된 모델 토큰이 아니라 byte delta 기반 보수적
182
- # 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
183
- # ~4 bytes/token의 통용 근사값을 사용한다.
184
- TOKEN_PROXY_BYTES_PER_TOKEN = 4
185
- BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
186
- MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
187
- SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
188
- SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
189
- SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
190
- MAX_SELF_HOSTED_LABEL_CHARS = 120
191
- MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
192
- MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
193
- MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
194
- CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
195
- SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
196
- VERSION_OUTPUT_MAX_BYTES = 16_000
197
- PROCESS_TERMINATE_GRACE_SECONDS = 2.0
198
- ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
199
- "tmp": Path("/private/tmp"),
200
- "var": Path("/private/var"),
201
- }
202
-
203
-
204
- def _base_open_flags() -> int:
205
- flags = os.O_RDONLY
206
- if hasattr(os, "O_CLOEXEC"):
207
- flags |= os.O_CLOEXEC
208
- return flags
209
-
210
-
211
- def _no_follow_flag() -> int:
212
- if hasattr(os, "O_NOFOLLOW"):
213
- return os.O_NOFOLLOW
214
- raise OSError("platform does not support no-follow file opens")
215
-
216
-
217
- def no_follow_file_ops_supported() -> bool:
218
- return hasattr(os, "O_NOFOLLOW") and os.open in os.supports_dir_fd and os.mkdir in os.supports_dir_fd
219
-
220
-
221
- def require_no_follow_file_ops_supported() -> None:
222
- if not no_follow_file_ops_supported():
223
- raise SystemExit(
224
- "benchmark runner requires POSIX no-follow file operations for safe fixture and CSV paths; "
225
- "this platform is not supported yet."
226
- )
227
-
228
-
229
- def _directory_flag() -> int:
230
- return getattr(os, "O_DIRECTORY", 0)
231
-
232
-
233
- def _normalized_link_target(parent: Path, raw_target: str) -> Path:
234
- target = Path(raw_target)
235
- if not target.is_absolute():
236
- target = parent / target
237
- return Path(os.path.normpath(str(target)))
238
-
239
-
240
- def _normalize_allowed_first_absolute_symlink(path: Path) -> Path:
241
- if not path.is_absolute() or len(path.parts) < 2:
242
- return path
243
- first = path.parts[1]
244
- expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
245
- if expected is None:
246
- return path
247
- link = Path(path.anchor) / first
248
- try:
249
- if not stat.S_ISLNK(os.lstat(link).st_mode):
250
- return path
251
- if _normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
252
- return path
253
- except OSError:
254
- return path
255
- return expected.joinpath(*path.parts[2:])
256
-
257
-
258
- def _open_directory_at(dir_fd: int, component: str, path: Path) -> int:
259
- fd = os.open(component, _base_open_flags() | _directory_flag() | _no_follow_flag(), dir_fd=dir_fd)
260
- try:
261
- if not stat.S_ISDIR(os.fstat(fd).st_mode):
262
- raise OSError(f"not a directory: {path}")
263
- return fd
264
- except Exception:
265
- os.close(fd)
266
- raise
267
-
268
-
269
- def _ensure_directory_no_symlink(path: Path, *, create: bool = False) -> int:
270
- if os.open not in os.supports_dir_fd or os.mkdir not in os.supports_dir_fd:
271
- raise OSError("platform does not support directory-relative no-follow directory access")
272
- path = _normalize_allowed_first_absolute_symlink(path)
273
- components = list(path.parts)
274
- if path.is_absolute() and components:
275
- components = components[1:]
276
- root = path.anchor if path.is_absolute() else "."
277
- dir_fd = os.open(root or ".", _base_open_flags() | _directory_flag())
278
- try:
279
- for component in components:
280
- try:
281
- next_fd = _open_directory_at(dir_fd, component, path)
282
- except FileNotFoundError:
283
- if not create:
284
- raise
285
- os.mkdir(component, 0o777, dir_fd=dir_fd)
286
- next_fd = _open_directory_at(dir_fd, component, path)
287
- os.close(dir_fd)
288
- dir_fd = next_fd
289
- return dir_fd
290
- except Exception:
291
- os.close(dir_fd)
292
- raise
293
-
294
-
295
- def _open_regular_no_symlink(
296
- path: Path,
297
- flags: int | None = None,
298
- mode: int = 0o666,
299
- *,
300
- create_parent: bool = False,
301
- ) -> int:
302
- if os.open not in os.supports_dir_fd:
303
- raise OSError("platform does not support directory-relative no-follow opens")
304
- path = _normalize_allowed_first_absolute_symlink(path)
305
- parent_fd = _ensure_directory_no_symlink(path.parent, create=create_parent)
306
- open_flags = (flags if flags is not None else _base_open_flags()) | _no_follow_flag()
307
- try:
308
- fd = os.open(path.name, open_flags, mode, dir_fd=parent_fd)
309
- try:
310
- if not stat.S_ISREG(os.fstat(fd).st_mode):
311
- raise OSError(f"not a regular file: {path}")
312
- return fd
313
- except Exception:
314
- os.close(fd)
315
- raise
316
- finally:
317
- os.close(parent_fd)
318
-
319
-
320
- def _read_text_no_follow(path: Path) -> str:
321
- fd = _open_regular_no_symlink(path)
322
- try:
323
- with os.fdopen(fd, "r", encoding="utf-8") as handle:
324
- fd = -1
325
- return handle.read()
326
- finally:
327
- if fd != -1:
328
- os.close(fd)
329
-
330
-
331
- @contextmanager
332
- def csv_file_lock(csv_path: Path, *, create_parent: bool) -> Any:
333
- """Serialize CSV read/write access with a no-follow sidecar lock file."""
334
- if fcntl is None:
335
- raise OSError("platform does not support advisory CSV locks")
336
- lock_path = csv_path.with_name(f"{csv_path.name}.lock")
337
- fd = _open_regular_no_symlink(lock_path, os.O_CREAT | os.O_RDWR, 0o600, create_parent=create_parent)
338
- locked = False
339
- try:
340
- fcntl.flock(fd, fcntl.LOCK_EX)
341
- locked = True
342
- yield
343
- finally:
344
- try:
345
- if locked:
346
- fcntl.flock(fd, fcntl.LOCK_UN)
347
- finally:
348
- os.close(fd)
349
-
350
-
351
- # 재현성 우선: fixture 에 명시되지 않은 필드는 argv 로 전달하지 않는다.
352
- # 사용자가 baseline 으로 의도한 변형이 implicit default(예: effort="medium")로 인해
353
- # 왜곡되지 않도록, 파싱 단계에서 명시 여부를 그대로 보존한다.
354
- @dataclass
355
- class TaskFixture:
356
- id: str
357
- prompt: str
358
- model: str = "sonnet"
359
- effort: str | None = None
360
- max_turns: int = 3
361
- max_budget_usd: float | None = None
362
- allowed_tools: list[str] = field(default_factory=list)
363
- success_command: str | None = None
364
- success_cwd: str = "."
365
- variant_prompt_files: dict[str, str] = field(default_factory=dict)
366
- variant_prompt_texts: dict[str, str] = field(default_factory=dict)
367
-
368
-
369
- @dataclass
370
- class Variant:
371
- name: str
372
- extra_args: list[str] = field(default_factory=list)
373
-
374
-
375
- @dataclass
376
- class RunResult:
377
- task_id: str
378
- variant: str
379
- model: str
380
- effort: str
381
- tokens: dict[str, int]
382
- cost_usd: float
383
- success: bool
384
- notes: str
385
- corrections: int = 0
386
- cost_measured: bool = False
387
- wall_time_seconds: float = 0.0
388
- turns: int = 0
389
- hook_triggers: int = 0
390
- bytes_before: int = 0
391
- bytes_after: int = 0
392
- artifacts_used: int = 0
393
- external_tokens: int = 0
394
- external_tokens_measured: bool = False
395
- external_cost_usd: float = 0.0
396
- external_cost_measured: bool = False
397
- provider_cached_tokens: int = 0
398
- provider_cached_tokens_measured: bool = False
399
- primary_tokens_measured: bool = False
400
- self_hosted_metrics: dict[str, Any] | None = None
401
-
402
-
403
- @dataclass
404
- class BoundedProcessResult:
405
- returncode: int
406
- stdout: str
407
- stderr: str
408
- timed_out: bool = False
409
- output_truncated: bool = False
410
-
411
-
412
- def is_placeholder_success_command(command: str | None) -> bool:
413
- return bool(command and PLACEHOLDER_SUCCESS_COMMAND_MARKER in command)
414
-
415
-
416
- def parse_positive_int(value: Any, *, field: str, owner: str) -> int:
417
- """Parse a JSON fixture field that must be a positive integer."""
418
- if isinstance(value, bool):
419
- raise SystemExit(f"{owner} {field} must be a positive integer")
420
- if isinstance(value, int):
421
- parsed = value
422
- elif isinstance(value, str) and re.fullmatch(r"[0-9]+", value.strip()):
423
- parsed = int(value.strip())
424
- else:
425
- raise SystemExit(f"{owner} {field} must be a positive integer")
426
- if parsed <= 0:
427
- raise SystemExit(f"{owner} {field} must be > 0")
428
- return parsed
429
-
430
-
431
- def parse_string_list(value: Any, *, field: str, owner: str) -> list[str]:
432
- """Parse a JSON fixture field that must be a list of non-empty strings."""
433
- if value is None:
434
- raise SystemExit(f"{owner} {field} must be a JSON list of strings")
435
- if not isinstance(value, list):
436
- raise SystemExit(f"{owner} {field} must be a JSON list of strings")
437
- items: list[str] = []
438
- for index, item in enumerate(value):
439
- if not isinstance(item, str):
440
- raise SystemExit(f"{owner} {field}[{index}] must be a string")
441
- if not item.strip():
442
- raise SystemExit(f"{owner} {field}[{index}] must be non-empty")
443
- items.append(item)
444
- return items
445
-
446
-
447
- def parse_string_map(value: Any, *, field: str, owner: str) -> dict[str, str]:
448
- """Parse a JSON fixture field that must be an object of non-empty string values."""
449
- if value is None:
450
- return {}
451
- if not isinstance(value, dict):
452
- raise SystemExit(f"{owner} {field} must be a JSON object of strings")
453
- items: dict[str, str] = {}
454
- for raw_key, raw_value in value.items():
455
- if not isinstance(raw_key, str) or not raw_key.strip():
456
- raise SystemExit(f"{owner} {field} keys must be non-empty strings")
457
- if not isinstance(raw_value, str) or not raw_value.strip():
458
- raise SystemExit(f"{owner} {field}.{raw_key} must be a non-empty string")
459
- items[raw_key] = raw_value
460
- return items
461
-
462
-
463
- def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[str]:
464
- for index, arg in enumerate(extra_args):
465
- flag = arg.split("=", 1)[0]
466
- if flag in PROTECTED_VARIANT_FLAGS:
467
- raise SystemExit(
468
- f"{owner} extra_args[{index}] must not override runner-controlled Claude flags: {flag}"
469
- )
470
- return extra_args
471
-
472
-
473
- def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
474
- """Return a safe relative prompt-file path, or fail before any file read."""
475
- rel_path = Path(raw_path)
476
- if rel_path.is_absolute():
477
- raise SystemExit(f"{owner} variant_prompt_files path must be relative: {raw_path}")
478
- if not rel_path.parts or rel_path == Path("."):
479
- raise SystemExit(f"{owner} variant_prompt_files path must name a file")
480
- if any(part in ("", ".", "..") for part in rel_path.parts):
481
- raise SystemExit(f"{owner} variant_prompt_files path must not contain '.', '..', or empty components: {raw_path}")
482
- return rel_path
483
-
484
-
485
- def validate_variant_prompt_file_references(
486
- tasks: list[TaskFixture],
487
- variants: list["Variant"],
488
- ) -> None:
489
- """Validate variant prompt-file keys and paths without dereferencing files.
490
-
491
- Unknown variant keys and unsafe relative paths are rejected before any file
492
- read. Missing prompt files are intentionally not checked here so a run
493
- narrowed by --task-id/--variant is not blocked by unselected prompt files.
494
- """
495
- known_variants = {variant.name for variant in variants}
496
- for task in tasks:
497
- unknown = sorted(set(task.variant_prompt_files) - known_variants)
498
- if unknown:
499
- raise SystemExit(
500
- f"task {task.id} variant_prompt_files references unknown variant(s): {', '.join(unknown)}"
501
- )
502
- for variant_name, raw_path in task.variant_prompt_files.items():
503
- validate_variant_prompt_file_path(
504
- raw_path,
505
- owner=f"task {task.id} variant {variant_name}",
506
- )
507
-
508
-
509
- def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None = None) -> str:
510
- """Read one selected prompt file with no-follow IO and an argv-safe size cap."""
511
- label = display_path or path.name
512
- try:
513
- fd = _open_regular_no_symlink(path)
514
- except OSError as exc:
515
- detail = exc.strerror or exc.__class__.__name__
516
- raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
517
- try:
518
- size = os.fstat(fd).st_size
519
- if size > MAX_VARIANT_PROMPT_FILE_BYTES:
520
- raise SystemExit(
521
- f"{owner} variant_prompt_files prompt file exceeds "
522
- f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
523
- )
524
- try:
525
- with os.fdopen(fd, "r", encoding="utf-8") as handle:
526
- fd = -1
527
- text = handle.read()
528
- except UnicodeDecodeError as exc:
529
- raise SystemExit(
530
- f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
531
- f"{label}: {exc.reason}"
532
- ) from None
533
- except OSError as exc:
534
- detail = exc.strerror or exc.__class__.__name__
535
- raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
536
- finally:
537
- if fd != -1:
538
- os.close(fd)
539
- if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
540
- raise SystemExit(
541
- f"{owner} variant_prompt_files prompt text exceeds "
542
- f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
543
- )
544
- return text
545
-
546
-
547
- def load_variant_prompt_files_for_targets(
548
- targets: list[tuple[TaskFixture, "Variant"]],
549
- *,
550
- task_file_dir: Path,
551
- ) -> None:
552
- """Load file-backed prompts only for selected (task, variant) targets."""
553
- for task, variant in targets:
554
- raw_path = task.variant_prompt_files.get(variant.name)
555
- if raw_path is None:
556
- continue
557
- rel_path = validate_variant_prompt_file_path(
558
- raw_path,
559
- owner=f"task {task.id} variant {variant.name}",
560
- )
561
- task.variant_prompt_texts[variant.name] = read_variant_prompt_file(
562
- task_file_dir / rel_path,
563
- owner=f"task {task.id} variant {variant.name}",
564
- display_path=str(rel_path),
565
- )
566
-
567
-
568
- def normalize_usage_token(value: Any) -> int | None:
569
- """Return a safe non-negative token count, or None for invalid metrics."""
570
- if isinstance(value, bool) or not isinstance(value, (int, float)):
571
- return None
572
- try:
573
- numeric = float(value)
574
- except (OverflowError, ValueError):
575
- return None
576
- if not math.isfinite(numeric) or numeric < 0 or numeric > MAX_USAGE_TOKEN_COUNT:
577
- return None
578
- return int(numeric)
579
-
580
-
581
- def normalize_usage_cost(value: Any) -> float | None:
582
- """Return a safe non-negative cost value, or None for invalid metrics."""
583
- if isinstance(value, bool) or not isinstance(value, (int, float)):
584
- return None
585
- try:
586
- numeric = float(value)
587
- except (OverflowError, ValueError):
588
- return None
589
- if not math.isfinite(numeric) or numeric < 0 or numeric > MAX_USAGE_COST_USD:
590
- return None
591
- return numeric
592
-
593
-
594
- def parse_tasks(path: Path, variants: list["Variant"] | None = None) -> list[TaskFixture]:
595
- raw = json.loads(_read_text_no_follow(path))
596
- if not isinstance(raw, list):
597
- raise SystemExit(f"tasks file must be a JSON list: {path}")
598
- fixtures: list[TaskFixture] = []
599
- for item in raw:
600
- if not isinstance(item, dict):
601
- raise SystemExit(f"task entry must be a JSON object: {item}")
602
- effort_raw = item.get("effort")
603
- budget_raw = item.get("max_budget_usd")
604
- if budget_raw is not None:
605
- try:
606
- budget = float(budget_raw)
607
- except (TypeError, ValueError):
608
- raise SystemExit(f"task {item.get('id')} max_budget_usd must be number or null")
609
- if not math.isfinite(budget) or budget <= 0:
610
- raise SystemExit(f"task {item.get('id')} max_budget_usd must be finite and > 0 (use null for unlimited)")
611
- else:
612
- budget = None
613
- task_id = str(item["id"])
614
- if "variant_prompts" in item:
615
- raise SystemExit(
616
- f"task {task_id} variant_prompts is not supported; use file-backed variant_prompt_files"
617
- )
618
- fixtures.append(TaskFixture(
619
- id=task_id,
620
- prompt=str(item["prompt"]),
621
- model=str(item.get("model", "sonnet")),
622
- effort=str(effort_raw) if effort_raw is not None else None,
623
- max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {task_id}"),
624
- max_budget_usd=budget,
625
- allowed_tools=parse_string_list(
626
- item.get("allowed_tools", []),
627
- field="allowed_tools",
628
- owner=f"task {task_id}",
629
- ),
630
- success_command=item.get("success_command"),
631
- success_cwd=str(item.get("success_cwd", ".")),
632
- variant_prompt_files=parse_string_map(
633
- item.get("variant_prompt_files"),
634
- field="variant_prompt_files",
635
- owner=f"task {task_id}",
636
- ),
637
- ))
638
- if variants is not None:
639
- validate_variant_prompt_file_references(fixtures, variants)
640
- return fixtures
641
-
642
-
643
- def parse_variants(path: Path) -> list[Variant]:
644
- raw = json.loads(_read_text_no_follow(path))
645
- if not isinstance(raw, list):
646
- raise SystemExit(f"variants file must be a JSON list: {path}")
647
- variants: list[Variant] = []
648
- for item in raw:
649
- if not isinstance(item, dict):
650
- raise SystemExit(f"variant entry must be a JSON object: {item}")
651
- variants.append(Variant(
652
- name=str(item["name"]),
653
- extra_args=validate_variant_extra_args(
654
- parse_string_list(
655
- item.get("extra_args", []),
656
- field="extra_args",
657
- owner=f"variant {item.get('name')}",
658
- ),
659
- owner=f"variant {item.get('name')}",
660
- ),
661
- ))
662
- return variants
663
-
664
-
665
- def collect_usage(payload: Any) -> tuple[dict[str, int], float, bool, bool]:
666
- """`claude -p --output-format json` 응답에서 token / cost 추출.
667
-
668
- 의도된 정책: 한 응답에 top-level usage 와 nested per-message usage 가 동시에 있으면
669
- 이중 합산이 되어 비용이 과대 보고된다. 따라서 각 bucket / cost 모두 **첫 매칭** 만
670
- 채택한다 (top-level → BFS 순서). 응답 구조가 바뀌어 첫 매칭이 의도와 다른 경우에는
671
- fixture/variant 단위로 측정 결과를 점검하라.
672
- """
673
- tokens: dict[str, int] = {key: 0 for key, _ in USAGE_KEY_GROUPS}
674
- seen_token: dict[str, bool] = {key: False for key, _ in USAGE_KEY_GROUPS}
675
- cost = 0.0
676
- seen_cost = False
677
- # BFS 로 walk 해 top-level dict 가 nested dict 보다 먼저 평가되도록 한다.
678
- queue: collections.deque[Any] = collections.deque([payload])
679
- while queue:
680
- cur = queue.popleft()
681
- if isinstance(cur, dict):
682
- for bucket, keys in USAGE_KEY_GROUPS:
683
- if seen_token[bucket]:
684
- continue
685
- for key in keys:
686
- token_count = normalize_usage_token(cur.get(key))
687
- if token_count is not None:
688
- tokens[bucket] = token_count
689
- seen_token[bucket] = True
690
- break
691
- if not seen_cost:
692
- for key in COST_KEYS:
693
- cost_value = normalize_usage_cost(cur.get(key))
694
- if cost_value is not None:
695
- cost = cost_value
696
- seen_cost = True
697
- break
698
- queue.extend(cur.values())
699
- elif isinstance(cur, list):
700
- queue.extend(cur)
701
- # Token-savings claims require a comparable primary-token total. Cache
702
- # buckets are optional zeroes in normal provider payloads, but the core
703
- # input/output buckets must both be observed; otherwise an output-only or
704
- # input-only partial payload would be treated as measured zero for the
705
- # missing side and could overstate savings.
706
- primary_tokens_measured = seen_token["input_tokens"] and seen_token["output_tokens"]
707
- return tokens, cost, seen_cost, primary_tokens_measured
708
-
709
-
710
- def collect_provider_cache_telemetry(payload: Any) -> tuple[int, bool]:
711
- """Extract provider-specific prompt-cache telemetry without changing token totals.
712
-
713
- OpenAI-style responses expose cached prompt tokens under
714
- `usage.prompt_tokens_details.cached_tokens`. That number is useful cache
715
- telemetry, but `prompt_tokens` may already include cached tokens, so keep it
716
- separate from the primary token buckets and from ContextGuard savings claims.
717
- Anthropic-style `cache_read_input_tokens` remains in the normal `cache_read`
718
- bucket handled by `collect_usage`.
719
- """
720
- queue: collections.deque[Any] = collections.deque([payload])
721
- while queue:
722
- cur = queue.popleft()
723
- if isinstance(cur, dict):
724
- for details_key in PROVIDER_CACHE_DETAIL_KEYS:
725
- details = cur.get(details_key)
726
- if not isinstance(details, dict):
727
- continue
728
- for cached_key in PROVIDER_CACHED_TOKEN_KEYS:
729
- cached = normalize_usage_token(details.get(cached_key))
730
- if cached is not None:
731
- return cached, True
732
- queue.extend(cur.values())
733
- elif isinstance(cur, list):
734
- queue.extend(cur)
735
- return 0, False
736
-
737
-
738
- def collect_provider_cached_tokens(payload: Any) -> int:
739
- """Return cached-token telemetry value for callers that only need the count."""
740
- cached_tokens, _measured = collect_provider_cache_telemetry(payload)
741
- return cached_tokens
742
-
743
-
744
- def elapsed_seconds_since(start: float) -> float:
745
- return max(0.0, time.monotonic() - start)
746
-
747
-
748
- def first_normalized_token(cur: dict[str, Any], keys: tuple[str, ...]) -> int | None:
749
- for key in keys:
750
- value = normalize_usage_token(cur.get(key))
751
- if value is not None:
752
- return value
753
- return None
754
-
755
-
756
- def first_normalized_cost(cur: dict[str, Any], keys: tuple[str, ...]) -> float | None:
757
- for key in keys:
758
- value = normalize_usage_cost(cur.get(key))
759
- if value is not None:
760
- return value
761
- return None
762
-
763
-
764
- def contains_external_source_tokens(value: Any) -> bool:
765
- queue: collections.deque[Any] = collections.deque([value])
766
- while queue:
767
- cur = queue.popleft()
768
- if isinstance(cur, dict):
769
- for _source, token_keys, _cost_keys in EXTERNAL_SOURCE_KEY_GROUPS:
770
- if first_normalized_token(cur, token_keys) is not None:
771
- return True
772
- queue.extend(cur.values())
773
- elif isinstance(cur, list):
774
- queue.extend(cur)
775
- return False
776
-
777
-
778
- def collect_shift_metrics(payload: Any) -> dict[str, int | float | bool]:
779
- """Collect optional cost-shift / byte-saving metrics without requiring them.
780
-
781
- External work is reported by evolving Claude/runner payloads either as one
782
- aggregate (`external_tokens` + `external_cost_usd`) or as explicit source
783
- records (`auxiliary_*`, `subagent_*`, `provider_*`). Do not mix those two
784
- shapes: if an aggregate token count exists, it is authoritative; otherwise
785
- sum only source-token records and mark cost measured only when every
786
- positive source-token record carries its matching source cost.
787
- """
788
- metrics: dict[str, int | float | bool] = {key: 0 for key, _ in SHIFT_METRIC_KEY_GROUPS}
789
- seen: dict[str, bool] = {key: False for key, _ in SHIFT_METRIC_KEY_GROUPS}
790
- aggregate_tokens: int | None = None
791
- aggregate_cost = 0.0
792
- aggregate_cost_measured = False
793
- source_tokens = 0
794
- source_tokens_measured = False
795
- source_cost = 0.0
796
- source_cost_covered = True
797
- metrics["external_cost_usd"] = 0.0
798
- metrics["external_cost_measured"] = False
799
- metrics["external_tokens"] = 0
800
- metrics["external_tokens_measured"] = False
801
- queue: collections.deque[Any] = collections.deque([payload])
802
- while queue:
803
- cur = queue.popleft()
804
- if isinstance(cur, dict):
805
- for bucket, keys in SHIFT_METRIC_KEY_GROUPS:
806
- if seen[bucket]:
807
- continue
808
- value = first_normalized_token(cur, keys)
809
- if value is not None:
810
- metrics[bucket] = value
811
- seen[bucket] = True
812
-
813
- if aggregate_tokens is None:
814
- value = first_normalized_token(cur, EXTERNAL_TOKEN_AGGREGATE_KEYS)
815
- if value is not None:
816
- aggregate_tokens = value
817
- cost = first_normalized_cost(cur, EXTERNAL_COST_AGGREGATE_KEYS)
818
- if cost is not None:
819
- aggregate_cost = cost
820
- aggregate_cost_measured = True
821
-
822
- source_values = [
823
- (value, cost_keys)
824
- for _source, token_keys, cost_keys in EXTERNAL_SOURCE_KEY_GROUPS
825
- for value in [first_normalized_token(cur, token_keys)]
826
- if value is not None
827
- ]
828
- if source_values and not any(contains_external_source_tokens(value) for value in cur.values()):
829
- for value, cost_keys in source_values:
830
- source_tokens += value
831
- source_tokens_measured = True
832
- cost = first_normalized_cost(cur, cost_keys)
833
- if cost is not None:
834
- source_cost += cost
835
- elif value > 0:
836
- source_cost_covered = False
837
- queue.extend(cur.values())
838
- elif isinstance(cur, list):
839
- queue.extend(cur)
840
-
841
- if aggregate_tokens is not None:
842
- metrics["external_tokens"] = aggregate_tokens
843
- metrics["external_tokens_measured"] = True
844
- metrics["external_cost_usd"] = aggregate_cost if aggregate_cost_measured else 0.0
845
- metrics["external_cost_measured"] = aggregate_cost_measured
846
- elif source_tokens_measured:
847
- metrics["external_tokens"] = source_tokens
848
- metrics["external_tokens_measured"] = True
849
- metrics["external_cost_usd"] = source_cost
850
- metrics["external_cost_measured"] = source_cost_covered
851
- return metrics
852
-
853
-
854
- def normalize_self_hosted_metric(value: Any, *, maximum: float) -> float | None:
855
- if isinstance(value, bool) or not isinstance(value, (int, float)):
856
- return None
857
- number = float(value)
858
- if not math.isfinite(number) or number < 0 or number > maximum:
859
- return None
860
- return number
861
-
862
-
863
- def sanitize_self_hosted_label(value: Any) -> str | None:
864
- if not isinstance(value, str):
865
- return None
866
- text = sanitize_note_text(value)
867
- if not text:
868
- return None
869
- if len(text) > MAX_SELF_HOSTED_LABEL_CHARS:
870
- text = text[:MAX_SELF_HOSTED_LABEL_CHARS - 12].rstrip() + "…[truncated]"
871
- return text
872
-
873
-
874
- def normalize_self_hosted_metrics(raw: Any, *, source: str) -> dict[str, Any] | None:
875
- if not isinstance(raw, dict):
876
- return None
877
- metrics: dict[str, float] = {}
878
- labels: dict[str, str] = {}
879
- availability = {
880
- "latency_ms": False,
881
- "peak_memory_mb": False,
882
- "quality_score": False,
883
- }
884
- latency = normalize_self_hosted_metric(raw.get("latency_ms"), maximum=MAX_SELF_HOSTED_LATENCY_MS)
885
- if latency is not None:
886
- metrics["latency_ms"] = latency
887
- availability["latency_ms"] = True
888
- peak_memory = normalize_self_hosted_metric(raw.get("peak_memory_mb"), maximum=MAX_SELF_HOSTED_MEMORY_MB)
889
- if peak_memory is not None:
890
- metrics["peak_memory_mb"] = peak_memory
891
- availability["peak_memory_mb"] = True
892
- quality = normalize_self_hosted_metric(raw.get("quality_score"), maximum=1.0)
893
- if quality is not None:
894
- metrics["quality_score"] = quality
895
- availability["quality_score"] = True
896
- for key in ("model_server", "optimization", "quality_metric"):
897
- label = sanitize_self_hosted_label(raw.get(key))
898
- if label is not None:
899
- labels[key] = label
900
- if not metrics:
901
- return None
902
- return {
903
- "schema_version": SELF_HOSTED_METRICS_SCHEMA_VERSION,
904
- "source": source,
905
- "metrics": metrics,
906
- "labels": labels,
907
- "measurement_availability": availability,
908
- "claim_boundary": {
909
- "id": SELF_HOSTED_METRICS_CLAIM_BOUNDARY,
910
- "hosted_api_token_savings_claim_allowed": False,
911
- "hosted_api_cost_savings_claim_allowed": False,
912
- "requires_provider_measured_matched_tasks_for_hosted_claims": True,
913
- "reason": (
914
- "Self-hosted local/model-server latency, memory, and quality metrics "
915
- "are not hosted API token or cost telemetry."
916
- ),
917
- },
918
- }
919
-
920
-
921
- def collect_self_hosted_metrics(payload: Any) -> dict[str, Any] | None:
922
- """Collect explicit self-hosted metric sidecars without broad key inference.
923
-
924
- Only explicit top-level telemetry envelopes are considered. Do not infer
925
- from incidental keys like `self_hosted_latency_ms` or arbitrary nested model
926
- message content: that would make local/model-server telemetry too easy to
927
- mix into hosted API claim surfaces.
928
- """
929
- if not isinstance(payload, dict):
930
- return None
931
- candidates = [
932
- (
933
- payload.get(SELF_HOSTED_METRICS_KEY),
934
- f"explicit_provider_payload.{SELF_HOSTED_METRICS_KEY}",
935
- )
936
- ]
937
- metrics_envelope = payload.get("metrics")
938
- if isinstance(metrics_envelope, dict):
939
- candidates.append((
940
- metrics_envelope.get(SELF_HOSTED_METRICS_KEY),
941
- f"explicit_provider_payload.metrics.{SELF_HOSTED_METRICS_KEY}",
942
- ))
943
- for raw, source in candidates:
944
- normalized = normalize_self_hosted_metrics(raw, source=source)
945
- if normalized is not None:
946
- return normalized
947
- return None
948
-
949
-
950
- def claude_version(claude_bin: str) -> str:
951
- try:
952
- proc = run_bounded_command(
953
- [claude_bin, "--version"],
954
- cwd=Path.cwd(),
955
- timeout_seconds=5,
956
- max_output_bytes=VERSION_OUTPUT_MAX_BYTES,
957
- )
958
- return proc.stdout.strip().splitlines()[0] if proc.stdout else "unknown"
959
- except (OSError, subprocess.TimeoutExpired, ValueError):
960
- return "unknown"
961
-
962
-
963
- def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> list[str]:
964
- """`claude -p` argv 를 빌드한다.
965
-
966
- fixture 에 명시되지 않은 옵션(effort, max_budget_usd) 은 argv 에서 빠진다.
967
- 이렇게 해야 baseline variant 의 실제 의미(=defaults 그대로)가 implicit
968
- runner default 로 왜곡되지 않는다.
969
- """
970
- argv = [claude_bin, "-p", "--model", task.model,
971
- "--max-turns", str(task.max_turns), "--output-format", "json"]
972
- if task.effort:
973
- argv.extend(["--effort", task.effort])
974
- if task.max_budget_usd is not None:
975
- argv.extend(["--max-budget-usd", str(task.max_budget_usd)])
976
- if task.allowed_tools:
977
- argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
978
- argv.extend(variant.extra_args)
979
- argv.append("--")
980
- argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
981
- return argv
982
-
983
-
984
- def executable_argv0(command: str) -> str:
985
- resolved = shutil.which(command)
986
- if resolved:
987
- return str(Path(resolved).expanduser().resolve())
988
- path = Path(command).expanduser()
989
- if path.is_absolute():
990
- return str(path)
991
- return str(path.resolve())
992
-
993
-
994
- def _signal_process_group(proc: subprocess.Popen[bytes], sig: int, pgid: int | None) -> None:
995
- if pgid is not None:
996
- try:
997
- os.killpg(pgid, sig)
998
- return
999
- except (AttributeError, ProcessLookupError):
1000
- pass
1001
- except OSError:
1002
- pass
1003
- try:
1004
- if sig == signal.SIGKILL:
1005
- proc.kill()
1006
- else:
1007
- proc.terminate()
1008
- except OSError:
1009
- pass
1010
-
1011
-
1012
- def run_bounded_command(
1013
- argv: list[str],
1014
- *,
1015
- cwd: Path,
1016
- timeout_seconds: int,
1017
- max_output_bytes: int,
1018
- ) -> BoundedProcessResult:
1019
- proc = subprocess.Popen(
1020
- argv,
1021
- cwd=cwd,
1022
- stdout=subprocess.PIPE,
1023
- stderr=subprocess.PIPE,
1024
- start_new_session=True,
1025
- )
1026
- try:
1027
- pgid = os.getpgid(proc.pid)
1028
- except OSError:
1029
- pgid = proc.pid
1030
- selector = selectors.DefaultSelector()
1031
- buffers: dict[str, bytearray] = {"stdout": bytearray(), "stderr": bytearray()}
1032
- streams = {"stdout": proc.stdout, "stderr": proc.stderr}
1033
- for name, stream in streams.items():
1034
- if stream is None:
1035
- continue
1036
- try:
1037
- os.set_blocking(stream.fileno(), False)
1038
- except (AttributeError, OSError):
1039
- pass
1040
- selector.register(stream, selectors.EVENT_READ, name)
1041
-
1042
- timed_out = False
1043
- output_truncated = False
1044
- terminated_at: float | None = None
1045
- sent_kill = False
1046
- deadline = time.monotonic() + timeout_seconds
1047
- try:
1048
- while selector.get_map():
1049
- now = time.monotonic()
1050
- if now >= deadline:
1051
- timed_out = True
1052
- if terminated_at is None:
1053
- _signal_process_group(proc, signal.SIGTERM, pgid)
1054
- terminated_at = now
1055
- if terminated_at is not None and not sent_kill:
1056
- if now - terminated_at >= PROCESS_TERMINATE_GRACE_SECONDS:
1057
- _signal_process_group(proc, signal.SIGKILL, pgid)
1058
- sent_kill = True
1059
- if sent_kill and terminated_at is not None:
1060
- if now - terminated_at >= PROCESS_TERMINATE_GRACE_SECONDS * 2:
1061
- timed_out = True
1062
- break
1063
- events = selector.select(timeout=0.05)
1064
- for key, _ in events:
1065
- name = key.data
1066
- stream = key.fileobj
1067
- try:
1068
- chunk = os.read(stream.fileno(), 65536)
1069
- except BlockingIOError:
1070
- continue
1071
- if not chunk:
1072
- selector.unregister(stream)
1073
- try:
1074
- stream.close()
1075
- except OSError:
1076
- pass
1077
- continue
1078
- buffer = buffers[name]
1079
- remaining = max_output_bytes - len(buffer)
1080
- if remaining > 0:
1081
- buffer.extend(chunk[:remaining])
1082
- if len(chunk) > remaining:
1083
- output_truncated = True
1084
- if terminated_at is None:
1085
- _signal_process_group(proc, signal.SIGTERM, pgid)
1086
- terminated_at = time.monotonic()
1087
- finally:
1088
- selector.close()
1089
-
1090
- try:
1091
- returncode = proc.wait(timeout=PROCESS_TERMINATE_GRACE_SECONDS)
1092
- except subprocess.TimeoutExpired:
1093
- _signal_process_group(proc, signal.SIGKILL, pgid)
1094
- try:
1095
- returncode = proc.wait(timeout=PROCESS_TERMINATE_GRACE_SECONDS)
1096
- except subprocess.TimeoutExpired:
1097
- returncode = 124
1098
- timed_out = True
1099
- if timed_out:
1100
- returncode = 124
1101
- elif output_truncated:
1102
- returncode = 125
1103
- return BoundedProcessResult(
1104
- returncode=returncode,
1105
- stdout=bytes(buffers["stdout"]).decode("utf-8", "replace"),
1106
- stderr=bytes(buffers["stderr"]).decode("utf-8", "replace"),
1107
- timed_out=timed_out,
1108
- output_truncated=output_truncated,
1109
- )
1110
-
1111
-
1112
- # shlex.split 은 shell injection 은 막지만 `true ; echo pwned` 같은 입력을 그대로
1113
- # `["true", ";", "echo", "pwned"]` 로 분해해 /usr/bin/true 가 ";"·"echo"·"pwned" 를
1114
- # 그냥 인자로 무시하고 success=true 로 끝나는 false-positive 를 만들 수 있다.
1115
- # 따라서 shlex 분해 결과 토큰에 셸 합성 의도를 가진 것으로 보이는 문자가 포함되면 거부한다.
1116
- _SHELL_META_TOKENS = frozenset({";", "&&", "||", "|", "&", "<", ">", ">>", "<<", "<<<"})
1117
-
1118
-
1119
- def _has_shell_meta(argv: list[str]) -> bool:
1120
- for tok in argv:
1121
- if tok in _SHELL_META_TOKENS:
1122
- return True
1123
- # 토큰 안에 `$( ... )` / 백틱 같은 명령 치환 흔적이 있어도 거부.
1124
- if "$(" in tok or "`" in tok:
1125
- return True
1126
- return False
1127
-
1128
-
1129
- def run_success_command(task: TaskFixture, project_root: Path) -> tuple[bool, str]:
1130
- """fixture 의 success_command 를 실행한다.
1131
-
1132
- - `shlex.split + shell=False` 로 단일 argv 만 실행한다.
1133
- - 분해된 토큰에 셸 합성 의도(`;`, `&&`, `|`, `$()`, 백틱 등)가 있으면 거부한다.
1134
- `success_command` 는 단일 검증 명령 또는 헬퍼 스크립트 한 개의 경로여야 한다.
1135
- - `success_cwd` 가 project_root 밖으로 escape 하면 거부한다 (..//../etc 같은 케이스).
1136
- """
1137
- if not task.success_command:
1138
- return True, "no success_command configured"
1139
- try:
1140
- argv = shlex.split(task.success_command)
1141
- except ValueError as exc:
1142
- return False, f"success_command parse error: {exc}"
1143
- if not argv:
1144
- return False, "success_command parsed to empty argv"
1145
- if _has_shell_meta(argv):
1146
- return False, "success_command contains shell-composition tokens (use a helper script)"
1147
- project_root_resolved = project_root.resolve()
1148
- cwd = (project_root / task.success_cwd).resolve()
1149
- try:
1150
- cwd.relative_to(project_root_resolved)
1151
- except ValueError:
1152
- return False, f"success_cwd escapes project_root: {cwd}"
1153
- try:
1154
- proc = run_bounded_command(
1155
- argv,
1156
- cwd=cwd,
1157
- timeout_seconds=600,
1158
- max_output_bytes=SUCCESS_COMMAND_OUTPUT_MAX_BYTES,
1159
- )
1160
- except (OSError, subprocess.TimeoutExpired, ValueError) as exc:
1161
- return False, f"success_command failed to launch: {exc}"
1162
- if proc.timed_out:
1163
- return False, "success_command timed out after 600s"
1164
- if proc.output_truncated:
1165
- return False, f"success_command output limit exceeded ({SUCCESS_COMMAND_OUTPUT_MAX_BYTES} bytes)"
1166
- return proc.returncode == 0, f"exit={proc.returncode}"
1167
-
1168
-
1169
- def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
1170
- project_root: Path, dry_run: bool) -> RunResult:
1171
- argv = build_claude_argv(claude_bin, task, variant)
1172
- started_at = time.monotonic()
1173
- if dry_run:
1174
- return RunResult(
1175
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1176
- tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
1177
- success=True, notes=f"dry-run: {shlex.join(argv)}",
1178
- wall_time_seconds=0.0,
1179
- )
1180
- if is_placeholder_success_command(task.success_command):
1181
- return RunResult(
1182
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1183
- tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
1184
- success=False,
1185
- notes=f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing to invoke provider",
1186
- wall_time_seconds=elapsed_seconds_since(started_at),
1187
- )
1188
- argv[0] = executable_argv0(argv[0])
1189
- try:
1190
- proc = run_bounded_command(
1191
- argv,
1192
- cwd=project_root,
1193
- timeout_seconds=1800,
1194
- max_output_bytes=CLAUDE_OUTPUT_MAX_BYTES,
1195
- )
1196
- except (OSError, subprocess.TimeoutExpired, ValueError) as exc:
1197
- return RunResult(
1198
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1199
- tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
1200
- success=False, notes=f"claude launch failed: {exc}",
1201
- wall_time_seconds=elapsed_seconds_since(started_at),
1202
- )
1203
- if proc.timed_out:
1204
- return RunResult(
1205
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1206
- tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
1207
- success=False, notes="claude timed out after 1800s",
1208
- wall_time_seconds=elapsed_seconds_since(started_at),
1209
- )
1210
- if proc.output_truncated:
1211
- return RunResult(
1212
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1213
- tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
1214
- success=False, notes=f"claude output limit exceeded ({CLAUDE_OUTPUT_MAX_BYTES} bytes)",
1215
- wall_time_seconds=elapsed_seconds_since(started_at),
1216
- )
1217
- if proc.returncode != 0:
1218
- return RunResult(
1219
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1220
- tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
1221
- success=False, notes=f"claude exit={proc.returncode}: {proc.stderr[-200:].strip()}",
1222
- wall_time_seconds=elapsed_seconds_since(started_at),
1223
- )
1224
- try:
1225
- payload = json.loads(proc.stdout)
1226
- except json.JSONDecodeError as exc:
1227
- return RunResult(
1228
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1229
- tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
1230
- success=False, notes=f"claude returned non-JSON: {exc.msg}",
1231
- wall_time_seconds=elapsed_seconds_since(started_at),
1232
- )
1233
- tokens, cost, cost_measured, primary_tokens_measured = collect_usage(payload)
1234
- provider_cached_tokens, provider_cached_tokens_measured = collect_provider_cache_telemetry(payload)
1235
- shift_metrics = collect_shift_metrics(payload)
1236
- self_hosted_metrics = collect_self_hosted_metrics(payload)
1237
- success, success_note = run_success_command(task, project_root)
1238
- return RunResult(
1239
- task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
1240
- tokens=tokens, cost_usd=cost, success=success, notes=success_note,
1241
- cost_measured=cost_measured,
1242
- primary_tokens_measured=primary_tokens_measured,
1243
- wall_time_seconds=elapsed_seconds_since(started_at),
1244
- turns=int(shift_metrics["turns"]),
1245
- hook_triggers=int(shift_metrics["hook_triggers"]),
1246
- bytes_before=int(shift_metrics["bytes_before"]),
1247
- bytes_after=int(shift_metrics["bytes_after"]),
1248
- artifacts_used=int(shift_metrics["artifacts_used"]),
1249
- external_tokens=int(shift_metrics["external_tokens"]),
1250
- external_tokens_measured=bool(shift_metrics["external_tokens_measured"]),
1251
- external_cost_usd=float(shift_metrics["external_cost_usd"]),
1252
- external_cost_measured=bool(shift_metrics["external_cost_measured"]),
1253
- provider_cached_tokens=provider_cached_tokens,
1254
- provider_cached_tokens_measured=provider_cached_tokens_measured,
1255
- self_hosted_metrics=self_hosted_metrics,
1256
- )
1257
-
1258
-
1259
- def append_csv(csv_path: Path, claude_ver: str, result: RunResult, *, skip_existing: bool = False) -> bool:
1260
- with csv_file_lock(csv_path, create_parent=True):
1261
- if skip_existing and (result.task_id, result.variant) in _read_existing_keys_unlocked(csv_path):
1262
- return False
1263
- flags = os.O_CREAT | os.O_APPEND | os.O_WRONLY
1264
- fd = _open_regular_no_symlink(csv_path, flags, 0o600, create_parent=True)
1265
- try:
1266
- new_file = os.fstat(fd).st_size == 0
1267
- if not new_file:
1268
- validate_csv_schema(csv_path, read_csv_header_unlocked(csv_path))
1269
- with os.fdopen(fd, "a", encoding="utf-8", newline="") as f:
1270
- fd = -1
1271
- writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
1272
- if new_file:
1273
- writer.writeheader()
1274
- tokens = result.tokens
1275
- total = sum(tokens.values())
1276
- shifted_cost_known = cost_shift_measured(result)
1277
- writer.writerow({
1278
- "date": sanitize_csv_cell(_dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")),
1279
- "claude_version": sanitize_csv_cell(claude_ver),
1280
- "task_id": sanitize_csv_cell(result.task_id),
1281
- "variant": sanitize_csv_cell(result.variant),
1282
- "model": sanitize_csv_cell(result.model),
1283
- "effort": sanitize_csv_cell(result.effort),
1284
- "total_tokens": total,
1285
- "input_tokens": tokens.get("input_tokens", 0),
1286
- "output_tokens": tokens.get("output_tokens", 0),
1287
- "cache_read": tokens.get("cache_read", 0),
1288
- "cache_creation": tokens.get("cache_creation", 0),
1289
- "provider_cached_tokens": result.provider_cached_tokens,
1290
- "provider_cached_tokens_measured": (
1291
- "true" if result.provider_cached_tokens_measured else "false"
1292
- ),
1293
- "cost_usd": f"{result.cost_usd:.6f}",
1294
- "cost_measured": "true" if result.cost_measured else "false",
1295
- "wall_time_seconds": f"{result.wall_time_seconds:.6f}",
1296
- "turns": result.turns,
1297
- "hook_triggers": result.hook_triggers,
1298
- "bytes_before": result.bytes_before,
1299
- "bytes_after": result.bytes_after,
1300
- "artifacts_used": result.artifacts_used,
1301
- "external_tokens": result.external_tokens,
1302
- "external_tokens_measured": "true" if result.external_tokens_measured else "false",
1303
- "external_cost_usd": f"{result.external_cost_usd:.6f}",
1304
- "external_cost_measured": "true" if result.external_cost_measured else "false",
1305
- "total_cost_with_shift_usd": (
1306
- f"{(result.cost_usd + result.external_cost_usd):.6f}" if shifted_cost_known else ""
1307
- ),
1308
- "success": "true" if result.success else "false",
1309
- "corrections": result.corrections,
1310
- "notes": sanitize_csv_note(result.notes),
1311
- "primary_tokens_measured": "true" if result.primary_tokens_measured else "false",
1312
- })
1313
- finally:
1314
- if fd != -1:
1315
- os.close(fd)
1316
- return True
1317
-
1318
-
1319
- def cost_shift_measured(result: RunResult) -> bool:
1320
- return (
1321
- result.cost_measured
1322
- and result.external_tokens_measured
1323
- and (result.external_tokens == 0 or result.external_cost_measured)
1324
- )
1325
-
1326
-
1327
- def read_csv_header_unlocked(csv_path: Path) -> list[str] | None:
1328
- fd = _open_regular_no_symlink(csv_path)
1329
- try:
1330
- with os.fdopen(fd, "r", encoding="utf-8", newline="") as handle:
1331
- fd = -1
1332
- reader = csv.reader(handle)
1333
- try:
1334
- return next(reader)
1335
- except StopIteration:
1336
- return None
1337
- finally:
1338
- if fd != -1:
1339
- os.close(fd)
1340
-
1341
-
1342
- def validate_csv_schema(csv_path: Path, fieldnames: list[str] | None) -> None:
1343
- """Fail loudly instead of appending/reporting across incompatible CSV schemas."""
1344
- if fieldnames is None:
1345
- return
1346
- if fieldnames != CSV_COLUMNS:
1347
- raise SystemExit(
1348
- f"CSV schema mismatch for {csv_path}; start a new --csv file or migrate the header "
1349
- f"to: {','.join(CSV_COLUMNS)}"
1350
- )
1351
-
1352
-
1353
- def write_text_no_follow(path: Path, text: str) -> None:
1354
- fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, 0o600, create_parent=True)
1355
- try:
1356
- with os.fdopen(fd, "w", encoding="utf-8") as handle:
1357
- fd = -1
1358
- handle.write(text)
1359
- finally:
1360
- if fd != -1:
1361
- os.close(fd)
1362
-
1363
-
1364
- def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
1365
- shifted_cost_known = cost_shift_measured(result)
1366
- byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
1367
- payload = {
1368
- "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
1369
- "date": _dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
1370
- "claude_version": claude_ver,
1371
- "task_id": result.task_id,
1372
- "variant": result.variant,
1373
- "transform_id": result.variant,
1374
- "success": result.success,
1375
- "primary_cost_measured": result.cost_measured,
1376
- "primary_cost_usd": round(result.cost_usd, 6),
1377
- "primary_tokens_measured": result.primary_tokens_measured,
1378
- "provider_cached_tokens": result.provider_cached_tokens,
1379
- "provider_cached_tokens_measured": result.provider_cached_tokens_measured,
1380
- "wall_time_seconds": round(result.wall_time_seconds, 6),
1381
- "external_tokens_measured": result.external_tokens_measured,
1382
- "external_cost_measured": result.external_cost_measured,
1383
- "external_cost_usd": round(result.external_cost_usd, 6),
1384
- "total_cost_with_shift_usd": (
1385
- round(result.cost_usd + result.external_cost_usd, 6) if shifted_cost_known else None
1386
- ),
1387
- "primary_tokens": sum(result.tokens.values()),
1388
- "external_tokens": result.external_tokens,
1389
- "artifacts_used": result.artifacts_used,
1390
- "bytes_before": result.bytes_before,
1391
- "bytes_after": result.bytes_after,
1392
- "hook_triggers": result.hook_triggers,
1393
- "turns": result.turns,
1394
- "notes": sanitize_csv_note(result.notes),
1395
- "measurement_availability": {
1396
- "primary_tokens": result.primary_tokens_measured,
1397
- "primary_cost": result.cost_measured,
1398
- "external_tokens": result.external_tokens_measured,
1399
- "external_cost": result.external_cost_measured,
1400
- "shifted_cost": shifted_cost_known,
1401
- "provider_cache": result.provider_cached_tokens_measured,
1402
- "byte_metrics": byte_metrics_observed,
1403
- "wall_time": result.wall_time_seconds >= 0,
1404
- "self_hosted_metrics": result.self_hosted_metrics is not None,
1405
- },
1406
- "proxy_metrics": {
1407
- "byte_metrics_observed": byte_metrics_observed,
1408
- "token_proxy": "chars_div_4",
1409
- "bytes_per_token": TOKEN_PROXY_BYTES_PER_TOKEN,
1410
- "claim_boundary": "proxy_only_not_hosted_token_savings",
1411
- },
1412
- }
1413
- if result.self_hosted_metrics is not None:
1414
- payload["self_hosted_metrics"] = result.self_hosted_metrics
1415
- with csv_file_lock(path, create_parent=True):
1416
- fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
1417
- try:
1418
- with os.fdopen(fd, "a", encoding="utf-8") as handle:
1419
- fd = -1
1420
- handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
1421
- finally:
1422
- if fd != -1:
1423
- os.close(fd)
1424
-
1425
-
1426
- def _read_existing_keys_unlocked(csv_path: Path) -> set[tuple[str, str]]:
1427
- try:
1428
- fd = _open_regular_no_symlink(csv_path)
1429
- except FileNotFoundError:
1430
- return set()
1431
- keys: set[tuple[str, str]] = set()
1432
- try:
1433
- with os.fdopen(fd, "r", encoding="utf-8", newline="") as f:
1434
- fd = -1
1435
- reader = csv.DictReader(f)
1436
- fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
1437
- validate_csv_schema(csv_path, fieldnames)
1438
- for row in reader:
1439
- tid = row.get("task_id") or ""
1440
- var = row.get("variant") or ""
1441
- if tid and var:
1442
- keys.add((tid, var))
1443
- finally:
1444
- if fd != -1:
1445
- os.close(fd)
1446
- return keys
1447
-
1448
-
1449
- def _csv_exists_no_follow(csv_path: Path) -> bool:
1450
- """Probe the CSV itself without following symlinks or creating a sidecar lock."""
1451
- try:
1452
- fd = _open_regular_no_symlink(csv_path)
1453
- except FileNotFoundError:
1454
- return False
1455
- else:
1456
- os.close(fd)
1457
- return True
1458
-
1459
-
1460
- def existing_keys(csv_path: Path) -> set[tuple[str, str]]:
1461
- """이미 적재된 (task_id, variant) 조합. resume 시 skip 판정에 사용."""
1462
- if not _csv_exists_no_follow(csv_path):
1463
- return set()
1464
- with csv_file_lock(csv_path, create_parent=False):
1465
- return _read_existing_keys_unlocked(csv_path)
1466
-
1467
-
1468
- def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
1469
- try:
1470
- fd = _open_regular_no_symlink(csv_path)
1471
- except FileNotFoundError:
1472
- return []
1473
- try:
1474
- with os.fdopen(fd, "r", encoding="utf-8", newline="") as handle:
1475
- fd = -1
1476
- reader = csv.DictReader(handle)
1477
- fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
1478
- validate_csv_schema(csv_path, fieldnames)
1479
- rows: list[dict[str, str]] = []
1480
- for index, row in enumerate(reader, start=1):
1481
- if index > MAX_CSV_ROWS:
1482
- raise SystemExit(f"CSV row limit exceeded for {csv_path}: > {MAX_CSV_ROWS}")
1483
- rows.append(row)
1484
- return rows
1485
- finally:
1486
- if fd != -1:
1487
- os.close(fd)
1488
-
1489
-
1490
- def row_int(row: dict[str, str], key: str) -> int:
1491
- try:
1492
- return int(float(row.get(key) or 0))
1493
- except (TypeError, ValueError, OverflowError):
1494
- return 0
1495
-
1496
-
1497
- def row_optional_nonnegative_int(row: dict[str, str], key: str) -> int | None:
1498
- raw = row.get(key)
1499
- if raw is None:
1500
- return None
1501
- text = str(raw).strip()
1502
- if not re.fullmatch(r"[0-9]+", text):
1503
- return None
1504
- try:
1505
- return int(text)
1506
- except (TypeError, ValueError, OverflowError):
1507
- return None
1508
-
1509
-
1510
- def row_float(row: dict[str, str], key: str) -> float:
1511
- try:
1512
- value = float(row.get(key) or 0)
1513
- except (TypeError, ValueError, OverflowError):
1514
- return 0.0
1515
- return value if math.isfinite(value) else 0.0
1516
-
1517
-
1518
- def row_optional_float(row: dict[str, str], key: str) -> float | None:
1519
- raw = row.get(key)
1520
- if raw is None or str(raw).strip() == "":
1521
- return None
1522
- try:
1523
- value = float(raw)
1524
- except (TypeError, ValueError, OverflowError):
1525
- return None
1526
- return value if math.isfinite(value) else None
1527
-
1528
-
1529
- def row_has_finite_float(row: dict[str, str], key: str) -> bool:
1530
- return row_optional_float(row, key) is not None
1531
-
1532
-
1533
- def row_bool(row: dict[str, str], key: str) -> bool:
1534
- return str(row.get(key) or "").strip().lower() == "true"
1535
-
1536
-
1537
- def row_success(row: dict[str, str]) -> bool:
1538
- return str(row.get("success") or "").strip().lower() == "true"
1539
-
1540
-
1541
- def row_cost_shift_measured(row: dict[str, str]) -> bool:
1542
- return (
1543
- row_bool(row, "cost_measured")
1544
- and row_bool(row, "external_tokens_measured")
1545
- and (row_int(row, "external_tokens") == 0 or row_bool(row, "external_cost_measured"))
1546
- )
1547
-
1548
-
1549
- def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str) -> dict[str, Any]:
1550
- by_variant: dict[str, dict[str, Any]] = {}
1551
- successful_rows_by_variant_task: dict[str, dict[str, list[dict[str, str]]]] = {}
1552
- seen_tasks_by_variant: dict[str, set[str]] = {}
1553
- successful_tasks_by_variant: dict[str, set[str]] = {}
1554
-
1555
- for row_index, raw_row in enumerate(rows, start=1):
1556
- row = dict(raw_row)
1557
- row["_row_index"] = str(row_index)
1558
- variant = row.get("variant") or "unknown"
1559
- task_id = row.get("task_id") or "unknown"
1560
- seen_tasks_by_variant.setdefault(variant, set()).add(task_id)
1561
- bucket = by_variant.setdefault(
1562
- variant,
1563
- {
1564
- "runs": 0,
1565
- "successful_runs": 0,
1566
- "failed_runs": 0,
1567
- "total_tokens_all_runs": 0,
1568
- "primary_tokens_measured_runs": 0,
1569
- "primary_cost_all_runs_usd": 0.0,
1570
- "primary_cost_measured_runs": 0,
1571
- "wall_time_seconds_all_runs": 0.0,
1572
- "wall_time_seconds_measured_runs": 0,
1573
- "provider_cached_tokens_all_runs": 0,
1574
- "provider_cached_tokens_measured_runs": 0,
1575
- "total_cost_with_shift_all_runs_usd": 0.0,
1576
- "total_cost_with_shift_measured_runs": 0,
1577
- "total_tokens_successful": 0,
1578
- "primary_tokens_measured_successful": 0,
1579
- "primary_cost_successful_usd": 0.0,
1580
- "primary_cost_measured_successful": 0,
1581
- "wall_time_seconds_successful": 0.0,
1582
- "wall_time_seconds_measured_successful": 0,
1583
- "provider_cached_tokens_successful": 0,
1584
- "provider_cached_tokens_measured_successful": 0,
1585
- "external_cost_successful_usd": 0.0,
1586
- "external_cost_unknown_successful": 0,
1587
- "total_cost_with_shift_successful_usd": 0.0,
1588
- "total_cost_with_shift_measured_successful": 0,
1589
- "external_tokens_successful": 0,
1590
- "external_tokens_measured_successful": 0,
1591
- "artifacts_used_successful": 0,
1592
- "corrections_successful": 0,
1593
- "bytes_before_successful": 0,
1594
- "bytes_after_successful": 0,
1595
- "turns_successful": 0,
1596
- "hook_triggers_successful": 0,
1597
- },
1598
- )
1599
- bucket["runs"] += 1
1600
- bucket["total_tokens_all_runs"] += row_int(row, "total_tokens")
1601
- if row_bool(row, "primary_tokens_measured"):
1602
- bucket["primary_tokens_measured_runs"] += 1
1603
- bucket["wall_time_seconds_all_runs"] += row_float(row, "wall_time_seconds")
1604
- if row_has_finite_float(row, "wall_time_seconds"):
1605
- bucket["wall_time_seconds_measured_runs"] += 1
1606
- bucket["provider_cached_tokens_all_runs"] += row_int(row, "provider_cached_tokens")
1607
- if row_bool(row, "provider_cached_tokens_measured"):
1608
- bucket["provider_cached_tokens_measured_runs"] += 1
1609
- if row_bool(row, "cost_measured"):
1610
- bucket["primary_cost_all_runs_usd"] += row_float(row, "cost_usd")
1611
- bucket["primary_cost_measured_runs"] += 1
1612
- shifted_cost = row_optional_float(row, "total_cost_with_shift_usd")
1613
- if row_cost_shift_measured(row) and shifted_cost is not None:
1614
- bucket["total_cost_with_shift_all_runs_usd"] += shifted_cost
1615
- bucket["total_cost_with_shift_measured_runs"] += 1
1616
- if not row_success(row):
1617
- bucket["failed_runs"] += 1
1618
- continue
1619
- bucket["successful_runs"] += 1
1620
- successful_tasks_by_variant.setdefault(variant, set()).add(task_id)
1621
- successful_rows_by_variant_task.setdefault(variant, {}).setdefault(task_id, []).append(row)
1622
- bucket["total_tokens_successful"] += row_int(row, "total_tokens")
1623
- if row_bool(row, "primary_tokens_measured"):
1624
- bucket["primary_tokens_measured_successful"] += 1
1625
- bucket["wall_time_seconds_successful"] += row_float(row, "wall_time_seconds")
1626
- if row_has_finite_float(row, "wall_time_seconds"):
1627
- bucket["wall_time_seconds_measured_successful"] += 1
1628
- bucket["provider_cached_tokens_successful"] += row_int(row, "provider_cached_tokens")
1629
- if row_bool(row, "provider_cached_tokens_measured"):
1630
- bucket["provider_cached_tokens_measured_successful"] += 1
1631
- if row_bool(row, "cost_measured"):
1632
- bucket["primary_cost_successful_usd"] += row_float(row, "cost_usd")
1633
- bucket["primary_cost_measured_successful"] += 1
1634
- if row_bool(row, "external_tokens_measured") and (
1635
- row_int(row, "external_tokens") == 0 or row_bool(row, "external_cost_measured")
1636
- ):
1637
- bucket["external_cost_successful_usd"] += row_float(row, "external_cost_usd")
1638
- else:
1639
- bucket["external_cost_unknown_successful"] += 1
1640
- if row_cost_shift_measured(row) and shifted_cost is not None:
1641
- bucket["total_cost_with_shift_successful_usd"] += shifted_cost
1642
- bucket["total_cost_with_shift_measured_successful"] += 1
1643
- if row_bool(row, "external_tokens_measured"):
1644
- bucket["external_tokens_successful"] += row_int(row, "external_tokens")
1645
- bucket["external_tokens_measured_successful"] += 1
1646
- bucket["artifacts_used_successful"] += row_int(row, "artifacts_used")
1647
- bucket["corrections_successful"] += row_int(row, "corrections")
1648
- bucket["bytes_before_successful"] += row_int(row, "bytes_before")
1649
- bucket["bytes_after_successful"] += row_int(row, "bytes_after")
1650
- bucket["turns_successful"] += row_int(row, "turns")
1651
- bucket["hook_triggers_successful"] += row_int(row, "hook_triggers")
1652
-
1653
- for variant, bucket in by_variant.items():
1654
- successes = bucket["successful_runs"]
1655
- runs = bucket["runs"]
1656
- bucket["failure_rate"] = (bucket["failed_runs"] / runs) if runs else None
1657
- bucket["task_count"] = len(seen_tasks_by_variant.get(variant, set()))
1658
- bucket["successful_task_count"] = len(successful_tasks_by_variant.get(variant, set()))
1659
- if bucket["task_count"]:
1660
- bucket["tokens_per_task_including_failures"] = (
1661
- bucket["total_tokens_all_runs"] / bucket["task_count"]
1662
- if bucket["primary_tokens_measured_runs"] == runs
1663
- else None
1664
- )
1665
- bucket["wall_time_seconds_per_task_including_failures"] = (
1666
- bucket["wall_time_seconds_all_runs"] / bucket["task_count"]
1667
- )
1668
- bucket["provider_cached_tokens_per_task_including_failures"] = (
1669
- bucket["provider_cached_tokens_all_runs"] / bucket["task_count"]
1670
- )
1671
- if bucket["primary_cost_measured_runs"] == runs:
1672
- bucket["primary_cost_per_task_including_failures_usd"] = (
1673
- bucket["primary_cost_all_runs_usd"] / bucket["task_count"]
1674
- )
1675
- else:
1676
- bucket["primary_cost_per_task_including_failures_usd"] = None
1677
- if bucket["total_cost_with_shift_measured_runs"] == runs:
1678
- bucket["total_cost_with_shift_per_task_including_failures_usd"] = (
1679
- bucket["total_cost_with_shift_all_runs_usd"] / bucket["task_count"]
1680
- )
1681
- else:
1682
- bucket["total_cost_with_shift_per_task_including_failures_usd"] = None
1683
- else:
1684
- bucket["tokens_per_task_including_failures"] = None
1685
- bucket["wall_time_seconds_per_task_including_failures"] = None
1686
- bucket["provider_cached_tokens_per_task_including_failures"] = None
1687
- bucket["primary_cost_per_task_including_failures_usd"] = None
1688
- bucket["total_cost_with_shift_per_task_including_failures_usd"] = None
1689
- if successes:
1690
- bucket["tokens_per_successful_task"] = (
1691
- bucket["total_tokens_successful"] / successes
1692
- if bucket["primary_tokens_measured_successful"] == successes
1693
- else None
1694
- )
1695
- bucket["wall_time_seconds_per_successful_task"] = bucket["wall_time_seconds_successful"] / successes
1696
- bucket["provider_cached_tokens_per_successful_task"] = (
1697
- bucket["provider_cached_tokens_successful"] / successes
1698
- )
1699
- if bucket["primary_cost_measured_successful"] == successes:
1700
- bucket["primary_cost_per_successful_task_usd"] = (
1701
- bucket["primary_cost_successful_usd"] / successes
1702
- )
1703
- else:
1704
- bucket["primary_cost_per_successful_task_usd"] = None
1705
- if bucket["total_cost_with_shift_measured_successful"] == successes:
1706
- bucket["total_cost_with_shift_per_successful_task_usd"] = (
1707
- bucket["total_cost_with_shift_successful_usd"] / successes
1708
- )
1709
- else:
1710
- bucket["total_cost_with_shift_per_successful_task_usd"] = None
1711
- bucket["external_tokens_per_successful_task"] = (
1712
- bucket["external_tokens_successful"] / successes
1713
- if bucket["external_tokens_measured_successful"] == successes
1714
- else None
1715
- )
1716
- bucket["artifacts_used_per_successful_task"] = bucket["artifacts_used_successful"] / successes
1717
- bucket["corrections_per_successful_task"] = bucket["corrections_successful"] / successes
1718
- before = bucket["bytes_before_successful"]
1719
- after = bucket["bytes_after_successful"]
1720
- bucket["byte_reduction_ratio"] = (after / before) if before else None
1721
- else:
1722
- bucket["tokens_per_successful_task"] = None
1723
- bucket["wall_time_seconds_per_successful_task"] = None
1724
- bucket["provider_cached_tokens_per_successful_task"] = None
1725
- bucket["primary_cost_per_successful_task_usd"] = None
1726
- bucket["total_cost_with_shift_per_successful_task_usd"] = None
1727
- bucket["external_tokens_per_successful_task"] = None
1728
- bucket["artifacts_used_per_successful_task"] = None
1729
- bucket["corrections_per_successful_task"] = None
1730
- bucket["byte_reduction_ratio"] = None
1731
-
1732
- # 각 variant는 하나의 compression strategy를 대표한다. byte 절감/토큰 proxy/
1733
- # 텔레메트리 증거 등급을 보수적으로(additive) 노출한다. 토큰 proxy는 측정된
1734
- # 모델 토큰이 아니라 byte delta 기반 추정치이므로 evidence="inferred"로 둔다.
1735
- bucket["compression_strategy"] = variant
1736
- bucket["is_baseline_strategy"] = variant == baseline_variant
1737
- bytes_before = bucket["bytes_before_successful"]
1738
- bytes_after = bucket["bytes_after_successful"]
1739
- byte_metrics_present = bool(bytes_before or bytes_after)
1740
- if successes and byte_metrics_present:
1741
- bytes_saved = max(0, bytes_before - bytes_after)
1742
- token_proxy_saved = bytes_saved // TOKEN_PROXY_BYTES_PER_TOKEN
1743
- bucket["bytes_saved_successful"] = bytes_saved
1744
- bucket["bytes_saved_per_successful_task"] = bytes_saved / successes
1745
- bucket["byte_savings_pct"] = ((bytes_before - bytes_after) / bytes_before * 100.0) if bytes_before else None
1746
- bucket["token_proxy_saved_successful"] = token_proxy_saved
1747
- bucket["token_proxy_saved_per_successful_task"] = token_proxy_saved / successes
1748
- else:
1749
- bucket["bytes_saved_successful"] = None
1750
- bucket["bytes_saved_per_successful_task"] = None
1751
- bucket["byte_savings_pct"] = None
1752
- bucket["token_proxy_saved_successful"] = None
1753
- bucket["token_proxy_saved_per_successful_task"] = None
1754
- bucket["observed_telemetry"] = {
1755
- "tokens": (
1756
- "observed" if runs and bucket["primary_tokens_measured_runs"] == runs
1757
- else ("partial" if bucket["primary_tokens_measured_runs"] else "unavailable")
1758
- ),
1759
- "primary_cost": (
1760
- "observed" if runs and bucket["primary_cost_measured_runs"] == runs
1761
- else ("partial" if bucket["primary_cost_measured_runs"] else "unavailable")
1762
- ),
1763
- "external_tokens": (
1764
- "observed" if successes and bucket["external_tokens_measured_successful"] == successes
1765
- else ("partial" if bucket["external_tokens_measured_successful"] else "unavailable")
1766
- ),
1767
- "byte_savings": "observed" if byte_metrics_present else "unavailable",
1768
- "token_proxy": "inferred" if (successes and byte_metrics_present) else "unavailable",
1769
- "wall_time": (
1770
- "observed" if runs and bucket["wall_time_seconds_measured_runs"] == runs
1771
- else ("partial" if bucket["wall_time_seconds_measured_runs"] else "unavailable")
1772
- ),
1773
- "provider_cache": (
1774
- "observed" if runs and bucket["provider_cached_tokens_measured_runs"] == runs
1775
- else ("partial" if bucket["provider_cached_tokens_measured_runs"] else "unavailable")
1776
- ),
1777
- }
1778
-
1779
- def average_task_metric(variant: str, task_id: str, key: str) -> float | None:
1780
- values = [
1781
- row_optional_float(row, key)
1782
- for row in successful_rows_by_variant_task.get(variant, {}).get(task_id, [])
1783
- ]
1784
- known = [value for value in values if value is not None]
1785
- return (sum(known) / len(known)) if known else None
1786
-
1787
- def average_task_int_metric(variant: str, task_id: str, key: str) -> float | None:
1788
- rows_for_task = successful_rows_by_variant_task.get(variant, {}).get(task_id, [])
1789
- if not rows_for_task:
1790
- return None
1791
- values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
1792
- if any(value is None for value in values):
1793
- return None
1794
- return sum(value for value in values if value is not None) / len(values)
1795
-
1796
- def average_paired_metric(
1797
- variant: str,
1798
- task_ids: set[str],
1799
- key: str,
1800
- ) -> tuple[float | None, float | None, int]:
1801
- baseline_values: list[float] = []
1802
- variant_values: list[float] = []
1803
- for task_id in sorted(task_ids):
1804
- baseline_value = average_task_metric(baseline_variant, task_id, key)
1805
- variant_value = average_task_metric(variant, task_id, key)
1806
- if baseline_value is None or variant_value is None:
1807
- continue
1808
- baseline_values.append(baseline_value)
1809
- variant_values.append(variant_value)
1810
- if not baseline_values:
1811
- return None, None, 0
1812
- return (
1813
- sum(baseline_values) / len(baseline_values),
1814
- sum(variant_values) / len(variant_values),
1815
- len(baseline_values),
1816
- )
1817
-
1818
- def average_paired_int_metric(
1819
- variant: str,
1820
- task_ids: set[str],
1821
- key: str,
1822
- ) -> tuple[float | None, float | None, int]:
1823
- baseline_values: list[float] = []
1824
- variant_values: list[float] = []
1825
- for task_id in sorted(task_ids):
1826
- baseline_value = average_task_int_metric(baseline_variant, task_id, key)
1827
- variant_value = average_task_int_metric(variant, task_id, key)
1828
- if baseline_value is None or variant_value is None:
1829
- continue
1830
- baseline_values.append(baseline_value)
1831
- variant_values.append(variant_value)
1832
- if not baseline_values:
1833
- return None, None, 0
1834
- return (
1835
- sum(baseline_values) / len(baseline_values),
1836
- sum(variant_values) / len(variant_values),
1837
- len(baseline_values),
1838
- )
1839
-
1840
- def row_indices_for(rows_for_task: list[dict[str, str]]) -> list[int]:
1841
- out: list[int] = []
1842
- for row in rows_for_task:
1843
- index = row_optional_nonnegative_int(row, "_row_index")
1844
- if index is not None:
1845
- out.append(index)
1846
- return out
1847
-
1848
- def all_rows_bool(rows_for_task: list[dict[str, str]], key: str) -> bool:
1849
- return bool(rows_for_task) and all(row_bool(row, key) for row in rows_for_task)
1850
-
1851
- def all_rows_optional_int(rows_for_task: list[dict[str, str]], key: str) -> list[int] | None:
1852
- values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
1853
- if not values or any(value is None for value in values):
1854
- return None
1855
- return [value for value in values if value is not None]
1856
-
1857
- def all_rows_optional_float(rows_for_task: list[dict[str, str]], key: str) -> list[float] | None:
1858
- values = [row_optional_float(row, key) for row in rows_for_task]
1859
- if not values or any(value is None for value in values):
1860
- return None
1861
- return [value for value in values if value is not None]
1862
-
1863
- def average_optional_int(rows_for_task: list[dict[str, str]], key: str) -> float | None:
1864
- values = all_rows_optional_int(rows_for_task, key)
1865
- return (sum(values) / len(values)) if values else None
1866
-
1867
- def average_optional_float(rows_for_task: list[dict[str, str]], key: str) -> float | None:
1868
- values = all_rows_optional_float(rows_for_task, key)
1869
- return (sum(values) / len(values)) if values else None
1870
-
1871
- def total_optional_int(rows_for_task: list[dict[str, str]], key: str) -> int | None:
1872
- values = all_rows_optional_int(rows_for_task, key)
1873
- return sum(values) if values is not None else None
1874
-
1875
- def all_rows_shifted_cost_measured(rows_for_task: list[dict[str, str]]) -> bool:
1876
- return bool(rows_for_task) and all(
1877
- row_cost_shift_measured(row) and row_optional_float(row, "total_cost_with_shift_usd") is not None
1878
- for row in rows_for_task
1879
- )
1880
-
1881
- def matched_side_evidence(variant: str, task_id: str, rows_for_task: list[dict[str, str]]) -> dict[str, Any]:
1882
- primary_tokens_measured = all_rows_bool(rows_for_task, "primary_tokens_measured")
1883
- primary_cost_measured = all_rows_bool(rows_for_task, "cost_measured")
1884
- shifted_cost_measured = all_rows_shifted_cost_measured(rows_for_task)
1885
- provider_cache_measured = all_rows_bool(rows_for_task, "provider_cached_tokens_measured")
1886
- external_tokens_measured = all_rows_bool(rows_for_task, "external_tokens_measured")
1887
- external_cost_measured = all_rows_bool(rows_for_task, "external_cost_measured")
1888
- corrections_values = all_rows_optional_int(rows_for_task, "corrections")
1889
- bytes_before_values = [row_optional_nonnegative_int(row, "bytes_before") for row in rows_for_task]
1890
- bytes_after_values = [row_optional_nonnegative_int(row, "bytes_after") for row in rows_for_task]
1891
- byte_metrics_observed = bool(rows_for_task) and not any(
1892
- value is None for value in [*bytes_before_values, *bytes_after_values]
1893
- )
1894
- bytes_before_total = sum(value for value in bytes_before_values if value is not None)
1895
- bytes_after_total = sum(value for value in bytes_after_values if value is not None)
1896
- byte_delta = bytes_after_total - bytes_before_total if byte_metrics_observed else None
1897
- token_proxy_delta = (
1898
- int(byte_delta / TOKEN_PROXY_BYTES_PER_TOKEN) if byte_delta is not None else None
1899
- )
1900
- return {
1901
- "variant": variant,
1902
- "task_id": task_id,
1903
- "run_count": len(rows_for_task),
1904
- "row_indices": row_indices_for(rows_for_task),
1905
- "primary_tokens": {
1906
- "measured": primary_tokens_measured,
1907
- "average": average_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
1908
- "total": total_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
1909
- },
1910
- "primary_cost_usd": {
1911
- "measured": primary_cost_measured,
1912
- "average": average_optional_float(rows_for_task, "cost_usd") if primary_cost_measured else None,
1913
- },
1914
- "total_cost_with_shift_usd": {
1915
- "measured": shifted_cost_measured,
1916
- "average": (
1917
- average_optional_float(rows_for_task, "total_cost_with_shift_usd")
1918
- if shifted_cost_measured else None
1919
- ),
1920
- },
1921
- "external_tokens": {
1922
- "measured": external_tokens_measured,
1923
- "total": total_optional_int(rows_for_task, "external_tokens") if external_tokens_measured else None,
1924
- },
1925
- "external_cost_usd": {
1926
- "measured": external_cost_measured,
1927
- "total": (
1928
- sum(row_float(row, "external_cost_usd") for row in rows_for_task)
1929
- if external_cost_measured else None
1930
- ),
1931
- },
1932
- "bytes": {
1933
- "measurement": "observed" if byte_metrics_observed else "unavailable",
1934
- "before_total": bytes_before_total if byte_metrics_observed else None,
1935
- "after_total": bytes_after_total if byte_metrics_observed else None,
1936
- "delta_total": byte_delta,
1937
- "token_proxy_delta": token_proxy_delta,
1938
- "token_proxy": "chars_div_4_proxy_only" if byte_metrics_observed else "unavailable",
1939
- },
1940
- "wall_time_seconds": {
1941
- "measured": all_rows_optional_float(rows_for_task, "wall_time_seconds") is not None,
1942
- "average": average_optional_float(rows_for_task, "wall_time_seconds"),
1943
- },
1944
- "provider_cached_tokens": {
1945
- "measured": provider_cache_measured,
1946
- "average": (
1947
- average_optional_int(rows_for_task, "provider_cached_tokens")
1948
- if provider_cache_measured else None
1949
- ),
1950
- },
1951
- "corrections": {
1952
- "measured": corrections_values is not None,
1953
- "average": (sum(corrections_values) / len(corrections_values)) if corrections_values else None,
1954
- },
1955
- }
1956
-
1957
- def matched_pair_evidence_entry(
1958
- variant: str,
1959
- task_id: str,
1960
- quality_gate: str,
1961
- ) -> dict[str, Any]:
1962
- baseline_rows = successful_rows_by_variant_task[baseline_variant][task_id]
1963
- variant_rows = successful_rows_by_variant_task[variant][task_id]
1964
- baseline_evidence = matched_side_evidence(baseline_variant, task_id, baseline_rows)
1965
- variant_evidence = matched_side_evidence(variant, task_id, variant_rows)
1966
- baseline_token_avg = baseline_evidence["primary_tokens"]["average"]
1967
- variant_token_avg = variant_evidence["primary_tokens"]["average"]
1968
- token_claim_allowed = (
1969
- quality_gate == "pass"
1970
- and bool(baseline_evidence["primary_tokens"]["measured"])
1971
- and bool(variant_evidence["primary_tokens"]["measured"])
1972
- and isinstance(baseline_token_avg, (int, float))
1973
- and baseline_token_avg > 0
1974
- and isinstance(variant_token_avg, (int, float))
1975
- )
1976
- baseline_cost_avg = baseline_evidence["total_cost_with_shift_usd"]["average"]
1977
- variant_cost_avg = variant_evidence["total_cost_with_shift_usd"]["average"]
1978
- shifted_cost_claim_allowed = (
1979
- quality_gate == "pass"
1980
- and bool(baseline_evidence["total_cost_with_shift_usd"]["measured"])
1981
- and bool(variant_evidence["total_cost_with_shift_usd"]["measured"])
1982
- and isinstance(baseline_cost_avg, (int, float))
1983
- and baseline_cost_avg > 0
1984
- and isinstance(variant_cost_avg, (int, float))
1985
- )
1986
- token_delta = (
1987
- variant_token_avg - baseline_token_avg
1988
- if token_claim_allowed
1989
- else None
1990
- )
1991
- token_savings_pct = (
1992
- (baseline_token_avg - variant_token_avg) / baseline_token_avg * 100.0
1993
- if token_delta is not None
1994
- else None
1995
- )
1996
- cost_delta = (
1997
- variant_cost_avg - baseline_cost_avg
1998
- if shifted_cost_claim_allowed
1999
- else None
2000
- )
2001
- cost_savings_pct = (
2002
- (baseline_cost_avg - variant_cost_avg) / baseline_cost_avg * 100.0
2003
- if cost_delta is not None
2004
- else None
2005
- )
2006
- base_after = baseline_evidence["bytes"]["after_total"]
2007
- variant_after = variant_evidence["bytes"]["after_total"]
2008
- byte_after_delta = (
2009
- variant_after - base_after
2010
- if isinstance(base_after, int) and isinstance(variant_after, int)
2011
- else None
2012
- )
2013
- return {
2014
- "schema_version": MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION,
2015
- "task_id": task_id,
2016
- "baseline_variant": baseline_variant,
2017
- "variant": variant,
2018
- "transform_id": variant,
2019
- "quality_gate": quality_gate,
2020
- "evidence_kind": "matched_successful_task_bucket",
2021
- "measurements": {
2022
- "baseline": baseline_evidence,
2023
- "variant": variant_evidence,
2024
- },
2025
- "delta": {
2026
- "primary_tokens_average": token_delta,
2027
- "token_savings_pct": token_savings_pct,
2028
- "total_cost_with_shift_usd_average": cost_delta,
2029
- "cost_savings_pct_with_shift": cost_savings_pct,
2030
- "bytes_after_total": byte_after_delta,
2031
- "token_proxy_after_total": (
2032
- int(byte_after_delta / TOKEN_PROXY_BYTES_PER_TOKEN)
2033
- if byte_after_delta is not None else None
2034
- ),
2035
- "proxy_measurement": "chars_div_4_proxy_only",
2036
- },
2037
- "claim_boundary": {
2038
- "quality_gate": quality_gate,
2039
- "token_savings_claim_allowed": token_claim_allowed,
2040
- "shifted_cost_claim_allowed": shifted_cost_claim_allowed,
2041
- "byte_proxy_only": True,
2042
- "requires_matched_successful_tasks": True,
2043
- "raw_estimate_only_claim_allowed": False,
2044
- },
2045
- }
2046
-
2047
- comparisons: list[dict[str, Any]] = []
2048
- matched_pair_evidence: list[dict[str, Any]] = []
2049
- baseline = by_variant.get(baseline_variant)
2050
- baseline_successful_tasks = successful_tasks_by_variant.get(baseline_variant, set())
2051
- baseline_failure_rate = baseline.get("failure_rate") if baseline else None
2052
- for variant, bucket in sorted(by_variant.items()):
2053
- if variant == baseline_variant:
2054
- continue
2055
- variant_successful_tasks = successful_tasks_by_variant.get(variant, set())
2056
- matched_tasks = baseline_successful_tasks & variant_successful_tasks
2057
- token_matched_tasks = {
2058
- task_id for task_id in matched_tasks
2059
- if all(
2060
- row_bool(row, "primary_tokens_measured")
2061
- for row in successful_rows_by_variant_task[baseline_variant][task_id]
2062
- )
2063
- and all(
2064
- row_bool(row, "primary_tokens_measured")
2065
- for row in successful_rows_by_variant_task[variant][task_id]
2066
- )
2067
- }
2068
- base_tokens, variant_tokens, token_task_count = average_paired_metric(
2069
- variant,
2070
- token_matched_tasks,
2071
- "total_tokens",
2072
- )
2073
- base_wall_time, variant_wall_time, wall_time_task_count = average_paired_metric(
2074
- variant,
2075
- matched_tasks,
2076
- "wall_time_seconds",
2077
- )
2078
- base_corrections, variant_corrections, corrections_task_count = average_paired_int_metric(
2079
- variant,
2080
- matched_tasks,
2081
- "corrections",
2082
- )
2083
- base_cost, variant_cost, cost_task_count = average_paired_metric(
2084
- variant,
2085
- {
2086
- task_id for task_id in matched_tasks
2087
- if all(
2088
- row_cost_shift_measured(row)
2089
- for row in successful_rows_by_variant_task[baseline_variant][task_id]
2090
- )
2091
- and all(
2092
- row_cost_shift_measured(row)
2093
- for row in successful_rows_by_variant_task[variant][task_id]
2094
- )
2095
- },
2096
- "total_cost_with_shift_usd",
2097
- )
2098
- failure_rate = bucket.get("failure_rate")
2099
- failure_delta = None
2100
- if isinstance(baseline_failure_rate, (int, float)) and isinstance(failure_rate, (int, float)):
2101
- failure_delta = (failure_rate - baseline_failure_rate) * 100.0
2102
- missing_baseline_success_tasks = sorted(baseline_successful_tasks - variant_successful_tasks)
2103
- quality_gate = "pass"
2104
- if not baseline or not baseline.get("successful_runs"):
2105
- quality_gate = "insufficient_baseline"
2106
- elif not bucket.get("successful_runs"):
2107
- quality_gate = "insufficient_success"
2108
- elif missing_baseline_success_tasks:
2109
- quality_gate = "matched_task_regression"
2110
- elif failure_delta is not None and failure_delta >= 10.0:
2111
- quality_gate = "failure_rate_regression"
2112
- elif matched_tasks and corrections_task_count < len(matched_tasks):
2113
- quality_gate = "insufficient_corrections_data"
2114
- elif (
2115
- isinstance(base_corrections, (int, float))
2116
- and isinstance(variant_corrections, (int, float))
2117
- and variant_corrections > base_corrections
2118
- ):
2119
- quality_gate = "corrections_regression"
2120
- comparison: dict[str, Any] = {
2121
- "variant": variant,
2122
- "baseline_variant": baseline_variant,
2123
- "quality_gate": quality_gate,
2124
- "baseline_failure_rate": baseline_failure_rate,
2125
- "variant_failure_rate": failure_rate,
2126
- "failure_rate_delta_pp": failure_delta,
2127
- "matched_successful_task_count": len(matched_tasks),
2128
- "baseline_successful_task_count": len(baseline_successful_tasks),
2129
- "missing_baseline_success_tasks": missing_baseline_success_tasks,
2130
- "baseline_corrections_per_successful_task": base_corrections,
2131
- "variant_corrections_per_successful_task": variant_corrections,
2132
- "paired_corrections_task_count": corrections_task_count,
2133
- }
2134
- if isinstance(base_corrections, (int, float)) and isinstance(variant_corrections, (int, float)):
2135
- comparison["corrections_delta_per_successful_task"] = variant_corrections - base_corrections
2136
- if isinstance(base_tokens, (int, float)) and isinstance(variant_tokens, (int, float)) and base_tokens:
2137
- comparison["token_delta_per_successful_task"] = variant_tokens - base_tokens
2138
- comparison["token_savings_pct"] = (base_tokens - variant_tokens) / base_tokens * 100.0
2139
- comparison["paired_token_task_count"] = token_task_count
2140
- else:
2141
- comparison["token_savings_pct"] = None
2142
- comparison["paired_token_task_count"] = 0
2143
- if (
2144
- isinstance(base_wall_time, (int, float))
2145
- and isinstance(variant_wall_time, (int, float))
2146
- and base_wall_time
2147
- ):
2148
- comparison["wall_time_delta_seconds_per_successful_task"] = variant_wall_time - base_wall_time
2149
- comparison["wall_time_change_pct"] = (variant_wall_time - base_wall_time) / base_wall_time * 100.0
2150
- comparison["paired_wall_time_task_count"] = wall_time_task_count
2151
- else:
2152
- comparison["wall_time_delta_seconds_per_successful_task"] = None
2153
- comparison["wall_time_change_pct"] = None
2154
- comparison["paired_wall_time_task_count"] = wall_time_task_count
2155
- if isinstance(base_cost, (int, float)) and isinstance(variant_cost, (int, float)) and base_cost:
2156
- comparison["total_cost_with_shift_delta_usd"] = variant_cost - base_cost
2157
- comparison["cost_savings_pct_with_shift"] = (base_cost - variant_cost) / base_cost * 100.0
2158
- comparison["paired_cost_task_count"] = cost_task_count
2159
- else:
2160
- comparison["cost_savings_pct_with_shift"] = None
2161
- comparison["paired_cost_task_count"] = cost_task_count
2162
- for task_id in sorted(matched_tasks):
2163
- matched_pair_evidence.append(matched_pair_evidence_entry(variant, task_id, quality_gate))
2164
- comparisons.append(comparison)
2165
-
2166
- claim_status = "insufficient_baseline"
2167
- if baseline and baseline.get("successful_runs"):
2168
- claim_status = "compare_variants" if comparisons else "baseline_only"
2169
- if comparisons:
2170
- quality_ok = all(item.get("quality_gate") == "pass" for item in comparisons)
2171
- paired_token_data = all((item.get("paired_token_task_count") or 0) > 0 for item in comparisons)
2172
- token_savings_observed = all((item.get("token_savings_pct") or 0) > 0 for item in comparisons)
2173
- shifted_cost_savings = [
2174
- item.get("cost_savings_pct_with_shift")
2175
- for item in comparisons
2176
- if isinstance(item.get("cost_savings_pct_with_shift"), (int, float))
2177
- ]
2178
- all_shifted_cost_measured = len(shifted_cost_savings) == len(comparisons)
2179
- shifted_cost_ok = all_shifted_cost_measured and all(value > 0 for value in shifted_cost_savings)
2180
- if not quality_ok:
2181
- claim_status = "quality_gate_watch"
2182
- elif not paired_token_data:
2183
- claim_status = "insufficient_paired_data"
2184
- elif token_savings_observed and shifted_cost_ok:
2185
- claim_status = "token_and_shifted_cost_savings_observed"
2186
- elif token_savings_observed and not all_shifted_cost_measured:
2187
- claim_status = "token_savings_observed_cost_unmeasured"
2188
- elif token_savings_observed:
2189
- claim_status = "token_savings_observed_cost_shift_watch"
2190
- return {
2191
- "schema": "context-guard-bench-report-v1",
2192
- "baseline_variant": baseline_variant,
2193
- "row_count": len(rows),
2194
- "summary_by_variant": by_variant,
2195
- "comparisons": comparisons,
2196
- "matched_pair_evidence": matched_pair_evidence,
2197
- "claim_status": claim_status,
2198
- "caveat": (
2199
- "Proxy byte reductions are reported separately from matched-task token/cost metrics; "
2200
- "shifted cost savings require measured primary cost and measured external cost when "
2201
- "external tokens are present. Wall time and provider cached-token fields are diagnostic "
2202
- "telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache "
2203
- "discounts must stay separate from token-reduction claims."
2204
- ),
2205
- }
2206
-
2207
- def write_report_json(csv_path: Path, report_path: Path, baseline_variant: str) -> dict[str, Any]:
2208
- # Keep lock order stable across all report writes: source CSV first, derived
2209
- # report second. Do not introduce a report -> CSV path; that can deadlock
2210
- # concurrent report generation.
2211
- with csv_file_lock(csv_path, create_parent=True):
2212
- report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
2213
- with csv_file_lock(report_path, create_parent=True):
2214
- write_text_no_follow(
2215
- report_path,
2216
- json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
2217
- )
2218
- return report
2219
-
2220
-
2221
- def sanitize_note_text(value: Any) -> str:
2222
- """Normalize untrusted benchmark note text without output-length policy."""
2223
- text = "" if value is None else str(value)
2224
- text = "".join(" " if unicodedata.category(ch)[0] == "C" else ch for ch in text)
2225
- text = " ".join(text.split())
2226
- for pattern, replacement in SECRET_NOTE_PATTERNS:
2227
- text = pattern.sub(replacement, text)
2228
- return text
2229
-
2230
-
2231
- def sanitize_csv_note(value: Any) -> str:
2232
- """Normalize untrusted notes before writing them to benchmark CSV output."""
2233
- text = sanitize_note_text(value)
2234
- if text.startswith(CSV_FORMULA_PREFIXES):
2235
- text = "'" + text
2236
- if len(text) > MAX_CSV_NOTE_CHARS:
2237
- text = text[:MAX_CSV_NOTE_CHARS - 12].rstrip() + "…[truncated]"
2238
- return text
2239
-
2240
-
2241
- def sanitize_csv_cell(value: Any) -> str:
2242
- """Normalize short untrusted CSV labels and block spreadsheet formulas."""
2243
- text = sanitize_note_text(value)
2244
- if text.startswith(CSV_FORMULA_PREFIXES):
2245
- text = "'" + text
2246
- return text
2247
-
2248
-
2249
- def filter_targets(tasks: list[TaskFixture], variants: list[Variant],
2250
- only_task: str | None, only_variant: str | None) -> list[tuple[TaskFixture, Variant]]:
2251
- targets: list[tuple[TaskFixture, Variant]] = []
2252
- for task in tasks:
2253
- if only_task and task.id != only_task:
2254
- continue
2255
- for variant in variants:
2256
- if only_variant and variant.name != only_variant:
2257
- continue
2258
- targets.append((task, variant))
2259
- return targets
2260
-
2261
-
2262
- def normalized_output_path(path: Path) -> Path:
2263
- expanded = path.expanduser()
2264
- if not expanded.is_absolute():
2265
- expanded = Path.cwd() / expanded
2266
- return Path(os.path.normpath(str(_normalize_allowed_first_absolute_symlink(expanded))))
2267
-
2268
-
2269
- def existing_file_identity(path: Path) -> tuple[int, int] | None:
2270
- try:
2271
- fd = _open_regular_no_symlink(normalized_output_path(path))
2272
- except FileNotFoundError:
2273
- return None
2274
- try:
2275
- st = os.fstat(fd)
2276
- return (int(st.st_dev), int(st.st_ino))
2277
- finally:
2278
- os.close(fd)
2279
-
2280
-
2281
- def validate_distinct_output_paths(csv_path: Path, ledger_path: Path | None, report_path: Path | None) -> None:
2282
- outputs = [("csv", csv_path), ("ledger-jsonl", ledger_path), ("report-json", report_path)]
2283
- seen: dict[Path, str] = {}
2284
- seen_identity: dict[tuple[int, int], str] = {}
2285
- for label, path in outputs:
2286
- if path is None:
2287
- continue
2288
- normalized = normalized_output_path(path)
2289
- previous = seen.get(normalized)
2290
- if previous is not None:
2291
- raise SystemExit(f"--{label} must not point to the same path as --{previous}: {normalized}")
2292
- seen[normalized] = label
2293
- identity = existing_file_identity(normalized)
2294
- if identity is not None:
2295
- previous_identity = seen_identity.get(identity)
2296
- if previous_identity is not None:
2297
- raise SystemExit(f"--{label} must not point to the same file as --{previous_identity}: {normalized}")
2298
- seen_identity[identity] = label
2299
-
2300
-
2301
- def main() -> int:
2302
- parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
2303
- parser.add_argument("--tasks", required=True, type=Path, help="task fixture JSON")
2304
- parser.add_argument("--variants", required=True, type=Path, help="variant fixture JSON")
2305
- parser.add_argument("--csv", default=Path("bench/results.csv"), type=Path,
2306
- help="results CSV path (header is added on first write)")
2307
- parser.add_argument("--task-id", default=None, help="run only the named task id")
2308
- parser.add_argument("--variant", default=None, help="run only the named variant")
2309
- parser.add_argument("--claude-bin", default=os.environ.get("CLAUDE_BIN", "claude"),
2310
- help="claude CLI executable (default: $CLAUDE_BIN or 'claude')")
2311
- parser.add_argument("--project-root", default=Path("."), type=Path,
2312
- help="working directory used for success_command (default: cwd)")
2313
- parser.add_argument("--dry-run", action="store_true",
2314
- help="print the claude command without invoking it")
2315
- parser.add_argument("--resume", action="store_true",
2316
- help="skip (task_id, variant) rows already present in --csv")
2317
- parser.add_argument("--ledger-jsonl", default=None, type=Path,
2318
- help="optional JSONL ledger path for cost-shift accounting per run")
2319
- parser.add_argument("--report-json", default=None, type=Path,
2320
- help="optional A/B summary report JSON path generated from --csv after real runs")
2321
- parser.add_argument("--baseline-variant", default="baseline",
2322
- help="variant name used as the report baseline (default: baseline)")
2323
- args = parser.parse_args()
2324
-
2325
- require_no_follow_file_ops_supported()
2326
- validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
2327
-
2328
- variants = parse_variants(args.variants)
2329
- tasks = parse_tasks(args.tasks, variants=variants)
2330
- targets = filter_targets(tasks, variants, args.task_id, args.variant)
2331
- if not targets:
2332
- print("no (task, variant) targets matched the filters", file=sys.stderr)
2333
- return 1
2334
-
2335
- skip_keys = existing_keys(args.csv) if args.resume else set()
2336
- runnable_targets = [
2337
- (task, variant)
2338
- for task, variant in targets
2339
- if (task.id, variant.name) not in skip_keys
2340
- ]
2341
- placeholder_targets = [
2342
- f"{task.id}/{variant.name}"
2343
- for task, variant in runnable_targets
2344
- if is_placeholder_success_command(task.success_command)
2345
- ]
2346
- if placeholder_targets and not args.dry_run:
2347
- print(
2348
- f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing non-dry-run provider invocation for: "
2349
- f"{', '.join(placeholder_targets)}",
2350
- file=sys.stderr,
2351
- )
2352
- return 2
2353
-
2354
- if runnable_targets and not args.dry_run and shutil.which(args.claude_bin) is None:
2355
- # claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
2356
- if not Path(args.claude_bin).exists():
2357
- print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
2358
- return 2
2359
-
2360
- if runnable_targets:
2361
- load_variant_prompt_files_for_targets(runnable_targets, task_file_dir=args.tasks.parent)
2362
-
2363
- project_root = args.project_root.resolve()
2364
- claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
2365
-
2366
- completed = 0
2367
- for task, variant in targets:
2368
- if (task.id, variant.name) in skip_keys:
2369
- print(f"skip {task.id}/{variant.name} (already in {args.csv})")
2370
- continue
2371
- print(f"run {task.id}/{variant.name} ...", flush=True)
2372
- result = run_fixture(task, variant, args.claude_bin, project_root, args.dry_run)
2373
- # dry-run row 는 CSV 에 적재하지 않는다. 적재하면 (a) tokens=0/cost=0 이 평균을
2374
- # 깎고, (b) --resume 이 그 (task, variant) 를 skip 해 실제 측정값이 영구 누락된다.
2375
- wrote = True
2376
- if not args.dry_run:
2377
- wrote = append_csv(args.csv, claude_ver, result, skip_existing=args.resume)
2378
- if wrote and args.ledger_jsonl is not None:
2379
- append_cost_shift_ledger(args.ledger_jsonl, claude_ver, result)
2380
- completed += 1
2381
- status = "ok" if result.success else "FAIL"
2382
- if args.dry_run:
2383
- suffix = " (dry-run; CSV not updated)"
2384
- elif not wrote:
2385
- suffix = " (CSV not updated; row already present)"
2386
- else:
2387
- suffix = ""
2388
- print(
2389
- f" {status} tokens={sum(result.tokens.values())} cost=${result.cost_usd:.4f} "
2390
- f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
2391
- )
2392
- target = args.csv if not args.dry_run else "(dry-run; no CSV writes)"
2393
- if args.report_json is not None and not args.dry_run:
2394
- report = write_report_json(args.csv, args.report_json, args.baseline_variant)
2395
- print(f"report {args.report_json}: {report['claim_status']}")
2396
- print(f"completed {completed} run(s); results in {target}")
2397
- return 0
2398
-
2399
-
2400
- if __name__ == "__main__":
2401
- raise SystemExit(main())