@ictechgy/context-guard 0.4.8 → 0.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/README.ko.md +92 -37
- package/README.md +111 -37
- package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
- package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
- package/docs/distribution.md +10 -7
- package/docs/experimental-benchmark-fixtures.md +8 -1
- package/package.json +3 -6
- package/packaging/homebrew/context-guard.rb.template +1 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +9 -6
- package/plugins/context-guard/README.md +27 -12
- package/plugins/context-guard/bin/context-guard +113 -26
- package/plugins/context-guard/bin/context-guard-artifact +542 -46
- package/plugins/context-guard/bin/context-guard-cache-score +380 -0
- package/plugins/context-guard/bin/context-guard-compress +146 -1
- package/plugins/context-guard/bin/context-guard-cost +783 -4
- package/plugins/context-guard/bin/context-guard-experiments +2211 -121
- package/plugins/context-guard/bin/context-guard-failed-nudge +3 -0
- package/plugins/context-guard/bin/context-guard-filter +163 -7
- package/plugins/context-guard/bin/context-guard-guard-read +3 -0
- package/plugins/context-guard/bin/context-guard-pack +602 -43
- package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
- package/plugins/context-guard/bin/context-guard-setup +165 -31
- package/plugins/context-guard/bin/context-guard-statusline +490 -283
- package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
- package/plugins/context-guard/bin/context-guard-tool-prune +241 -1
- package/plugins/context-guard/lib/context_guard_commands.py +206 -0
- package/plugins/context-guard/skills/setup/SKILL.md +1 -0
- package/context-guard-kit/README.md +0 -91
- package/context-guard-kit/benchmark_runner.py +0 -2401
- package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
- package/context-guard-kit/context_compress.py +0 -695
- package/context-guard-kit/context_escrow.py +0 -935
- package/context-guard-kit/context_filter.py +0 -637
- package/context-guard-kit/context_guard_cli.py +0 -325
- package/context-guard-kit/context_guard_diet.py +0 -1711
- package/context-guard-kit/context_pack.py +0 -2713
- package/context-guard-kit/cost_guard.py +0 -2349
- package/context-guard-kit/experimental_registry.py +0 -2339
- package/context-guard-kit/failed_attempt_nudge.py +0 -567
- package/context-guard-kit/guard_large_read.py +0 -690
- package/context-guard-kit/hook_secret_patterns.py +0 -43
- package/context-guard-kit/read_symbol.py +0 -483
- package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
- package/context-guard-kit/sanitize_output.py +0 -725
- package/context-guard-kit/settings.example.json +0 -67
- package/context-guard-kit/setup_wizard.py +0 -2515
- package/context-guard-kit/statusline.sh +0 -362
- package/context-guard-kit/statusline_merged.sh +0 -157
- package/context-guard-kit/tool_schema_pruner.py +0 -837
- package/context-guard-kit/trim_command_output.py +0 -1449
|
@@ -1,2401 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Claude Code 토큰 절감 벤치마크 자동 실행 runner.
|
|
3
|
-
|
|
4
|
-
`research/benchmark-plan.md` 의 task set × variant 조합을 비대화형 `claude -p`
|
|
5
|
-
호출로 실행하고, `tokens_per_successful_task` 측정에 필요한 컬럼을 CSV 에 적재한다.
|
|
6
|
-
|
|
7
|
-
사용 예:
|
|
8
|
-
|
|
9
|
-
```bash
|
|
10
|
-
context-guard-kit/benchmark_runner.py \
|
|
11
|
-
--tasks bench/tasks.json --variants bench/variants.json \
|
|
12
|
-
--csv bench/results.csv
|
|
13
|
-
|
|
14
|
-
context-guard-kit/benchmark_runner.py --tasks bench/tasks.json \
|
|
15
|
-
--variants bench/variants.json --task-id t01 --variant baseline --dry-run
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
Task fixture (`tasks.json`): 각 task 는 다음 필드를 가진다.
|
|
19
|
-
|
|
20
|
-
```json
|
|
21
|
-
[
|
|
22
|
-
{
|
|
23
|
-
"id": "t01",
|
|
24
|
-
"prompt": "Add validation to src/auth/session.ts ...",
|
|
25
|
-
"model": "sonnet",
|
|
26
|
-
"effort": "medium",
|
|
27
|
-
"max_turns": 3,
|
|
28
|
-
"max_budget_usd": 1.0,
|
|
29
|
-
"allowed_tools": ["Read", "Edit", "Bash(npm test*)"],
|
|
30
|
-
"variant_prompt_files": {"context_hygiene": "t01.context_hygiene.prompt.md"},
|
|
31
|
-
"success_command": "npm test -- auth/session",
|
|
32
|
-
"success_cwd": "."
|
|
33
|
-
}
|
|
34
|
-
]
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
Variant fixture (`variants.json`): 각 variant 는 `claude -p` 에 추가할 옵션 묶음을 정의한다.
|
|
38
|
-
|
|
39
|
-
```json
|
|
40
|
-
[
|
|
41
|
-
{"name": "baseline", "extra_args": []},
|
|
42
|
-
{"name": "context_hygiene", "extra_args": ["--strict-mcp-config", "--mcp-config", "bench/minimal-mcp.json"]}
|
|
43
|
-
]
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
dry-run 모드는 실제 호출은 하지 않고 어떤 명령이 실행될지만 출력한다.
|
|
47
|
-
"""
|
|
48
|
-
from __future__ import annotations
|
|
49
|
-
|
|
50
|
-
import argparse
|
|
51
|
-
import collections
|
|
52
|
-
from contextlib import contextmanager
|
|
53
|
-
import csv
|
|
54
|
-
import datetime as _dt
|
|
55
|
-
import json
|
|
56
|
-
import math
|
|
57
|
-
import os
|
|
58
|
-
import re
|
|
59
|
-
import selectors
|
|
60
|
-
import shlex
|
|
61
|
-
import shutil
|
|
62
|
-
import signal
|
|
63
|
-
import stat
|
|
64
|
-
import subprocess
|
|
65
|
-
import sys
|
|
66
|
-
import time
|
|
67
|
-
import unicodedata
|
|
68
|
-
from dataclasses import dataclass, field
|
|
69
|
-
from pathlib import Path
|
|
70
|
-
from typing import Any
|
|
71
|
-
|
|
72
|
-
try:
|
|
73
|
-
import fcntl
|
|
74
|
-
except ImportError: # pragma: no cover - benchmark runner already requires POSIX no-follow IO.
|
|
75
|
-
fcntl = None # type: ignore[assignment]
|
|
76
|
-
|
|
77
|
-
CSV_COLUMNS = [
|
|
78
|
-
"date",
|
|
79
|
-
"claude_version",
|
|
80
|
-
"task_id",
|
|
81
|
-
"variant",
|
|
82
|
-
"model",
|
|
83
|
-
"effort",
|
|
84
|
-
"total_tokens",
|
|
85
|
-
"input_tokens",
|
|
86
|
-
"output_tokens",
|
|
87
|
-
"cache_read",
|
|
88
|
-
"cache_creation",
|
|
89
|
-
"provider_cached_tokens",
|
|
90
|
-
"provider_cached_tokens_measured",
|
|
91
|
-
"cost_usd",
|
|
92
|
-
"cost_measured",
|
|
93
|
-
"wall_time_seconds",
|
|
94
|
-
"turns",
|
|
95
|
-
"hook_triggers",
|
|
96
|
-
"bytes_before",
|
|
97
|
-
"bytes_after",
|
|
98
|
-
"artifacts_used",
|
|
99
|
-
"external_tokens",
|
|
100
|
-
"external_tokens_measured",
|
|
101
|
-
"external_cost_usd",
|
|
102
|
-
"external_cost_measured",
|
|
103
|
-
"total_cost_with_shift_usd",
|
|
104
|
-
"success",
|
|
105
|
-
"corrections",
|
|
106
|
-
"notes",
|
|
107
|
-
"primary_tokens_measured",
|
|
108
|
-
]
|
|
109
|
-
MAX_CSV_NOTE_CHARS = 500
|
|
110
|
-
MAX_CSV_ROWS = 100_000
|
|
111
|
-
CSV_FORMULA_PREFIXES = ("=", "+", "-", "@")
|
|
112
|
-
PLACEHOLDER_SUCCESS_COMMAND_MARKER = "fixture-only placeholder: replace success_command before real benchmark runs"
|
|
113
|
-
PROTECTED_VARIANT_FLAGS = frozenset({
|
|
114
|
-
"--",
|
|
115
|
-
"-p",
|
|
116
|
-
"--print",
|
|
117
|
-
"--model",
|
|
118
|
-
"--max-turns",
|
|
119
|
-
"--output-format",
|
|
120
|
-
"--allowedTools",
|
|
121
|
-
"--allowed-tools",
|
|
122
|
-
"--max-budget-usd",
|
|
123
|
-
"--effort",
|
|
124
|
-
})
|
|
125
|
-
SECRET_NOTE_KEY_RE = r"[A-Za-z0-9_.-]*(?:api[-_]?key|token|secret|password|client[-_]?secret)[A-Za-z0-9_.-]*"
|
|
126
|
-
SECRET_NOTE_VALUE_RE = r"(?:'[^']*'|\"[^\"]*\"|[^\s,}&#;]+)"
|
|
127
|
-
SECRET_NOTE_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
|
|
128
|
-
(re.compile(r"(?i)\bBearer\s+[A-Za-z0-9._~+/=-]+"), "[REDACTED]"),
|
|
129
|
-
(re.compile(r"(?i)\bBasic\s+[A-Za-z0-9._~+/=-]+"), "[REDACTED]"),
|
|
130
|
-
(re.compile(rf"(?i)([?&#;]({SECRET_NOTE_KEY_RE})=)[^\s?&#;]+"), r"\1[REDACTED]"),
|
|
131
|
-
(re.compile(rf"(?i)(^|[\s{{,?&#;])([\"']?(?:{SECRET_NOTE_KEY_RE})[\"']?\s*[:=]\s*){SECRET_NOTE_VALUE_RE}"), r"\1\2[REDACTED]"),
|
|
132
|
-
(re.compile(rf"(?i)(^|[\s\"'])(--(?:{SECRET_NOTE_KEY_RE})(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"), r"\1\2[REDACTED]"),
|
|
133
|
-
(re.compile(r"(?i)(^|[\s\"'])((?:-u|--user)(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"), r"\1\2[REDACTED]"),
|
|
134
|
-
(re.compile(r"gh[pousr]_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
|
|
135
|
-
(re.compile(r"github_pat_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
|
|
136
|
-
(re.compile(r"glpat-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
|
|
137
|
-
(re.compile(r"xox[abprs]-[A-Za-z0-9-]{10,}"), "[REDACTED]"),
|
|
138
|
-
(re.compile(r"(?:AKIA|ASIA)[0-9A-Z]{16}"), "[REDACTED]"),
|
|
139
|
-
(re.compile(r"(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{16,}"), "[REDACTED]"),
|
|
140
|
-
(re.compile(r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
|
|
141
|
-
(re.compile(r"npm_[A-Za-z0-9]{20,}"), "[REDACTED]"),
|
|
142
|
-
(re.compile(r"AIza[0-9A-Za-z_\-]{20,}"), "[REDACTED]"),
|
|
143
|
-
(re.compile(r"SG\.[A-Za-z0-9_-]{16,}\.[A-Za-z0-9_-]{16,}"), "[REDACTED]"),
|
|
144
|
-
(re.compile(r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"), "[REDACTED]"),
|
|
145
|
-
(re.compile(r"([a-z][a-z0-9+.-]*://)[^/\s@]+@", re.IGNORECASE), r"\1[REDACTED]@"),
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
# claude -p --output-format json 및 호환 벤치마크 provider usage 키 후보.
|
|
149
|
-
# Anthropic SDK, Claude Code, OpenAI-style JSON 출력 형식이 시간이 지나며 바뀔 수
|
|
150
|
-
# 있어 다중 후보로 best-effort 매칭한다.
|
|
151
|
-
USAGE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...]], ...] = (
|
|
152
|
-
("input_tokens", ("input_tokens", "inputTokens", "prompt_tokens", "promptTokens")),
|
|
153
|
-
("output_tokens", ("output_tokens", "outputTokens", "completion_tokens", "completionTokens")),
|
|
154
|
-
("cache_read", ("cache_read_input_tokens", "cacheRead")),
|
|
155
|
-
("cache_creation", ("cache_creation_input_tokens", "cacheCreation")),
|
|
156
|
-
)
|
|
157
|
-
PROVIDER_CACHE_DETAIL_KEYS = (
|
|
158
|
-
"prompt_tokens_details",
|
|
159
|
-
"promptTokensDetails",
|
|
160
|
-
"input_tokens_details",
|
|
161
|
-
"inputTokensDetails",
|
|
162
|
-
)
|
|
163
|
-
PROVIDER_CACHED_TOKEN_KEYS = ("cached_tokens", "cachedTokens")
|
|
164
|
-
COST_KEYS = ("total_cost_usd", "cost_usd", "costUSD")
|
|
165
|
-
SHIFT_METRIC_KEY_GROUPS: tuple[tuple[str, tuple[str, ...]], ...] = (
|
|
166
|
-
("turns", ("turns", "num_turns", "total_turns")),
|
|
167
|
-
("hook_triggers", ("hook_triggers", "hookTriggerCount", "hook_trigger_count")),
|
|
168
|
-
("bytes_before", ("bytes_before", "bytesBefore", "raw_bytes_before")),
|
|
169
|
-
("bytes_after", ("bytes_after", "bytesAfter", "visible_bytes_after")),
|
|
170
|
-
("artifacts_used", ("artifacts_used", "artifact_count", "artifactsUsed")),
|
|
171
|
-
)
|
|
172
|
-
EXTERNAL_TOKEN_AGGREGATE_KEYS = ("external_tokens",)
|
|
173
|
-
EXTERNAL_COST_AGGREGATE_KEYS = ("external_cost_usd",)
|
|
174
|
-
EXTERNAL_SOURCE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...], tuple[str, ...]], ...] = (
|
|
175
|
-
("auxiliary", ("auxiliary_tokens",), ("auxiliary_cost_usd",)),
|
|
176
|
-
("subagent", ("subagent_tokens",), ("subagent_cost_usd",)),
|
|
177
|
-
("provider", ("provider_tokens",), ("provider_cost_usd",)),
|
|
178
|
-
)
|
|
179
|
-
MAX_USAGE_TOKEN_COUNT = 10**12
|
|
180
|
-
MAX_USAGE_COST_USD = 10**9
|
|
181
|
-
# Byte -> token proxy 환산 계수. 측정된 모델 토큰이 아니라 byte delta 기반 보수적
|
|
182
|
-
# 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
|
|
183
|
-
# ~4 bytes/token의 통용 근사값을 사용한다.
|
|
184
|
-
TOKEN_PROXY_BYTES_PER_TOKEN = 4
|
|
185
|
-
BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
|
|
186
|
-
MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
|
|
187
|
-
SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
|
|
188
|
-
SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
|
|
189
|
-
SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
|
|
190
|
-
MAX_SELF_HOSTED_LABEL_CHARS = 120
|
|
191
|
-
MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
|
|
192
|
-
MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
|
|
193
|
-
MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
|
|
194
|
-
CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
|
|
195
|
-
SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
|
|
196
|
-
VERSION_OUTPUT_MAX_BYTES = 16_000
|
|
197
|
-
PROCESS_TERMINATE_GRACE_SECONDS = 2.0
|
|
198
|
-
ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
|
|
199
|
-
"tmp": Path("/private/tmp"),
|
|
200
|
-
"var": Path("/private/var"),
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def _base_open_flags() -> int:
|
|
205
|
-
flags = os.O_RDONLY
|
|
206
|
-
if hasattr(os, "O_CLOEXEC"):
|
|
207
|
-
flags |= os.O_CLOEXEC
|
|
208
|
-
return flags
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
def _no_follow_flag() -> int:
|
|
212
|
-
if hasattr(os, "O_NOFOLLOW"):
|
|
213
|
-
return os.O_NOFOLLOW
|
|
214
|
-
raise OSError("platform does not support no-follow file opens")
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
def no_follow_file_ops_supported() -> bool:
|
|
218
|
-
return hasattr(os, "O_NOFOLLOW") and os.open in os.supports_dir_fd and os.mkdir in os.supports_dir_fd
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def require_no_follow_file_ops_supported() -> None:
|
|
222
|
-
if not no_follow_file_ops_supported():
|
|
223
|
-
raise SystemExit(
|
|
224
|
-
"benchmark runner requires POSIX no-follow file operations for safe fixture and CSV paths; "
|
|
225
|
-
"this platform is not supported yet."
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
def _directory_flag() -> int:
|
|
230
|
-
return getattr(os, "O_DIRECTORY", 0)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def _normalized_link_target(parent: Path, raw_target: str) -> Path:
|
|
234
|
-
target = Path(raw_target)
|
|
235
|
-
if not target.is_absolute():
|
|
236
|
-
target = parent / target
|
|
237
|
-
return Path(os.path.normpath(str(target)))
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
def _normalize_allowed_first_absolute_symlink(path: Path) -> Path:
|
|
241
|
-
if not path.is_absolute() or len(path.parts) < 2:
|
|
242
|
-
return path
|
|
243
|
-
first = path.parts[1]
|
|
244
|
-
expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
|
|
245
|
-
if expected is None:
|
|
246
|
-
return path
|
|
247
|
-
link = Path(path.anchor) / first
|
|
248
|
-
try:
|
|
249
|
-
if not stat.S_ISLNK(os.lstat(link).st_mode):
|
|
250
|
-
return path
|
|
251
|
-
if _normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
|
|
252
|
-
return path
|
|
253
|
-
except OSError:
|
|
254
|
-
return path
|
|
255
|
-
return expected.joinpath(*path.parts[2:])
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
def _open_directory_at(dir_fd: int, component: str, path: Path) -> int:
|
|
259
|
-
fd = os.open(component, _base_open_flags() | _directory_flag() | _no_follow_flag(), dir_fd=dir_fd)
|
|
260
|
-
try:
|
|
261
|
-
if not stat.S_ISDIR(os.fstat(fd).st_mode):
|
|
262
|
-
raise OSError(f"not a directory: {path}")
|
|
263
|
-
return fd
|
|
264
|
-
except Exception:
|
|
265
|
-
os.close(fd)
|
|
266
|
-
raise
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def _ensure_directory_no_symlink(path: Path, *, create: bool = False) -> int:
|
|
270
|
-
if os.open not in os.supports_dir_fd or os.mkdir not in os.supports_dir_fd:
|
|
271
|
-
raise OSError("platform does not support directory-relative no-follow directory access")
|
|
272
|
-
path = _normalize_allowed_first_absolute_symlink(path)
|
|
273
|
-
components = list(path.parts)
|
|
274
|
-
if path.is_absolute() and components:
|
|
275
|
-
components = components[1:]
|
|
276
|
-
root = path.anchor if path.is_absolute() else "."
|
|
277
|
-
dir_fd = os.open(root or ".", _base_open_flags() | _directory_flag())
|
|
278
|
-
try:
|
|
279
|
-
for component in components:
|
|
280
|
-
try:
|
|
281
|
-
next_fd = _open_directory_at(dir_fd, component, path)
|
|
282
|
-
except FileNotFoundError:
|
|
283
|
-
if not create:
|
|
284
|
-
raise
|
|
285
|
-
os.mkdir(component, 0o777, dir_fd=dir_fd)
|
|
286
|
-
next_fd = _open_directory_at(dir_fd, component, path)
|
|
287
|
-
os.close(dir_fd)
|
|
288
|
-
dir_fd = next_fd
|
|
289
|
-
return dir_fd
|
|
290
|
-
except Exception:
|
|
291
|
-
os.close(dir_fd)
|
|
292
|
-
raise
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
def _open_regular_no_symlink(
|
|
296
|
-
path: Path,
|
|
297
|
-
flags: int | None = None,
|
|
298
|
-
mode: int = 0o666,
|
|
299
|
-
*,
|
|
300
|
-
create_parent: bool = False,
|
|
301
|
-
) -> int:
|
|
302
|
-
if os.open not in os.supports_dir_fd:
|
|
303
|
-
raise OSError("platform does not support directory-relative no-follow opens")
|
|
304
|
-
path = _normalize_allowed_first_absolute_symlink(path)
|
|
305
|
-
parent_fd = _ensure_directory_no_symlink(path.parent, create=create_parent)
|
|
306
|
-
open_flags = (flags if flags is not None else _base_open_flags()) | _no_follow_flag()
|
|
307
|
-
try:
|
|
308
|
-
fd = os.open(path.name, open_flags, mode, dir_fd=parent_fd)
|
|
309
|
-
try:
|
|
310
|
-
if not stat.S_ISREG(os.fstat(fd).st_mode):
|
|
311
|
-
raise OSError(f"not a regular file: {path}")
|
|
312
|
-
return fd
|
|
313
|
-
except Exception:
|
|
314
|
-
os.close(fd)
|
|
315
|
-
raise
|
|
316
|
-
finally:
|
|
317
|
-
os.close(parent_fd)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
def _read_text_no_follow(path: Path) -> str:
|
|
321
|
-
fd = _open_regular_no_symlink(path)
|
|
322
|
-
try:
|
|
323
|
-
with os.fdopen(fd, "r", encoding="utf-8") as handle:
|
|
324
|
-
fd = -1
|
|
325
|
-
return handle.read()
|
|
326
|
-
finally:
|
|
327
|
-
if fd != -1:
|
|
328
|
-
os.close(fd)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
@contextmanager
|
|
332
|
-
def csv_file_lock(csv_path: Path, *, create_parent: bool) -> Any:
|
|
333
|
-
"""Serialize CSV read/write access with a no-follow sidecar lock file."""
|
|
334
|
-
if fcntl is None:
|
|
335
|
-
raise OSError("platform does not support advisory CSV locks")
|
|
336
|
-
lock_path = csv_path.with_name(f"{csv_path.name}.lock")
|
|
337
|
-
fd = _open_regular_no_symlink(lock_path, os.O_CREAT | os.O_RDWR, 0o600, create_parent=create_parent)
|
|
338
|
-
locked = False
|
|
339
|
-
try:
|
|
340
|
-
fcntl.flock(fd, fcntl.LOCK_EX)
|
|
341
|
-
locked = True
|
|
342
|
-
yield
|
|
343
|
-
finally:
|
|
344
|
-
try:
|
|
345
|
-
if locked:
|
|
346
|
-
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
347
|
-
finally:
|
|
348
|
-
os.close(fd)
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
# 재현성 우선: fixture 에 명시되지 않은 필드는 argv 로 전달하지 않는다.
|
|
352
|
-
# 사용자가 baseline 으로 의도한 변형이 implicit default(예: effort="medium")로 인해
|
|
353
|
-
# 왜곡되지 않도록, 파싱 단계에서 명시 여부를 그대로 보존한다.
|
|
354
|
-
@dataclass
|
|
355
|
-
class TaskFixture:
|
|
356
|
-
id: str
|
|
357
|
-
prompt: str
|
|
358
|
-
model: str = "sonnet"
|
|
359
|
-
effort: str | None = None
|
|
360
|
-
max_turns: int = 3
|
|
361
|
-
max_budget_usd: float | None = None
|
|
362
|
-
allowed_tools: list[str] = field(default_factory=list)
|
|
363
|
-
success_command: str | None = None
|
|
364
|
-
success_cwd: str = "."
|
|
365
|
-
variant_prompt_files: dict[str, str] = field(default_factory=dict)
|
|
366
|
-
variant_prompt_texts: dict[str, str] = field(default_factory=dict)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
@dataclass
|
|
370
|
-
class Variant:
|
|
371
|
-
name: str
|
|
372
|
-
extra_args: list[str] = field(default_factory=list)
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
@dataclass
|
|
376
|
-
class RunResult:
|
|
377
|
-
task_id: str
|
|
378
|
-
variant: str
|
|
379
|
-
model: str
|
|
380
|
-
effort: str
|
|
381
|
-
tokens: dict[str, int]
|
|
382
|
-
cost_usd: float
|
|
383
|
-
success: bool
|
|
384
|
-
notes: str
|
|
385
|
-
corrections: int = 0
|
|
386
|
-
cost_measured: bool = False
|
|
387
|
-
wall_time_seconds: float = 0.0
|
|
388
|
-
turns: int = 0
|
|
389
|
-
hook_triggers: int = 0
|
|
390
|
-
bytes_before: int = 0
|
|
391
|
-
bytes_after: int = 0
|
|
392
|
-
artifacts_used: int = 0
|
|
393
|
-
external_tokens: int = 0
|
|
394
|
-
external_tokens_measured: bool = False
|
|
395
|
-
external_cost_usd: float = 0.0
|
|
396
|
-
external_cost_measured: bool = False
|
|
397
|
-
provider_cached_tokens: int = 0
|
|
398
|
-
provider_cached_tokens_measured: bool = False
|
|
399
|
-
primary_tokens_measured: bool = False
|
|
400
|
-
self_hosted_metrics: dict[str, Any] | None = None
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
@dataclass
|
|
404
|
-
class BoundedProcessResult:
|
|
405
|
-
returncode: int
|
|
406
|
-
stdout: str
|
|
407
|
-
stderr: str
|
|
408
|
-
timed_out: bool = False
|
|
409
|
-
output_truncated: bool = False
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
def is_placeholder_success_command(command: str | None) -> bool:
|
|
413
|
-
return bool(command and PLACEHOLDER_SUCCESS_COMMAND_MARKER in command)
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
def parse_positive_int(value: Any, *, field: str, owner: str) -> int:
|
|
417
|
-
"""Parse a JSON fixture field that must be a positive integer."""
|
|
418
|
-
if isinstance(value, bool):
|
|
419
|
-
raise SystemExit(f"{owner} {field} must be a positive integer")
|
|
420
|
-
if isinstance(value, int):
|
|
421
|
-
parsed = value
|
|
422
|
-
elif isinstance(value, str) and re.fullmatch(r"[0-9]+", value.strip()):
|
|
423
|
-
parsed = int(value.strip())
|
|
424
|
-
else:
|
|
425
|
-
raise SystemExit(f"{owner} {field} must be a positive integer")
|
|
426
|
-
if parsed <= 0:
|
|
427
|
-
raise SystemExit(f"{owner} {field} must be > 0")
|
|
428
|
-
return parsed
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
def parse_string_list(value: Any, *, field: str, owner: str) -> list[str]:
|
|
432
|
-
"""Parse a JSON fixture field that must be a list of non-empty strings."""
|
|
433
|
-
if value is None:
|
|
434
|
-
raise SystemExit(f"{owner} {field} must be a JSON list of strings")
|
|
435
|
-
if not isinstance(value, list):
|
|
436
|
-
raise SystemExit(f"{owner} {field} must be a JSON list of strings")
|
|
437
|
-
items: list[str] = []
|
|
438
|
-
for index, item in enumerate(value):
|
|
439
|
-
if not isinstance(item, str):
|
|
440
|
-
raise SystemExit(f"{owner} {field}[{index}] must be a string")
|
|
441
|
-
if not item.strip():
|
|
442
|
-
raise SystemExit(f"{owner} {field}[{index}] must be non-empty")
|
|
443
|
-
items.append(item)
|
|
444
|
-
return items
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
def parse_string_map(value: Any, *, field: str, owner: str) -> dict[str, str]:
|
|
448
|
-
"""Parse a JSON fixture field that must be an object of non-empty string values."""
|
|
449
|
-
if value is None:
|
|
450
|
-
return {}
|
|
451
|
-
if not isinstance(value, dict):
|
|
452
|
-
raise SystemExit(f"{owner} {field} must be a JSON object of strings")
|
|
453
|
-
items: dict[str, str] = {}
|
|
454
|
-
for raw_key, raw_value in value.items():
|
|
455
|
-
if not isinstance(raw_key, str) or not raw_key.strip():
|
|
456
|
-
raise SystemExit(f"{owner} {field} keys must be non-empty strings")
|
|
457
|
-
if not isinstance(raw_value, str) or not raw_value.strip():
|
|
458
|
-
raise SystemExit(f"{owner} {field}.{raw_key} must be a non-empty string")
|
|
459
|
-
items[raw_key] = raw_value
|
|
460
|
-
return items
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[str]:
|
|
464
|
-
for index, arg in enumerate(extra_args):
|
|
465
|
-
flag = arg.split("=", 1)[0]
|
|
466
|
-
if flag in PROTECTED_VARIANT_FLAGS:
|
|
467
|
-
raise SystemExit(
|
|
468
|
-
f"{owner} extra_args[{index}] must not override runner-controlled Claude flags: {flag}"
|
|
469
|
-
)
|
|
470
|
-
return extra_args
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
|
|
474
|
-
"""Return a safe relative prompt-file path, or fail before any file read."""
|
|
475
|
-
rel_path = Path(raw_path)
|
|
476
|
-
if rel_path.is_absolute():
|
|
477
|
-
raise SystemExit(f"{owner} variant_prompt_files path must be relative: {raw_path}")
|
|
478
|
-
if not rel_path.parts or rel_path == Path("."):
|
|
479
|
-
raise SystemExit(f"{owner} variant_prompt_files path must name a file")
|
|
480
|
-
if any(part in ("", ".", "..") for part in rel_path.parts):
|
|
481
|
-
raise SystemExit(f"{owner} variant_prompt_files path must not contain '.', '..', or empty components: {raw_path}")
|
|
482
|
-
return rel_path
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
def validate_variant_prompt_file_references(
|
|
486
|
-
tasks: list[TaskFixture],
|
|
487
|
-
variants: list["Variant"],
|
|
488
|
-
) -> None:
|
|
489
|
-
"""Validate variant prompt-file keys and paths without dereferencing files.
|
|
490
|
-
|
|
491
|
-
Unknown variant keys and unsafe relative paths are rejected before any file
|
|
492
|
-
read. Missing prompt files are intentionally not checked here so a run
|
|
493
|
-
narrowed by --task-id/--variant is not blocked by unselected prompt files.
|
|
494
|
-
"""
|
|
495
|
-
known_variants = {variant.name for variant in variants}
|
|
496
|
-
for task in tasks:
|
|
497
|
-
unknown = sorted(set(task.variant_prompt_files) - known_variants)
|
|
498
|
-
if unknown:
|
|
499
|
-
raise SystemExit(
|
|
500
|
-
f"task {task.id} variant_prompt_files references unknown variant(s): {', '.join(unknown)}"
|
|
501
|
-
)
|
|
502
|
-
for variant_name, raw_path in task.variant_prompt_files.items():
|
|
503
|
-
validate_variant_prompt_file_path(
|
|
504
|
-
raw_path,
|
|
505
|
-
owner=f"task {task.id} variant {variant_name}",
|
|
506
|
-
)
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None = None) -> str:
|
|
510
|
-
"""Read one selected prompt file with no-follow IO and an argv-safe size cap."""
|
|
511
|
-
label = display_path or path.name
|
|
512
|
-
try:
|
|
513
|
-
fd = _open_regular_no_symlink(path)
|
|
514
|
-
except OSError as exc:
|
|
515
|
-
detail = exc.strerror or exc.__class__.__name__
|
|
516
|
-
raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
|
|
517
|
-
try:
|
|
518
|
-
size = os.fstat(fd).st_size
|
|
519
|
-
if size > MAX_VARIANT_PROMPT_FILE_BYTES:
|
|
520
|
-
raise SystemExit(
|
|
521
|
-
f"{owner} variant_prompt_files prompt file exceeds "
|
|
522
|
-
f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
|
|
523
|
-
)
|
|
524
|
-
try:
|
|
525
|
-
with os.fdopen(fd, "r", encoding="utf-8") as handle:
|
|
526
|
-
fd = -1
|
|
527
|
-
text = handle.read()
|
|
528
|
-
except UnicodeDecodeError as exc:
|
|
529
|
-
raise SystemExit(
|
|
530
|
-
f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
|
|
531
|
-
f"{label}: {exc.reason}"
|
|
532
|
-
) from None
|
|
533
|
-
except OSError as exc:
|
|
534
|
-
detail = exc.strerror or exc.__class__.__name__
|
|
535
|
-
raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
|
|
536
|
-
finally:
|
|
537
|
-
if fd != -1:
|
|
538
|
-
os.close(fd)
|
|
539
|
-
if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
|
|
540
|
-
raise SystemExit(
|
|
541
|
-
f"{owner} variant_prompt_files prompt text exceeds "
|
|
542
|
-
f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
|
|
543
|
-
)
|
|
544
|
-
return text
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
def load_variant_prompt_files_for_targets(
|
|
548
|
-
targets: list[tuple[TaskFixture, "Variant"]],
|
|
549
|
-
*,
|
|
550
|
-
task_file_dir: Path,
|
|
551
|
-
) -> None:
|
|
552
|
-
"""Load file-backed prompts only for selected (task, variant) targets."""
|
|
553
|
-
for task, variant in targets:
|
|
554
|
-
raw_path = task.variant_prompt_files.get(variant.name)
|
|
555
|
-
if raw_path is None:
|
|
556
|
-
continue
|
|
557
|
-
rel_path = validate_variant_prompt_file_path(
|
|
558
|
-
raw_path,
|
|
559
|
-
owner=f"task {task.id} variant {variant.name}",
|
|
560
|
-
)
|
|
561
|
-
task.variant_prompt_texts[variant.name] = read_variant_prompt_file(
|
|
562
|
-
task_file_dir / rel_path,
|
|
563
|
-
owner=f"task {task.id} variant {variant.name}",
|
|
564
|
-
display_path=str(rel_path),
|
|
565
|
-
)
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
def normalize_usage_token(value: Any) -> int | None:
|
|
569
|
-
"""Return a safe non-negative token count, or None for invalid metrics."""
|
|
570
|
-
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
571
|
-
return None
|
|
572
|
-
try:
|
|
573
|
-
numeric = float(value)
|
|
574
|
-
except (OverflowError, ValueError):
|
|
575
|
-
return None
|
|
576
|
-
if not math.isfinite(numeric) or numeric < 0 or numeric > MAX_USAGE_TOKEN_COUNT:
|
|
577
|
-
return None
|
|
578
|
-
return int(numeric)
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
def normalize_usage_cost(value: Any) -> float | None:
|
|
582
|
-
"""Return a safe non-negative cost value, or None for invalid metrics."""
|
|
583
|
-
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
584
|
-
return None
|
|
585
|
-
try:
|
|
586
|
-
numeric = float(value)
|
|
587
|
-
except (OverflowError, ValueError):
|
|
588
|
-
return None
|
|
589
|
-
if not math.isfinite(numeric) or numeric < 0 or numeric > MAX_USAGE_COST_USD:
|
|
590
|
-
return None
|
|
591
|
-
return numeric
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
def parse_tasks(path: Path, variants: list["Variant"] | None = None) -> list[TaskFixture]:
|
|
595
|
-
raw = json.loads(_read_text_no_follow(path))
|
|
596
|
-
if not isinstance(raw, list):
|
|
597
|
-
raise SystemExit(f"tasks file must be a JSON list: {path}")
|
|
598
|
-
fixtures: list[TaskFixture] = []
|
|
599
|
-
for item in raw:
|
|
600
|
-
if not isinstance(item, dict):
|
|
601
|
-
raise SystemExit(f"task entry must be a JSON object: {item}")
|
|
602
|
-
effort_raw = item.get("effort")
|
|
603
|
-
budget_raw = item.get("max_budget_usd")
|
|
604
|
-
if budget_raw is not None:
|
|
605
|
-
try:
|
|
606
|
-
budget = float(budget_raw)
|
|
607
|
-
except (TypeError, ValueError):
|
|
608
|
-
raise SystemExit(f"task {item.get('id')} max_budget_usd must be number or null")
|
|
609
|
-
if not math.isfinite(budget) or budget <= 0:
|
|
610
|
-
raise SystemExit(f"task {item.get('id')} max_budget_usd must be finite and > 0 (use null for unlimited)")
|
|
611
|
-
else:
|
|
612
|
-
budget = None
|
|
613
|
-
task_id = str(item["id"])
|
|
614
|
-
if "variant_prompts" in item:
|
|
615
|
-
raise SystemExit(
|
|
616
|
-
f"task {task_id} variant_prompts is not supported; use file-backed variant_prompt_files"
|
|
617
|
-
)
|
|
618
|
-
fixtures.append(TaskFixture(
|
|
619
|
-
id=task_id,
|
|
620
|
-
prompt=str(item["prompt"]),
|
|
621
|
-
model=str(item.get("model", "sonnet")),
|
|
622
|
-
effort=str(effort_raw) if effort_raw is not None else None,
|
|
623
|
-
max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {task_id}"),
|
|
624
|
-
max_budget_usd=budget,
|
|
625
|
-
allowed_tools=parse_string_list(
|
|
626
|
-
item.get("allowed_tools", []),
|
|
627
|
-
field="allowed_tools",
|
|
628
|
-
owner=f"task {task_id}",
|
|
629
|
-
),
|
|
630
|
-
success_command=item.get("success_command"),
|
|
631
|
-
success_cwd=str(item.get("success_cwd", ".")),
|
|
632
|
-
variant_prompt_files=parse_string_map(
|
|
633
|
-
item.get("variant_prompt_files"),
|
|
634
|
-
field="variant_prompt_files",
|
|
635
|
-
owner=f"task {task_id}",
|
|
636
|
-
),
|
|
637
|
-
))
|
|
638
|
-
if variants is not None:
|
|
639
|
-
validate_variant_prompt_file_references(fixtures, variants)
|
|
640
|
-
return fixtures
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
def parse_variants(path: Path) -> list[Variant]:
|
|
644
|
-
raw = json.loads(_read_text_no_follow(path))
|
|
645
|
-
if not isinstance(raw, list):
|
|
646
|
-
raise SystemExit(f"variants file must be a JSON list: {path}")
|
|
647
|
-
variants: list[Variant] = []
|
|
648
|
-
for item in raw:
|
|
649
|
-
if not isinstance(item, dict):
|
|
650
|
-
raise SystemExit(f"variant entry must be a JSON object: {item}")
|
|
651
|
-
variants.append(Variant(
|
|
652
|
-
name=str(item["name"]),
|
|
653
|
-
extra_args=validate_variant_extra_args(
|
|
654
|
-
parse_string_list(
|
|
655
|
-
item.get("extra_args", []),
|
|
656
|
-
field="extra_args",
|
|
657
|
-
owner=f"variant {item.get('name')}",
|
|
658
|
-
),
|
|
659
|
-
owner=f"variant {item.get('name')}",
|
|
660
|
-
),
|
|
661
|
-
))
|
|
662
|
-
return variants
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
def collect_usage(payload: Any) -> tuple[dict[str, int], float, bool, bool]:
|
|
666
|
-
"""`claude -p --output-format json` 응답에서 token / cost 추출.
|
|
667
|
-
|
|
668
|
-
의도된 정책: 한 응답에 top-level usage 와 nested per-message usage 가 동시에 있으면
|
|
669
|
-
이중 합산이 되어 비용이 과대 보고된다. 따라서 각 bucket / cost 모두 **첫 매칭** 만
|
|
670
|
-
채택한다 (top-level → BFS 순서). 응답 구조가 바뀌어 첫 매칭이 의도와 다른 경우에는
|
|
671
|
-
fixture/variant 단위로 측정 결과를 점검하라.
|
|
672
|
-
"""
|
|
673
|
-
tokens: dict[str, int] = {key: 0 for key, _ in USAGE_KEY_GROUPS}
|
|
674
|
-
seen_token: dict[str, bool] = {key: False for key, _ in USAGE_KEY_GROUPS}
|
|
675
|
-
cost = 0.0
|
|
676
|
-
seen_cost = False
|
|
677
|
-
# BFS 로 walk 해 top-level dict 가 nested dict 보다 먼저 평가되도록 한다.
|
|
678
|
-
queue: collections.deque[Any] = collections.deque([payload])
|
|
679
|
-
while queue:
|
|
680
|
-
cur = queue.popleft()
|
|
681
|
-
if isinstance(cur, dict):
|
|
682
|
-
for bucket, keys in USAGE_KEY_GROUPS:
|
|
683
|
-
if seen_token[bucket]:
|
|
684
|
-
continue
|
|
685
|
-
for key in keys:
|
|
686
|
-
token_count = normalize_usage_token(cur.get(key))
|
|
687
|
-
if token_count is not None:
|
|
688
|
-
tokens[bucket] = token_count
|
|
689
|
-
seen_token[bucket] = True
|
|
690
|
-
break
|
|
691
|
-
if not seen_cost:
|
|
692
|
-
for key in COST_KEYS:
|
|
693
|
-
cost_value = normalize_usage_cost(cur.get(key))
|
|
694
|
-
if cost_value is not None:
|
|
695
|
-
cost = cost_value
|
|
696
|
-
seen_cost = True
|
|
697
|
-
break
|
|
698
|
-
queue.extend(cur.values())
|
|
699
|
-
elif isinstance(cur, list):
|
|
700
|
-
queue.extend(cur)
|
|
701
|
-
# Token-savings claims require a comparable primary-token total. Cache
|
|
702
|
-
# buckets are optional zeroes in normal provider payloads, but the core
|
|
703
|
-
# input/output buckets must both be observed; otherwise an output-only or
|
|
704
|
-
# input-only partial payload would be treated as measured zero for the
|
|
705
|
-
# missing side and could overstate savings.
|
|
706
|
-
primary_tokens_measured = seen_token["input_tokens"] and seen_token["output_tokens"]
|
|
707
|
-
return tokens, cost, seen_cost, primary_tokens_measured
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
def collect_provider_cache_telemetry(payload: Any) -> tuple[int, bool]:
|
|
711
|
-
"""Extract provider-specific prompt-cache telemetry without changing token totals.
|
|
712
|
-
|
|
713
|
-
OpenAI-style responses expose cached prompt tokens under
|
|
714
|
-
`usage.prompt_tokens_details.cached_tokens`. That number is useful cache
|
|
715
|
-
telemetry, but `prompt_tokens` may already include cached tokens, so keep it
|
|
716
|
-
separate from the primary token buckets and from ContextGuard savings claims.
|
|
717
|
-
Anthropic-style `cache_read_input_tokens` remains in the normal `cache_read`
|
|
718
|
-
bucket handled by `collect_usage`.
|
|
719
|
-
"""
|
|
720
|
-
queue: collections.deque[Any] = collections.deque([payload])
|
|
721
|
-
while queue:
|
|
722
|
-
cur = queue.popleft()
|
|
723
|
-
if isinstance(cur, dict):
|
|
724
|
-
for details_key in PROVIDER_CACHE_DETAIL_KEYS:
|
|
725
|
-
details = cur.get(details_key)
|
|
726
|
-
if not isinstance(details, dict):
|
|
727
|
-
continue
|
|
728
|
-
for cached_key in PROVIDER_CACHED_TOKEN_KEYS:
|
|
729
|
-
cached = normalize_usage_token(details.get(cached_key))
|
|
730
|
-
if cached is not None:
|
|
731
|
-
return cached, True
|
|
732
|
-
queue.extend(cur.values())
|
|
733
|
-
elif isinstance(cur, list):
|
|
734
|
-
queue.extend(cur)
|
|
735
|
-
return 0, False
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
def collect_provider_cached_tokens(payload: Any) -> int:
|
|
739
|
-
"""Return cached-token telemetry value for callers that only need the count."""
|
|
740
|
-
cached_tokens, _measured = collect_provider_cache_telemetry(payload)
|
|
741
|
-
return cached_tokens
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
def elapsed_seconds_since(start: float) -> float:
|
|
745
|
-
return max(0.0, time.monotonic() - start)
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
def first_normalized_token(cur: dict[str, Any], keys: tuple[str, ...]) -> int | None:
|
|
749
|
-
for key in keys:
|
|
750
|
-
value = normalize_usage_token(cur.get(key))
|
|
751
|
-
if value is not None:
|
|
752
|
-
return value
|
|
753
|
-
return None
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
def first_normalized_cost(cur: dict[str, Any], keys: tuple[str, ...]) -> float | None:
|
|
757
|
-
for key in keys:
|
|
758
|
-
value = normalize_usage_cost(cur.get(key))
|
|
759
|
-
if value is not None:
|
|
760
|
-
return value
|
|
761
|
-
return None
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
def contains_external_source_tokens(value: Any) -> bool:
|
|
765
|
-
queue: collections.deque[Any] = collections.deque([value])
|
|
766
|
-
while queue:
|
|
767
|
-
cur = queue.popleft()
|
|
768
|
-
if isinstance(cur, dict):
|
|
769
|
-
for _source, token_keys, _cost_keys in EXTERNAL_SOURCE_KEY_GROUPS:
|
|
770
|
-
if first_normalized_token(cur, token_keys) is not None:
|
|
771
|
-
return True
|
|
772
|
-
queue.extend(cur.values())
|
|
773
|
-
elif isinstance(cur, list):
|
|
774
|
-
queue.extend(cur)
|
|
775
|
-
return False
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
def collect_shift_metrics(payload: Any) -> dict[str, int | float | bool]:
|
|
779
|
-
"""Collect optional cost-shift / byte-saving metrics without requiring them.
|
|
780
|
-
|
|
781
|
-
External work is reported by evolving Claude/runner payloads either as one
|
|
782
|
-
aggregate (`external_tokens` + `external_cost_usd`) or as explicit source
|
|
783
|
-
records (`auxiliary_*`, `subagent_*`, `provider_*`). Do not mix those two
|
|
784
|
-
shapes: if an aggregate token count exists, it is authoritative; otherwise
|
|
785
|
-
sum only source-token records and mark cost measured only when every
|
|
786
|
-
positive source-token record carries its matching source cost.
|
|
787
|
-
"""
|
|
788
|
-
metrics: dict[str, int | float | bool] = {key: 0 for key, _ in SHIFT_METRIC_KEY_GROUPS}
|
|
789
|
-
seen: dict[str, bool] = {key: False for key, _ in SHIFT_METRIC_KEY_GROUPS}
|
|
790
|
-
aggregate_tokens: int | None = None
|
|
791
|
-
aggregate_cost = 0.0
|
|
792
|
-
aggregate_cost_measured = False
|
|
793
|
-
source_tokens = 0
|
|
794
|
-
source_tokens_measured = False
|
|
795
|
-
source_cost = 0.0
|
|
796
|
-
source_cost_covered = True
|
|
797
|
-
metrics["external_cost_usd"] = 0.0
|
|
798
|
-
metrics["external_cost_measured"] = False
|
|
799
|
-
metrics["external_tokens"] = 0
|
|
800
|
-
metrics["external_tokens_measured"] = False
|
|
801
|
-
queue: collections.deque[Any] = collections.deque([payload])
|
|
802
|
-
while queue:
|
|
803
|
-
cur = queue.popleft()
|
|
804
|
-
if isinstance(cur, dict):
|
|
805
|
-
for bucket, keys in SHIFT_METRIC_KEY_GROUPS:
|
|
806
|
-
if seen[bucket]:
|
|
807
|
-
continue
|
|
808
|
-
value = first_normalized_token(cur, keys)
|
|
809
|
-
if value is not None:
|
|
810
|
-
metrics[bucket] = value
|
|
811
|
-
seen[bucket] = True
|
|
812
|
-
|
|
813
|
-
if aggregate_tokens is None:
|
|
814
|
-
value = first_normalized_token(cur, EXTERNAL_TOKEN_AGGREGATE_KEYS)
|
|
815
|
-
if value is not None:
|
|
816
|
-
aggregate_tokens = value
|
|
817
|
-
cost = first_normalized_cost(cur, EXTERNAL_COST_AGGREGATE_KEYS)
|
|
818
|
-
if cost is not None:
|
|
819
|
-
aggregate_cost = cost
|
|
820
|
-
aggregate_cost_measured = True
|
|
821
|
-
|
|
822
|
-
source_values = [
|
|
823
|
-
(value, cost_keys)
|
|
824
|
-
for _source, token_keys, cost_keys in EXTERNAL_SOURCE_KEY_GROUPS
|
|
825
|
-
for value in [first_normalized_token(cur, token_keys)]
|
|
826
|
-
if value is not None
|
|
827
|
-
]
|
|
828
|
-
if source_values and not any(contains_external_source_tokens(value) for value in cur.values()):
|
|
829
|
-
for value, cost_keys in source_values:
|
|
830
|
-
source_tokens += value
|
|
831
|
-
source_tokens_measured = True
|
|
832
|
-
cost = first_normalized_cost(cur, cost_keys)
|
|
833
|
-
if cost is not None:
|
|
834
|
-
source_cost += cost
|
|
835
|
-
elif value > 0:
|
|
836
|
-
source_cost_covered = False
|
|
837
|
-
queue.extend(cur.values())
|
|
838
|
-
elif isinstance(cur, list):
|
|
839
|
-
queue.extend(cur)
|
|
840
|
-
|
|
841
|
-
if aggregate_tokens is not None:
|
|
842
|
-
metrics["external_tokens"] = aggregate_tokens
|
|
843
|
-
metrics["external_tokens_measured"] = True
|
|
844
|
-
metrics["external_cost_usd"] = aggregate_cost if aggregate_cost_measured else 0.0
|
|
845
|
-
metrics["external_cost_measured"] = aggregate_cost_measured
|
|
846
|
-
elif source_tokens_measured:
|
|
847
|
-
metrics["external_tokens"] = source_tokens
|
|
848
|
-
metrics["external_tokens_measured"] = True
|
|
849
|
-
metrics["external_cost_usd"] = source_cost
|
|
850
|
-
metrics["external_cost_measured"] = source_cost_covered
|
|
851
|
-
return metrics
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
def normalize_self_hosted_metric(value: Any, *, maximum: float) -> float | None:
|
|
855
|
-
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
856
|
-
return None
|
|
857
|
-
number = float(value)
|
|
858
|
-
if not math.isfinite(number) or number < 0 or number > maximum:
|
|
859
|
-
return None
|
|
860
|
-
return number
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
def sanitize_self_hosted_label(value: Any) -> str | None:
|
|
864
|
-
if not isinstance(value, str):
|
|
865
|
-
return None
|
|
866
|
-
text = sanitize_note_text(value)
|
|
867
|
-
if not text:
|
|
868
|
-
return None
|
|
869
|
-
if len(text) > MAX_SELF_HOSTED_LABEL_CHARS:
|
|
870
|
-
text = text[:MAX_SELF_HOSTED_LABEL_CHARS - 12].rstrip() + "…[truncated]"
|
|
871
|
-
return text
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
def normalize_self_hosted_metrics(raw: Any, *, source: str) -> dict[str, Any] | None:
|
|
875
|
-
if not isinstance(raw, dict):
|
|
876
|
-
return None
|
|
877
|
-
metrics: dict[str, float] = {}
|
|
878
|
-
labels: dict[str, str] = {}
|
|
879
|
-
availability = {
|
|
880
|
-
"latency_ms": False,
|
|
881
|
-
"peak_memory_mb": False,
|
|
882
|
-
"quality_score": False,
|
|
883
|
-
}
|
|
884
|
-
latency = normalize_self_hosted_metric(raw.get("latency_ms"), maximum=MAX_SELF_HOSTED_LATENCY_MS)
|
|
885
|
-
if latency is not None:
|
|
886
|
-
metrics["latency_ms"] = latency
|
|
887
|
-
availability["latency_ms"] = True
|
|
888
|
-
peak_memory = normalize_self_hosted_metric(raw.get("peak_memory_mb"), maximum=MAX_SELF_HOSTED_MEMORY_MB)
|
|
889
|
-
if peak_memory is not None:
|
|
890
|
-
metrics["peak_memory_mb"] = peak_memory
|
|
891
|
-
availability["peak_memory_mb"] = True
|
|
892
|
-
quality = normalize_self_hosted_metric(raw.get("quality_score"), maximum=1.0)
|
|
893
|
-
if quality is not None:
|
|
894
|
-
metrics["quality_score"] = quality
|
|
895
|
-
availability["quality_score"] = True
|
|
896
|
-
for key in ("model_server", "optimization", "quality_metric"):
|
|
897
|
-
label = sanitize_self_hosted_label(raw.get(key))
|
|
898
|
-
if label is not None:
|
|
899
|
-
labels[key] = label
|
|
900
|
-
if not metrics:
|
|
901
|
-
return None
|
|
902
|
-
return {
|
|
903
|
-
"schema_version": SELF_HOSTED_METRICS_SCHEMA_VERSION,
|
|
904
|
-
"source": source,
|
|
905
|
-
"metrics": metrics,
|
|
906
|
-
"labels": labels,
|
|
907
|
-
"measurement_availability": availability,
|
|
908
|
-
"claim_boundary": {
|
|
909
|
-
"id": SELF_HOSTED_METRICS_CLAIM_BOUNDARY,
|
|
910
|
-
"hosted_api_token_savings_claim_allowed": False,
|
|
911
|
-
"hosted_api_cost_savings_claim_allowed": False,
|
|
912
|
-
"requires_provider_measured_matched_tasks_for_hosted_claims": True,
|
|
913
|
-
"reason": (
|
|
914
|
-
"Self-hosted local/model-server latency, memory, and quality metrics "
|
|
915
|
-
"are not hosted API token or cost telemetry."
|
|
916
|
-
),
|
|
917
|
-
},
|
|
918
|
-
}
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
def collect_self_hosted_metrics(payload: Any) -> dict[str, Any] | None:
|
|
922
|
-
"""Collect explicit self-hosted metric sidecars without broad key inference.
|
|
923
|
-
|
|
924
|
-
Only explicit top-level telemetry envelopes are considered. Do not infer
|
|
925
|
-
from incidental keys like `self_hosted_latency_ms` or arbitrary nested model
|
|
926
|
-
message content: that would make local/model-server telemetry too easy to
|
|
927
|
-
mix into hosted API claim surfaces.
|
|
928
|
-
"""
|
|
929
|
-
if not isinstance(payload, dict):
|
|
930
|
-
return None
|
|
931
|
-
candidates = [
|
|
932
|
-
(
|
|
933
|
-
payload.get(SELF_HOSTED_METRICS_KEY),
|
|
934
|
-
f"explicit_provider_payload.{SELF_HOSTED_METRICS_KEY}",
|
|
935
|
-
)
|
|
936
|
-
]
|
|
937
|
-
metrics_envelope = payload.get("metrics")
|
|
938
|
-
if isinstance(metrics_envelope, dict):
|
|
939
|
-
candidates.append((
|
|
940
|
-
metrics_envelope.get(SELF_HOSTED_METRICS_KEY),
|
|
941
|
-
f"explicit_provider_payload.metrics.{SELF_HOSTED_METRICS_KEY}",
|
|
942
|
-
))
|
|
943
|
-
for raw, source in candidates:
|
|
944
|
-
normalized = normalize_self_hosted_metrics(raw, source=source)
|
|
945
|
-
if normalized is not None:
|
|
946
|
-
return normalized
|
|
947
|
-
return None
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
def claude_version(claude_bin: str) -> str:
|
|
951
|
-
try:
|
|
952
|
-
proc = run_bounded_command(
|
|
953
|
-
[claude_bin, "--version"],
|
|
954
|
-
cwd=Path.cwd(),
|
|
955
|
-
timeout_seconds=5,
|
|
956
|
-
max_output_bytes=VERSION_OUTPUT_MAX_BYTES,
|
|
957
|
-
)
|
|
958
|
-
return proc.stdout.strip().splitlines()[0] if proc.stdout else "unknown"
|
|
959
|
-
except (OSError, subprocess.TimeoutExpired, ValueError):
|
|
960
|
-
return "unknown"
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> list[str]:
|
|
964
|
-
"""`claude -p` argv 를 빌드한다.
|
|
965
|
-
|
|
966
|
-
fixture 에 명시되지 않은 옵션(effort, max_budget_usd) 은 argv 에서 빠진다.
|
|
967
|
-
이렇게 해야 baseline variant 의 실제 의미(=defaults 그대로)가 implicit
|
|
968
|
-
runner default 로 왜곡되지 않는다.
|
|
969
|
-
"""
|
|
970
|
-
argv = [claude_bin, "-p", "--model", task.model,
|
|
971
|
-
"--max-turns", str(task.max_turns), "--output-format", "json"]
|
|
972
|
-
if task.effort:
|
|
973
|
-
argv.extend(["--effort", task.effort])
|
|
974
|
-
if task.max_budget_usd is not None:
|
|
975
|
-
argv.extend(["--max-budget-usd", str(task.max_budget_usd)])
|
|
976
|
-
if task.allowed_tools:
|
|
977
|
-
argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
|
|
978
|
-
argv.extend(variant.extra_args)
|
|
979
|
-
argv.append("--")
|
|
980
|
-
argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
|
|
981
|
-
return argv
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
def executable_argv0(command: str) -> str:
|
|
985
|
-
resolved = shutil.which(command)
|
|
986
|
-
if resolved:
|
|
987
|
-
return str(Path(resolved).expanduser().resolve())
|
|
988
|
-
path = Path(command).expanduser()
|
|
989
|
-
if path.is_absolute():
|
|
990
|
-
return str(path)
|
|
991
|
-
return str(path.resolve())
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
def _signal_process_group(proc: subprocess.Popen[bytes], sig: int, pgid: int | None) -> None:
|
|
995
|
-
if pgid is not None:
|
|
996
|
-
try:
|
|
997
|
-
os.killpg(pgid, sig)
|
|
998
|
-
return
|
|
999
|
-
except (AttributeError, ProcessLookupError):
|
|
1000
|
-
pass
|
|
1001
|
-
except OSError:
|
|
1002
|
-
pass
|
|
1003
|
-
try:
|
|
1004
|
-
if sig == signal.SIGKILL:
|
|
1005
|
-
proc.kill()
|
|
1006
|
-
else:
|
|
1007
|
-
proc.terminate()
|
|
1008
|
-
except OSError:
|
|
1009
|
-
pass
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
def run_bounded_command(
|
|
1013
|
-
argv: list[str],
|
|
1014
|
-
*,
|
|
1015
|
-
cwd: Path,
|
|
1016
|
-
timeout_seconds: int,
|
|
1017
|
-
max_output_bytes: int,
|
|
1018
|
-
) -> BoundedProcessResult:
|
|
1019
|
-
proc = subprocess.Popen(
|
|
1020
|
-
argv,
|
|
1021
|
-
cwd=cwd,
|
|
1022
|
-
stdout=subprocess.PIPE,
|
|
1023
|
-
stderr=subprocess.PIPE,
|
|
1024
|
-
start_new_session=True,
|
|
1025
|
-
)
|
|
1026
|
-
try:
|
|
1027
|
-
pgid = os.getpgid(proc.pid)
|
|
1028
|
-
except OSError:
|
|
1029
|
-
pgid = proc.pid
|
|
1030
|
-
selector = selectors.DefaultSelector()
|
|
1031
|
-
buffers: dict[str, bytearray] = {"stdout": bytearray(), "stderr": bytearray()}
|
|
1032
|
-
streams = {"stdout": proc.stdout, "stderr": proc.stderr}
|
|
1033
|
-
for name, stream in streams.items():
|
|
1034
|
-
if stream is None:
|
|
1035
|
-
continue
|
|
1036
|
-
try:
|
|
1037
|
-
os.set_blocking(stream.fileno(), False)
|
|
1038
|
-
except (AttributeError, OSError):
|
|
1039
|
-
pass
|
|
1040
|
-
selector.register(stream, selectors.EVENT_READ, name)
|
|
1041
|
-
|
|
1042
|
-
timed_out = False
|
|
1043
|
-
output_truncated = False
|
|
1044
|
-
terminated_at: float | None = None
|
|
1045
|
-
sent_kill = False
|
|
1046
|
-
deadline = time.monotonic() + timeout_seconds
|
|
1047
|
-
try:
|
|
1048
|
-
while selector.get_map():
|
|
1049
|
-
now = time.monotonic()
|
|
1050
|
-
if now >= deadline:
|
|
1051
|
-
timed_out = True
|
|
1052
|
-
if terminated_at is None:
|
|
1053
|
-
_signal_process_group(proc, signal.SIGTERM, pgid)
|
|
1054
|
-
terminated_at = now
|
|
1055
|
-
if terminated_at is not None and not sent_kill:
|
|
1056
|
-
if now - terminated_at >= PROCESS_TERMINATE_GRACE_SECONDS:
|
|
1057
|
-
_signal_process_group(proc, signal.SIGKILL, pgid)
|
|
1058
|
-
sent_kill = True
|
|
1059
|
-
if sent_kill and terminated_at is not None:
|
|
1060
|
-
if now - terminated_at >= PROCESS_TERMINATE_GRACE_SECONDS * 2:
|
|
1061
|
-
timed_out = True
|
|
1062
|
-
break
|
|
1063
|
-
events = selector.select(timeout=0.05)
|
|
1064
|
-
for key, _ in events:
|
|
1065
|
-
name = key.data
|
|
1066
|
-
stream = key.fileobj
|
|
1067
|
-
try:
|
|
1068
|
-
chunk = os.read(stream.fileno(), 65536)
|
|
1069
|
-
except BlockingIOError:
|
|
1070
|
-
continue
|
|
1071
|
-
if not chunk:
|
|
1072
|
-
selector.unregister(stream)
|
|
1073
|
-
try:
|
|
1074
|
-
stream.close()
|
|
1075
|
-
except OSError:
|
|
1076
|
-
pass
|
|
1077
|
-
continue
|
|
1078
|
-
buffer = buffers[name]
|
|
1079
|
-
remaining = max_output_bytes - len(buffer)
|
|
1080
|
-
if remaining > 0:
|
|
1081
|
-
buffer.extend(chunk[:remaining])
|
|
1082
|
-
if len(chunk) > remaining:
|
|
1083
|
-
output_truncated = True
|
|
1084
|
-
if terminated_at is None:
|
|
1085
|
-
_signal_process_group(proc, signal.SIGTERM, pgid)
|
|
1086
|
-
terminated_at = time.monotonic()
|
|
1087
|
-
finally:
|
|
1088
|
-
selector.close()
|
|
1089
|
-
|
|
1090
|
-
try:
|
|
1091
|
-
returncode = proc.wait(timeout=PROCESS_TERMINATE_GRACE_SECONDS)
|
|
1092
|
-
except subprocess.TimeoutExpired:
|
|
1093
|
-
_signal_process_group(proc, signal.SIGKILL, pgid)
|
|
1094
|
-
try:
|
|
1095
|
-
returncode = proc.wait(timeout=PROCESS_TERMINATE_GRACE_SECONDS)
|
|
1096
|
-
except subprocess.TimeoutExpired:
|
|
1097
|
-
returncode = 124
|
|
1098
|
-
timed_out = True
|
|
1099
|
-
if timed_out:
|
|
1100
|
-
returncode = 124
|
|
1101
|
-
elif output_truncated:
|
|
1102
|
-
returncode = 125
|
|
1103
|
-
return BoundedProcessResult(
|
|
1104
|
-
returncode=returncode,
|
|
1105
|
-
stdout=bytes(buffers["stdout"]).decode("utf-8", "replace"),
|
|
1106
|
-
stderr=bytes(buffers["stderr"]).decode("utf-8", "replace"),
|
|
1107
|
-
timed_out=timed_out,
|
|
1108
|
-
output_truncated=output_truncated,
|
|
1109
|
-
)
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
# shlex.split 은 shell injection 은 막지만 `true ; echo pwned` 같은 입력을 그대로
|
|
1113
|
-
# `["true", ";", "echo", "pwned"]` 로 분해해 /usr/bin/true 가 ";"·"echo"·"pwned" 를
|
|
1114
|
-
# 그냥 인자로 무시하고 success=true 로 끝나는 false-positive 를 만들 수 있다.
|
|
1115
|
-
# 따라서 shlex 분해 결과 토큰에 셸 합성 의도를 가진 것으로 보이는 문자가 포함되면 거부한다.
|
|
1116
|
-
_SHELL_META_TOKENS = frozenset({";", "&&", "||", "|", "&", "<", ">", ">>", "<<", "<<<"})
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
def _has_shell_meta(argv: list[str]) -> bool:
|
|
1120
|
-
for tok in argv:
|
|
1121
|
-
if tok in _SHELL_META_TOKENS:
|
|
1122
|
-
return True
|
|
1123
|
-
# 토큰 안에 `$( ... )` / 백틱 같은 명령 치환 흔적이 있어도 거부.
|
|
1124
|
-
if "$(" in tok or "`" in tok:
|
|
1125
|
-
return True
|
|
1126
|
-
return False
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
def run_success_command(task: TaskFixture, project_root: Path) -> tuple[bool, str]:
|
|
1130
|
-
"""fixture 의 success_command 를 실행한다.
|
|
1131
|
-
|
|
1132
|
-
- `shlex.split + shell=False` 로 단일 argv 만 실행한다.
|
|
1133
|
-
- 분해된 토큰에 셸 합성 의도(`;`, `&&`, `|`, `$()`, 백틱 등)가 있으면 거부한다.
|
|
1134
|
-
`success_command` 는 단일 검증 명령 또는 헬퍼 스크립트 한 개의 경로여야 한다.
|
|
1135
|
-
- `success_cwd` 가 project_root 밖으로 escape 하면 거부한다 (..//../etc 같은 케이스).
|
|
1136
|
-
"""
|
|
1137
|
-
if not task.success_command:
|
|
1138
|
-
return True, "no success_command configured"
|
|
1139
|
-
try:
|
|
1140
|
-
argv = shlex.split(task.success_command)
|
|
1141
|
-
except ValueError as exc:
|
|
1142
|
-
return False, f"success_command parse error: {exc}"
|
|
1143
|
-
if not argv:
|
|
1144
|
-
return False, "success_command parsed to empty argv"
|
|
1145
|
-
if _has_shell_meta(argv):
|
|
1146
|
-
return False, "success_command contains shell-composition tokens (use a helper script)"
|
|
1147
|
-
project_root_resolved = project_root.resolve()
|
|
1148
|
-
cwd = (project_root / task.success_cwd).resolve()
|
|
1149
|
-
try:
|
|
1150
|
-
cwd.relative_to(project_root_resolved)
|
|
1151
|
-
except ValueError:
|
|
1152
|
-
return False, f"success_cwd escapes project_root: {cwd}"
|
|
1153
|
-
try:
|
|
1154
|
-
proc = run_bounded_command(
|
|
1155
|
-
argv,
|
|
1156
|
-
cwd=cwd,
|
|
1157
|
-
timeout_seconds=600,
|
|
1158
|
-
max_output_bytes=SUCCESS_COMMAND_OUTPUT_MAX_BYTES,
|
|
1159
|
-
)
|
|
1160
|
-
except (OSError, subprocess.TimeoutExpired, ValueError) as exc:
|
|
1161
|
-
return False, f"success_command failed to launch: {exc}"
|
|
1162
|
-
if proc.timed_out:
|
|
1163
|
-
return False, "success_command timed out after 600s"
|
|
1164
|
-
if proc.output_truncated:
|
|
1165
|
-
return False, f"success_command output limit exceeded ({SUCCESS_COMMAND_OUTPUT_MAX_BYTES} bytes)"
|
|
1166
|
-
return proc.returncode == 0, f"exit={proc.returncode}"
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
|
|
1170
|
-
project_root: Path, dry_run: bool) -> RunResult:
|
|
1171
|
-
argv = build_claude_argv(claude_bin, task, variant)
|
|
1172
|
-
started_at = time.monotonic()
|
|
1173
|
-
if dry_run:
|
|
1174
|
-
return RunResult(
|
|
1175
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1176
|
-
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
1177
|
-
success=True, notes=f"dry-run: {shlex.join(argv)}",
|
|
1178
|
-
wall_time_seconds=0.0,
|
|
1179
|
-
)
|
|
1180
|
-
if is_placeholder_success_command(task.success_command):
|
|
1181
|
-
return RunResult(
|
|
1182
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1183
|
-
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
1184
|
-
success=False,
|
|
1185
|
-
notes=f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing to invoke provider",
|
|
1186
|
-
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
1187
|
-
)
|
|
1188
|
-
argv[0] = executable_argv0(argv[0])
|
|
1189
|
-
try:
|
|
1190
|
-
proc = run_bounded_command(
|
|
1191
|
-
argv,
|
|
1192
|
-
cwd=project_root,
|
|
1193
|
-
timeout_seconds=1800,
|
|
1194
|
-
max_output_bytes=CLAUDE_OUTPUT_MAX_BYTES,
|
|
1195
|
-
)
|
|
1196
|
-
except (OSError, subprocess.TimeoutExpired, ValueError) as exc:
|
|
1197
|
-
return RunResult(
|
|
1198
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1199
|
-
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
1200
|
-
success=False, notes=f"claude launch failed: {exc}",
|
|
1201
|
-
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
1202
|
-
)
|
|
1203
|
-
if proc.timed_out:
|
|
1204
|
-
return RunResult(
|
|
1205
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1206
|
-
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
1207
|
-
success=False, notes="claude timed out after 1800s",
|
|
1208
|
-
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
1209
|
-
)
|
|
1210
|
-
if proc.output_truncated:
|
|
1211
|
-
return RunResult(
|
|
1212
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1213
|
-
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
1214
|
-
success=False, notes=f"claude output limit exceeded ({CLAUDE_OUTPUT_MAX_BYTES} bytes)",
|
|
1215
|
-
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
1216
|
-
)
|
|
1217
|
-
if proc.returncode != 0:
|
|
1218
|
-
return RunResult(
|
|
1219
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1220
|
-
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
1221
|
-
success=False, notes=f"claude exit={proc.returncode}: {proc.stderr[-200:].strip()}",
|
|
1222
|
-
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
1223
|
-
)
|
|
1224
|
-
try:
|
|
1225
|
-
payload = json.loads(proc.stdout)
|
|
1226
|
-
except json.JSONDecodeError as exc:
|
|
1227
|
-
return RunResult(
|
|
1228
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1229
|
-
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
1230
|
-
success=False, notes=f"claude returned non-JSON: {exc.msg}",
|
|
1231
|
-
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
1232
|
-
)
|
|
1233
|
-
tokens, cost, cost_measured, primary_tokens_measured = collect_usage(payload)
|
|
1234
|
-
provider_cached_tokens, provider_cached_tokens_measured = collect_provider_cache_telemetry(payload)
|
|
1235
|
-
shift_metrics = collect_shift_metrics(payload)
|
|
1236
|
-
self_hosted_metrics = collect_self_hosted_metrics(payload)
|
|
1237
|
-
success, success_note = run_success_command(task, project_root)
|
|
1238
|
-
return RunResult(
|
|
1239
|
-
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
1240
|
-
tokens=tokens, cost_usd=cost, success=success, notes=success_note,
|
|
1241
|
-
cost_measured=cost_measured,
|
|
1242
|
-
primary_tokens_measured=primary_tokens_measured,
|
|
1243
|
-
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
1244
|
-
turns=int(shift_metrics["turns"]),
|
|
1245
|
-
hook_triggers=int(shift_metrics["hook_triggers"]),
|
|
1246
|
-
bytes_before=int(shift_metrics["bytes_before"]),
|
|
1247
|
-
bytes_after=int(shift_metrics["bytes_after"]),
|
|
1248
|
-
artifacts_used=int(shift_metrics["artifacts_used"]),
|
|
1249
|
-
external_tokens=int(shift_metrics["external_tokens"]),
|
|
1250
|
-
external_tokens_measured=bool(shift_metrics["external_tokens_measured"]),
|
|
1251
|
-
external_cost_usd=float(shift_metrics["external_cost_usd"]),
|
|
1252
|
-
external_cost_measured=bool(shift_metrics["external_cost_measured"]),
|
|
1253
|
-
provider_cached_tokens=provider_cached_tokens,
|
|
1254
|
-
provider_cached_tokens_measured=provider_cached_tokens_measured,
|
|
1255
|
-
self_hosted_metrics=self_hosted_metrics,
|
|
1256
|
-
)
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
def append_csv(csv_path: Path, claude_ver: str, result: RunResult, *, skip_existing: bool = False) -> bool:
|
|
1260
|
-
with csv_file_lock(csv_path, create_parent=True):
|
|
1261
|
-
if skip_existing and (result.task_id, result.variant) in _read_existing_keys_unlocked(csv_path):
|
|
1262
|
-
return False
|
|
1263
|
-
flags = os.O_CREAT | os.O_APPEND | os.O_WRONLY
|
|
1264
|
-
fd = _open_regular_no_symlink(csv_path, flags, 0o600, create_parent=True)
|
|
1265
|
-
try:
|
|
1266
|
-
new_file = os.fstat(fd).st_size == 0
|
|
1267
|
-
if not new_file:
|
|
1268
|
-
validate_csv_schema(csv_path, read_csv_header_unlocked(csv_path))
|
|
1269
|
-
with os.fdopen(fd, "a", encoding="utf-8", newline="") as f:
|
|
1270
|
-
fd = -1
|
|
1271
|
-
writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
|
|
1272
|
-
if new_file:
|
|
1273
|
-
writer.writeheader()
|
|
1274
|
-
tokens = result.tokens
|
|
1275
|
-
total = sum(tokens.values())
|
|
1276
|
-
shifted_cost_known = cost_shift_measured(result)
|
|
1277
|
-
writer.writerow({
|
|
1278
|
-
"date": sanitize_csv_cell(_dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")),
|
|
1279
|
-
"claude_version": sanitize_csv_cell(claude_ver),
|
|
1280
|
-
"task_id": sanitize_csv_cell(result.task_id),
|
|
1281
|
-
"variant": sanitize_csv_cell(result.variant),
|
|
1282
|
-
"model": sanitize_csv_cell(result.model),
|
|
1283
|
-
"effort": sanitize_csv_cell(result.effort),
|
|
1284
|
-
"total_tokens": total,
|
|
1285
|
-
"input_tokens": tokens.get("input_tokens", 0),
|
|
1286
|
-
"output_tokens": tokens.get("output_tokens", 0),
|
|
1287
|
-
"cache_read": tokens.get("cache_read", 0),
|
|
1288
|
-
"cache_creation": tokens.get("cache_creation", 0),
|
|
1289
|
-
"provider_cached_tokens": result.provider_cached_tokens,
|
|
1290
|
-
"provider_cached_tokens_measured": (
|
|
1291
|
-
"true" if result.provider_cached_tokens_measured else "false"
|
|
1292
|
-
),
|
|
1293
|
-
"cost_usd": f"{result.cost_usd:.6f}",
|
|
1294
|
-
"cost_measured": "true" if result.cost_measured else "false",
|
|
1295
|
-
"wall_time_seconds": f"{result.wall_time_seconds:.6f}",
|
|
1296
|
-
"turns": result.turns,
|
|
1297
|
-
"hook_triggers": result.hook_triggers,
|
|
1298
|
-
"bytes_before": result.bytes_before,
|
|
1299
|
-
"bytes_after": result.bytes_after,
|
|
1300
|
-
"artifacts_used": result.artifacts_used,
|
|
1301
|
-
"external_tokens": result.external_tokens,
|
|
1302
|
-
"external_tokens_measured": "true" if result.external_tokens_measured else "false",
|
|
1303
|
-
"external_cost_usd": f"{result.external_cost_usd:.6f}",
|
|
1304
|
-
"external_cost_measured": "true" if result.external_cost_measured else "false",
|
|
1305
|
-
"total_cost_with_shift_usd": (
|
|
1306
|
-
f"{(result.cost_usd + result.external_cost_usd):.6f}" if shifted_cost_known else ""
|
|
1307
|
-
),
|
|
1308
|
-
"success": "true" if result.success else "false",
|
|
1309
|
-
"corrections": result.corrections,
|
|
1310
|
-
"notes": sanitize_csv_note(result.notes),
|
|
1311
|
-
"primary_tokens_measured": "true" if result.primary_tokens_measured else "false",
|
|
1312
|
-
})
|
|
1313
|
-
finally:
|
|
1314
|
-
if fd != -1:
|
|
1315
|
-
os.close(fd)
|
|
1316
|
-
return True
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
def cost_shift_measured(result: RunResult) -> bool:
|
|
1320
|
-
return (
|
|
1321
|
-
result.cost_measured
|
|
1322
|
-
and result.external_tokens_measured
|
|
1323
|
-
and (result.external_tokens == 0 or result.external_cost_measured)
|
|
1324
|
-
)
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
def read_csv_header_unlocked(csv_path: Path) -> list[str] | None:
|
|
1328
|
-
fd = _open_regular_no_symlink(csv_path)
|
|
1329
|
-
try:
|
|
1330
|
-
with os.fdopen(fd, "r", encoding="utf-8", newline="") as handle:
|
|
1331
|
-
fd = -1
|
|
1332
|
-
reader = csv.reader(handle)
|
|
1333
|
-
try:
|
|
1334
|
-
return next(reader)
|
|
1335
|
-
except StopIteration:
|
|
1336
|
-
return None
|
|
1337
|
-
finally:
|
|
1338
|
-
if fd != -1:
|
|
1339
|
-
os.close(fd)
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
def validate_csv_schema(csv_path: Path, fieldnames: list[str] | None) -> None:
|
|
1343
|
-
"""Fail loudly instead of appending/reporting across incompatible CSV schemas."""
|
|
1344
|
-
if fieldnames is None:
|
|
1345
|
-
return
|
|
1346
|
-
if fieldnames != CSV_COLUMNS:
|
|
1347
|
-
raise SystemExit(
|
|
1348
|
-
f"CSV schema mismatch for {csv_path}; start a new --csv file or migrate the header "
|
|
1349
|
-
f"to: {','.join(CSV_COLUMNS)}"
|
|
1350
|
-
)
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
def write_text_no_follow(path: Path, text: str) -> None:
|
|
1354
|
-
fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_TRUNC | os.O_WRONLY, 0o600, create_parent=True)
|
|
1355
|
-
try:
|
|
1356
|
-
with os.fdopen(fd, "w", encoding="utf-8") as handle:
|
|
1357
|
-
fd = -1
|
|
1358
|
-
handle.write(text)
|
|
1359
|
-
finally:
|
|
1360
|
-
if fd != -1:
|
|
1361
|
-
os.close(fd)
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
|
|
1365
|
-
shifted_cost_known = cost_shift_measured(result)
|
|
1366
|
-
byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
|
|
1367
|
-
payload = {
|
|
1368
|
-
"schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
|
|
1369
|
-
"date": _dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
|
|
1370
|
-
"claude_version": claude_ver,
|
|
1371
|
-
"task_id": result.task_id,
|
|
1372
|
-
"variant": result.variant,
|
|
1373
|
-
"transform_id": result.variant,
|
|
1374
|
-
"success": result.success,
|
|
1375
|
-
"primary_cost_measured": result.cost_measured,
|
|
1376
|
-
"primary_cost_usd": round(result.cost_usd, 6),
|
|
1377
|
-
"primary_tokens_measured": result.primary_tokens_measured,
|
|
1378
|
-
"provider_cached_tokens": result.provider_cached_tokens,
|
|
1379
|
-
"provider_cached_tokens_measured": result.provider_cached_tokens_measured,
|
|
1380
|
-
"wall_time_seconds": round(result.wall_time_seconds, 6),
|
|
1381
|
-
"external_tokens_measured": result.external_tokens_measured,
|
|
1382
|
-
"external_cost_measured": result.external_cost_measured,
|
|
1383
|
-
"external_cost_usd": round(result.external_cost_usd, 6),
|
|
1384
|
-
"total_cost_with_shift_usd": (
|
|
1385
|
-
round(result.cost_usd + result.external_cost_usd, 6) if shifted_cost_known else None
|
|
1386
|
-
),
|
|
1387
|
-
"primary_tokens": sum(result.tokens.values()),
|
|
1388
|
-
"external_tokens": result.external_tokens,
|
|
1389
|
-
"artifacts_used": result.artifacts_used,
|
|
1390
|
-
"bytes_before": result.bytes_before,
|
|
1391
|
-
"bytes_after": result.bytes_after,
|
|
1392
|
-
"hook_triggers": result.hook_triggers,
|
|
1393
|
-
"turns": result.turns,
|
|
1394
|
-
"notes": sanitize_csv_note(result.notes),
|
|
1395
|
-
"measurement_availability": {
|
|
1396
|
-
"primary_tokens": result.primary_tokens_measured,
|
|
1397
|
-
"primary_cost": result.cost_measured,
|
|
1398
|
-
"external_tokens": result.external_tokens_measured,
|
|
1399
|
-
"external_cost": result.external_cost_measured,
|
|
1400
|
-
"shifted_cost": shifted_cost_known,
|
|
1401
|
-
"provider_cache": result.provider_cached_tokens_measured,
|
|
1402
|
-
"byte_metrics": byte_metrics_observed,
|
|
1403
|
-
"wall_time": result.wall_time_seconds >= 0,
|
|
1404
|
-
"self_hosted_metrics": result.self_hosted_metrics is not None,
|
|
1405
|
-
},
|
|
1406
|
-
"proxy_metrics": {
|
|
1407
|
-
"byte_metrics_observed": byte_metrics_observed,
|
|
1408
|
-
"token_proxy": "chars_div_4",
|
|
1409
|
-
"bytes_per_token": TOKEN_PROXY_BYTES_PER_TOKEN,
|
|
1410
|
-
"claim_boundary": "proxy_only_not_hosted_token_savings",
|
|
1411
|
-
},
|
|
1412
|
-
}
|
|
1413
|
-
if result.self_hosted_metrics is not None:
|
|
1414
|
-
payload["self_hosted_metrics"] = result.self_hosted_metrics
|
|
1415
|
-
with csv_file_lock(path, create_parent=True):
|
|
1416
|
-
fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
|
|
1417
|
-
try:
|
|
1418
|
-
with os.fdopen(fd, "a", encoding="utf-8") as handle:
|
|
1419
|
-
fd = -1
|
|
1420
|
-
handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
|
|
1421
|
-
finally:
|
|
1422
|
-
if fd != -1:
|
|
1423
|
-
os.close(fd)
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
def _read_existing_keys_unlocked(csv_path: Path) -> set[tuple[str, str]]:
|
|
1427
|
-
try:
|
|
1428
|
-
fd = _open_regular_no_symlink(csv_path)
|
|
1429
|
-
except FileNotFoundError:
|
|
1430
|
-
return set()
|
|
1431
|
-
keys: set[tuple[str, str]] = set()
|
|
1432
|
-
try:
|
|
1433
|
-
with os.fdopen(fd, "r", encoding="utf-8", newline="") as f:
|
|
1434
|
-
fd = -1
|
|
1435
|
-
reader = csv.DictReader(f)
|
|
1436
|
-
fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
|
|
1437
|
-
validate_csv_schema(csv_path, fieldnames)
|
|
1438
|
-
for row in reader:
|
|
1439
|
-
tid = row.get("task_id") or ""
|
|
1440
|
-
var = row.get("variant") or ""
|
|
1441
|
-
if tid and var:
|
|
1442
|
-
keys.add((tid, var))
|
|
1443
|
-
finally:
|
|
1444
|
-
if fd != -1:
|
|
1445
|
-
os.close(fd)
|
|
1446
|
-
return keys
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
def _csv_exists_no_follow(csv_path: Path) -> bool:
|
|
1450
|
-
"""Probe the CSV itself without following symlinks or creating a sidecar lock."""
|
|
1451
|
-
try:
|
|
1452
|
-
fd = _open_regular_no_symlink(csv_path)
|
|
1453
|
-
except FileNotFoundError:
|
|
1454
|
-
return False
|
|
1455
|
-
else:
|
|
1456
|
-
os.close(fd)
|
|
1457
|
-
return True
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
def existing_keys(csv_path: Path) -> set[tuple[str, str]]:
|
|
1461
|
-
"""이미 적재된 (task_id, variant) 조합. resume 시 skip 판정에 사용."""
|
|
1462
|
-
if not _csv_exists_no_follow(csv_path):
|
|
1463
|
-
return set()
|
|
1464
|
-
with csv_file_lock(csv_path, create_parent=False):
|
|
1465
|
-
return _read_existing_keys_unlocked(csv_path)
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
|
|
1469
|
-
try:
|
|
1470
|
-
fd = _open_regular_no_symlink(csv_path)
|
|
1471
|
-
except FileNotFoundError:
|
|
1472
|
-
return []
|
|
1473
|
-
try:
|
|
1474
|
-
with os.fdopen(fd, "r", encoding="utf-8", newline="") as handle:
|
|
1475
|
-
fd = -1
|
|
1476
|
-
reader = csv.DictReader(handle)
|
|
1477
|
-
fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
|
|
1478
|
-
validate_csv_schema(csv_path, fieldnames)
|
|
1479
|
-
rows: list[dict[str, str]] = []
|
|
1480
|
-
for index, row in enumerate(reader, start=1):
|
|
1481
|
-
if index > MAX_CSV_ROWS:
|
|
1482
|
-
raise SystemExit(f"CSV row limit exceeded for {csv_path}: > {MAX_CSV_ROWS}")
|
|
1483
|
-
rows.append(row)
|
|
1484
|
-
return rows
|
|
1485
|
-
finally:
|
|
1486
|
-
if fd != -1:
|
|
1487
|
-
os.close(fd)
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
def row_int(row: dict[str, str], key: str) -> int:
|
|
1491
|
-
try:
|
|
1492
|
-
return int(float(row.get(key) or 0))
|
|
1493
|
-
except (TypeError, ValueError, OverflowError):
|
|
1494
|
-
return 0
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
def row_optional_nonnegative_int(row: dict[str, str], key: str) -> int | None:
|
|
1498
|
-
raw = row.get(key)
|
|
1499
|
-
if raw is None:
|
|
1500
|
-
return None
|
|
1501
|
-
text = str(raw).strip()
|
|
1502
|
-
if not re.fullmatch(r"[0-9]+", text):
|
|
1503
|
-
return None
|
|
1504
|
-
try:
|
|
1505
|
-
return int(text)
|
|
1506
|
-
except (TypeError, ValueError, OverflowError):
|
|
1507
|
-
return None
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
def row_float(row: dict[str, str], key: str) -> float:
|
|
1511
|
-
try:
|
|
1512
|
-
value = float(row.get(key) or 0)
|
|
1513
|
-
except (TypeError, ValueError, OverflowError):
|
|
1514
|
-
return 0.0
|
|
1515
|
-
return value if math.isfinite(value) else 0.0
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
def row_optional_float(row: dict[str, str], key: str) -> float | None:
|
|
1519
|
-
raw = row.get(key)
|
|
1520
|
-
if raw is None or str(raw).strip() == "":
|
|
1521
|
-
return None
|
|
1522
|
-
try:
|
|
1523
|
-
value = float(raw)
|
|
1524
|
-
except (TypeError, ValueError, OverflowError):
|
|
1525
|
-
return None
|
|
1526
|
-
return value if math.isfinite(value) else None
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
def row_has_finite_float(row: dict[str, str], key: str) -> bool:
|
|
1530
|
-
return row_optional_float(row, key) is not None
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
def row_bool(row: dict[str, str], key: str) -> bool:
|
|
1534
|
-
return str(row.get(key) or "").strip().lower() == "true"
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
def row_success(row: dict[str, str]) -> bool:
|
|
1538
|
-
return str(row.get("success") or "").strip().lower() == "true"
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
def row_cost_shift_measured(row: dict[str, str]) -> bool:
|
|
1542
|
-
return (
|
|
1543
|
-
row_bool(row, "cost_measured")
|
|
1544
|
-
and row_bool(row, "external_tokens_measured")
|
|
1545
|
-
and (row_int(row, "external_tokens") == 0 or row_bool(row, "external_cost_measured"))
|
|
1546
|
-
)
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str) -> dict[str, Any]:
|
|
1550
|
-
by_variant: dict[str, dict[str, Any]] = {}
|
|
1551
|
-
successful_rows_by_variant_task: dict[str, dict[str, list[dict[str, str]]]] = {}
|
|
1552
|
-
seen_tasks_by_variant: dict[str, set[str]] = {}
|
|
1553
|
-
successful_tasks_by_variant: dict[str, set[str]] = {}
|
|
1554
|
-
|
|
1555
|
-
for row_index, raw_row in enumerate(rows, start=1):
|
|
1556
|
-
row = dict(raw_row)
|
|
1557
|
-
row["_row_index"] = str(row_index)
|
|
1558
|
-
variant = row.get("variant") or "unknown"
|
|
1559
|
-
task_id = row.get("task_id") or "unknown"
|
|
1560
|
-
seen_tasks_by_variant.setdefault(variant, set()).add(task_id)
|
|
1561
|
-
bucket = by_variant.setdefault(
|
|
1562
|
-
variant,
|
|
1563
|
-
{
|
|
1564
|
-
"runs": 0,
|
|
1565
|
-
"successful_runs": 0,
|
|
1566
|
-
"failed_runs": 0,
|
|
1567
|
-
"total_tokens_all_runs": 0,
|
|
1568
|
-
"primary_tokens_measured_runs": 0,
|
|
1569
|
-
"primary_cost_all_runs_usd": 0.0,
|
|
1570
|
-
"primary_cost_measured_runs": 0,
|
|
1571
|
-
"wall_time_seconds_all_runs": 0.0,
|
|
1572
|
-
"wall_time_seconds_measured_runs": 0,
|
|
1573
|
-
"provider_cached_tokens_all_runs": 0,
|
|
1574
|
-
"provider_cached_tokens_measured_runs": 0,
|
|
1575
|
-
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
1576
|
-
"total_cost_with_shift_measured_runs": 0,
|
|
1577
|
-
"total_tokens_successful": 0,
|
|
1578
|
-
"primary_tokens_measured_successful": 0,
|
|
1579
|
-
"primary_cost_successful_usd": 0.0,
|
|
1580
|
-
"primary_cost_measured_successful": 0,
|
|
1581
|
-
"wall_time_seconds_successful": 0.0,
|
|
1582
|
-
"wall_time_seconds_measured_successful": 0,
|
|
1583
|
-
"provider_cached_tokens_successful": 0,
|
|
1584
|
-
"provider_cached_tokens_measured_successful": 0,
|
|
1585
|
-
"external_cost_successful_usd": 0.0,
|
|
1586
|
-
"external_cost_unknown_successful": 0,
|
|
1587
|
-
"total_cost_with_shift_successful_usd": 0.0,
|
|
1588
|
-
"total_cost_with_shift_measured_successful": 0,
|
|
1589
|
-
"external_tokens_successful": 0,
|
|
1590
|
-
"external_tokens_measured_successful": 0,
|
|
1591
|
-
"artifacts_used_successful": 0,
|
|
1592
|
-
"corrections_successful": 0,
|
|
1593
|
-
"bytes_before_successful": 0,
|
|
1594
|
-
"bytes_after_successful": 0,
|
|
1595
|
-
"turns_successful": 0,
|
|
1596
|
-
"hook_triggers_successful": 0,
|
|
1597
|
-
},
|
|
1598
|
-
)
|
|
1599
|
-
bucket["runs"] += 1
|
|
1600
|
-
bucket["total_tokens_all_runs"] += row_int(row, "total_tokens")
|
|
1601
|
-
if row_bool(row, "primary_tokens_measured"):
|
|
1602
|
-
bucket["primary_tokens_measured_runs"] += 1
|
|
1603
|
-
bucket["wall_time_seconds_all_runs"] += row_float(row, "wall_time_seconds")
|
|
1604
|
-
if row_has_finite_float(row, "wall_time_seconds"):
|
|
1605
|
-
bucket["wall_time_seconds_measured_runs"] += 1
|
|
1606
|
-
bucket["provider_cached_tokens_all_runs"] += row_int(row, "provider_cached_tokens")
|
|
1607
|
-
if row_bool(row, "provider_cached_tokens_measured"):
|
|
1608
|
-
bucket["provider_cached_tokens_measured_runs"] += 1
|
|
1609
|
-
if row_bool(row, "cost_measured"):
|
|
1610
|
-
bucket["primary_cost_all_runs_usd"] += row_float(row, "cost_usd")
|
|
1611
|
-
bucket["primary_cost_measured_runs"] += 1
|
|
1612
|
-
shifted_cost = row_optional_float(row, "total_cost_with_shift_usd")
|
|
1613
|
-
if row_cost_shift_measured(row) and shifted_cost is not None:
|
|
1614
|
-
bucket["total_cost_with_shift_all_runs_usd"] += shifted_cost
|
|
1615
|
-
bucket["total_cost_with_shift_measured_runs"] += 1
|
|
1616
|
-
if not row_success(row):
|
|
1617
|
-
bucket["failed_runs"] += 1
|
|
1618
|
-
continue
|
|
1619
|
-
bucket["successful_runs"] += 1
|
|
1620
|
-
successful_tasks_by_variant.setdefault(variant, set()).add(task_id)
|
|
1621
|
-
successful_rows_by_variant_task.setdefault(variant, {}).setdefault(task_id, []).append(row)
|
|
1622
|
-
bucket["total_tokens_successful"] += row_int(row, "total_tokens")
|
|
1623
|
-
if row_bool(row, "primary_tokens_measured"):
|
|
1624
|
-
bucket["primary_tokens_measured_successful"] += 1
|
|
1625
|
-
bucket["wall_time_seconds_successful"] += row_float(row, "wall_time_seconds")
|
|
1626
|
-
if row_has_finite_float(row, "wall_time_seconds"):
|
|
1627
|
-
bucket["wall_time_seconds_measured_successful"] += 1
|
|
1628
|
-
bucket["provider_cached_tokens_successful"] += row_int(row, "provider_cached_tokens")
|
|
1629
|
-
if row_bool(row, "provider_cached_tokens_measured"):
|
|
1630
|
-
bucket["provider_cached_tokens_measured_successful"] += 1
|
|
1631
|
-
if row_bool(row, "cost_measured"):
|
|
1632
|
-
bucket["primary_cost_successful_usd"] += row_float(row, "cost_usd")
|
|
1633
|
-
bucket["primary_cost_measured_successful"] += 1
|
|
1634
|
-
if row_bool(row, "external_tokens_measured") and (
|
|
1635
|
-
row_int(row, "external_tokens") == 0 or row_bool(row, "external_cost_measured")
|
|
1636
|
-
):
|
|
1637
|
-
bucket["external_cost_successful_usd"] += row_float(row, "external_cost_usd")
|
|
1638
|
-
else:
|
|
1639
|
-
bucket["external_cost_unknown_successful"] += 1
|
|
1640
|
-
if row_cost_shift_measured(row) and shifted_cost is not None:
|
|
1641
|
-
bucket["total_cost_with_shift_successful_usd"] += shifted_cost
|
|
1642
|
-
bucket["total_cost_with_shift_measured_successful"] += 1
|
|
1643
|
-
if row_bool(row, "external_tokens_measured"):
|
|
1644
|
-
bucket["external_tokens_successful"] += row_int(row, "external_tokens")
|
|
1645
|
-
bucket["external_tokens_measured_successful"] += 1
|
|
1646
|
-
bucket["artifacts_used_successful"] += row_int(row, "artifacts_used")
|
|
1647
|
-
bucket["corrections_successful"] += row_int(row, "corrections")
|
|
1648
|
-
bucket["bytes_before_successful"] += row_int(row, "bytes_before")
|
|
1649
|
-
bucket["bytes_after_successful"] += row_int(row, "bytes_after")
|
|
1650
|
-
bucket["turns_successful"] += row_int(row, "turns")
|
|
1651
|
-
bucket["hook_triggers_successful"] += row_int(row, "hook_triggers")
|
|
1652
|
-
|
|
1653
|
-
for variant, bucket in by_variant.items():
|
|
1654
|
-
successes = bucket["successful_runs"]
|
|
1655
|
-
runs = bucket["runs"]
|
|
1656
|
-
bucket["failure_rate"] = (bucket["failed_runs"] / runs) if runs else None
|
|
1657
|
-
bucket["task_count"] = len(seen_tasks_by_variant.get(variant, set()))
|
|
1658
|
-
bucket["successful_task_count"] = len(successful_tasks_by_variant.get(variant, set()))
|
|
1659
|
-
if bucket["task_count"]:
|
|
1660
|
-
bucket["tokens_per_task_including_failures"] = (
|
|
1661
|
-
bucket["total_tokens_all_runs"] / bucket["task_count"]
|
|
1662
|
-
if bucket["primary_tokens_measured_runs"] == runs
|
|
1663
|
-
else None
|
|
1664
|
-
)
|
|
1665
|
-
bucket["wall_time_seconds_per_task_including_failures"] = (
|
|
1666
|
-
bucket["wall_time_seconds_all_runs"] / bucket["task_count"]
|
|
1667
|
-
)
|
|
1668
|
-
bucket["provider_cached_tokens_per_task_including_failures"] = (
|
|
1669
|
-
bucket["provider_cached_tokens_all_runs"] / bucket["task_count"]
|
|
1670
|
-
)
|
|
1671
|
-
if bucket["primary_cost_measured_runs"] == runs:
|
|
1672
|
-
bucket["primary_cost_per_task_including_failures_usd"] = (
|
|
1673
|
-
bucket["primary_cost_all_runs_usd"] / bucket["task_count"]
|
|
1674
|
-
)
|
|
1675
|
-
else:
|
|
1676
|
-
bucket["primary_cost_per_task_including_failures_usd"] = None
|
|
1677
|
-
if bucket["total_cost_with_shift_measured_runs"] == runs:
|
|
1678
|
-
bucket["total_cost_with_shift_per_task_including_failures_usd"] = (
|
|
1679
|
-
bucket["total_cost_with_shift_all_runs_usd"] / bucket["task_count"]
|
|
1680
|
-
)
|
|
1681
|
-
else:
|
|
1682
|
-
bucket["total_cost_with_shift_per_task_including_failures_usd"] = None
|
|
1683
|
-
else:
|
|
1684
|
-
bucket["tokens_per_task_including_failures"] = None
|
|
1685
|
-
bucket["wall_time_seconds_per_task_including_failures"] = None
|
|
1686
|
-
bucket["provider_cached_tokens_per_task_including_failures"] = None
|
|
1687
|
-
bucket["primary_cost_per_task_including_failures_usd"] = None
|
|
1688
|
-
bucket["total_cost_with_shift_per_task_including_failures_usd"] = None
|
|
1689
|
-
if successes:
|
|
1690
|
-
bucket["tokens_per_successful_task"] = (
|
|
1691
|
-
bucket["total_tokens_successful"] / successes
|
|
1692
|
-
if bucket["primary_tokens_measured_successful"] == successes
|
|
1693
|
-
else None
|
|
1694
|
-
)
|
|
1695
|
-
bucket["wall_time_seconds_per_successful_task"] = bucket["wall_time_seconds_successful"] / successes
|
|
1696
|
-
bucket["provider_cached_tokens_per_successful_task"] = (
|
|
1697
|
-
bucket["provider_cached_tokens_successful"] / successes
|
|
1698
|
-
)
|
|
1699
|
-
if bucket["primary_cost_measured_successful"] == successes:
|
|
1700
|
-
bucket["primary_cost_per_successful_task_usd"] = (
|
|
1701
|
-
bucket["primary_cost_successful_usd"] / successes
|
|
1702
|
-
)
|
|
1703
|
-
else:
|
|
1704
|
-
bucket["primary_cost_per_successful_task_usd"] = None
|
|
1705
|
-
if bucket["total_cost_with_shift_measured_successful"] == successes:
|
|
1706
|
-
bucket["total_cost_with_shift_per_successful_task_usd"] = (
|
|
1707
|
-
bucket["total_cost_with_shift_successful_usd"] / successes
|
|
1708
|
-
)
|
|
1709
|
-
else:
|
|
1710
|
-
bucket["total_cost_with_shift_per_successful_task_usd"] = None
|
|
1711
|
-
bucket["external_tokens_per_successful_task"] = (
|
|
1712
|
-
bucket["external_tokens_successful"] / successes
|
|
1713
|
-
if bucket["external_tokens_measured_successful"] == successes
|
|
1714
|
-
else None
|
|
1715
|
-
)
|
|
1716
|
-
bucket["artifacts_used_per_successful_task"] = bucket["artifacts_used_successful"] / successes
|
|
1717
|
-
bucket["corrections_per_successful_task"] = bucket["corrections_successful"] / successes
|
|
1718
|
-
before = bucket["bytes_before_successful"]
|
|
1719
|
-
after = bucket["bytes_after_successful"]
|
|
1720
|
-
bucket["byte_reduction_ratio"] = (after / before) if before else None
|
|
1721
|
-
else:
|
|
1722
|
-
bucket["tokens_per_successful_task"] = None
|
|
1723
|
-
bucket["wall_time_seconds_per_successful_task"] = None
|
|
1724
|
-
bucket["provider_cached_tokens_per_successful_task"] = None
|
|
1725
|
-
bucket["primary_cost_per_successful_task_usd"] = None
|
|
1726
|
-
bucket["total_cost_with_shift_per_successful_task_usd"] = None
|
|
1727
|
-
bucket["external_tokens_per_successful_task"] = None
|
|
1728
|
-
bucket["artifacts_used_per_successful_task"] = None
|
|
1729
|
-
bucket["corrections_per_successful_task"] = None
|
|
1730
|
-
bucket["byte_reduction_ratio"] = None
|
|
1731
|
-
|
|
1732
|
-
# 각 variant는 하나의 compression strategy를 대표한다. byte 절감/토큰 proxy/
|
|
1733
|
-
# 텔레메트리 증거 등급을 보수적으로(additive) 노출한다. 토큰 proxy는 측정된
|
|
1734
|
-
# 모델 토큰이 아니라 byte delta 기반 추정치이므로 evidence="inferred"로 둔다.
|
|
1735
|
-
bucket["compression_strategy"] = variant
|
|
1736
|
-
bucket["is_baseline_strategy"] = variant == baseline_variant
|
|
1737
|
-
bytes_before = bucket["bytes_before_successful"]
|
|
1738
|
-
bytes_after = bucket["bytes_after_successful"]
|
|
1739
|
-
byte_metrics_present = bool(bytes_before or bytes_after)
|
|
1740
|
-
if successes and byte_metrics_present:
|
|
1741
|
-
bytes_saved = max(0, bytes_before - bytes_after)
|
|
1742
|
-
token_proxy_saved = bytes_saved // TOKEN_PROXY_BYTES_PER_TOKEN
|
|
1743
|
-
bucket["bytes_saved_successful"] = bytes_saved
|
|
1744
|
-
bucket["bytes_saved_per_successful_task"] = bytes_saved / successes
|
|
1745
|
-
bucket["byte_savings_pct"] = ((bytes_before - bytes_after) / bytes_before * 100.0) if bytes_before else None
|
|
1746
|
-
bucket["token_proxy_saved_successful"] = token_proxy_saved
|
|
1747
|
-
bucket["token_proxy_saved_per_successful_task"] = token_proxy_saved / successes
|
|
1748
|
-
else:
|
|
1749
|
-
bucket["bytes_saved_successful"] = None
|
|
1750
|
-
bucket["bytes_saved_per_successful_task"] = None
|
|
1751
|
-
bucket["byte_savings_pct"] = None
|
|
1752
|
-
bucket["token_proxy_saved_successful"] = None
|
|
1753
|
-
bucket["token_proxy_saved_per_successful_task"] = None
|
|
1754
|
-
bucket["observed_telemetry"] = {
|
|
1755
|
-
"tokens": (
|
|
1756
|
-
"observed" if runs and bucket["primary_tokens_measured_runs"] == runs
|
|
1757
|
-
else ("partial" if bucket["primary_tokens_measured_runs"] else "unavailable")
|
|
1758
|
-
),
|
|
1759
|
-
"primary_cost": (
|
|
1760
|
-
"observed" if runs and bucket["primary_cost_measured_runs"] == runs
|
|
1761
|
-
else ("partial" if bucket["primary_cost_measured_runs"] else "unavailable")
|
|
1762
|
-
),
|
|
1763
|
-
"external_tokens": (
|
|
1764
|
-
"observed" if successes and bucket["external_tokens_measured_successful"] == successes
|
|
1765
|
-
else ("partial" if bucket["external_tokens_measured_successful"] else "unavailable")
|
|
1766
|
-
),
|
|
1767
|
-
"byte_savings": "observed" if byte_metrics_present else "unavailable",
|
|
1768
|
-
"token_proxy": "inferred" if (successes and byte_metrics_present) else "unavailable",
|
|
1769
|
-
"wall_time": (
|
|
1770
|
-
"observed" if runs and bucket["wall_time_seconds_measured_runs"] == runs
|
|
1771
|
-
else ("partial" if bucket["wall_time_seconds_measured_runs"] else "unavailable")
|
|
1772
|
-
),
|
|
1773
|
-
"provider_cache": (
|
|
1774
|
-
"observed" if runs and bucket["provider_cached_tokens_measured_runs"] == runs
|
|
1775
|
-
else ("partial" if bucket["provider_cached_tokens_measured_runs"] else "unavailable")
|
|
1776
|
-
),
|
|
1777
|
-
}
|
|
1778
|
-
|
|
1779
|
-
def average_task_metric(variant: str, task_id: str, key: str) -> float | None:
|
|
1780
|
-
values = [
|
|
1781
|
-
row_optional_float(row, key)
|
|
1782
|
-
for row in successful_rows_by_variant_task.get(variant, {}).get(task_id, [])
|
|
1783
|
-
]
|
|
1784
|
-
known = [value for value in values if value is not None]
|
|
1785
|
-
return (sum(known) / len(known)) if known else None
|
|
1786
|
-
|
|
1787
|
-
def average_task_int_metric(variant: str, task_id: str, key: str) -> float | None:
|
|
1788
|
-
rows_for_task = successful_rows_by_variant_task.get(variant, {}).get(task_id, [])
|
|
1789
|
-
if not rows_for_task:
|
|
1790
|
-
return None
|
|
1791
|
-
values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
|
|
1792
|
-
if any(value is None for value in values):
|
|
1793
|
-
return None
|
|
1794
|
-
return sum(value for value in values if value is not None) / len(values)
|
|
1795
|
-
|
|
1796
|
-
def average_paired_metric(
|
|
1797
|
-
variant: str,
|
|
1798
|
-
task_ids: set[str],
|
|
1799
|
-
key: str,
|
|
1800
|
-
) -> tuple[float | None, float | None, int]:
|
|
1801
|
-
baseline_values: list[float] = []
|
|
1802
|
-
variant_values: list[float] = []
|
|
1803
|
-
for task_id in sorted(task_ids):
|
|
1804
|
-
baseline_value = average_task_metric(baseline_variant, task_id, key)
|
|
1805
|
-
variant_value = average_task_metric(variant, task_id, key)
|
|
1806
|
-
if baseline_value is None or variant_value is None:
|
|
1807
|
-
continue
|
|
1808
|
-
baseline_values.append(baseline_value)
|
|
1809
|
-
variant_values.append(variant_value)
|
|
1810
|
-
if not baseline_values:
|
|
1811
|
-
return None, None, 0
|
|
1812
|
-
return (
|
|
1813
|
-
sum(baseline_values) / len(baseline_values),
|
|
1814
|
-
sum(variant_values) / len(variant_values),
|
|
1815
|
-
len(baseline_values),
|
|
1816
|
-
)
|
|
1817
|
-
|
|
1818
|
-
def average_paired_int_metric(
|
|
1819
|
-
variant: str,
|
|
1820
|
-
task_ids: set[str],
|
|
1821
|
-
key: str,
|
|
1822
|
-
) -> tuple[float | None, float | None, int]:
|
|
1823
|
-
baseline_values: list[float] = []
|
|
1824
|
-
variant_values: list[float] = []
|
|
1825
|
-
for task_id in sorted(task_ids):
|
|
1826
|
-
baseline_value = average_task_int_metric(baseline_variant, task_id, key)
|
|
1827
|
-
variant_value = average_task_int_metric(variant, task_id, key)
|
|
1828
|
-
if baseline_value is None or variant_value is None:
|
|
1829
|
-
continue
|
|
1830
|
-
baseline_values.append(baseline_value)
|
|
1831
|
-
variant_values.append(variant_value)
|
|
1832
|
-
if not baseline_values:
|
|
1833
|
-
return None, None, 0
|
|
1834
|
-
return (
|
|
1835
|
-
sum(baseline_values) / len(baseline_values),
|
|
1836
|
-
sum(variant_values) / len(variant_values),
|
|
1837
|
-
len(baseline_values),
|
|
1838
|
-
)
|
|
1839
|
-
|
|
1840
|
-
def row_indices_for(rows_for_task: list[dict[str, str]]) -> list[int]:
|
|
1841
|
-
out: list[int] = []
|
|
1842
|
-
for row in rows_for_task:
|
|
1843
|
-
index = row_optional_nonnegative_int(row, "_row_index")
|
|
1844
|
-
if index is not None:
|
|
1845
|
-
out.append(index)
|
|
1846
|
-
return out
|
|
1847
|
-
|
|
1848
|
-
def all_rows_bool(rows_for_task: list[dict[str, str]], key: str) -> bool:
|
|
1849
|
-
return bool(rows_for_task) and all(row_bool(row, key) for row in rows_for_task)
|
|
1850
|
-
|
|
1851
|
-
def all_rows_optional_int(rows_for_task: list[dict[str, str]], key: str) -> list[int] | None:
|
|
1852
|
-
values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
|
|
1853
|
-
if not values or any(value is None for value in values):
|
|
1854
|
-
return None
|
|
1855
|
-
return [value for value in values if value is not None]
|
|
1856
|
-
|
|
1857
|
-
def all_rows_optional_float(rows_for_task: list[dict[str, str]], key: str) -> list[float] | None:
|
|
1858
|
-
values = [row_optional_float(row, key) for row in rows_for_task]
|
|
1859
|
-
if not values or any(value is None for value in values):
|
|
1860
|
-
return None
|
|
1861
|
-
return [value for value in values if value is not None]
|
|
1862
|
-
|
|
1863
|
-
def average_optional_int(rows_for_task: list[dict[str, str]], key: str) -> float | None:
|
|
1864
|
-
values = all_rows_optional_int(rows_for_task, key)
|
|
1865
|
-
return (sum(values) / len(values)) if values else None
|
|
1866
|
-
|
|
1867
|
-
def average_optional_float(rows_for_task: list[dict[str, str]], key: str) -> float | None:
|
|
1868
|
-
values = all_rows_optional_float(rows_for_task, key)
|
|
1869
|
-
return (sum(values) / len(values)) if values else None
|
|
1870
|
-
|
|
1871
|
-
def total_optional_int(rows_for_task: list[dict[str, str]], key: str) -> int | None:
|
|
1872
|
-
values = all_rows_optional_int(rows_for_task, key)
|
|
1873
|
-
return sum(values) if values is not None else None
|
|
1874
|
-
|
|
1875
|
-
def all_rows_shifted_cost_measured(rows_for_task: list[dict[str, str]]) -> bool:
|
|
1876
|
-
return bool(rows_for_task) and all(
|
|
1877
|
-
row_cost_shift_measured(row) and row_optional_float(row, "total_cost_with_shift_usd") is not None
|
|
1878
|
-
for row in rows_for_task
|
|
1879
|
-
)
|
|
1880
|
-
|
|
1881
|
-
def matched_side_evidence(variant: str, task_id: str, rows_for_task: list[dict[str, str]]) -> dict[str, Any]:
|
|
1882
|
-
primary_tokens_measured = all_rows_bool(rows_for_task, "primary_tokens_measured")
|
|
1883
|
-
primary_cost_measured = all_rows_bool(rows_for_task, "cost_measured")
|
|
1884
|
-
shifted_cost_measured = all_rows_shifted_cost_measured(rows_for_task)
|
|
1885
|
-
provider_cache_measured = all_rows_bool(rows_for_task, "provider_cached_tokens_measured")
|
|
1886
|
-
external_tokens_measured = all_rows_bool(rows_for_task, "external_tokens_measured")
|
|
1887
|
-
external_cost_measured = all_rows_bool(rows_for_task, "external_cost_measured")
|
|
1888
|
-
corrections_values = all_rows_optional_int(rows_for_task, "corrections")
|
|
1889
|
-
bytes_before_values = [row_optional_nonnegative_int(row, "bytes_before") for row in rows_for_task]
|
|
1890
|
-
bytes_after_values = [row_optional_nonnegative_int(row, "bytes_after") for row in rows_for_task]
|
|
1891
|
-
byte_metrics_observed = bool(rows_for_task) and not any(
|
|
1892
|
-
value is None for value in [*bytes_before_values, *bytes_after_values]
|
|
1893
|
-
)
|
|
1894
|
-
bytes_before_total = sum(value for value in bytes_before_values if value is not None)
|
|
1895
|
-
bytes_after_total = sum(value for value in bytes_after_values if value is not None)
|
|
1896
|
-
byte_delta = bytes_after_total - bytes_before_total if byte_metrics_observed else None
|
|
1897
|
-
token_proxy_delta = (
|
|
1898
|
-
int(byte_delta / TOKEN_PROXY_BYTES_PER_TOKEN) if byte_delta is not None else None
|
|
1899
|
-
)
|
|
1900
|
-
return {
|
|
1901
|
-
"variant": variant,
|
|
1902
|
-
"task_id": task_id,
|
|
1903
|
-
"run_count": len(rows_for_task),
|
|
1904
|
-
"row_indices": row_indices_for(rows_for_task),
|
|
1905
|
-
"primary_tokens": {
|
|
1906
|
-
"measured": primary_tokens_measured,
|
|
1907
|
-
"average": average_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
|
|
1908
|
-
"total": total_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
|
|
1909
|
-
},
|
|
1910
|
-
"primary_cost_usd": {
|
|
1911
|
-
"measured": primary_cost_measured,
|
|
1912
|
-
"average": average_optional_float(rows_for_task, "cost_usd") if primary_cost_measured else None,
|
|
1913
|
-
},
|
|
1914
|
-
"total_cost_with_shift_usd": {
|
|
1915
|
-
"measured": shifted_cost_measured,
|
|
1916
|
-
"average": (
|
|
1917
|
-
average_optional_float(rows_for_task, "total_cost_with_shift_usd")
|
|
1918
|
-
if shifted_cost_measured else None
|
|
1919
|
-
),
|
|
1920
|
-
},
|
|
1921
|
-
"external_tokens": {
|
|
1922
|
-
"measured": external_tokens_measured,
|
|
1923
|
-
"total": total_optional_int(rows_for_task, "external_tokens") if external_tokens_measured else None,
|
|
1924
|
-
},
|
|
1925
|
-
"external_cost_usd": {
|
|
1926
|
-
"measured": external_cost_measured,
|
|
1927
|
-
"total": (
|
|
1928
|
-
sum(row_float(row, "external_cost_usd") for row in rows_for_task)
|
|
1929
|
-
if external_cost_measured else None
|
|
1930
|
-
),
|
|
1931
|
-
},
|
|
1932
|
-
"bytes": {
|
|
1933
|
-
"measurement": "observed" if byte_metrics_observed else "unavailable",
|
|
1934
|
-
"before_total": bytes_before_total if byte_metrics_observed else None,
|
|
1935
|
-
"after_total": bytes_after_total if byte_metrics_observed else None,
|
|
1936
|
-
"delta_total": byte_delta,
|
|
1937
|
-
"token_proxy_delta": token_proxy_delta,
|
|
1938
|
-
"token_proxy": "chars_div_4_proxy_only" if byte_metrics_observed else "unavailable",
|
|
1939
|
-
},
|
|
1940
|
-
"wall_time_seconds": {
|
|
1941
|
-
"measured": all_rows_optional_float(rows_for_task, "wall_time_seconds") is not None,
|
|
1942
|
-
"average": average_optional_float(rows_for_task, "wall_time_seconds"),
|
|
1943
|
-
},
|
|
1944
|
-
"provider_cached_tokens": {
|
|
1945
|
-
"measured": provider_cache_measured,
|
|
1946
|
-
"average": (
|
|
1947
|
-
average_optional_int(rows_for_task, "provider_cached_tokens")
|
|
1948
|
-
if provider_cache_measured else None
|
|
1949
|
-
),
|
|
1950
|
-
},
|
|
1951
|
-
"corrections": {
|
|
1952
|
-
"measured": corrections_values is not None,
|
|
1953
|
-
"average": (sum(corrections_values) / len(corrections_values)) if corrections_values else None,
|
|
1954
|
-
},
|
|
1955
|
-
}
|
|
1956
|
-
|
|
1957
|
-
def matched_pair_evidence_entry(
|
|
1958
|
-
variant: str,
|
|
1959
|
-
task_id: str,
|
|
1960
|
-
quality_gate: str,
|
|
1961
|
-
) -> dict[str, Any]:
|
|
1962
|
-
baseline_rows = successful_rows_by_variant_task[baseline_variant][task_id]
|
|
1963
|
-
variant_rows = successful_rows_by_variant_task[variant][task_id]
|
|
1964
|
-
baseline_evidence = matched_side_evidence(baseline_variant, task_id, baseline_rows)
|
|
1965
|
-
variant_evidence = matched_side_evidence(variant, task_id, variant_rows)
|
|
1966
|
-
baseline_token_avg = baseline_evidence["primary_tokens"]["average"]
|
|
1967
|
-
variant_token_avg = variant_evidence["primary_tokens"]["average"]
|
|
1968
|
-
token_claim_allowed = (
|
|
1969
|
-
quality_gate == "pass"
|
|
1970
|
-
and bool(baseline_evidence["primary_tokens"]["measured"])
|
|
1971
|
-
and bool(variant_evidence["primary_tokens"]["measured"])
|
|
1972
|
-
and isinstance(baseline_token_avg, (int, float))
|
|
1973
|
-
and baseline_token_avg > 0
|
|
1974
|
-
and isinstance(variant_token_avg, (int, float))
|
|
1975
|
-
)
|
|
1976
|
-
baseline_cost_avg = baseline_evidence["total_cost_with_shift_usd"]["average"]
|
|
1977
|
-
variant_cost_avg = variant_evidence["total_cost_with_shift_usd"]["average"]
|
|
1978
|
-
shifted_cost_claim_allowed = (
|
|
1979
|
-
quality_gate == "pass"
|
|
1980
|
-
and bool(baseline_evidence["total_cost_with_shift_usd"]["measured"])
|
|
1981
|
-
and bool(variant_evidence["total_cost_with_shift_usd"]["measured"])
|
|
1982
|
-
and isinstance(baseline_cost_avg, (int, float))
|
|
1983
|
-
and baseline_cost_avg > 0
|
|
1984
|
-
and isinstance(variant_cost_avg, (int, float))
|
|
1985
|
-
)
|
|
1986
|
-
token_delta = (
|
|
1987
|
-
variant_token_avg - baseline_token_avg
|
|
1988
|
-
if token_claim_allowed
|
|
1989
|
-
else None
|
|
1990
|
-
)
|
|
1991
|
-
token_savings_pct = (
|
|
1992
|
-
(baseline_token_avg - variant_token_avg) / baseline_token_avg * 100.0
|
|
1993
|
-
if token_delta is not None
|
|
1994
|
-
else None
|
|
1995
|
-
)
|
|
1996
|
-
cost_delta = (
|
|
1997
|
-
variant_cost_avg - baseline_cost_avg
|
|
1998
|
-
if shifted_cost_claim_allowed
|
|
1999
|
-
else None
|
|
2000
|
-
)
|
|
2001
|
-
cost_savings_pct = (
|
|
2002
|
-
(baseline_cost_avg - variant_cost_avg) / baseline_cost_avg * 100.0
|
|
2003
|
-
if cost_delta is not None
|
|
2004
|
-
else None
|
|
2005
|
-
)
|
|
2006
|
-
base_after = baseline_evidence["bytes"]["after_total"]
|
|
2007
|
-
variant_after = variant_evidence["bytes"]["after_total"]
|
|
2008
|
-
byte_after_delta = (
|
|
2009
|
-
variant_after - base_after
|
|
2010
|
-
if isinstance(base_after, int) and isinstance(variant_after, int)
|
|
2011
|
-
else None
|
|
2012
|
-
)
|
|
2013
|
-
return {
|
|
2014
|
-
"schema_version": MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION,
|
|
2015
|
-
"task_id": task_id,
|
|
2016
|
-
"baseline_variant": baseline_variant,
|
|
2017
|
-
"variant": variant,
|
|
2018
|
-
"transform_id": variant,
|
|
2019
|
-
"quality_gate": quality_gate,
|
|
2020
|
-
"evidence_kind": "matched_successful_task_bucket",
|
|
2021
|
-
"measurements": {
|
|
2022
|
-
"baseline": baseline_evidence,
|
|
2023
|
-
"variant": variant_evidence,
|
|
2024
|
-
},
|
|
2025
|
-
"delta": {
|
|
2026
|
-
"primary_tokens_average": token_delta,
|
|
2027
|
-
"token_savings_pct": token_savings_pct,
|
|
2028
|
-
"total_cost_with_shift_usd_average": cost_delta,
|
|
2029
|
-
"cost_savings_pct_with_shift": cost_savings_pct,
|
|
2030
|
-
"bytes_after_total": byte_after_delta,
|
|
2031
|
-
"token_proxy_after_total": (
|
|
2032
|
-
int(byte_after_delta / TOKEN_PROXY_BYTES_PER_TOKEN)
|
|
2033
|
-
if byte_after_delta is not None else None
|
|
2034
|
-
),
|
|
2035
|
-
"proxy_measurement": "chars_div_4_proxy_only",
|
|
2036
|
-
},
|
|
2037
|
-
"claim_boundary": {
|
|
2038
|
-
"quality_gate": quality_gate,
|
|
2039
|
-
"token_savings_claim_allowed": token_claim_allowed,
|
|
2040
|
-
"shifted_cost_claim_allowed": shifted_cost_claim_allowed,
|
|
2041
|
-
"byte_proxy_only": True,
|
|
2042
|
-
"requires_matched_successful_tasks": True,
|
|
2043
|
-
"raw_estimate_only_claim_allowed": False,
|
|
2044
|
-
},
|
|
2045
|
-
}
|
|
2046
|
-
|
|
2047
|
-
comparisons: list[dict[str, Any]] = []
|
|
2048
|
-
matched_pair_evidence: list[dict[str, Any]] = []
|
|
2049
|
-
baseline = by_variant.get(baseline_variant)
|
|
2050
|
-
baseline_successful_tasks = successful_tasks_by_variant.get(baseline_variant, set())
|
|
2051
|
-
baseline_failure_rate = baseline.get("failure_rate") if baseline else None
|
|
2052
|
-
for variant, bucket in sorted(by_variant.items()):
|
|
2053
|
-
if variant == baseline_variant:
|
|
2054
|
-
continue
|
|
2055
|
-
variant_successful_tasks = successful_tasks_by_variant.get(variant, set())
|
|
2056
|
-
matched_tasks = baseline_successful_tasks & variant_successful_tasks
|
|
2057
|
-
token_matched_tasks = {
|
|
2058
|
-
task_id for task_id in matched_tasks
|
|
2059
|
-
if all(
|
|
2060
|
-
row_bool(row, "primary_tokens_measured")
|
|
2061
|
-
for row in successful_rows_by_variant_task[baseline_variant][task_id]
|
|
2062
|
-
)
|
|
2063
|
-
and all(
|
|
2064
|
-
row_bool(row, "primary_tokens_measured")
|
|
2065
|
-
for row in successful_rows_by_variant_task[variant][task_id]
|
|
2066
|
-
)
|
|
2067
|
-
}
|
|
2068
|
-
base_tokens, variant_tokens, token_task_count = average_paired_metric(
|
|
2069
|
-
variant,
|
|
2070
|
-
token_matched_tasks,
|
|
2071
|
-
"total_tokens",
|
|
2072
|
-
)
|
|
2073
|
-
base_wall_time, variant_wall_time, wall_time_task_count = average_paired_metric(
|
|
2074
|
-
variant,
|
|
2075
|
-
matched_tasks,
|
|
2076
|
-
"wall_time_seconds",
|
|
2077
|
-
)
|
|
2078
|
-
base_corrections, variant_corrections, corrections_task_count = average_paired_int_metric(
|
|
2079
|
-
variant,
|
|
2080
|
-
matched_tasks,
|
|
2081
|
-
"corrections",
|
|
2082
|
-
)
|
|
2083
|
-
base_cost, variant_cost, cost_task_count = average_paired_metric(
|
|
2084
|
-
variant,
|
|
2085
|
-
{
|
|
2086
|
-
task_id for task_id in matched_tasks
|
|
2087
|
-
if all(
|
|
2088
|
-
row_cost_shift_measured(row)
|
|
2089
|
-
for row in successful_rows_by_variant_task[baseline_variant][task_id]
|
|
2090
|
-
)
|
|
2091
|
-
and all(
|
|
2092
|
-
row_cost_shift_measured(row)
|
|
2093
|
-
for row in successful_rows_by_variant_task[variant][task_id]
|
|
2094
|
-
)
|
|
2095
|
-
},
|
|
2096
|
-
"total_cost_with_shift_usd",
|
|
2097
|
-
)
|
|
2098
|
-
failure_rate = bucket.get("failure_rate")
|
|
2099
|
-
failure_delta = None
|
|
2100
|
-
if isinstance(baseline_failure_rate, (int, float)) and isinstance(failure_rate, (int, float)):
|
|
2101
|
-
failure_delta = (failure_rate - baseline_failure_rate) * 100.0
|
|
2102
|
-
missing_baseline_success_tasks = sorted(baseline_successful_tasks - variant_successful_tasks)
|
|
2103
|
-
quality_gate = "pass"
|
|
2104
|
-
if not baseline or not baseline.get("successful_runs"):
|
|
2105
|
-
quality_gate = "insufficient_baseline"
|
|
2106
|
-
elif not bucket.get("successful_runs"):
|
|
2107
|
-
quality_gate = "insufficient_success"
|
|
2108
|
-
elif missing_baseline_success_tasks:
|
|
2109
|
-
quality_gate = "matched_task_regression"
|
|
2110
|
-
elif failure_delta is not None and failure_delta >= 10.0:
|
|
2111
|
-
quality_gate = "failure_rate_regression"
|
|
2112
|
-
elif matched_tasks and corrections_task_count < len(matched_tasks):
|
|
2113
|
-
quality_gate = "insufficient_corrections_data"
|
|
2114
|
-
elif (
|
|
2115
|
-
isinstance(base_corrections, (int, float))
|
|
2116
|
-
and isinstance(variant_corrections, (int, float))
|
|
2117
|
-
and variant_corrections > base_corrections
|
|
2118
|
-
):
|
|
2119
|
-
quality_gate = "corrections_regression"
|
|
2120
|
-
comparison: dict[str, Any] = {
|
|
2121
|
-
"variant": variant,
|
|
2122
|
-
"baseline_variant": baseline_variant,
|
|
2123
|
-
"quality_gate": quality_gate,
|
|
2124
|
-
"baseline_failure_rate": baseline_failure_rate,
|
|
2125
|
-
"variant_failure_rate": failure_rate,
|
|
2126
|
-
"failure_rate_delta_pp": failure_delta,
|
|
2127
|
-
"matched_successful_task_count": len(matched_tasks),
|
|
2128
|
-
"baseline_successful_task_count": len(baseline_successful_tasks),
|
|
2129
|
-
"missing_baseline_success_tasks": missing_baseline_success_tasks,
|
|
2130
|
-
"baseline_corrections_per_successful_task": base_corrections,
|
|
2131
|
-
"variant_corrections_per_successful_task": variant_corrections,
|
|
2132
|
-
"paired_corrections_task_count": corrections_task_count,
|
|
2133
|
-
}
|
|
2134
|
-
if isinstance(base_corrections, (int, float)) and isinstance(variant_corrections, (int, float)):
|
|
2135
|
-
comparison["corrections_delta_per_successful_task"] = variant_corrections - base_corrections
|
|
2136
|
-
if isinstance(base_tokens, (int, float)) and isinstance(variant_tokens, (int, float)) and base_tokens:
|
|
2137
|
-
comparison["token_delta_per_successful_task"] = variant_tokens - base_tokens
|
|
2138
|
-
comparison["token_savings_pct"] = (base_tokens - variant_tokens) / base_tokens * 100.0
|
|
2139
|
-
comparison["paired_token_task_count"] = token_task_count
|
|
2140
|
-
else:
|
|
2141
|
-
comparison["token_savings_pct"] = None
|
|
2142
|
-
comparison["paired_token_task_count"] = 0
|
|
2143
|
-
if (
|
|
2144
|
-
isinstance(base_wall_time, (int, float))
|
|
2145
|
-
and isinstance(variant_wall_time, (int, float))
|
|
2146
|
-
and base_wall_time
|
|
2147
|
-
):
|
|
2148
|
-
comparison["wall_time_delta_seconds_per_successful_task"] = variant_wall_time - base_wall_time
|
|
2149
|
-
comparison["wall_time_change_pct"] = (variant_wall_time - base_wall_time) / base_wall_time * 100.0
|
|
2150
|
-
comparison["paired_wall_time_task_count"] = wall_time_task_count
|
|
2151
|
-
else:
|
|
2152
|
-
comparison["wall_time_delta_seconds_per_successful_task"] = None
|
|
2153
|
-
comparison["wall_time_change_pct"] = None
|
|
2154
|
-
comparison["paired_wall_time_task_count"] = wall_time_task_count
|
|
2155
|
-
if isinstance(base_cost, (int, float)) and isinstance(variant_cost, (int, float)) and base_cost:
|
|
2156
|
-
comparison["total_cost_with_shift_delta_usd"] = variant_cost - base_cost
|
|
2157
|
-
comparison["cost_savings_pct_with_shift"] = (base_cost - variant_cost) / base_cost * 100.0
|
|
2158
|
-
comparison["paired_cost_task_count"] = cost_task_count
|
|
2159
|
-
else:
|
|
2160
|
-
comparison["cost_savings_pct_with_shift"] = None
|
|
2161
|
-
comparison["paired_cost_task_count"] = cost_task_count
|
|
2162
|
-
for task_id in sorted(matched_tasks):
|
|
2163
|
-
matched_pair_evidence.append(matched_pair_evidence_entry(variant, task_id, quality_gate))
|
|
2164
|
-
comparisons.append(comparison)
|
|
2165
|
-
|
|
2166
|
-
claim_status = "insufficient_baseline"
|
|
2167
|
-
if baseline and baseline.get("successful_runs"):
|
|
2168
|
-
claim_status = "compare_variants" if comparisons else "baseline_only"
|
|
2169
|
-
if comparisons:
|
|
2170
|
-
quality_ok = all(item.get("quality_gate") == "pass" for item in comparisons)
|
|
2171
|
-
paired_token_data = all((item.get("paired_token_task_count") or 0) > 0 for item in comparisons)
|
|
2172
|
-
token_savings_observed = all((item.get("token_savings_pct") or 0) > 0 for item in comparisons)
|
|
2173
|
-
shifted_cost_savings = [
|
|
2174
|
-
item.get("cost_savings_pct_with_shift")
|
|
2175
|
-
for item in comparisons
|
|
2176
|
-
if isinstance(item.get("cost_savings_pct_with_shift"), (int, float))
|
|
2177
|
-
]
|
|
2178
|
-
all_shifted_cost_measured = len(shifted_cost_savings) == len(comparisons)
|
|
2179
|
-
shifted_cost_ok = all_shifted_cost_measured and all(value > 0 for value in shifted_cost_savings)
|
|
2180
|
-
if not quality_ok:
|
|
2181
|
-
claim_status = "quality_gate_watch"
|
|
2182
|
-
elif not paired_token_data:
|
|
2183
|
-
claim_status = "insufficient_paired_data"
|
|
2184
|
-
elif token_savings_observed and shifted_cost_ok:
|
|
2185
|
-
claim_status = "token_and_shifted_cost_savings_observed"
|
|
2186
|
-
elif token_savings_observed and not all_shifted_cost_measured:
|
|
2187
|
-
claim_status = "token_savings_observed_cost_unmeasured"
|
|
2188
|
-
elif token_savings_observed:
|
|
2189
|
-
claim_status = "token_savings_observed_cost_shift_watch"
|
|
2190
|
-
return {
|
|
2191
|
-
"schema": "context-guard-bench-report-v1",
|
|
2192
|
-
"baseline_variant": baseline_variant,
|
|
2193
|
-
"row_count": len(rows),
|
|
2194
|
-
"summary_by_variant": by_variant,
|
|
2195
|
-
"comparisons": comparisons,
|
|
2196
|
-
"matched_pair_evidence": matched_pair_evidence,
|
|
2197
|
-
"claim_status": claim_status,
|
|
2198
|
-
"caveat": (
|
|
2199
|
-
"Proxy byte reductions are reported separately from matched-task token/cost metrics; "
|
|
2200
|
-
"shifted cost savings require measured primary cost and measured external cost when "
|
|
2201
|
-
"external tokens are present. Wall time and provider cached-token fields are diagnostic "
|
|
2202
|
-
"telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache "
|
|
2203
|
-
"discounts must stay separate from token-reduction claims."
|
|
2204
|
-
),
|
|
2205
|
-
}
|
|
2206
|
-
|
|
2207
|
-
def write_report_json(csv_path: Path, report_path: Path, baseline_variant: str) -> dict[str, Any]:
|
|
2208
|
-
# Keep lock order stable across all report writes: source CSV first, derived
|
|
2209
|
-
# report second. Do not introduce a report -> CSV path; that can deadlock
|
|
2210
|
-
# concurrent report generation.
|
|
2211
|
-
with csv_file_lock(csv_path, create_parent=True):
|
|
2212
|
-
report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
|
|
2213
|
-
with csv_file_lock(report_path, create_parent=True):
|
|
2214
|
-
write_text_no_follow(
|
|
2215
|
-
report_path,
|
|
2216
|
-
json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
|
|
2217
|
-
)
|
|
2218
|
-
return report
|
|
2219
|
-
|
|
2220
|
-
|
|
2221
|
-
def sanitize_note_text(value: Any) -> str:
|
|
2222
|
-
"""Normalize untrusted benchmark note text without output-length policy."""
|
|
2223
|
-
text = "" if value is None else str(value)
|
|
2224
|
-
text = "".join(" " if unicodedata.category(ch)[0] == "C" else ch for ch in text)
|
|
2225
|
-
text = " ".join(text.split())
|
|
2226
|
-
for pattern, replacement in SECRET_NOTE_PATTERNS:
|
|
2227
|
-
text = pattern.sub(replacement, text)
|
|
2228
|
-
return text
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
def sanitize_csv_note(value: Any) -> str:
|
|
2232
|
-
"""Normalize untrusted notes before writing them to benchmark CSV output."""
|
|
2233
|
-
text = sanitize_note_text(value)
|
|
2234
|
-
if text.startswith(CSV_FORMULA_PREFIXES):
|
|
2235
|
-
text = "'" + text
|
|
2236
|
-
if len(text) > MAX_CSV_NOTE_CHARS:
|
|
2237
|
-
text = text[:MAX_CSV_NOTE_CHARS - 12].rstrip() + "…[truncated]"
|
|
2238
|
-
return text
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
def sanitize_csv_cell(value: Any) -> str:
|
|
2242
|
-
"""Normalize short untrusted CSV labels and block spreadsheet formulas."""
|
|
2243
|
-
text = sanitize_note_text(value)
|
|
2244
|
-
if text.startswith(CSV_FORMULA_PREFIXES):
|
|
2245
|
-
text = "'" + text
|
|
2246
|
-
return text
|
|
2247
|
-
|
|
2248
|
-
|
|
2249
|
-
def filter_targets(tasks: list[TaskFixture], variants: list[Variant],
|
|
2250
|
-
only_task: str | None, only_variant: str | None) -> list[tuple[TaskFixture, Variant]]:
|
|
2251
|
-
targets: list[tuple[TaskFixture, Variant]] = []
|
|
2252
|
-
for task in tasks:
|
|
2253
|
-
if only_task and task.id != only_task:
|
|
2254
|
-
continue
|
|
2255
|
-
for variant in variants:
|
|
2256
|
-
if only_variant and variant.name != only_variant:
|
|
2257
|
-
continue
|
|
2258
|
-
targets.append((task, variant))
|
|
2259
|
-
return targets
|
|
2260
|
-
|
|
2261
|
-
|
|
2262
|
-
def normalized_output_path(path: Path) -> Path:
|
|
2263
|
-
expanded = path.expanduser()
|
|
2264
|
-
if not expanded.is_absolute():
|
|
2265
|
-
expanded = Path.cwd() / expanded
|
|
2266
|
-
return Path(os.path.normpath(str(_normalize_allowed_first_absolute_symlink(expanded))))
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
def existing_file_identity(path: Path) -> tuple[int, int] | None:
|
|
2270
|
-
try:
|
|
2271
|
-
fd = _open_regular_no_symlink(normalized_output_path(path))
|
|
2272
|
-
except FileNotFoundError:
|
|
2273
|
-
return None
|
|
2274
|
-
try:
|
|
2275
|
-
st = os.fstat(fd)
|
|
2276
|
-
return (int(st.st_dev), int(st.st_ino))
|
|
2277
|
-
finally:
|
|
2278
|
-
os.close(fd)
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
def validate_distinct_output_paths(csv_path: Path, ledger_path: Path | None, report_path: Path | None) -> None:
|
|
2282
|
-
outputs = [("csv", csv_path), ("ledger-jsonl", ledger_path), ("report-json", report_path)]
|
|
2283
|
-
seen: dict[Path, str] = {}
|
|
2284
|
-
seen_identity: dict[tuple[int, int], str] = {}
|
|
2285
|
-
for label, path in outputs:
|
|
2286
|
-
if path is None:
|
|
2287
|
-
continue
|
|
2288
|
-
normalized = normalized_output_path(path)
|
|
2289
|
-
previous = seen.get(normalized)
|
|
2290
|
-
if previous is not None:
|
|
2291
|
-
raise SystemExit(f"--{label} must not point to the same path as --{previous}: {normalized}")
|
|
2292
|
-
seen[normalized] = label
|
|
2293
|
-
identity = existing_file_identity(normalized)
|
|
2294
|
-
if identity is not None:
|
|
2295
|
-
previous_identity = seen_identity.get(identity)
|
|
2296
|
-
if previous_identity is not None:
|
|
2297
|
-
raise SystemExit(f"--{label} must not point to the same file as --{previous_identity}: {normalized}")
|
|
2298
|
-
seen_identity[identity] = label
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
def main() -> int:
|
|
2302
|
-
parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
2303
|
-
parser.add_argument("--tasks", required=True, type=Path, help="task fixture JSON")
|
|
2304
|
-
parser.add_argument("--variants", required=True, type=Path, help="variant fixture JSON")
|
|
2305
|
-
parser.add_argument("--csv", default=Path("bench/results.csv"), type=Path,
|
|
2306
|
-
help="results CSV path (header is added on first write)")
|
|
2307
|
-
parser.add_argument("--task-id", default=None, help="run only the named task id")
|
|
2308
|
-
parser.add_argument("--variant", default=None, help="run only the named variant")
|
|
2309
|
-
parser.add_argument("--claude-bin", default=os.environ.get("CLAUDE_BIN", "claude"),
|
|
2310
|
-
help="claude CLI executable (default: $CLAUDE_BIN or 'claude')")
|
|
2311
|
-
parser.add_argument("--project-root", default=Path("."), type=Path,
|
|
2312
|
-
help="working directory used for success_command (default: cwd)")
|
|
2313
|
-
parser.add_argument("--dry-run", action="store_true",
|
|
2314
|
-
help="print the claude command without invoking it")
|
|
2315
|
-
parser.add_argument("--resume", action="store_true",
|
|
2316
|
-
help="skip (task_id, variant) rows already present in --csv")
|
|
2317
|
-
parser.add_argument("--ledger-jsonl", default=None, type=Path,
|
|
2318
|
-
help="optional JSONL ledger path for cost-shift accounting per run")
|
|
2319
|
-
parser.add_argument("--report-json", default=None, type=Path,
|
|
2320
|
-
help="optional A/B summary report JSON path generated from --csv after real runs")
|
|
2321
|
-
parser.add_argument("--baseline-variant", default="baseline",
|
|
2322
|
-
help="variant name used as the report baseline (default: baseline)")
|
|
2323
|
-
args = parser.parse_args()
|
|
2324
|
-
|
|
2325
|
-
require_no_follow_file_ops_supported()
|
|
2326
|
-
validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
|
|
2327
|
-
|
|
2328
|
-
variants = parse_variants(args.variants)
|
|
2329
|
-
tasks = parse_tasks(args.tasks, variants=variants)
|
|
2330
|
-
targets = filter_targets(tasks, variants, args.task_id, args.variant)
|
|
2331
|
-
if not targets:
|
|
2332
|
-
print("no (task, variant) targets matched the filters", file=sys.stderr)
|
|
2333
|
-
return 1
|
|
2334
|
-
|
|
2335
|
-
skip_keys = existing_keys(args.csv) if args.resume else set()
|
|
2336
|
-
runnable_targets = [
|
|
2337
|
-
(task, variant)
|
|
2338
|
-
for task, variant in targets
|
|
2339
|
-
if (task.id, variant.name) not in skip_keys
|
|
2340
|
-
]
|
|
2341
|
-
placeholder_targets = [
|
|
2342
|
-
f"{task.id}/{variant.name}"
|
|
2343
|
-
for task, variant in runnable_targets
|
|
2344
|
-
if is_placeholder_success_command(task.success_command)
|
|
2345
|
-
]
|
|
2346
|
-
if placeholder_targets and not args.dry_run:
|
|
2347
|
-
print(
|
|
2348
|
-
f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing non-dry-run provider invocation for: "
|
|
2349
|
-
f"{', '.join(placeholder_targets)}",
|
|
2350
|
-
file=sys.stderr,
|
|
2351
|
-
)
|
|
2352
|
-
return 2
|
|
2353
|
-
|
|
2354
|
-
if runnable_targets and not args.dry_run and shutil.which(args.claude_bin) is None:
|
|
2355
|
-
# claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
|
|
2356
|
-
if not Path(args.claude_bin).exists():
|
|
2357
|
-
print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
|
|
2358
|
-
return 2
|
|
2359
|
-
|
|
2360
|
-
if runnable_targets:
|
|
2361
|
-
load_variant_prompt_files_for_targets(runnable_targets, task_file_dir=args.tasks.parent)
|
|
2362
|
-
|
|
2363
|
-
project_root = args.project_root.resolve()
|
|
2364
|
-
claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
|
|
2365
|
-
|
|
2366
|
-
completed = 0
|
|
2367
|
-
for task, variant in targets:
|
|
2368
|
-
if (task.id, variant.name) in skip_keys:
|
|
2369
|
-
print(f"skip {task.id}/{variant.name} (already in {args.csv})")
|
|
2370
|
-
continue
|
|
2371
|
-
print(f"run {task.id}/{variant.name} ...", flush=True)
|
|
2372
|
-
result = run_fixture(task, variant, args.claude_bin, project_root, args.dry_run)
|
|
2373
|
-
# dry-run row 는 CSV 에 적재하지 않는다. 적재하면 (a) tokens=0/cost=0 이 평균을
|
|
2374
|
-
# 깎고, (b) --resume 이 그 (task, variant) 를 skip 해 실제 측정값이 영구 누락된다.
|
|
2375
|
-
wrote = True
|
|
2376
|
-
if not args.dry_run:
|
|
2377
|
-
wrote = append_csv(args.csv, claude_ver, result, skip_existing=args.resume)
|
|
2378
|
-
if wrote and args.ledger_jsonl is not None:
|
|
2379
|
-
append_cost_shift_ledger(args.ledger_jsonl, claude_ver, result)
|
|
2380
|
-
completed += 1
|
|
2381
|
-
status = "ok" if result.success else "FAIL"
|
|
2382
|
-
if args.dry_run:
|
|
2383
|
-
suffix = " (dry-run; CSV not updated)"
|
|
2384
|
-
elif not wrote:
|
|
2385
|
-
suffix = " (CSV not updated; row already present)"
|
|
2386
|
-
else:
|
|
2387
|
-
suffix = ""
|
|
2388
|
-
print(
|
|
2389
|
-
f" {status} tokens={sum(result.tokens.values())} cost=${result.cost_usd:.4f} "
|
|
2390
|
-
f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
|
|
2391
|
-
)
|
|
2392
|
-
target = args.csv if not args.dry_run else "(dry-run; no CSV writes)"
|
|
2393
|
-
if args.report_json is not None and not args.dry_run:
|
|
2394
|
-
report = write_report_json(args.csv, args.report_json, args.baseline_variant)
|
|
2395
|
-
print(f"report {args.report_json}: {report['claim_status']}")
|
|
2396
|
-
print(f"completed {completed} run(s); results in {target}")
|
|
2397
|
-
return 0
|
|
2398
|
-
|
|
2399
|
-
|
|
2400
|
-
if __name__ == "__main__":
|
|
2401
|
-
raise SystemExit(main())
|