@ictechgy/context-guard 0.4.1 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/README.ko.md +62 -33
- package/README.md +91 -23
- package/context-guard-kit/README.md +39 -26
- package/context-guard-kit/benchmark_runner.py +273 -8
- package/context-guard-kit/claude_transcript_cost_audit.py +597 -12
- package/context-guard-kit/context_compress.py +153 -1
- package/context-guard-kit/context_filter.py +446 -0
- package/context-guard-kit/context_guard_cli.py +3 -0
- package/context-guard-kit/context_guard_diet.py +677 -2
- package/context-guard-kit/context_pack.py +1694 -2
- package/context-guard-kit/cost_guard.py +1870 -0
- package/context-guard-kit/setup_wizard.py +820 -29
- package/context-guard-kit/trim_command_output.py +396 -45
- package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
- package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
- package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
- package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
- package/docs/benchmark-workflow-examples.md +40 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
- package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
- package/docs/cache-diagnostics-schema.md +96 -0
- package/docs/cache-diagnostics.example.json +116 -0
- package/docs/cache-diagnostics.schema.json +460 -0
- package/docs/distribution.md +4 -2
- package/docs/experimental-benchmark-fixtures.md +36 -0
- package/package.json +11 -2
- package/packaging/homebrew/context-guard.rb.template +3 -2
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +22 -14
- package/plugins/context-guard/README.md +24 -10
- package/plugins/context-guard/bin/context-guard +3 -0
- package/plugins/context-guard/bin/context-guard-audit +597 -12
- package/plugins/context-guard/bin/context-guard-bench +273 -8
- package/plugins/context-guard/bin/context-guard-compress +153 -1
- package/plugins/context-guard/bin/context-guard-cost +1870 -0
- package/plugins/context-guard/bin/context-guard-diet +677 -2
- package/plugins/context-guard/bin/context-guard-filter +446 -0
- package/plugins/context-guard/bin/context-guard-pack +1694 -2
- package/plugins/context-guard/bin/context-guard-setup +820 -29
- package/plugins/context-guard/bin/context-guard-trim-output +396 -45
- package/plugins/context-guard/brief/README.md +10 -3
- package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
- package/plugins/context-guard/skills/setup/SKILL.md +3 -1
|
@@ -108,6 +108,7 @@ CSV_COLUMNS = [
|
|
|
108
108
|
MAX_CSV_NOTE_CHARS = 500
|
|
109
109
|
MAX_CSV_ROWS = 100_000
|
|
110
110
|
CSV_FORMULA_PREFIXES = ("=", "+", "-", "@")
|
|
111
|
+
PLACEHOLDER_SUCCESS_COMMAND_MARKER = "fixture-only placeholder: replace success_command before real benchmark runs"
|
|
111
112
|
PROTECTED_VARIANT_FLAGS = frozenset({
|
|
112
113
|
"--",
|
|
113
114
|
"-p",
|
|
@@ -180,6 +181,8 @@ MAX_USAGE_COST_USD = 10**9
|
|
|
180
181
|
# 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
|
|
181
182
|
# ~4 bytes/token의 통용 근사값을 사용한다.
|
|
182
183
|
TOKEN_PROXY_BYTES_PER_TOKEN = 4
|
|
184
|
+
BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
|
|
185
|
+
MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
|
|
183
186
|
CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
|
|
184
187
|
SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
|
|
185
188
|
VERSION_OUTPUT_MAX_BYTES = 16_000
|
|
@@ -395,6 +398,10 @@ class BoundedProcessResult:
|
|
|
395
398
|
output_truncated: bool = False
|
|
396
399
|
|
|
397
400
|
|
|
401
|
+
def is_placeholder_success_command(command: str | None) -> bool:
|
|
402
|
+
return bool(command and PLACEHOLDER_SUCCESS_COMMAND_MARKER in command)
|
|
403
|
+
|
|
404
|
+
|
|
398
405
|
def parse_positive_int(value: Any, *, field: str, owner: str) -> int:
|
|
399
406
|
"""Parse a JSON fixture field that must be a positive integer."""
|
|
400
407
|
if isinstance(value, bool):
|
|
@@ -940,6 +947,14 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
|
|
|
940
947
|
success=True, notes=f"dry-run: {shlex.join(argv)}",
|
|
941
948
|
wall_time_seconds=0.0,
|
|
942
949
|
)
|
|
950
|
+
if is_placeholder_success_command(task.success_command):
|
|
951
|
+
return RunResult(
|
|
952
|
+
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
953
|
+
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
954
|
+
success=False,
|
|
955
|
+
notes=f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing to invoke provider",
|
|
956
|
+
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
957
|
+
)
|
|
943
958
|
argv[0] = executable_argv0(argv[0])
|
|
944
959
|
try:
|
|
945
960
|
proc = run_bounded_command(
|
|
@@ -1116,11 +1131,14 @@ def write_text_no_follow(path: Path, text: str) -> None:
|
|
|
1116
1131
|
|
|
1117
1132
|
def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
|
|
1118
1133
|
shifted_cost_known = cost_shift_measured(result)
|
|
1134
|
+
byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
|
|
1119
1135
|
payload = {
|
|
1136
|
+
"schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
|
|
1120
1137
|
"date": _dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
|
|
1121
1138
|
"claude_version": claude_ver,
|
|
1122
1139
|
"task_id": result.task_id,
|
|
1123
1140
|
"variant": result.variant,
|
|
1141
|
+
"transform_id": result.variant,
|
|
1124
1142
|
"success": result.success,
|
|
1125
1143
|
"primary_cost_measured": result.cost_measured,
|
|
1126
1144
|
"primary_cost_usd": round(result.cost_usd, 6),
|
|
@@ -1142,6 +1160,22 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
|
|
|
1142
1160
|
"hook_triggers": result.hook_triggers,
|
|
1143
1161
|
"turns": result.turns,
|
|
1144
1162
|
"notes": sanitize_csv_note(result.notes),
|
|
1163
|
+
"measurement_availability": {
|
|
1164
|
+
"primary_tokens": result.primary_tokens_measured,
|
|
1165
|
+
"primary_cost": result.cost_measured,
|
|
1166
|
+
"external_tokens": result.external_tokens_measured,
|
|
1167
|
+
"external_cost": result.external_cost_measured,
|
|
1168
|
+
"shifted_cost": shifted_cost_known,
|
|
1169
|
+
"provider_cache": result.provider_cached_tokens_measured,
|
|
1170
|
+
"byte_metrics": byte_metrics_observed,
|
|
1171
|
+
"wall_time": result.wall_time_seconds >= 0,
|
|
1172
|
+
},
|
|
1173
|
+
"proxy_metrics": {
|
|
1174
|
+
"byte_metrics_observed": byte_metrics_observed,
|
|
1175
|
+
"token_proxy": "chars_div_4",
|
|
1176
|
+
"bytes_per_token": TOKEN_PROXY_BYTES_PER_TOKEN,
|
|
1177
|
+
"claim_boundary": "proxy_only_not_hosted_token_savings",
|
|
1178
|
+
},
|
|
1145
1179
|
}
|
|
1146
1180
|
with csv_file_lock(path, create_parent=True):
|
|
1147
1181
|
fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
|
|
@@ -1283,7 +1317,9 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1283
1317
|
seen_tasks_by_variant: dict[str, set[str]] = {}
|
|
1284
1318
|
successful_tasks_by_variant: dict[str, set[str]] = {}
|
|
1285
1319
|
|
|
1286
|
-
for
|
|
1320
|
+
for row_index, raw_row in enumerate(rows, start=1):
|
|
1321
|
+
row = dict(raw_row)
|
|
1322
|
+
row["_row_index"] = str(row_index)
|
|
1287
1323
|
variant = row.get("variant") or "unknown"
|
|
1288
1324
|
task_id = row.get("task_id") or "unknown"
|
|
1289
1325
|
seen_tasks_by_variant.setdefault(variant, set()).add(task_id)
|
|
@@ -1566,7 +1602,215 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1566
1602
|
len(baseline_values),
|
|
1567
1603
|
)
|
|
1568
1604
|
|
|
1605
|
+
def row_indices_for(rows_for_task: list[dict[str, str]]) -> list[int]:
|
|
1606
|
+
out: list[int] = []
|
|
1607
|
+
for row in rows_for_task:
|
|
1608
|
+
index = row_optional_nonnegative_int(row, "_row_index")
|
|
1609
|
+
if index is not None:
|
|
1610
|
+
out.append(index)
|
|
1611
|
+
return out
|
|
1612
|
+
|
|
1613
|
+
def all_rows_bool(rows_for_task: list[dict[str, str]], key: str) -> bool:
|
|
1614
|
+
return bool(rows_for_task) and all(row_bool(row, key) for row in rows_for_task)
|
|
1615
|
+
|
|
1616
|
+
def all_rows_optional_int(rows_for_task: list[dict[str, str]], key: str) -> list[int] | None:
|
|
1617
|
+
values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
|
|
1618
|
+
if not values or any(value is None for value in values):
|
|
1619
|
+
return None
|
|
1620
|
+
return [value for value in values if value is not None]
|
|
1621
|
+
|
|
1622
|
+
def all_rows_optional_float(rows_for_task: list[dict[str, str]], key: str) -> list[float] | None:
|
|
1623
|
+
values = [row_optional_float(row, key) for row in rows_for_task]
|
|
1624
|
+
if not values or any(value is None for value in values):
|
|
1625
|
+
return None
|
|
1626
|
+
return [value for value in values if value is not None]
|
|
1627
|
+
|
|
1628
|
+
def average_optional_int(rows_for_task: list[dict[str, str]], key: str) -> float | None:
|
|
1629
|
+
values = all_rows_optional_int(rows_for_task, key)
|
|
1630
|
+
return (sum(values) / len(values)) if values else None
|
|
1631
|
+
|
|
1632
|
+
def average_optional_float(rows_for_task: list[dict[str, str]], key: str) -> float | None:
|
|
1633
|
+
values = all_rows_optional_float(rows_for_task, key)
|
|
1634
|
+
return (sum(values) / len(values)) if values else None
|
|
1635
|
+
|
|
1636
|
+
def total_optional_int(rows_for_task: list[dict[str, str]], key: str) -> int | None:
|
|
1637
|
+
values = all_rows_optional_int(rows_for_task, key)
|
|
1638
|
+
return sum(values) if values is not None else None
|
|
1639
|
+
|
|
1640
|
+
def all_rows_shifted_cost_measured(rows_for_task: list[dict[str, str]]) -> bool:
|
|
1641
|
+
return bool(rows_for_task) and all(
|
|
1642
|
+
row_cost_shift_measured(row) and row_optional_float(row, "total_cost_with_shift_usd") is not None
|
|
1643
|
+
for row in rows_for_task
|
|
1644
|
+
)
|
|
1645
|
+
|
|
1646
|
+
def matched_side_evidence(variant: str, task_id: str, rows_for_task: list[dict[str, str]]) -> dict[str, Any]:
|
|
1647
|
+
primary_tokens_measured = all_rows_bool(rows_for_task, "primary_tokens_measured")
|
|
1648
|
+
primary_cost_measured = all_rows_bool(rows_for_task, "cost_measured")
|
|
1649
|
+
shifted_cost_measured = all_rows_shifted_cost_measured(rows_for_task)
|
|
1650
|
+
provider_cache_measured = all_rows_bool(rows_for_task, "provider_cached_tokens_measured")
|
|
1651
|
+
external_tokens_measured = all_rows_bool(rows_for_task, "external_tokens_measured")
|
|
1652
|
+
external_cost_measured = all_rows_bool(rows_for_task, "external_cost_measured")
|
|
1653
|
+
corrections_values = all_rows_optional_int(rows_for_task, "corrections")
|
|
1654
|
+
bytes_before_values = [row_optional_nonnegative_int(row, "bytes_before") for row in rows_for_task]
|
|
1655
|
+
bytes_after_values = [row_optional_nonnegative_int(row, "bytes_after") for row in rows_for_task]
|
|
1656
|
+
byte_metrics_observed = bool(rows_for_task) and not any(
|
|
1657
|
+
value is None for value in [*bytes_before_values, *bytes_after_values]
|
|
1658
|
+
)
|
|
1659
|
+
bytes_before_total = sum(value for value in bytes_before_values if value is not None)
|
|
1660
|
+
bytes_after_total = sum(value for value in bytes_after_values if value is not None)
|
|
1661
|
+
byte_delta = bytes_after_total - bytes_before_total if byte_metrics_observed else None
|
|
1662
|
+
token_proxy_delta = (
|
|
1663
|
+
int(byte_delta / TOKEN_PROXY_BYTES_PER_TOKEN) if byte_delta is not None else None
|
|
1664
|
+
)
|
|
1665
|
+
return {
|
|
1666
|
+
"variant": variant,
|
|
1667
|
+
"task_id": task_id,
|
|
1668
|
+
"run_count": len(rows_for_task),
|
|
1669
|
+
"row_indices": row_indices_for(rows_for_task),
|
|
1670
|
+
"primary_tokens": {
|
|
1671
|
+
"measured": primary_tokens_measured,
|
|
1672
|
+
"average": average_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
|
|
1673
|
+
"total": total_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
|
|
1674
|
+
},
|
|
1675
|
+
"primary_cost_usd": {
|
|
1676
|
+
"measured": primary_cost_measured,
|
|
1677
|
+
"average": average_optional_float(rows_for_task, "cost_usd") if primary_cost_measured else None,
|
|
1678
|
+
},
|
|
1679
|
+
"total_cost_with_shift_usd": {
|
|
1680
|
+
"measured": shifted_cost_measured,
|
|
1681
|
+
"average": (
|
|
1682
|
+
average_optional_float(rows_for_task, "total_cost_with_shift_usd")
|
|
1683
|
+
if shifted_cost_measured else None
|
|
1684
|
+
),
|
|
1685
|
+
},
|
|
1686
|
+
"external_tokens": {
|
|
1687
|
+
"measured": external_tokens_measured,
|
|
1688
|
+
"total": total_optional_int(rows_for_task, "external_tokens") if external_tokens_measured else None,
|
|
1689
|
+
},
|
|
1690
|
+
"external_cost_usd": {
|
|
1691
|
+
"measured": external_cost_measured,
|
|
1692
|
+
"total": (
|
|
1693
|
+
sum(row_float(row, "external_cost_usd") for row in rows_for_task)
|
|
1694
|
+
if external_cost_measured else None
|
|
1695
|
+
),
|
|
1696
|
+
},
|
|
1697
|
+
"bytes": {
|
|
1698
|
+
"measurement": "observed" if byte_metrics_observed else "unavailable",
|
|
1699
|
+
"before_total": bytes_before_total if byte_metrics_observed else None,
|
|
1700
|
+
"after_total": bytes_after_total if byte_metrics_observed else None,
|
|
1701
|
+
"delta_total": byte_delta,
|
|
1702
|
+
"token_proxy_delta": token_proxy_delta,
|
|
1703
|
+
"token_proxy": "chars_div_4_proxy_only" if byte_metrics_observed else "unavailable",
|
|
1704
|
+
},
|
|
1705
|
+
"wall_time_seconds": {
|
|
1706
|
+
"measured": all_rows_optional_float(rows_for_task, "wall_time_seconds") is not None,
|
|
1707
|
+
"average": average_optional_float(rows_for_task, "wall_time_seconds"),
|
|
1708
|
+
},
|
|
1709
|
+
"provider_cached_tokens": {
|
|
1710
|
+
"measured": provider_cache_measured,
|
|
1711
|
+
"average": (
|
|
1712
|
+
average_optional_int(rows_for_task, "provider_cached_tokens")
|
|
1713
|
+
if provider_cache_measured else None
|
|
1714
|
+
),
|
|
1715
|
+
},
|
|
1716
|
+
"corrections": {
|
|
1717
|
+
"measured": corrections_values is not None,
|
|
1718
|
+
"average": (sum(corrections_values) / len(corrections_values)) if corrections_values else None,
|
|
1719
|
+
},
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
def matched_pair_evidence_entry(
|
|
1723
|
+
variant: str,
|
|
1724
|
+
task_id: str,
|
|
1725
|
+
quality_gate: str,
|
|
1726
|
+
) -> dict[str, Any]:
|
|
1727
|
+
baseline_rows = successful_rows_by_variant_task[baseline_variant][task_id]
|
|
1728
|
+
variant_rows = successful_rows_by_variant_task[variant][task_id]
|
|
1729
|
+
baseline_evidence = matched_side_evidence(baseline_variant, task_id, baseline_rows)
|
|
1730
|
+
variant_evidence = matched_side_evidence(variant, task_id, variant_rows)
|
|
1731
|
+
baseline_token_avg = baseline_evidence["primary_tokens"]["average"]
|
|
1732
|
+
variant_token_avg = variant_evidence["primary_tokens"]["average"]
|
|
1733
|
+
token_claim_allowed = (
|
|
1734
|
+
quality_gate == "pass"
|
|
1735
|
+
and bool(baseline_evidence["primary_tokens"]["measured"])
|
|
1736
|
+
and bool(variant_evidence["primary_tokens"]["measured"])
|
|
1737
|
+
and isinstance(baseline_token_avg, (int, float))
|
|
1738
|
+
and baseline_token_avg > 0
|
|
1739
|
+
and isinstance(variant_token_avg, (int, float))
|
|
1740
|
+
)
|
|
1741
|
+
baseline_cost_avg = baseline_evidence["total_cost_with_shift_usd"]["average"]
|
|
1742
|
+
variant_cost_avg = variant_evidence["total_cost_with_shift_usd"]["average"]
|
|
1743
|
+
shifted_cost_claim_allowed = (
|
|
1744
|
+
quality_gate == "pass"
|
|
1745
|
+
and bool(baseline_evidence["total_cost_with_shift_usd"]["measured"])
|
|
1746
|
+
and bool(variant_evidence["total_cost_with_shift_usd"]["measured"])
|
|
1747
|
+
and isinstance(baseline_cost_avg, (int, float))
|
|
1748
|
+
and baseline_cost_avg > 0
|
|
1749
|
+
and isinstance(variant_cost_avg, (int, float))
|
|
1750
|
+
)
|
|
1751
|
+
token_delta = (
|
|
1752
|
+
variant_token_avg - baseline_token_avg
|
|
1753
|
+
if token_claim_allowed
|
|
1754
|
+
else None
|
|
1755
|
+
)
|
|
1756
|
+
token_savings_pct = (
|
|
1757
|
+
(baseline_token_avg - variant_token_avg) / baseline_token_avg * 100.0
|
|
1758
|
+
if token_delta is not None
|
|
1759
|
+
else None
|
|
1760
|
+
)
|
|
1761
|
+
cost_delta = (
|
|
1762
|
+
variant_cost_avg - baseline_cost_avg
|
|
1763
|
+
if shifted_cost_claim_allowed
|
|
1764
|
+
else None
|
|
1765
|
+
)
|
|
1766
|
+
cost_savings_pct = (
|
|
1767
|
+
(baseline_cost_avg - variant_cost_avg) / baseline_cost_avg * 100.0
|
|
1768
|
+
if cost_delta is not None
|
|
1769
|
+
else None
|
|
1770
|
+
)
|
|
1771
|
+
base_after = baseline_evidence["bytes"]["after_total"]
|
|
1772
|
+
variant_after = variant_evidence["bytes"]["after_total"]
|
|
1773
|
+
byte_after_delta = (
|
|
1774
|
+
variant_after - base_after
|
|
1775
|
+
if isinstance(base_after, int) and isinstance(variant_after, int)
|
|
1776
|
+
else None
|
|
1777
|
+
)
|
|
1778
|
+
return {
|
|
1779
|
+
"schema_version": MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION,
|
|
1780
|
+
"task_id": task_id,
|
|
1781
|
+
"baseline_variant": baseline_variant,
|
|
1782
|
+
"variant": variant,
|
|
1783
|
+
"transform_id": variant,
|
|
1784
|
+
"quality_gate": quality_gate,
|
|
1785
|
+
"evidence_kind": "matched_successful_task_bucket",
|
|
1786
|
+
"measurements": {
|
|
1787
|
+
"baseline": baseline_evidence,
|
|
1788
|
+
"variant": variant_evidence,
|
|
1789
|
+
},
|
|
1790
|
+
"delta": {
|
|
1791
|
+
"primary_tokens_average": token_delta,
|
|
1792
|
+
"token_savings_pct": token_savings_pct,
|
|
1793
|
+
"total_cost_with_shift_usd_average": cost_delta,
|
|
1794
|
+
"cost_savings_pct_with_shift": cost_savings_pct,
|
|
1795
|
+
"bytes_after_total": byte_after_delta,
|
|
1796
|
+
"token_proxy_after_total": (
|
|
1797
|
+
int(byte_after_delta / TOKEN_PROXY_BYTES_PER_TOKEN)
|
|
1798
|
+
if byte_after_delta is not None else None
|
|
1799
|
+
),
|
|
1800
|
+
"proxy_measurement": "chars_div_4_proxy_only",
|
|
1801
|
+
},
|
|
1802
|
+
"claim_boundary": {
|
|
1803
|
+
"quality_gate": quality_gate,
|
|
1804
|
+
"token_savings_claim_allowed": token_claim_allowed,
|
|
1805
|
+
"shifted_cost_claim_allowed": shifted_cost_claim_allowed,
|
|
1806
|
+
"byte_proxy_only": True,
|
|
1807
|
+
"requires_matched_successful_tasks": True,
|
|
1808
|
+
"raw_estimate_only_claim_allowed": False,
|
|
1809
|
+
},
|
|
1810
|
+
}
|
|
1811
|
+
|
|
1569
1812
|
comparisons: list[dict[str, Any]] = []
|
|
1813
|
+
matched_pair_evidence: list[dict[str, Any]] = []
|
|
1570
1814
|
baseline = by_variant.get(baseline_variant)
|
|
1571
1815
|
baseline_successful_tasks = successful_tasks_by_variant.get(baseline_variant, set())
|
|
1572
1816
|
baseline_failure_rate = baseline.get("failure_rate") if baseline else None
|
|
@@ -1680,6 +1924,8 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1680
1924
|
else:
|
|
1681
1925
|
comparison["cost_savings_pct_with_shift"] = None
|
|
1682
1926
|
comparison["paired_cost_task_count"] = cost_task_count
|
|
1927
|
+
for task_id in sorted(matched_tasks):
|
|
1928
|
+
matched_pair_evidence.append(matched_pair_evidence_entry(variant, task_id, quality_gate))
|
|
1683
1929
|
comparisons.append(comparison)
|
|
1684
1930
|
|
|
1685
1931
|
claim_status = "insufficient_baseline"
|
|
@@ -1712,6 +1958,7 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1712
1958
|
"row_count": len(rows),
|
|
1713
1959
|
"summary_by_variant": by_variant,
|
|
1714
1960
|
"comparisons": comparisons,
|
|
1961
|
+
"matched_pair_evidence": matched_pair_evidence,
|
|
1715
1962
|
"claim_status": claim_status,
|
|
1716
1963
|
"caveat": (
|
|
1717
1964
|
"Proxy byte reductions are reported separately from matched-task token/cost metrics; "
|
|
@@ -1843,12 +2090,6 @@ def main() -> int:
|
|
|
1843
2090
|
require_no_follow_file_ops_supported()
|
|
1844
2091
|
validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
|
|
1845
2092
|
|
|
1846
|
-
if not args.dry_run and shutil.which(args.claude_bin) is None:
|
|
1847
|
-
# claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
|
|
1848
|
-
if not Path(args.claude_bin).exists():
|
|
1849
|
-
print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
|
|
1850
|
-
return 2
|
|
1851
|
-
|
|
1852
2093
|
tasks = parse_tasks(args.tasks)
|
|
1853
2094
|
variants = parse_variants(args.variants)
|
|
1854
2095
|
targets = filter_targets(tasks, variants, args.task_id, args.variant)
|
|
@@ -1857,8 +2098,32 @@ def main() -> int:
|
|
|
1857
2098
|
return 1
|
|
1858
2099
|
|
|
1859
2100
|
skip_keys = existing_keys(args.csv) if args.resume else set()
|
|
2101
|
+
runnable_targets = [
|
|
2102
|
+
(task, variant)
|
|
2103
|
+
for task, variant in targets
|
|
2104
|
+
if (task.id, variant.name) not in skip_keys
|
|
2105
|
+
]
|
|
2106
|
+
placeholder_targets = [
|
|
2107
|
+
f"{task.id}/{variant.name}"
|
|
2108
|
+
for task, variant in runnable_targets
|
|
2109
|
+
if is_placeholder_success_command(task.success_command)
|
|
2110
|
+
]
|
|
2111
|
+
if placeholder_targets and not args.dry_run:
|
|
2112
|
+
print(
|
|
2113
|
+
f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing non-dry-run provider invocation for: "
|
|
2114
|
+
f"{', '.join(placeholder_targets)}",
|
|
2115
|
+
file=sys.stderr,
|
|
2116
|
+
)
|
|
2117
|
+
return 2
|
|
2118
|
+
|
|
2119
|
+
if runnable_targets and not args.dry_run and shutil.which(args.claude_bin) is None:
|
|
2120
|
+
# claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
|
|
2121
|
+
if not Path(args.claude_bin).exists():
|
|
2122
|
+
print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
|
|
2123
|
+
return 2
|
|
2124
|
+
|
|
1860
2125
|
project_root = args.project_root.resolve()
|
|
1861
|
-
claude_ver = "dry-run" if args.dry_run else claude_version(args.claude_bin)
|
|
2126
|
+
claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
|
|
1862
2127
|
|
|
1863
2128
|
completed = 0
|
|
1864
2129
|
for task, variant in targets:
|