@ictechgy/context-guard 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/README.ko.md +61 -32
- package/README.md +90 -22
- package/context-guard-kit/README.md +39 -26
- package/context-guard-kit/benchmark_runner.py +273 -8
- package/context-guard-kit/claude_transcript_cost_audit.py +325 -12
- package/context-guard-kit/context_compress.py +153 -1
- package/context-guard-kit/context_filter.py +446 -0
- package/context-guard-kit/context_guard_cli.py +3 -0
- package/context-guard-kit/context_guard_diet.py +677 -2
- package/context-guard-kit/context_pack.py +1694 -2
- package/context-guard-kit/cost_guard.py +1870 -0
- package/context-guard-kit/setup_wizard.py +820 -29
- package/context-guard-kit/trim_command_output.py +396 -45
- package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
- package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
- package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
- package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
- package/docs/benchmark-workflow-examples.md +40 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
- package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
- package/docs/cache-diagnostics-schema.md +75 -0
- package/docs/cache-diagnostics.example.json +116 -0
- package/docs/cache-diagnostics.schema.json +460 -0
- package/docs/distribution.md +4 -2
- package/docs/experimental-benchmark-fixtures.md +36 -0
- package/package.json +11 -2
- package/packaging/homebrew/context-guard.rb.template +3 -2
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +21 -13
- package/plugins/context-guard/README.md +24 -10
- package/plugins/context-guard/bin/context-guard +3 -0
- package/plugins/context-guard/bin/context-guard-audit +325 -12
- package/plugins/context-guard/bin/context-guard-bench +273 -8
- package/plugins/context-guard/bin/context-guard-compress +153 -1
- package/plugins/context-guard/bin/context-guard-cost +1870 -0
- package/plugins/context-guard/bin/context-guard-diet +677 -2
- package/plugins/context-guard/bin/context-guard-filter +446 -0
- package/plugins/context-guard/bin/context-guard-pack +1694 -2
- package/plugins/context-guard/bin/context-guard-setup +820 -29
- package/plugins/context-guard/bin/context-guard-trim-output +396 -45
- package/plugins/context-guard/brief/README.md +10 -3
- package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
- package/plugins/context-guard/skills/setup/SKILL.md +3 -1
|
@@ -108,6 +108,7 @@ CSV_COLUMNS = [
|
|
|
108
108
|
MAX_CSV_NOTE_CHARS = 500
|
|
109
109
|
MAX_CSV_ROWS = 100_000
|
|
110
110
|
CSV_FORMULA_PREFIXES = ("=", "+", "-", "@")
|
|
111
|
+
PLACEHOLDER_SUCCESS_COMMAND_MARKER = "fixture-only placeholder: replace success_command before real benchmark runs"
|
|
111
112
|
PROTECTED_VARIANT_FLAGS = frozenset({
|
|
112
113
|
"--",
|
|
113
114
|
"-p",
|
|
@@ -180,6 +181,8 @@ MAX_USAGE_COST_USD = 10**9
|
|
|
180
181
|
# 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
|
|
181
182
|
# ~4 bytes/token의 통용 근사값을 사용한다.
|
|
182
183
|
TOKEN_PROXY_BYTES_PER_TOKEN = 4
|
|
184
|
+
BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
|
|
185
|
+
MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
|
|
183
186
|
CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
|
|
184
187
|
SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
|
|
185
188
|
VERSION_OUTPUT_MAX_BYTES = 16_000
|
|
@@ -395,6 +398,10 @@ class BoundedProcessResult:
|
|
|
395
398
|
output_truncated: bool = False
|
|
396
399
|
|
|
397
400
|
|
|
401
|
+
def is_placeholder_success_command(command: str | None) -> bool:
|
|
402
|
+
return bool(command and PLACEHOLDER_SUCCESS_COMMAND_MARKER in command)
|
|
403
|
+
|
|
404
|
+
|
|
398
405
|
def parse_positive_int(value: Any, *, field: str, owner: str) -> int:
|
|
399
406
|
"""Parse a JSON fixture field that must be a positive integer."""
|
|
400
407
|
if isinstance(value, bool):
|
|
@@ -940,6 +947,14 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
|
|
|
940
947
|
success=True, notes=f"dry-run: {shlex.join(argv)}",
|
|
941
948
|
wall_time_seconds=0.0,
|
|
942
949
|
)
|
|
950
|
+
if is_placeholder_success_command(task.success_command):
|
|
951
|
+
return RunResult(
|
|
952
|
+
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
953
|
+
tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
|
|
954
|
+
success=False,
|
|
955
|
+
notes=f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing to invoke provider",
|
|
956
|
+
wall_time_seconds=elapsed_seconds_since(started_at),
|
|
957
|
+
)
|
|
943
958
|
argv[0] = executable_argv0(argv[0])
|
|
944
959
|
try:
|
|
945
960
|
proc = run_bounded_command(
|
|
@@ -1116,11 +1131,14 @@ def write_text_no_follow(path: Path, text: str) -> None:
|
|
|
1116
1131
|
|
|
1117
1132
|
def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
|
|
1118
1133
|
shifted_cost_known = cost_shift_measured(result)
|
|
1134
|
+
byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
|
|
1119
1135
|
payload = {
|
|
1136
|
+
"schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
|
|
1120
1137
|
"date": _dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
|
|
1121
1138
|
"claude_version": claude_ver,
|
|
1122
1139
|
"task_id": result.task_id,
|
|
1123
1140
|
"variant": result.variant,
|
|
1141
|
+
"transform_id": result.variant,
|
|
1124
1142
|
"success": result.success,
|
|
1125
1143
|
"primary_cost_measured": result.cost_measured,
|
|
1126
1144
|
"primary_cost_usd": round(result.cost_usd, 6),
|
|
@@ -1142,6 +1160,22 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
|
|
|
1142
1160
|
"hook_triggers": result.hook_triggers,
|
|
1143
1161
|
"turns": result.turns,
|
|
1144
1162
|
"notes": sanitize_csv_note(result.notes),
|
|
1163
|
+
"measurement_availability": {
|
|
1164
|
+
"primary_tokens": result.primary_tokens_measured,
|
|
1165
|
+
"primary_cost": result.cost_measured,
|
|
1166
|
+
"external_tokens": result.external_tokens_measured,
|
|
1167
|
+
"external_cost": result.external_cost_measured,
|
|
1168
|
+
"shifted_cost": shifted_cost_known,
|
|
1169
|
+
"provider_cache": result.provider_cached_tokens_measured,
|
|
1170
|
+
"byte_metrics": byte_metrics_observed,
|
|
1171
|
+
"wall_time": result.wall_time_seconds >= 0,
|
|
1172
|
+
},
|
|
1173
|
+
"proxy_metrics": {
|
|
1174
|
+
"byte_metrics_observed": byte_metrics_observed,
|
|
1175
|
+
"token_proxy": "chars_div_4",
|
|
1176
|
+
"bytes_per_token": TOKEN_PROXY_BYTES_PER_TOKEN,
|
|
1177
|
+
"claim_boundary": "proxy_only_not_hosted_token_savings",
|
|
1178
|
+
},
|
|
1145
1179
|
}
|
|
1146
1180
|
with csv_file_lock(path, create_parent=True):
|
|
1147
1181
|
fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
|
|
@@ -1283,7 +1317,9 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1283
1317
|
seen_tasks_by_variant: dict[str, set[str]] = {}
|
|
1284
1318
|
successful_tasks_by_variant: dict[str, set[str]] = {}
|
|
1285
1319
|
|
|
1286
|
-
for
|
|
1320
|
+
for row_index, raw_row in enumerate(rows, start=1):
|
|
1321
|
+
row = dict(raw_row)
|
|
1322
|
+
row["_row_index"] = str(row_index)
|
|
1287
1323
|
variant = row.get("variant") or "unknown"
|
|
1288
1324
|
task_id = row.get("task_id") or "unknown"
|
|
1289
1325
|
seen_tasks_by_variant.setdefault(variant, set()).add(task_id)
|
|
@@ -1566,7 +1602,215 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1566
1602
|
len(baseline_values),
|
|
1567
1603
|
)
|
|
1568
1604
|
|
|
1605
|
+
def row_indices_for(rows_for_task: list[dict[str, str]]) -> list[int]:
|
|
1606
|
+
out: list[int] = []
|
|
1607
|
+
for row in rows_for_task:
|
|
1608
|
+
index = row_optional_nonnegative_int(row, "_row_index")
|
|
1609
|
+
if index is not None:
|
|
1610
|
+
out.append(index)
|
|
1611
|
+
return out
|
|
1612
|
+
|
|
1613
|
+
def all_rows_bool(rows_for_task: list[dict[str, str]], key: str) -> bool:
|
|
1614
|
+
return bool(rows_for_task) and all(row_bool(row, key) for row in rows_for_task)
|
|
1615
|
+
|
|
1616
|
+
def all_rows_optional_int(rows_for_task: list[dict[str, str]], key: str) -> list[int] | None:
|
|
1617
|
+
values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
|
|
1618
|
+
if not values or any(value is None for value in values):
|
|
1619
|
+
return None
|
|
1620
|
+
return [value for value in values if value is not None]
|
|
1621
|
+
|
|
1622
|
+
def all_rows_optional_float(rows_for_task: list[dict[str, str]], key: str) -> list[float] | None:
|
|
1623
|
+
values = [row_optional_float(row, key) for row in rows_for_task]
|
|
1624
|
+
if not values or any(value is None for value in values):
|
|
1625
|
+
return None
|
|
1626
|
+
return [value for value in values if value is not None]
|
|
1627
|
+
|
|
1628
|
+
def average_optional_int(rows_for_task: list[dict[str, str]], key: str) -> float | None:
|
|
1629
|
+
values = all_rows_optional_int(rows_for_task, key)
|
|
1630
|
+
return (sum(values) / len(values)) if values else None
|
|
1631
|
+
|
|
1632
|
+
def average_optional_float(rows_for_task: list[dict[str, str]], key: str) -> float | None:
|
|
1633
|
+
values = all_rows_optional_float(rows_for_task, key)
|
|
1634
|
+
return (sum(values) / len(values)) if values else None
|
|
1635
|
+
|
|
1636
|
+
def total_optional_int(rows_for_task: list[dict[str, str]], key: str) -> int | None:
|
|
1637
|
+
values = all_rows_optional_int(rows_for_task, key)
|
|
1638
|
+
return sum(values) if values is not None else None
|
|
1639
|
+
|
|
1640
|
+
def all_rows_shifted_cost_measured(rows_for_task: list[dict[str, str]]) -> bool:
|
|
1641
|
+
return bool(rows_for_task) and all(
|
|
1642
|
+
row_cost_shift_measured(row) and row_optional_float(row, "total_cost_with_shift_usd") is not None
|
|
1643
|
+
for row in rows_for_task
|
|
1644
|
+
)
|
|
1645
|
+
|
|
1646
|
+
def matched_side_evidence(variant: str, task_id: str, rows_for_task: list[dict[str, str]]) -> dict[str, Any]:
|
|
1647
|
+
primary_tokens_measured = all_rows_bool(rows_for_task, "primary_tokens_measured")
|
|
1648
|
+
primary_cost_measured = all_rows_bool(rows_for_task, "cost_measured")
|
|
1649
|
+
shifted_cost_measured = all_rows_shifted_cost_measured(rows_for_task)
|
|
1650
|
+
provider_cache_measured = all_rows_bool(rows_for_task, "provider_cached_tokens_measured")
|
|
1651
|
+
external_tokens_measured = all_rows_bool(rows_for_task, "external_tokens_measured")
|
|
1652
|
+
external_cost_measured = all_rows_bool(rows_for_task, "external_cost_measured")
|
|
1653
|
+
corrections_values = all_rows_optional_int(rows_for_task, "corrections")
|
|
1654
|
+
bytes_before_values = [row_optional_nonnegative_int(row, "bytes_before") for row in rows_for_task]
|
|
1655
|
+
bytes_after_values = [row_optional_nonnegative_int(row, "bytes_after") for row in rows_for_task]
|
|
1656
|
+
byte_metrics_observed = bool(rows_for_task) and not any(
|
|
1657
|
+
value is None for value in [*bytes_before_values, *bytes_after_values]
|
|
1658
|
+
)
|
|
1659
|
+
bytes_before_total = sum(value for value in bytes_before_values if value is not None)
|
|
1660
|
+
bytes_after_total = sum(value for value in bytes_after_values if value is not None)
|
|
1661
|
+
byte_delta = bytes_after_total - bytes_before_total if byte_metrics_observed else None
|
|
1662
|
+
token_proxy_delta = (
|
|
1663
|
+
int(byte_delta / TOKEN_PROXY_BYTES_PER_TOKEN) if byte_delta is not None else None
|
|
1664
|
+
)
|
|
1665
|
+
return {
|
|
1666
|
+
"variant": variant,
|
|
1667
|
+
"task_id": task_id,
|
|
1668
|
+
"run_count": len(rows_for_task),
|
|
1669
|
+
"row_indices": row_indices_for(rows_for_task),
|
|
1670
|
+
"primary_tokens": {
|
|
1671
|
+
"measured": primary_tokens_measured,
|
|
1672
|
+
"average": average_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
|
|
1673
|
+
"total": total_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
|
|
1674
|
+
},
|
|
1675
|
+
"primary_cost_usd": {
|
|
1676
|
+
"measured": primary_cost_measured,
|
|
1677
|
+
"average": average_optional_float(rows_for_task, "cost_usd") if primary_cost_measured else None,
|
|
1678
|
+
},
|
|
1679
|
+
"total_cost_with_shift_usd": {
|
|
1680
|
+
"measured": shifted_cost_measured,
|
|
1681
|
+
"average": (
|
|
1682
|
+
average_optional_float(rows_for_task, "total_cost_with_shift_usd")
|
|
1683
|
+
if shifted_cost_measured else None
|
|
1684
|
+
),
|
|
1685
|
+
},
|
|
1686
|
+
"external_tokens": {
|
|
1687
|
+
"measured": external_tokens_measured,
|
|
1688
|
+
"total": total_optional_int(rows_for_task, "external_tokens") if external_tokens_measured else None,
|
|
1689
|
+
},
|
|
1690
|
+
"external_cost_usd": {
|
|
1691
|
+
"measured": external_cost_measured,
|
|
1692
|
+
"total": (
|
|
1693
|
+
sum(row_float(row, "external_cost_usd") for row in rows_for_task)
|
|
1694
|
+
if external_cost_measured else None
|
|
1695
|
+
),
|
|
1696
|
+
},
|
|
1697
|
+
"bytes": {
|
|
1698
|
+
"measurement": "observed" if byte_metrics_observed else "unavailable",
|
|
1699
|
+
"before_total": bytes_before_total if byte_metrics_observed else None,
|
|
1700
|
+
"after_total": bytes_after_total if byte_metrics_observed else None,
|
|
1701
|
+
"delta_total": byte_delta,
|
|
1702
|
+
"token_proxy_delta": token_proxy_delta,
|
|
1703
|
+
"token_proxy": "chars_div_4_proxy_only" if byte_metrics_observed else "unavailable",
|
|
1704
|
+
},
|
|
1705
|
+
"wall_time_seconds": {
|
|
1706
|
+
"measured": all_rows_optional_float(rows_for_task, "wall_time_seconds") is not None,
|
|
1707
|
+
"average": average_optional_float(rows_for_task, "wall_time_seconds"),
|
|
1708
|
+
},
|
|
1709
|
+
"provider_cached_tokens": {
|
|
1710
|
+
"measured": provider_cache_measured,
|
|
1711
|
+
"average": (
|
|
1712
|
+
average_optional_int(rows_for_task, "provider_cached_tokens")
|
|
1713
|
+
if provider_cache_measured else None
|
|
1714
|
+
),
|
|
1715
|
+
},
|
|
1716
|
+
"corrections": {
|
|
1717
|
+
"measured": corrections_values is not None,
|
|
1718
|
+
"average": (sum(corrections_values) / len(corrections_values)) if corrections_values else None,
|
|
1719
|
+
},
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
def matched_pair_evidence_entry(
|
|
1723
|
+
variant: str,
|
|
1724
|
+
task_id: str,
|
|
1725
|
+
quality_gate: str,
|
|
1726
|
+
) -> dict[str, Any]:
|
|
1727
|
+
baseline_rows = successful_rows_by_variant_task[baseline_variant][task_id]
|
|
1728
|
+
variant_rows = successful_rows_by_variant_task[variant][task_id]
|
|
1729
|
+
baseline_evidence = matched_side_evidence(baseline_variant, task_id, baseline_rows)
|
|
1730
|
+
variant_evidence = matched_side_evidence(variant, task_id, variant_rows)
|
|
1731
|
+
baseline_token_avg = baseline_evidence["primary_tokens"]["average"]
|
|
1732
|
+
variant_token_avg = variant_evidence["primary_tokens"]["average"]
|
|
1733
|
+
token_claim_allowed = (
|
|
1734
|
+
quality_gate == "pass"
|
|
1735
|
+
and bool(baseline_evidence["primary_tokens"]["measured"])
|
|
1736
|
+
and bool(variant_evidence["primary_tokens"]["measured"])
|
|
1737
|
+
and isinstance(baseline_token_avg, (int, float))
|
|
1738
|
+
and baseline_token_avg > 0
|
|
1739
|
+
and isinstance(variant_token_avg, (int, float))
|
|
1740
|
+
)
|
|
1741
|
+
baseline_cost_avg = baseline_evidence["total_cost_with_shift_usd"]["average"]
|
|
1742
|
+
variant_cost_avg = variant_evidence["total_cost_with_shift_usd"]["average"]
|
|
1743
|
+
shifted_cost_claim_allowed = (
|
|
1744
|
+
quality_gate == "pass"
|
|
1745
|
+
and bool(baseline_evidence["total_cost_with_shift_usd"]["measured"])
|
|
1746
|
+
and bool(variant_evidence["total_cost_with_shift_usd"]["measured"])
|
|
1747
|
+
and isinstance(baseline_cost_avg, (int, float))
|
|
1748
|
+
and baseline_cost_avg > 0
|
|
1749
|
+
and isinstance(variant_cost_avg, (int, float))
|
|
1750
|
+
)
|
|
1751
|
+
token_delta = (
|
|
1752
|
+
variant_token_avg - baseline_token_avg
|
|
1753
|
+
if token_claim_allowed
|
|
1754
|
+
else None
|
|
1755
|
+
)
|
|
1756
|
+
token_savings_pct = (
|
|
1757
|
+
(baseline_token_avg - variant_token_avg) / baseline_token_avg * 100.0
|
|
1758
|
+
if token_delta is not None
|
|
1759
|
+
else None
|
|
1760
|
+
)
|
|
1761
|
+
cost_delta = (
|
|
1762
|
+
variant_cost_avg - baseline_cost_avg
|
|
1763
|
+
if shifted_cost_claim_allowed
|
|
1764
|
+
else None
|
|
1765
|
+
)
|
|
1766
|
+
cost_savings_pct = (
|
|
1767
|
+
(baseline_cost_avg - variant_cost_avg) / baseline_cost_avg * 100.0
|
|
1768
|
+
if cost_delta is not None
|
|
1769
|
+
else None
|
|
1770
|
+
)
|
|
1771
|
+
base_after = baseline_evidence["bytes"]["after_total"]
|
|
1772
|
+
variant_after = variant_evidence["bytes"]["after_total"]
|
|
1773
|
+
byte_after_delta = (
|
|
1774
|
+
variant_after - base_after
|
|
1775
|
+
if isinstance(base_after, int) and isinstance(variant_after, int)
|
|
1776
|
+
else None
|
|
1777
|
+
)
|
|
1778
|
+
return {
|
|
1779
|
+
"schema_version": MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION,
|
|
1780
|
+
"task_id": task_id,
|
|
1781
|
+
"baseline_variant": baseline_variant,
|
|
1782
|
+
"variant": variant,
|
|
1783
|
+
"transform_id": variant,
|
|
1784
|
+
"quality_gate": quality_gate,
|
|
1785
|
+
"evidence_kind": "matched_successful_task_bucket",
|
|
1786
|
+
"measurements": {
|
|
1787
|
+
"baseline": baseline_evidence,
|
|
1788
|
+
"variant": variant_evidence,
|
|
1789
|
+
},
|
|
1790
|
+
"delta": {
|
|
1791
|
+
"primary_tokens_average": token_delta,
|
|
1792
|
+
"token_savings_pct": token_savings_pct,
|
|
1793
|
+
"total_cost_with_shift_usd_average": cost_delta,
|
|
1794
|
+
"cost_savings_pct_with_shift": cost_savings_pct,
|
|
1795
|
+
"bytes_after_total": byte_after_delta,
|
|
1796
|
+
"token_proxy_after_total": (
|
|
1797
|
+
int(byte_after_delta / TOKEN_PROXY_BYTES_PER_TOKEN)
|
|
1798
|
+
if byte_after_delta is not None else None
|
|
1799
|
+
),
|
|
1800
|
+
"proxy_measurement": "chars_div_4_proxy_only",
|
|
1801
|
+
},
|
|
1802
|
+
"claim_boundary": {
|
|
1803
|
+
"quality_gate": quality_gate,
|
|
1804
|
+
"token_savings_claim_allowed": token_claim_allowed,
|
|
1805
|
+
"shifted_cost_claim_allowed": shifted_cost_claim_allowed,
|
|
1806
|
+
"byte_proxy_only": True,
|
|
1807
|
+
"requires_matched_successful_tasks": True,
|
|
1808
|
+
"raw_estimate_only_claim_allowed": False,
|
|
1809
|
+
},
|
|
1810
|
+
}
|
|
1811
|
+
|
|
1569
1812
|
comparisons: list[dict[str, Any]] = []
|
|
1813
|
+
matched_pair_evidence: list[dict[str, Any]] = []
|
|
1570
1814
|
baseline = by_variant.get(baseline_variant)
|
|
1571
1815
|
baseline_successful_tasks = successful_tasks_by_variant.get(baseline_variant, set())
|
|
1572
1816
|
baseline_failure_rate = baseline.get("failure_rate") if baseline else None
|
|
@@ -1680,6 +1924,8 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1680
1924
|
else:
|
|
1681
1925
|
comparison["cost_savings_pct_with_shift"] = None
|
|
1682
1926
|
comparison["paired_cost_task_count"] = cost_task_count
|
|
1927
|
+
for task_id in sorted(matched_tasks):
|
|
1928
|
+
matched_pair_evidence.append(matched_pair_evidence_entry(variant, task_id, quality_gate))
|
|
1683
1929
|
comparisons.append(comparison)
|
|
1684
1930
|
|
|
1685
1931
|
claim_status = "insufficient_baseline"
|
|
@@ -1712,6 +1958,7 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
1712
1958
|
"row_count": len(rows),
|
|
1713
1959
|
"summary_by_variant": by_variant,
|
|
1714
1960
|
"comparisons": comparisons,
|
|
1961
|
+
"matched_pair_evidence": matched_pair_evidence,
|
|
1715
1962
|
"claim_status": claim_status,
|
|
1716
1963
|
"caveat": (
|
|
1717
1964
|
"Proxy byte reductions are reported separately from matched-task token/cost metrics; "
|
|
@@ -1843,12 +2090,6 @@ def main() -> int:
|
|
|
1843
2090
|
require_no_follow_file_ops_supported()
|
|
1844
2091
|
validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
|
|
1845
2092
|
|
|
1846
|
-
if not args.dry_run and shutil.which(args.claude_bin) is None:
|
|
1847
|
-
# claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
|
|
1848
|
-
if not Path(args.claude_bin).exists():
|
|
1849
|
-
print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
|
|
1850
|
-
return 2
|
|
1851
|
-
|
|
1852
2093
|
tasks = parse_tasks(args.tasks)
|
|
1853
2094
|
variants = parse_variants(args.variants)
|
|
1854
2095
|
targets = filter_targets(tasks, variants, args.task_id, args.variant)
|
|
@@ -1857,8 +2098,32 @@ def main() -> int:
|
|
|
1857
2098
|
return 1
|
|
1858
2099
|
|
|
1859
2100
|
skip_keys = existing_keys(args.csv) if args.resume else set()
|
|
2101
|
+
runnable_targets = [
|
|
2102
|
+
(task, variant)
|
|
2103
|
+
for task, variant in targets
|
|
2104
|
+
if (task.id, variant.name) not in skip_keys
|
|
2105
|
+
]
|
|
2106
|
+
placeholder_targets = [
|
|
2107
|
+
f"{task.id}/{variant.name}"
|
|
2108
|
+
for task, variant in runnable_targets
|
|
2109
|
+
if is_placeholder_success_command(task.success_command)
|
|
2110
|
+
]
|
|
2111
|
+
if placeholder_targets and not args.dry_run:
|
|
2112
|
+
print(
|
|
2113
|
+
f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing non-dry-run provider invocation for: "
|
|
2114
|
+
f"{', '.join(placeholder_targets)}",
|
|
2115
|
+
file=sys.stderr,
|
|
2116
|
+
)
|
|
2117
|
+
return 2
|
|
2118
|
+
|
|
2119
|
+
if runnable_targets and not args.dry_run and shutil.which(args.claude_bin) is None:
|
|
2120
|
+
# claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
|
|
2121
|
+
if not Path(args.claude_bin).exists():
|
|
2122
|
+
print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
|
|
2123
|
+
return 2
|
|
2124
|
+
|
|
1860
2125
|
project_root = args.project_root.resolve()
|
|
1861
|
-
claude_ver = "dry-run" if args.dry_run else claude_version(args.claude_bin)
|
|
2126
|
+
claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
|
|
1862
2127
|
|
|
1863
2128
|
completed = 0
|
|
1864
2129
|
for task, variant in targets:
|
|
@@ -44,6 +44,55 @@ CODE_SIGNAL_RE = re.compile(
|
|
|
44
44
|
r"(^\s*(def |class |function |func |import |from \S+ import |public |private |const |let |var |#include|package )"
|
|
45
45
|
r"|[{};]\s*$|=>|::)"
|
|
46
46
|
)
|
|
47
|
+
CODE_FENCE_RE = re.compile(r"(?m)^\s*```")
|
|
48
|
+
JSON_KEY_RE = re.compile(r'"(?:[^"\\]|\\.)*"\s*:')
|
|
49
|
+
QUOTED_STRING_RE = re.compile(r"""(?x)
|
|
50
|
+
"(?:[^"\\]|\\.)*" |
|
|
51
|
+
'(?:[^'\\]|\\.)*'
|
|
52
|
+
""")
|
|
53
|
+
HASH_RE = re.compile(r"\b(?:[0-9a-fA-F]{32,}|sha256:[0-9a-fA-F]{32,})\b")
|
|
54
|
+
PATH_RE = re.compile(
|
|
55
|
+
r"(?x)(?:"
|
|
56
|
+
r"(?<![\w.-])/(?:[A-Za-z0-9._@%+=:-]+/)*[A-Za-z0-9._@%+=:-]+"
|
|
57
|
+
r"|"
|
|
58
|
+
r"\b[A-Za-z]:\\(?:[^\\\s:\"'<>|]+\\)*[^\\\s:\"'<>|]+"
|
|
59
|
+
r"|"
|
|
60
|
+
r"\b[A-Za-z0-9._-]+\#path:[0-9a-f]{12}\b"
|
|
61
|
+
r")"
|
|
62
|
+
)
|
|
63
|
+
STACK_FRAME_RE = re.compile(
|
|
64
|
+
r"(?m)^\s*(?:File\s+\"[^\"]+\",\s+line\s+\d+,\s+in\s+\S+|at\s+\S+.*\([^)]*:\d+(?::\d+)?\))"
|
|
65
|
+
)
|
|
66
|
+
IDENTIFIER_RE = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*(?:[A-Z][A-Za-z0-9_]*)?\b")
|
|
67
|
+
NUMERIC_CONSTANT_RE = re.compile(r"(?<![\w.])[-+]?(?:0x[0-9A-Fa-f]+|\d+(?:\.\d+)?)(?![\w.])")
|
|
68
|
+
PROTECTED_ZONE_KEYS = (
|
|
69
|
+
"code_fence",
|
|
70
|
+
"diff",
|
|
71
|
+
"identifier",
|
|
72
|
+
"numeric_constant",
|
|
73
|
+
"hash",
|
|
74
|
+
"path",
|
|
75
|
+
"stack_frame",
|
|
76
|
+
"quoted_string",
|
|
77
|
+
"json_key",
|
|
78
|
+
)
|
|
79
|
+
PROTECTED_ALLOWED_TRANSFORMS = (
|
|
80
|
+
"exact_dedupe",
|
|
81
|
+
"structural_window",
|
|
82
|
+
"line_truncate",
|
|
83
|
+
"whitespace_normalize",
|
|
84
|
+
"json_compact",
|
|
85
|
+
"artifact_retrieval",
|
|
86
|
+
)
|
|
87
|
+
PROTECTED_DENIED_TRANSFORMS = (
|
|
88
|
+
"semantic_compress",
|
|
89
|
+
"paraphrase",
|
|
90
|
+
"identifier_rewrite",
|
|
91
|
+
"numeric_rewrite",
|
|
92
|
+
"hash_rewrite",
|
|
93
|
+
"path_rewrite",
|
|
94
|
+
"quoted_literal_rewrite",
|
|
95
|
+
)
|
|
47
96
|
|
|
48
97
|
|
|
49
98
|
def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
|
|
@@ -173,6 +222,85 @@ def classify_content(text: str) -> str:
|
|
|
173
222
|
return "prose"
|
|
174
223
|
|
|
175
224
|
|
|
225
|
+
def protected_zone_counts(text: str) -> dict[str, int]:
|
|
226
|
+
"""Conservatively count semantic-sensitive zones without storing raw spans.
|
|
227
|
+
|
|
228
|
+
The counts intentionally over-approximate. They are policy signals for later
|
|
229
|
+
transform gates, not a parser. Metadata must never include the matched path,
|
|
230
|
+
identifier, hash, or string contents because receipts are safe to share.
|
|
231
|
+
"""
|
|
232
|
+
lines = text.splitlines()
|
|
233
|
+
fence_markers = len(CODE_FENCE_RE.findall(text))
|
|
234
|
+
diff_lines = sum(
|
|
235
|
+
1
|
|
236
|
+
for line in lines
|
|
237
|
+
if DIFF_FILE_HEADER_RE.match(line)
|
|
238
|
+
or DIFF_HUNK_RE.match(line)
|
|
239
|
+
or (line[:1] in "+-" and not line.startswith(("+++", "---")))
|
|
240
|
+
)
|
|
241
|
+
counts = {
|
|
242
|
+
"code_fence": (fence_markers + 1) // 2,
|
|
243
|
+
"diff": diff_lines,
|
|
244
|
+
"identifier": len(IDENTIFIER_RE.findall(text)),
|
|
245
|
+
"numeric_constant": len(NUMERIC_CONSTANT_RE.findall(text)),
|
|
246
|
+
"hash": len(HASH_RE.findall(text)),
|
|
247
|
+
"path": len(PATH_RE.findall(text)),
|
|
248
|
+
"stack_frame": len(STACK_FRAME_RE.findall(text)),
|
|
249
|
+
"quoted_string": len(QUOTED_STRING_RE.findall(text)),
|
|
250
|
+
"json_key": len(JSON_KEY_RE.findall(text)),
|
|
251
|
+
}
|
|
252
|
+
return {key: counts[key] for key in PROTECTED_ZONE_KEYS if counts.get(key, 0) > 0}
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def build_protected_policy(
|
|
256
|
+
*,
|
|
257
|
+
text: str,
|
|
258
|
+
content_type: str,
|
|
259
|
+
strategy_detail: dict[str, object],
|
|
260
|
+
lossy: bool,
|
|
261
|
+
) -> dict[str, object]:
|
|
262
|
+
"""Build an opt-in transform policy for protected zones.
|
|
263
|
+
|
|
264
|
+
Protection governs transform eligibility and exact-retrieval expectations.
|
|
265
|
+
It does not claim the section should be provider-cache-stable; cache ordering
|
|
266
|
+
is handled by `context-guard-cost compile`.
|
|
267
|
+
"""
|
|
268
|
+
zone_counts = protected_zone_counts(text)
|
|
269
|
+
detected = bool(zone_counts)
|
|
270
|
+
strategy = str(strategy_detail.get("strategy") or "unknown")
|
|
271
|
+
retrieval_required = bool(detected and lossy)
|
|
272
|
+
return {
|
|
273
|
+
"enabled": True,
|
|
274
|
+
"detected": detected,
|
|
275
|
+
"content_type": content_type,
|
|
276
|
+
"zone_counts": zone_counts,
|
|
277
|
+
"semantic_compress": False,
|
|
278
|
+
"allowed_transforms": list(PROTECTED_ALLOWED_TRANSFORMS),
|
|
279
|
+
"denied_transforms": list(PROTECTED_DENIED_TRANSFORMS),
|
|
280
|
+
"retrieval_required": retrieval_required,
|
|
281
|
+
"retrieval_scope": "sanitized_full_input" if retrieval_required else "compressed_output",
|
|
282
|
+
"raw_spans_stored": False,
|
|
283
|
+
"policy_note": "Protected zones permit structural transforms only; no semantic/paraphrase rewrites.",
|
|
284
|
+
"strategy": {
|
|
285
|
+
"name": strategy,
|
|
286
|
+
"structural_only": True,
|
|
287
|
+
},
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def build_transform_policy(protected_policy: dict[str, object]) -> dict[str, object]:
|
|
292
|
+
"""Summarize transform eligibility without embedding raw protected content."""
|
|
293
|
+
return {
|
|
294
|
+
"mode": "protected" if protected_policy.get("detected") else "structural_default",
|
|
295
|
+
"semantic_transforms_allowed": False,
|
|
296
|
+
"semantic_compress": False,
|
|
297
|
+
"allowed": list(PROTECTED_ALLOWED_TRANSFORMS),
|
|
298
|
+
"denied": list(PROTECTED_DENIED_TRANSFORMS),
|
|
299
|
+
"exact_retrieval_required": bool(protected_policy.get("retrieval_required")),
|
|
300
|
+
"raw_spans_stored": False,
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
|
|
176
304
|
def _looks_like_json(stripped: str) -> bool:
|
|
177
305
|
if stripped[0] not in "{[":
|
|
178
306
|
return False
|
|
@@ -353,6 +481,7 @@ def build_metadata(
|
|
|
353
481
|
input_truncated: bool,
|
|
354
482
|
input_bytes: int,
|
|
355
483
|
max_bytes: int,
|
|
484
|
+
protected_policy_enabled: bool = False,
|
|
356
485
|
) -> dict[str, object]:
|
|
357
486
|
"""Assemble the compress receipt: observed byte/line counts plus an estimated token proxy.
|
|
358
487
|
|
|
@@ -370,7 +499,7 @@ def build_metadata(
|
|
|
370
499
|
if lossy
|
|
371
500
|
else "Data-preserving: compact form is semantically equivalent to the sanitized input."
|
|
372
501
|
)
|
|
373
|
-
|
|
502
|
+
metadata: dict[str, object] = {
|
|
374
503
|
"tool": "context-guard-kit.context_compress",
|
|
375
504
|
"metadata_version": 1,
|
|
376
505
|
"content_type": content_type,
|
|
@@ -407,6 +536,21 @@ def build_metadata(
|
|
|
407
536
|
},
|
|
408
537
|
"retrieval_hint": retrieval_hint,
|
|
409
538
|
}
|
|
539
|
+
if protected_policy_enabled:
|
|
540
|
+
protected_policy = build_protected_policy(
|
|
541
|
+
text=original_text,
|
|
542
|
+
content_type=content_type,
|
|
543
|
+
strategy_detail=strategy_detail,
|
|
544
|
+
lossy=lossy,
|
|
545
|
+
)
|
|
546
|
+
metadata["protected_zone_policy"] = protected_policy
|
|
547
|
+
metadata["transform_policy"] = build_transform_policy(protected_policy)
|
|
548
|
+
if protected_policy.get("retrieval_required"):
|
|
549
|
+
metadata["retrieval_hint"] = (
|
|
550
|
+
"Protected lossy structural transform: store the full sanitized text with "
|
|
551
|
+
"`context-guard-artifact store` and retrieve exact slices before relying on omitted content."
|
|
552
|
+
)
|
|
553
|
+
return metadata
|
|
410
554
|
|
|
411
555
|
|
|
412
556
|
def compress_text(
|
|
@@ -417,6 +561,7 @@ def compress_text(
|
|
|
417
561
|
input_truncated: bool,
|
|
418
562
|
input_bytes: int,
|
|
419
563
|
max_bytes: int,
|
|
564
|
+
protected_policy_enabled: bool = False,
|
|
420
565
|
) -> tuple[str, dict[str, object]]:
|
|
421
566
|
"""Sanitize first, then classify and compress, then build the receipt.
|
|
422
567
|
|
|
@@ -446,6 +591,7 @@ def compress_text(
|
|
|
446
591
|
input_truncated=input_truncated,
|
|
447
592
|
input_bytes=input_bytes,
|
|
448
593
|
max_bytes=max_bytes,
|
|
594
|
+
protected_policy_enabled=protected_policy_enabled,
|
|
449
595
|
)
|
|
450
596
|
return compressed, metadata
|
|
451
597
|
|
|
@@ -489,6 +635,7 @@ def run_compress(args: argparse.Namespace) -> int:
|
|
|
489
635
|
input_truncated=input_truncated,
|
|
490
636
|
input_bytes=input_bytes,
|
|
491
637
|
max_bytes=max_bytes,
|
|
638
|
+
protected_policy_enabled=bool(args.protected_policy),
|
|
492
639
|
)
|
|
493
640
|
if args.json:
|
|
494
641
|
payload = {"metadata": metadata, "content": compressed}
|
|
@@ -513,6 +660,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
513
660
|
help="force a content type instead of auto-detecting (json/diff/log/search/code/prose)",
|
|
514
661
|
)
|
|
515
662
|
parser.add_argument("--json", action="store_true", help="emit JSON with metadata and compressed content")
|
|
663
|
+
parser.add_argument(
|
|
664
|
+
"--protected-policy",
|
|
665
|
+
action="store_true",
|
|
666
|
+
help="add opt-in protected-zone transform policy metadata to --json/--metadata-only receipts; default content is unchanged",
|
|
667
|
+
)
|
|
516
668
|
parser.add_argument(
|
|
517
669
|
"--metadata-only",
|
|
518
670
|
action="store_true",
|