npm - @ictechgy/context-guard - Versions diffs - 0.4.1 → 0.4.4 - Mend

@ictechgy/context-guard 0.4.1 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +15 -0
package/README.ko.md +62 -33
package/README.md +91 -23
package/context-guard-kit/README.md +39 -26
package/context-guard-kit/benchmark_runner.py +273 -8
package/context-guard-kit/claude_transcript_cost_audit.py +597 -12
package/context-guard-kit/context_compress.py +153 -1
package/context-guard-kit/context_filter.py +446 -0
package/context-guard-kit/context_guard_cli.py +3 -0
package/context-guard-kit/context_guard_diet.py +677 -2
package/context-guard-kit/context_pack.py +1694 -2
package/context-guard-kit/cost_guard.py +1870 -0
package/context-guard-kit/setup_wizard.py +820 -29
package/context-guard-kit/trim_command_output.py +396 -45
package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
package/docs/benchmark-workflow-examples.md +40 -0
package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
package/docs/cache-diagnostics-schema.md +96 -0
package/docs/cache-diagnostics.example.json +116 -0
package/docs/cache-diagnostics.schema.json +460 -0
package/docs/distribution.md +4 -2
package/docs/experimental-benchmark-fixtures.md +36 -0
package/package.json +11 -2
package/packaging/homebrew/context-guard.rb.template +3 -2
package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
package/plugins/context-guard/README.ko.md +22 -14
package/plugins/context-guard/README.md +24 -10
package/plugins/context-guard/bin/context-guard +3 -0
package/plugins/context-guard/bin/context-guard-audit +597 -12
package/plugins/context-guard/bin/context-guard-bench +273 -8
package/plugins/context-guard/bin/context-guard-compress +153 -1
package/plugins/context-guard/bin/context-guard-cost +1870 -0
package/plugins/context-guard/bin/context-guard-diet +677 -2
package/plugins/context-guard/bin/context-guard-filter +446 -0
package/plugins/context-guard/bin/context-guard-pack +1694 -2
package/plugins/context-guard/bin/context-guard-setup +820 -29
package/plugins/context-guard/bin/context-guard-trim-output +396 -45
package/plugins/context-guard/brief/README.md +10 -3
package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
package/plugins/context-guard/skills/setup/SKILL.md +3 -1

package/context-guard-kit/benchmark_runner.py CHANGED Viewed

@@ -108,6 +108,7 @@ CSV_COLUMNS = [
 MAX_CSV_NOTE_CHARS = 500
 MAX_CSV_ROWS = 100_000
 CSV_FORMULA_PREFIXES = ("=", "+", "-", "@")
+PLACEHOLDER_SUCCESS_COMMAND_MARKER = "fixture-only placeholder: replace success_command before real benchmark runs"
 PROTECTED_VARIANT_FLAGS = frozenset({
     "--",
     "-p",
@@ -180,6 +181,8 @@ MAX_USAGE_COST_USD = 10**9
 # 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
 # ~4 bytes/token의 통용 근사값을 사용한다.
 TOKEN_PROXY_BYTES_PER_TOKEN = 4
+BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
+MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
 CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
 SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
 VERSION_OUTPUT_MAX_BYTES = 16_000
@@ -395,6 +398,10 @@ class BoundedProcessResult:
     output_truncated: bool = False
+def is_placeholder_success_command(command: str | None) -> bool:
+    return bool(command and PLACEHOLDER_SUCCESS_COMMAND_MARKER in command)
 def parse_positive_int(value: Any, *, field: str, owner: str) -> int:
     """Parse a JSON fixture field that must be a positive integer."""
     if isinstance(value, bool):
@@ -940,6 +947,14 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
             success=True, notes=f"dry-run: {shlex.join(argv)}",
             wall_time_seconds=0.0,
         )
+    if is_placeholder_success_command(task.success_command):
+        return RunResult(
+            task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
+            tokens={k: 0 for k, _ in USAGE_KEY_GROUPS}, cost_usd=0.0,
+            success=False,
+            notes=f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing to invoke provider",
+            wall_time_seconds=elapsed_seconds_since(started_at),
+        )
     argv[0] = executable_argv0(argv[0])
     try:
         proc = run_bounded_command(
@@ -1116,11 +1131,14 @@ def write_text_no_follow(path: Path, text: str) -> None:
 def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
     shifted_cost_known = cost_shift_measured(result)
+    byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
     payload = {
+        "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
         "date": _dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
         "claude_version": claude_ver,
         "task_id": result.task_id,
         "variant": result.variant,
+        "transform_id": result.variant,
         "success": result.success,
         "primary_cost_measured": result.cost_measured,
         "primary_cost_usd": round(result.cost_usd, 6),
@@ -1142,6 +1160,22 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
         "hook_triggers": result.hook_triggers,
         "turns": result.turns,
         "notes": sanitize_csv_note(result.notes),
+        "measurement_availability": {
+            "primary_tokens": result.primary_tokens_measured,
+            "primary_cost": result.cost_measured,
+            "external_tokens": result.external_tokens_measured,
+            "external_cost": result.external_cost_measured,
+            "shifted_cost": shifted_cost_known,
+            "provider_cache": result.provider_cached_tokens_measured,
+            "byte_metrics": byte_metrics_observed,
+            "wall_time": result.wall_time_seconds >= 0,
+        },
+        "proxy_metrics": {
+            "byte_metrics_observed": byte_metrics_observed,
+            "token_proxy": "chars_div_4",
+            "bytes_per_token": TOKEN_PROXY_BYTES_PER_TOKEN,
+            "claim_boundary": "proxy_only_not_hosted_token_savings",
+        },
     }
     with csv_file_lock(path, create_parent=True):
         fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
@@ -1283,7 +1317,9 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
     seen_tasks_by_variant: dict[str, set[str]] = {}
     successful_tasks_by_variant: dict[str, set[str]] = {}
-    for row in rows:
+    for row_index, raw_row in enumerate(rows, start=1):
+        row = dict(raw_row)
+        row["_row_index"] = str(row_index)
         variant = row.get("variant") or "unknown"
         task_id = row.get("task_id") or "unknown"
         seen_tasks_by_variant.setdefault(variant, set()).add(task_id)
@@ -1566,7 +1602,215 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
             len(baseline_values),
         )
+    def row_indices_for(rows_for_task: list[dict[str, str]]) -> list[int]:
+        out: list[int] = []
+        for row in rows_for_task:
+            index = row_optional_nonnegative_int(row, "_row_index")
+            if index is not None:
+                out.append(index)
+        return out
+    def all_rows_bool(rows_for_task: list[dict[str, str]], key: str) -> bool:
+        return bool(rows_for_task) and all(row_bool(row, key) for row in rows_for_task)
+    def all_rows_optional_int(rows_for_task: list[dict[str, str]], key: str) -> list[int] | None:
+        values = [row_optional_nonnegative_int(row, key) for row in rows_for_task]
+        if not values or any(value is None for value in values):
+            return None
+        return [value for value in values if value is not None]
+    def all_rows_optional_float(rows_for_task: list[dict[str, str]], key: str) -> list[float] | None:
+        values = [row_optional_float(row, key) for row in rows_for_task]
+        if not values or any(value is None for value in values):
+            return None
+        return [value for value in values if value is not None]
+    def average_optional_int(rows_for_task: list[dict[str, str]], key: str) -> float | None:
+        values = all_rows_optional_int(rows_for_task, key)
+        return (sum(values) / len(values)) if values else None
+    def average_optional_float(rows_for_task: list[dict[str, str]], key: str) -> float | None:
+        values = all_rows_optional_float(rows_for_task, key)
+        return (sum(values) / len(values)) if values else None
+    def total_optional_int(rows_for_task: list[dict[str, str]], key: str) -> int | None:
+        values = all_rows_optional_int(rows_for_task, key)
+        return sum(values) if values is not None else None
+    def all_rows_shifted_cost_measured(rows_for_task: list[dict[str, str]]) -> bool:
+        return bool(rows_for_task) and all(
+            row_cost_shift_measured(row) and row_optional_float(row, "total_cost_with_shift_usd") is not None
+            for row in rows_for_task
+        )
+    def matched_side_evidence(variant: str, task_id: str, rows_for_task: list[dict[str, str]]) -> dict[str, Any]:
+        primary_tokens_measured = all_rows_bool(rows_for_task, "primary_tokens_measured")
+        primary_cost_measured = all_rows_bool(rows_for_task, "cost_measured")
+        shifted_cost_measured = all_rows_shifted_cost_measured(rows_for_task)
+        provider_cache_measured = all_rows_bool(rows_for_task, "provider_cached_tokens_measured")
+        external_tokens_measured = all_rows_bool(rows_for_task, "external_tokens_measured")
+        external_cost_measured = all_rows_bool(rows_for_task, "external_cost_measured")
+        corrections_values = all_rows_optional_int(rows_for_task, "corrections")
+        bytes_before_values = [row_optional_nonnegative_int(row, "bytes_before") for row in rows_for_task]
+        bytes_after_values = [row_optional_nonnegative_int(row, "bytes_after") for row in rows_for_task]
+        byte_metrics_observed = bool(rows_for_task) and not any(
+            value is None for value in [*bytes_before_values, *bytes_after_values]
+        )
+        bytes_before_total = sum(value for value in bytes_before_values if value is not None)
+        bytes_after_total = sum(value for value in bytes_after_values if value is not None)
+        byte_delta = bytes_after_total - bytes_before_total if byte_metrics_observed else None
+        token_proxy_delta = (
+            int(byte_delta / TOKEN_PROXY_BYTES_PER_TOKEN) if byte_delta is not None else None
+        )
+        return {
+            "variant": variant,
+            "task_id": task_id,
+            "run_count": len(rows_for_task),
+            "row_indices": row_indices_for(rows_for_task),
+            "primary_tokens": {
+                "measured": primary_tokens_measured,
+                "average": average_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
+                "total": total_optional_int(rows_for_task, "total_tokens") if primary_tokens_measured else None,
+            },
+            "primary_cost_usd": {
+                "measured": primary_cost_measured,
+                "average": average_optional_float(rows_for_task, "cost_usd") if primary_cost_measured else None,
+            },
+            "total_cost_with_shift_usd": {
+                "measured": shifted_cost_measured,
+                "average": (
+                    average_optional_float(rows_for_task, "total_cost_with_shift_usd")
+                    if shifted_cost_measured else None
+                ),
+            },
+            "external_tokens": {
+                "measured": external_tokens_measured,
+                "total": total_optional_int(rows_for_task, "external_tokens") if external_tokens_measured else None,
+            },
+            "external_cost_usd": {
+                "measured": external_cost_measured,
+                "total": (
+                    sum(row_float(row, "external_cost_usd") for row in rows_for_task)
+                    if external_cost_measured else None
+                ),
+            },
+            "bytes": {
+                "measurement": "observed" if byte_metrics_observed else "unavailable",
+                "before_total": bytes_before_total if byte_metrics_observed else None,
+                "after_total": bytes_after_total if byte_metrics_observed else None,
+                "delta_total": byte_delta,
+                "token_proxy_delta": token_proxy_delta,
+                "token_proxy": "chars_div_4_proxy_only" if byte_metrics_observed else "unavailable",
+            },
+            "wall_time_seconds": {
+                "measured": all_rows_optional_float(rows_for_task, "wall_time_seconds") is not None,
+                "average": average_optional_float(rows_for_task, "wall_time_seconds"),
+            },
+            "provider_cached_tokens": {
+                "measured": provider_cache_measured,
+                "average": (
+                    average_optional_int(rows_for_task, "provider_cached_tokens")
+                    if provider_cache_measured else None
+                ),
+            },
+            "corrections": {
+                "measured": corrections_values is not None,
+                "average": (sum(corrections_values) / len(corrections_values)) if corrections_values else None,
+            },
+        }
+    def matched_pair_evidence_entry(
+        variant: str,
+        task_id: str,
+        quality_gate: str,
+    ) -> dict[str, Any]:
+        baseline_rows = successful_rows_by_variant_task[baseline_variant][task_id]
+        variant_rows = successful_rows_by_variant_task[variant][task_id]
+        baseline_evidence = matched_side_evidence(baseline_variant, task_id, baseline_rows)
+        variant_evidence = matched_side_evidence(variant, task_id, variant_rows)
+        baseline_token_avg = baseline_evidence["primary_tokens"]["average"]
+        variant_token_avg = variant_evidence["primary_tokens"]["average"]
+        token_claim_allowed = (
+            quality_gate == "pass"
+            and bool(baseline_evidence["primary_tokens"]["measured"])
+            and bool(variant_evidence["primary_tokens"]["measured"])
+            and isinstance(baseline_token_avg, (int, float))
+            and baseline_token_avg > 0
+            and isinstance(variant_token_avg, (int, float))
+        )
+        baseline_cost_avg = baseline_evidence["total_cost_with_shift_usd"]["average"]
+        variant_cost_avg = variant_evidence["total_cost_with_shift_usd"]["average"]
+        shifted_cost_claim_allowed = (
+            quality_gate == "pass"
+            and bool(baseline_evidence["total_cost_with_shift_usd"]["measured"])
+            and bool(variant_evidence["total_cost_with_shift_usd"]["measured"])
+            and isinstance(baseline_cost_avg, (int, float))
+            and baseline_cost_avg > 0
+            and isinstance(variant_cost_avg, (int, float))
+        )
+        token_delta = (
+            variant_token_avg - baseline_token_avg
+            if token_claim_allowed
+            else None
+        )
+        token_savings_pct = (
+            (baseline_token_avg - variant_token_avg) / baseline_token_avg * 100.0
+            if token_delta is not None
+            else None
+        )
+        cost_delta = (
+            variant_cost_avg - baseline_cost_avg
+            if shifted_cost_claim_allowed
+            else None
+        )
+        cost_savings_pct = (
+            (baseline_cost_avg - variant_cost_avg) / baseline_cost_avg * 100.0
+            if cost_delta is not None
+            else None
+        )
+        base_after = baseline_evidence["bytes"]["after_total"]
+        variant_after = variant_evidence["bytes"]["after_total"]
+        byte_after_delta = (
+            variant_after - base_after
+            if isinstance(base_after, int) and isinstance(variant_after, int)
+            else None
+        )
+        return {
+            "schema_version": MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION,
+            "task_id": task_id,
+            "baseline_variant": baseline_variant,
+            "variant": variant,
+            "transform_id": variant,
+            "quality_gate": quality_gate,
+            "evidence_kind": "matched_successful_task_bucket",
+            "measurements": {
+                "baseline": baseline_evidence,
+                "variant": variant_evidence,
+            },
+            "delta": {
+                "primary_tokens_average": token_delta,
+                "token_savings_pct": token_savings_pct,
+                "total_cost_with_shift_usd_average": cost_delta,
+                "cost_savings_pct_with_shift": cost_savings_pct,
+                "bytes_after_total": byte_after_delta,
+                "token_proxy_after_total": (
+                    int(byte_after_delta / TOKEN_PROXY_BYTES_PER_TOKEN)
+                    if byte_after_delta is not None else None
+                ),
+                "proxy_measurement": "chars_div_4_proxy_only",
+            },
+            "claim_boundary": {
+                "quality_gate": quality_gate,
+                "token_savings_claim_allowed": token_claim_allowed,
+                "shifted_cost_claim_allowed": shifted_cost_claim_allowed,
+                "byte_proxy_only": True,
+                "requires_matched_successful_tasks": True,
+                "raw_estimate_only_claim_allowed": False,
+            },
+        }
     comparisons: list[dict[str, Any]] = []
+    matched_pair_evidence: list[dict[str, Any]] = []
     baseline = by_variant.get(baseline_variant)
     baseline_successful_tasks = successful_tasks_by_variant.get(baseline_variant, set())
     baseline_failure_rate = baseline.get("failure_rate") if baseline else None
@@ -1680,6 +1924,8 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
         else:
             comparison["cost_savings_pct_with_shift"] = None
             comparison["paired_cost_task_count"] = cost_task_count
+        for task_id in sorted(matched_tasks):
+            matched_pair_evidence.append(matched_pair_evidence_entry(variant, task_id, quality_gate))
         comparisons.append(comparison)
     claim_status = "insufficient_baseline"
@@ -1712,6 +1958,7 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
         "row_count": len(rows),
         "summary_by_variant": by_variant,
         "comparisons": comparisons,
+        "matched_pair_evidence": matched_pair_evidence,
         "claim_status": claim_status,
         "caveat": (
             "Proxy byte reductions are reported separately from matched-task token/cost metrics; "
@@ -1843,12 +2090,6 @@ def main() -> int:
     require_no_follow_file_ops_supported()
     validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
-    if not args.dry_run and shutil.which(args.claude_bin) is None:
-        # claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
-        if not Path(args.claude_bin).exists():
-            print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
-            return 2
     tasks = parse_tasks(args.tasks)
     variants = parse_variants(args.variants)
     targets = filter_targets(tasks, variants, args.task_id, args.variant)
@@ -1857,8 +2098,32 @@ def main() -> int:
         return 1
     skip_keys = existing_keys(args.csv) if args.resume else set()
+    runnable_targets = [
+        (task, variant)
+        for task, variant in targets
+        if (task.id, variant.name) not in skip_keys
+    ]
+    placeholder_targets = [
+        f"{task.id}/{variant.name}"
+        for task, variant in runnable_targets
+        if is_placeholder_success_command(task.success_command)
+    ]
+    if placeholder_targets and not args.dry_run:
+        print(
+            f"{PLACEHOLDER_SUCCESS_COMMAND_MARKER}; refusing non-dry-run provider invocation for: "
+            f"{', '.join(placeholder_targets)}",
+            file=sys.stderr,
+        )
+        return 2
+    if runnable_targets and not args.dry_run and shutil.which(args.claude_bin) is None:
+        # claude_bin 이 절대경로면 shutil.which 가 None 일 수 있으므로 추가 검사.
+        if not Path(args.claude_bin).exists():
+            print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
+            return 2
     project_root = args.project_root.resolve()
-    claude_ver = "dry-run" if args.dry_run else claude_version(args.claude_bin)
+    claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
     completed = 0
     for task, variant in targets: