npm - @ictechgy/context-guard - Versions diffs - 0.4.4 → 0.4.5 - Mend

@ictechgy/context-guard 0.4.4 → 0.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/plugins/context-guard/bin/context-guard-bench CHANGED Viewed

@@ -27,6 +27,7 @@ Task fixture (`tasks.json`): 각 task 는 다음 필드를 가진다.
     "max_turns": 3,
     "max_budget_usd": 1.0,
     "allowed_tools": ["Read", "Edit", "Bash(npm test*)"],
+    "variant_prompt_files": {"context_hygiene": "t01.context_hygiene.prompt.md"},
     "success_command": "npm test -- auth/session",
     "success_cwd": "."
   }
@@ -183,6 +184,13 @@ MAX_USAGE_COST_USD = 10**9
 TOKEN_PROXY_BYTES_PER_TOKEN = 4
 BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
 MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
+SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
+SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
+SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
+MAX_SELF_HOSTED_LABEL_CHARS = 120
+MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
+MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
+MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
 CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
 SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
 VERSION_OUTPUT_MAX_BYTES = 16_000
@@ -354,6 +362,8 @@ class TaskFixture:
     allowed_tools: list[str] = field(default_factory=list)
     success_command: str | None = None
     success_cwd: str = "."
+    variant_prompt_files: dict[str, str] = field(default_factory=dict)
+    variant_prompt_texts: dict[str, str] = field(default_factory=dict)
 @dataclass
@@ -387,6 +397,7 @@ class RunResult:
     provider_cached_tokens: int = 0
     provider_cached_tokens_measured: bool = False
     primary_tokens_measured: bool = False
+    self_hosted_metrics: dict[str, Any] | None = None
 @dataclass
@@ -433,6 +444,22 @@ def parse_string_list(value: Any, *, field: str, owner: str) -> list[str]:
     return items
+def parse_string_map(value: Any, *, field: str, owner: str) -> dict[str, str]:
+    """Parse a JSON fixture field that must be an object of non-empty string values."""
+    if value is None:
+        return {}
+    if not isinstance(value, dict):
+        raise SystemExit(f"{owner} {field} must be a JSON object of strings")
+    items: dict[str, str] = {}
+    for raw_key, raw_value in value.items():
+        if not isinstance(raw_key, str) or not raw_key.strip():
+            raise SystemExit(f"{owner} {field} keys must be non-empty strings")
+        if not isinstance(raw_value, str) or not raw_value.strip():
+            raise SystemExit(f"{owner} {field}.{raw_key} must be a non-empty string")
+        items[raw_key] = raw_value
+    return items
 def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[str]:
     for index, arg in enumerate(extra_args):
         flag = arg.split("=", 1)[0]
@@ -443,6 +470,101 @@ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[st
     return extra_args
+def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
+    """Return a safe relative prompt-file path, or fail before any file read."""
+    rel_path = Path(raw_path)
+    if rel_path.is_absolute():
+        raise SystemExit(f"{owner} variant_prompt_files path must be relative: {raw_path}")
+    if not rel_path.parts or rel_path == Path("."):
+        raise SystemExit(f"{owner} variant_prompt_files path must name a file")
+    if any(part in ("", ".", "..") for part in rel_path.parts):
+        raise SystemExit(f"{owner} variant_prompt_files path must not contain '.', '..', or empty components: {raw_path}")
+    return rel_path
+def validate_variant_prompt_file_references(
+    tasks: list[TaskFixture],
+    variants: list["Variant"],
+) -> None:
+    """Validate variant prompt-file keys and paths without dereferencing files.
+    Unknown variant keys and unsafe relative paths are rejected before any file
+    read. Missing prompt files are intentionally not checked here so a run
+    narrowed by --task-id/--variant is not blocked by unselected prompt files.
+    """
+    known_variants = {variant.name for variant in variants}
+    for task in tasks:
+        unknown = sorted(set(task.variant_prompt_files) - known_variants)
+        if unknown:
+            raise SystemExit(
+                f"task {task.id} variant_prompt_files references unknown variant(s): {', '.join(unknown)}"
+            )
+        for variant_name, raw_path in task.variant_prompt_files.items():
+            validate_variant_prompt_file_path(
+                raw_path,
+                owner=f"task {task.id} variant {variant_name}",
+            )
+def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None = None) -> str:
+    """Read one selected prompt file with no-follow IO and an argv-safe size cap."""
+    label = display_path or path.name
+    try:
+        fd = _open_regular_no_symlink(path)
+    except OSError as exc:
+        detail = exc.strerror or exc.__class__.__name__
+        raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
+    try:
+        size = os.fstat(fd).st_size
+        if size > MAX_VARIANT_PROMPT_FILE_BYTES:
+            raise SystemExit(
+                f"{owner} variant_prompt_files prompt file exceeds "
+                f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
+            )
+        try:
+            with os.fdopen(fd, "r", encoding="utf-8") as handle:
+                fd = -1
+                text = handle.read()
+        except UnicodeDecodeError as exc:
+            raise SystemExit(
+                f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
+                f"{label}: {exc.reason}"
+            ) from None
+        except OSError as exc:
+            detail = exc.strerror or exc.__class__.__name__
+            raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
+    finally:
+        if fd != -1:
+            os.close(fd)
+    if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
+        raise SystemExit(
+            f"{owner} variant_prompt_files prompt text exceeds "
+            f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
+        )
+    return text
+def load_variant_prompt_files_for_targets(
+    targets: list[tuple[TaskFixture, "Variant"]],
+    *,
+    task_file_dir: Path,
+) -> None:
+    """Load file-backed prompts only for selected (task, variant) targets."""
+    for task, variant in targets:
+        raw_path = task.variant_prompt_files.get(variant.name)
+        if raw_path is None:
+            continue
+        rel_path = validate_variant_prompt_file_path(
+            raw_path,
+            owner=f"task {task.id} variant {variant.name}",
+        )
+        task.variant_prompt_texts[variant.name] = read_variant_prompt_file(
+            task_file_dir / rel_path,
+            owner=f"task {task.id} variant {variant.name}",
+            display_path=str(rel_path),
+        )
 def normalize_usage_token(value: Any) -> int | None:
     """Return a safe non-negative token count, or None for invalid metrics."""
     if isinstance(value, bool) or not isinstance(value, (int, float)):
@@ -469,7 +591,7 @@ def normalize_usage_cost(value: Any) -> float | None:
     return numeric
-def parse_tasks(path: Path) -> list[TaskFixture]:
+def parse_tasks(path: Path, variants: list["Variant"] | None = None) -> list[TaskFixture]:
     raw = json.loads(_read_text_no_follow(path))
     if not isinstance(raw, list):
         raise SystemExit(f"tasks file must be a JSON list: {path}")
@@ -488,21 +610,33 @@ def parse_tasks(path: Path) -> list[TaskFixture]:
                 raise SystemExit(f"task {item.get('id')} max_budget_usd must be finite and > 0 (use null for unlimited)")
         else:
             budget = None
+        task_id = str(item["id"])
+        if "variant_prompts" in item:
+            raise SystemExit(
+                f"task {task_id} variant_prompts is not supported; use file-backed variant_prompt_files"
+            )
         fixtures.append(TaskFixture(
-            id=str(item["id"]),
+            id=task_id,
             prompt=str(item["prompt"]),
             model=str(item.get("model", "sonnet")),
             effort=str(effort_raw) if effort_raw is not None else None,
-            max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {item.get('id')}"),
+            max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {task_id}"),
             max_budget_usd=budget,
             allowed_tools=parse_string_list(
                 item.get("allowed_tools", []),
                 field="allowed_tools",
-                owner=f"task {item.get('id')}",
+                owner=f"task {task_id}",
             ),
             success_command=item.get("success_command"),
             success_cwd=str(item.get("success_cwd", ".")),
+            variant_prompt_files=parse_string_map(
+                item.get("variant_prompt_files"),
+                field="variant_prompt_files",
+                owner=f"task {task_id}",
+            ),
         ))
+    if variants is not None:
+        validate_variant_prompt_file_references(fixtures, variants)
     return fixtures
@@ -717,6 +851,102 @@ def collect_shift_metrics(payload: Any) -> dict[str, int | float | bool]:
     return metrics
+def normalize_self_hosted_metric(value: Any, *, maximum: float) -> float | None:
+    if isinstance(value, bool) or not isinstance(value, (int, float)):
+        return None
+    number = float(value)
+    if not math.isfinite(number) or number < 0 or number > maximum:
+        return None
+    return number
+def sanitize_self_hosted_label(value: Any) -> str | None:
+    if not isinstance(value, str):
+        return None
+    text = sanitize_note_text(value)
+    if not text:
+        return None
+    if len(text) > MAX_SELF_HOSTED_LABEL_CHARS:
+        text = text[:MAX_SELF_HOSTED_LABEL_CHARS - 12].rstrip() + "…[truncated]"
+    return text
+def normalize_self_hosted_metrics(raw: Any, *, source: str) -> dict[str, Any] | None:
+    if not isinstance(raw, dict):
+        return None
+    metrics: dict[str, float] = {}
+    labels: dict[str, str] = {}
+    availability = {
+        "latency_ms": False,
+        "peak_memory_mb": False,
+        "quality_score": False,
+    }
+    latency = normalize_self_hosted_metric(raw.get("latency_ms"), maximum=MAX_SELF_HOSTED_LATENCY_MS)
+    if latency is not None:
+        metrics["latency_ms"] = latency
+        availability["latency_ms"] = True
+    peak_memory = normalize_self_hosted_metric(raw.get("peak_memory_mb"), maximum=MAX_SELF_HOSTED_MEMORY_MB)
+    if peak_memory is not None:
+        metrics["peak_memory_mb"] = peak_memory
+        availability["peak_memory_mb"] = True
+    quality = normalize_self_hosted_metric(raw.get("quality_score"), maximum=1.0)
+    if quality is not None:
+        metrics["quality_score"] = quality
+        availability["quality_score"] = True
+    for key in ("model_server", "optimization", "quality_metric"):
+        label = sanitize_self_hosted_label(raw.get(key))
+        if label is not None:
+            labels[key] = label
+    if not metrics:
+        return None
+    return {
+        "schema_version": SELF_HOSTED_METRICS_SCHEMA_VERSION,
+        "source": source,
+        "metrics": metrics,
+        "labels": labels,
+        "measurement_availability": availability,
+        "claim_boundary": {
+            "id": SELF_HOSTED_METRICS_CLAIM_BOUNDARY,
+            "hosted_api_token_savings_claim_allowed": False,
+            "hosted_api_cost_savings_claim_allowed": False,
+            "requires_provider_measured_matched_tasks_for_hosted_claims": True,
+            "reason": (
+                "Self-hosted local/model-server latency, memory, and quality metrics "
+                "are not hosted API token or cost telemetry."
+            ),
+        },
+    }
+def collect_self_hosted_metrics(payload: Any) -> dict[str, Any] | None:
+    """Collect explicit self-hosted metric sidecars without broad key inference.
+    Only explicit top-level telemetry envelopes are considered.  Do not infer
+    from incidental keys like `self_hosted_latency_ms` or arbitrary nested model
+    message content: that would make local/model-server telemetry too easy to
+    mix into hosted API claim surfaces.
+    """
+    if not isinstance(payload, dict):
+        return None
+    candidates = [
+        (
+            payload.get(SELF_HOSTED_METRICS_KEY),
+            f"explicit_provider_payload.{SELF_HOSTED_METRICS_KEY}",
+        )
+    ]
+    metrics_envelope = payload.get("metrics")
+    if isinstance(metrics_envelope, dict):
+        candidates.append((
+            metrics_envelope.get(SELF_HOSTED_METRICS_KEY),
+            f"explicit_provider_payload.metrics.{SELF_HOSTED_METRICS_KEY}",
+        ))
+    for raw, source in candidates:
+        normalized = normalize_self_hosted_metrics(raw, source=source)
+        if normalized is not None:
+            return normalized
+    return None
 def claude_version(claude_bin: str) -> str:
     try:
         proc = run_bounded_command(
@@ -747,7 +977,7 @@ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> l
         argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
     argv.extend(variant.extra_args)
     argv.append("--")
-    argv.append(task.prompt)
+    argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
     return argv
@@ -1003,6 +1233,7 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
     tokens, cost, cost_measured, primary_tokens_measured = collect_usage(payload)
     provider_cached_tokens, provider_cached_tokens_measured = collect_provider_cache_telemetry(payload)
     shift_metrics = collect_shift_metrics(payload)
+    self_hosted_metrics = collect_self_hosted_metrics(payload)
     success, success_note = run_success_command(task, project_root)
     return RunResult(
         task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
@@ -1021,6 +1252,7 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
         external_cost_measured=bool(shift_metrics["external_cost_measured"]),
         provider_cached_tokens=provider_cached_tokens,
         provider_cached_tokens_measured=provider_cached_tokens_measured,
+        self_hosted_metrics=self_hosted_metrics,
     )
@@ -1169,6 +1401,7 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
             "provider_cache": result.provider_cached_tokens_measured,
             "byte_metrics": byte_metrics_observed,
             "wall_time": result.wall_time_seconds >= 0,
+            "self_hosted_metrics": result.self_hosted_metrics is not None,
         },
         "proxy_metrics": {
             "byte_metrics_observed": byte_metrics_observed,
@@ -1177,6 +1410,8 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
             "claim_boundary": "proxy_only_not_hosted_token_savings",
         },
     }
+    if result.self_hosted_metrics is not None:
+        payload["self_hosted_metrics"] = result.self_hosted_metrics
     with csv_file_lock(path, create_parent=True):
         fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
         try:
@@ -2090,8 +2325,8 @@ def main() -> int:
     require_no_follow_file_ops_supported()
     validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
-    tasks = parse_tasks(args.tasks)
     variants = parse_variants(args.variants)
+    tasks = parse_tasks(args.tasks, variants=variants)
     targets = filter_targets(tasks, variants, args.task_id, args.variant)
     if not targets:
         print("no (task, variant) targets matched the filters", file=sys.stderr)
@@ -2122,6 +2357,9 @@ def main() -> int:
             print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
             return 2
+    if runnable_targets:
+        load_variant_prompt_files_for_targets(runnable_targets, task_file_dir=args.tasks.parent)
     project_root = args.project_root.resolve()
     claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")