@ictechgy/context-guard 0.4.4 → 0.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/README.ko.md +15 -2
- package/README.md +12 -2
- package/context-guard-kit/README.md +2 -2
- package/context-guard-kit/benchmark_runner.py +244 -6
- package/context-guard-kit/claude_transcript_cost_audit.py +171 -1
- package/docs/benchmark-fixtures/learned-compression-baseline-context-pack.prompt.example.md +19 -0
- package/docs/benchmark-fixtures/learned-compression-candidate-digest.prompt.example.md +21 -0
- package/docs/benchmark-fixtures/learned-compression.tasks.example.json +5 -1
- package/docs/benchmark-fixtures/output-transform-baseline-raw-output.prompt.example.md +20 -0
- package/docs/benchmark-fixtures/output-transform-digest-receipt.prompt.example.md +23 -0
- package/docs/benchmark-fixtures/output-transform.tasks.example.json +28 -0
- package/docs/benchmark-fixtures/output-transform.variants.example.json +10 -0
- package/docs/benchmark-fixtures/visual-ocr-cropped-ocr.prompt.example.md +22 -0
- package/docs/benchmark-fixtures/visual-ocr-full-visual.prompt.example.md +19 -0
- package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +5 -1
- package/docs/benchmark-workflow-examples.md +6 -2
- package/docs/benchmark-workflows/self-hosted-metrics-ledger.example.jsonl +1 -0
- package/docs/experimental-benchmark-fixtures.md +17 -6
- package/docs/mac-visibility-feasibility-schema.md +62 -0
- package/docs/mac-visibility-feasibility.example.json +130 -0
- package/package.json +5 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +1 -1
- package/plugins/context-guard/README.md +1 -1
- package/plugins/context-guard/bin/context-guard-audit +171 -1
- package/plugins/context-guard/bin/context-guard-bench +244 -6
|
@@ -27,6 +27,7 @@ Task fixture (`tasks.json`): 각 task 는 다음 필드를 가진다.
|
|
|
27
27
|
"max_turns": 3,
|
|
28
28
|
"max_budget_usd": 1.0,
|
|
29
29
|
"allowed_tools": ["Read", "Edit", "Bash(npm test*)"],
|
|
30
|
+
"variant_prompt_files": {"context_hygiene": "t01.context_hygiene.prompt.md"},
|
|
30
31
|
"success_command": "npm test -- auth/session",
|
|
31
32
|
"success_cwd": "."
|
|
32
33
|
}
|
|
@@ -183,6 +184,13 @@ MAX_USAGE_COST_USD = 10**9
|
|
|
183
184
|
TOKEN_PROXY_BYTES_PER_TOKEN = 4
|
|
184
185
|
BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
|
|
185
186
|
MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
|
|
187
|
+
SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
|
|
188
|
+
SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
|
|
189
|
+
SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
|
|
190
|
+
MAX_SELF_HOSTED_LABEL_CHARS = 120
|
|
191
|
+
MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
|
|
192
|
+
MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
|
|
193
|
+
MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
|
|
186
194
|
CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
|
|
187
195
|
SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
|
|
188
196
|
VERSION_OUTPUT_MAX_BYTES = 16_000
|
|
@@ -354,6 +362,8 @@ class TaskFixture:
|
|
|
354
362
|
allowed_tools: list[str] = field(default_factory=list)
|
|
355
363
|
success_command: str | None = None
|
|
356
364
|
success_cwd: str = "."
|
|
365
|
+
variant_prompt_files: dict[str, str] = field(default_factory=dict)
|
|
366
|
+
variant_prompt_texts: dict[str, str] = field(default_factory=dict)
|
|
357
367
|
|
|
358
368
|
|
|
359
369
|
@dataclass
|
|
@@ -387,6 +397,7 @@ class RunResult:
|
|
|
387
397
|
provider_cached_tokens: int = 0
|
|
388
398
|
provider_cached_tokens_measured: bool = False
|
|
389
399
|
primary_tokens_measured: bool = False
|
|
400
|
+
self_hosted_metrics: dict[str, Any] | None = None
|
|
390
401
|
|
|
391
402
|
|
|
392
403
|
@dataclass
|
|
@@ -433,6 +444,22 @@ def parse_string_list(value: Any, *, field: str, owner: str) -> list[str]:
|
|
|
433
444
|
return items
|
|
434
445
|
|
|
435
446
|
|
|
447
|
+
def parse_string_map(value: Any, *, field: str, owner: str) -> dict[str, str]:
|
|
448
|
+
"""Parse a JSON fixture field that must be an object of non-empty string values."""
|
|
449
|
+
if value is None:
|
|
450
|
+
return {}
|
|
451
|
+
if not isinstance(value, dict):
|
|
452
|
+
raise SystemExit(f"{owner} {field} must be a JSON object of strings")
|
|
453
|
+
items: dict[str, str] = {}
|
|
454
|
+
for raw_key, raw_value in value.items():
|
|
455
|
+
if not isinstance(raw_key, str) or not raw_key.strip():
|
|
456
|
+
raise SystemExit(f"{owner} {field} keys must be non-empty strings")
|
|
457
|
+
if not isinstance(raw_value, str) or not raw_value.strip():
|
|
458
|
+
raise SystemExit(f"{owner} {field}.{raw_key} must be a non-empty string")
|
|
459
|
+
items[raw_key] = raw_value
|
|
460
|
+
return items
|
|
461
|
+
|
|
462
|
+
|
|
436
463
|
def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[str]:
|
|
437
464
|
for index, arg in enumerate(extra_args):
|
|
438
465
|
flag = arg.split("=", 1)[0]
|
|
@@ -443,6 +470,101 @@ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[st
|
|
|
443
470
|
return extra_args
|
|
444
471
|
|
|
445
472
|
|
|
473
|
+
def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
|
|
474
|
+
"""Return a safe relative prompt-file path, or fail before any file read."""
|
|
475
|
+
rel_path = Path(raw_path)
|
|
476
|
+
if rel_path.is_absolute():
|
|
477
|
+
raise SystemExit(f"{owner} variant_prompt_files path must be relative: {raw_path}")
|
|
478
|
+
if not rel_path.parts or rel_path == Path("."):
|
|
479
|
+
raise SystemExit(f"{owner} variant_prompt_files path must name a file")
|
|
480
|
+
if any(part in ("", ".", "..") for part in rel_path.parts):
|
|
481
|
+
raise SystemExit(f"{owner} variant_prompt_files path must not contain '.', '..', or empty components: {raw_path}")
|
|
482
|
+
return rel_path
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def validate_variant_prompt_file_references(
|
|
486
|
+
tasks: list[TaskFixture],
|
|
487
|
+
variants: list["Variant"],
|
|
488
|
+
) -> None:
|
|
489
|
+
"""Validate variant prompt-file keys and paths without dereferencing files.
|
|
490
|
+
|
|
491
|
+
Unknown variant keys and unsafe relative paths are rejected before any file
|
|
492
|
+
read. Missing prompt files are intentionally not checked here so a run
|
|
493
|
+
narrowed by --task-id/--variant is not blocked by unselected prompt files.
|
|
494
|
+
"""
|
|
495
|
+
known_variants = {variant.name for variant in variants}
|
|
496
|
+
for task in tasks:
|
|
497
|
+
unknown = sorted(set(task.variant_prompt_files) - known_variants)
|
|
498
|
+
if unknown:
|
|
499
|
+
raise SystemExit(
|
|
500
|
+
f"task {task.id} variant_prompt_files references unknown variant(s): {', '.join(unknown)}"
|
|
501
|
+
)
|
|
502
|
+
for variant_name, raw_path in task.variant_prompt_files.items():
|
|
503
|
+
validate_variant_prompt_file_path(
|
|
504
|
+
raw_path,
|
|
505
|
+
owner=f"task {task.id} variant {variant_name}",
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None = None) -> str:
|
|
510
|
+
"""Read one selected prompt file with no-follow IO and an argv-safe size cap."""
|
|
511
|
+
label = display_path or path.name
|
|
512
|
+
try:
|
|
513
|
+
fd = _open_regular_no_symlink(path)
|
|
514
|
+
except OSError as exc:
|
|
515
|
+
detail = exc.strerror or exc.__class__.__name__
|
|
516
|
+
raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
|
|
517
|
+
try:
|
|
518
|
+
size = os.fstat(fd).st_size
|
|
519
|
+
if size > MAX_VARIANT_PROMPT_FILE_BYTES:
|
|
520
|
+
raise SystemExit(
|
|
521
|
+
f"{owner} variant_prompt_files prompt file exceeds "
|
|
522
|
+
f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
|
|
523
|
+
)
|
|
524
|
+
try:
|
|
525
|
+
with os.fdopen(fd, "r", encoding="utf-8") as handle:
|
|
526
|
+
fd = -1
|
|
527
|
+
text = handle.read()
|
|
528
|
+
except UnicodeDecodeError as exc:
|
|
529
|
+
raise SystemExit(
|
|
530
|
+
f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
|
|
531
|
+
f"{label}: {exc.reason}"
|
|
532
|
+
) from None
|
|
533
|
+
except OSError as exc:
|
|
534
|
+
detail = exc.strerror or exc.__class__.__name__
|
|
535
|
+
raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
|
|
536
|
+
finally:
|
|
537
|
+
if fd != -1:
|
|
538
|
+
os.close(fd)
|
|
539
|
+
if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
|
|
540
|
+
raise SystemExit(
|
|
541
|
+
f"{owner} variant_prompt_files prompt text exceeds "
|
|
542
|
+
f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
|
|
543
|
+
)
|
|
544
|
+
return text
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def load_variant_prompt_files_for_targets(
|
|
548
|
+
targets: list[tuple[TaskFixture, "Variant"]],
|
|
549
|
+
*,
|
|
550
|
+
task_file_dir: Path,
|
|
551
|
+
) -> None:
|
|
552
|
+
"""Load file-backed prompts only for selected (task, variant) targets."""
|
|
553
|
+
for task, variant in targets:
|
|
554
|
+
raw_path = task.variant_prompt_files.get(variant.name)
|
|
555
|
+
if raw_path is None:
|
|
556
|
+
continue
|
|
557
|
+
rel_path = validate_variant_prompt_file_path(
|
|
558
|
+
raw_path,
|
|
559
|
+
owner=f"task {task.id} variant {variant.name}",
|
|
560
|
+
)
|
|
561
|
+
task.variant_prompt_texts[variant.name] = read_variant_prompt_file(
|
|
562
|
+
task_file_dir / rel_path,
|
|
563
|
+
owner=f"task {task.id} variant {variant.name}",
|
|
564
|
+
display_path=str(rel_path),
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
|
|
446
568
|
def normalize_usage_token(value: Any) -> int | None:
|
|
447
569
|
"""Return a safe non-negative token count, or None for invalid metrics."""
|
|
448
570
|
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
@@ -469,7 +591,7 @@ def normalize_usage_cost(value: Any) -> float | None:
|
|
|
469
591
|
return numeric
|
|
470
592
|
|
|
471
593
|
|
|
472
|
-
def parse_tasks(path: Path) -> list[TaskFixture]:
|
|
594
|
+
def parse_tasks(path: Path, variants: list["Variant"] | None = None) -> list[TaskFixture]:
|
|
473
595
|
raw = json.loads(_read_text_no_follow(path))
|
|
474
596
|
if not isinstance(raw, list):
|
|
475
597
|
raise SystemExit(f"tasks file must be a JSON list: {path}")
|
|
@@ -488,21 +610,33 @@ def parse_tasks(path: Path) -> list[TaskFixture]:
|
|
|
488
610
|
raise SystemExit(f"task {item.get('id')} max_budget_usd must be finite and > 0 (use null for unlimited)")
|
|
489
611
|
else:
|
|
490
612
|
budget = None
|
|
613
|
+
task_id = str(item["id"])
|
|
614
|
+
if "variant_prompts" in item:
|
|
615
|
+
raise SystemExit(
|
|
616
|
+
f"task {task_id} variant_prompts is not supported; use file-backed variant_prompt_files"
|
|
617
|
+
)
|
|
491
618
|
fixtures.append(TaskFixture(
|
|
492
|
-
id=
|
|
619
|
+
id=task_id,
|
|
493
620
|
prompt=str(item["prompt"]),
|
|
494
621
|
model=str(item.get("model", "sonnet")),
|
|
495
622
|
effort=str(effort_raw) if effort_raw is not None else None,
|
|
496
|
-
max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {
|
|
623
|
+
max_turns=parse_positive_int(item.get("max_turns", 3), field="max_turns", owner=f"task {task_id}"),
|
|
497
624
|
max_budget_usd=budget,
|
|
498
625
|
allowed_tools=parse_string_list(
|
|
499
626
|
item.get("allowed_tools", []),
|
|
500
627
|
field="allowed_tools",
|
|
501
|
-
owner=f"task {
|
|
628
|
+
owner=f"task {task_id}",
|
|
502
629
|
),
|
|
503
630
|
success_command=item.get("success_command"),
|
|
504
631
|
success_cwd=str(item.get("success_cwd", ".")),
|
|
632
|
+
variant_prompt_files=parse_string_map(
|
|
633
|
+
item.get("variant_prompt_files"),
|
|
634
|
+
field="variant_prompt_files",
|
|
635
|
+
owner=f"task {task_id}",
|
|
636
|
+
),
|
|
505
637
|
))
|
|
638
|
+
if variants is not None:
|
|
639
|
+
validate_variant_prompt_file_references(fixtures, variants)
|
|
506
640
|
return fixtures
|
|
507
641
|
|
|
508
642
|
|
|
@@ -717,6 +851,102 @@ def collect_shift_metrics(payload: Any) -> dict[str, int | float | bool]:
|
|
|
717
851
|
return metrics
|
|
718
852
|
|
|
719
853
|
|
|
854
|
+
def normalize_self_hosted_metric(value: Any, *, maximum: float) -> float | None:
|
|
855
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
856
|
+
return None
|
|
857
|
+
number = float(value)
|
|
858
|
+
if not math.isfinite(number) or number < 0 or number > maximum:
|
|
859
|
+
return None
|
|
860
|
+
return number
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def sanitize_self_hosted_label(value: Any) -> str | None:
|
|
864
|
+
if not isinstance(value, str):
|
|
865
|
+
return None
|
|
866
|
+
text = sanitize_note_text(value)
|
|
867
|
+
if not text:
|
|
868
|
+
return None
|
|
869
|
+
if len(text) > MAX_SELF_HOSTED_LABEL_CHARS:
|
|
870
|
+
text = text[:MAX_SELF_HOSTED_LABEL_CHARS - 12].rstrip() + "…[truncated]"
|
|
871
|
+
return text
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
def normalize_self_hosted_metrics(raw: Any, *, source: str) -> dict[str, Any] | None:
|
|
875
|
+
if not isinstance(raw, dict):
|
|
876
|
+
return None
|
|
877
|
+
metrics: dict[str, float] = {}
|
|
878
|
+
labels: dict[str, str] = {}
|
|
879
|
+
availability = {
|
|
880
|
+
"latency_ms": False,
|
|
881
|
+
"peak_memory_mb": False,
|
|
882
|
+
"quality_score": False,
|
|
883
|
+
}
|
|
884
|
+
latency = normalize_self_hosted_metric(raw.get("latency_ms"), maximum=MAX_SELF_HOSTED_LATENCY_MS)
|
|
885
|
+
if latency is not None:
|
|
886
|
+
metrics["latency_ms"] = latency
|
|
887
|
+
availability["latency_ms"] = True
|
|
888
|
+
peak_memory = normalize_self_hosted_metric(raw.get("peak_memory_mb"), maximum=MAX_SELF_HOSTED_MEMORY_MB)
|
|
889
|
+
if peak_memory is not None:
|
|
890
|
+
metrics["peak_memory_mb"] = peak_memory
|
|
891
|
+
availability["peak_memory_mb"] = True
|
|
892
|
+
quality = normalize_self_hosted_metric(raw.get("quality_score"), maximum=1.0)
|
|
893
|
+
if quality is not None:
|
|
894
|
+
metrics["quality_score"] = quality
|
|
895
|
+
availability["quality_score"] = True
|
|
896
|
+
for key in ("model_server", "optimization", "quality_metric"):
|
|
897
|
+
label = sanitize_self_hosted_label(raw.get(key))
|
|
898
|
+
if label is not None:
|
|
899
|
+
labels[key] = label
|
|
900
|
+
if not metrics:
|
|
901
|
+
return None
|
|
902
|
+
return {
|
|
903
|
+
"schema_version": SELF_HOSTED_METRICS_SCHEMA_VERSION,
|
|
904
|
+
"source": source,
|
|
905
|
+
"metrics": metrics,
|
|
906
|
+
"labels": labels,
|
|
907
|
+
"measurement_availability": availability,
|
|
908
|
+
"claim_boundary": {
|
|
909
|
+
"id": SELF_HOSTED_METRICS_CLAIM_BOUNDARY,
|
|
910
|
+
"hosted_api_token_savings_claim_allowed": False,
|
|
911
|
+
"hosted_api_cost_savings_claim_allowed": False,
|
|
912
|
+
"requires_provider_measured_matched_tasks_for_hosted_claims": True,
|
|
913
|
+
"reason": (
|
|
914
|
+
"Self-hosted local/model-server latency, memory, and quality metrics "
|
|
915
|
+
"are not hosted API token or cost telemetry."
|
|
916
|
+
),
|
|
917
|
+
},
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def collect_self_hosted_metrics(payload: Any) -> dict[str, Any] | None:
|
|
922
|
+
"""Collect explicit self-hosted metric sidecars without broad key inference.
|
|
923
|
+
|
|
924
|
+
Only explicit top-level telemetry envelopes are considered. Do not infer
|
|
925
|
+
from incidental keys like `self_hosted_latency_ms` or arbitrary nested model
|
|
926
|
+
message content: that would make local/model-server telemetry too easy to
|
|
927
|
+
mix into hosted API claim surfaces.
|
|
928
|
+
"""
|
|
929
|
+
if not isinstance(payload, dict):
|
|
930
|
+
return None
|
|
931
|
+
candidates = [
|
|
932
|
+
(
|
|
933
|
+
payload.get(SELF_HOSTED_METRICS_KEY),
|
|
934
|
+
f"explicit_provider_payload.{SELF_HOSTED_METRICS_KEY}",
|
|
935
|
+
)
|
|
936
|
+
]
|
|
937
|
+
metrics_envelope = payload.get("metrics")
|
|
938
|
+
if isinstance(metrics_envelope, dict):
|
|
939
|
+
candidates.append((
|
|
940
|
+
metrics_envelope.get(SELF_HOSTED_METRICS_KEY),
|
|
941
|
+
f"explicit_provider_payload.metrics.{SELF_HOSTED_METRICS_KEY}",
|
|
942
|
+
))
|
|
943
|
+
for raw, source in candidates:
|
|
944
|
+
normalized = normalize_self_hosted_metrics(raw, source=source)
|
|
945
|
+
if normalized is not None:
|
|
946
|
+
return normalized
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
|
|
720
950
|
def claude_version(claude_bin: str) -> str:
|
|
721
951
|
try:
|
|
722
952
|
proc = run_bounded_command(
|
|
@@ -747,7 +977,7 @@ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> l
|
|
|
747
977
|
argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
|
|
748
978
|
argv.extend(variant.extra_args)
|
|
749
979
|
argv.append("--")
|
|
750
|
-
argv.append(task.prompt)
|
|
980
|
+
argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
|
|
751
981
|
return argv
|
|
752
982
|
|
|
753
983
|
|
|
@@ -1003,6 +1233,7 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
|
|
|
1003
1233
|
tokens, cost, cost_measured, primary_tokens_measured = collect_usage(payload)
|
|
1004
1234
|
provider_cached_tokens, provider_cached_tokens_measured = collect_provider_cache_telemetry(payload)
|
|
1005
1235
|
shift_metrics = collect_shift_metrics(payload)
|
|
1236
|
+
self_hosted_metrics = collect_self_hosted_metrics(payload)
|
|
1006
1237
|
success, success_note = run_success_command(task, project_root)
|
|
1007
1238
|
return RunResult(
|
|
1008
1239
|
task_id=task.id, variant=variant.name, model=task.model, effort=task.effort,
|
|
@@ -1021,6 +1252,7 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
|
|
|
1021
1252
|
external_cost_measured=bool(shift_metrics["external_cost_measured"]),
|
|
1022
1253
|
provider_cached_tokens=provider_cached_tokens,
|
|
1023
1254
|
provider_cached_tokens_measured=provider_cached_tokens_measured,
|
|
1255
|
+
self_hosted_metrics=self_hosted_metrics,
|
|
1024
1256
|
)
|
|
1025
1257
|
|
|
1026
1258
|
|
|
@@ -1169,6 +1401,7 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
|
|
|
1169
1401
|
"provider_cache": result.provider_cached_tokens_measured,
|
|
1170
1402
|
"byte_metrics": byte_metrics_observed,
|
|
1171
1403
|
"wall_time": result.wall_time_seconds >= 0,
|
|
1404
|
+
"self_hosted_metrics": result.self_hosted_metrics is not None,
|
|
1172
1405
|
},
|
|
1173
1406
|
"proxy_metrics": {
|
|
1174
1407
|
"byte_metrics_observed": byte_metrics_observed,
|
|
@@ -1177,6 +1410,8 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
|
|
|
1177
1410
|
"claim_boundary": "proxy_only_not_hosted_token_savings",
|
|
1178
1411
|
},
|
|
1179
1412
|
}
|
|
1413
|
+
if result.self_hosted_metrics is not None:
|
|
1414
|
+
payload["self_hosted_metrics"] = result.self_hosted_metrics
|
|
1180
1415
|
with csv_file_lock(path, create_parent=True):
|
|
1181
1416
|
fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
|
|
1182
1417
|
try:
|
|
@@ -2090,8 +2325,8 @@ def main() -> int:
|
|
|
2090
2325
|
require_no_follow_file_ops_supported()
|
|
2091
2326
|
validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
|
|
2092
2327
|
|
|
2093
|
-
tasks = parse_tasks(args.tasks)
|
|
2094
2328
|
variants = parse_variants(args.variants)
|
|
2329
|
+
tasks = parse_tasks(args.tasks, variants=variants)
|
|
2095
2330
|
targets = filter_targets(tasks, variants, args.task_id, args.variant)
|
|
2096
2331
|
if not targets:
|
|
2097
2332
|
print("no (task, variant) targets matched the filters", file=sys.stderr)
|
|
@@ -2122,6 +2357,9 @@ def main() -> int:
|
|
|
2122
2357
|
print(f"claude binary not found: {args.claude_bin}", file=sys.stderr)
|
|
2123
2358
|
return 2
|
|
2124
2359
|
|
|
2360
|
+
if runnable_targets:
|
|
2361
|
+
load_variant_prompt_files_for_targets(runnable_targets, task_file_dir=args.tasks.parent)
|
|
2362
|
+
|
|
2125
2363
|
project_root = args.project_root.resolve()
|
|
2126
2364
|
claude_ver = "dry-run" if args.dry_run else (claude_version(args.claude_bin) if runnable_targets else "skipped")
|
|
2127
2365
|
|