npm - @ictechgy/context-guard - Versions diffs - 0.4.10 → 0.4.11 - Mend

@ictechgy/context-guard 0.4.10 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/CHANGELOG.md +13 -1
package/README.ko.md +32 -21
package/README.md +38 -29
package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
package/docs/benchmark-workflow-examples.md +3 -0
package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
package/docs/experimental-benchmark-fixtures.md +24 -7
package/package.json +2 -1
package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
package/plugins/context-guard/README.ko.md +14 -11
package/plugins/context-guard/README.md +15 -14
package/plugins/context-guard/bin/context-guard +46 -11
package/plugins/context-guard/bin/context-guard-artifact +342 -33
package/plugins/context-guard/bin/context-guard-audit +33 -2
package/plugins/context-guard/bin/context-guard-bench +1542 -31
package/plugins/context-guard/bin/context-guard-cache-score +318 -33
package/plugins/context-guard/bin/context-guard-cost +7 -2
package/plugins/context-guard/bin/context-guard-experiments +364 -8
package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
package/plugins/context-guard/bin/context-guard-pack +301 -17
package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
package/plugins/context-guard/bin/context-guard-tool-prune +241 -54
package/plugins/context-guard/bin/context-guard-trim-output +288 -41
package/plugins/context-guard/brief/README.md +5 -5
package/plugins/context-guard/lib/context_guard_commands.py +214 -190

package/docs/benchmark-workflows/measured-token-workflow.example.json CHANGED Viewed

@@ -1,170 +1,311 @@
 {
-  "schema": "context-guard-bench-report-v1",
   "baseline_variant": "baseline",
+  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
+  "claim_status": "token_savings_observed_cost_unmeasured",
+  "comparisons": [
+    {
+      "baseline_corrections_per_successful_task": 0.0,
+      "baseline_failure_rate": 0.0,
+      "baseline_successful_task_count": 1,
+      "baseline_variant": "baseline",
+      "corrections_delta_per_successful_task": 0.0,
+      "cost_savings_pct_with_shift": null,
+      "failure_rate_delta_pp": 0.0,
+      "matched_successful_task_count": 1,
+      "missing_baseline_success_tasks": [],
+      "paired_corrections_task_count": 1,
+      "paired_cost_task_count": 0,
+      "paired_token_task_count": 1,
+      "paired_wall_time_task_count": 1,
+      "quality_gate": "pass",
+      "token_delta_per_successful_task": -240.0,
+      "token_savings_pct": 24.0,
+      "variant": "brief_mode_standard",
+      "variant_corrections_per_successful_task": 0.0,
+      "variant_failure_rate": 0.0,
+      "wall_time_change_pct": -4.0000000000000036,
+      "wall_time_delta_seconds_per_successful_task": -0.40000000000000036
+    }
+  ],
+  "public_claim_readiness": {
+    "blocking_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "claim_allowed": false,
+    "claim_boundary": {
+      "claim_allowed_field": "public_claim_readiness.claim_allowed",
+      "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
+      "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
+      "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
+      "id": "public_claim_readiness_authoritative_release_gate",
+      "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
+      "reporting_only": true,
+      "requires_confidence_and_failure_notes": true,
+      "requires_matched_successful_tasks": true,
+      "requires_provider_export_provenance": true,
+      "requires_provider_measured_tokens_and_cost": true,
+      "requires_quality_non_inferiority": true,
+      "requires_shifted_cost_accounting": true,
+      "unsupported_claims_forbidden": true
+    },
+    "gates": [
+      {
+        "evidence": {
+          "comparison_count": 1,
+          "matched_pair_count": 0,
+          "min_matched_successful_task_count": 1.0,
+          "missing_baseline_success_task_count": 0,
+          "variants": [
+            "brief_mode_standard"
+          ]
+        },
+        "id": "matched_successful_tasks",
+        "label": "Matched successful tasks",
+        "passed": false,
+        "reason": "missing_or_regressed_matched_successful_tasks",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
+          ]
+        },
+        "id": "provider_measured_token_cost",
+        "label": "Provider-measured token and primary cost",
+        "passed": false,
+        "reason": "missing_provider_measured_primary_tokens_or_cost",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "max_corrections_delta_per_successful_task": 0.0,
+          "max_failure_rate_delta_pp": 0.0,
+          "quality_gates": [
+            "pass"
+          ]
+        },
+        "id": "quality_non_inferiority",
+        "label": "Quality non-inferiority",
+        "passed": true,
+        "reason": "all_quality_gates_pass",
+        "required": true,
+        "status": "pass"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
+            "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
+          ]
+        },
+        "id": "shifted_cost_accounting",
+        "label": "Shifted-cost accounting",
+        "passed": false,
+        "reason": "missing_shifted_cost_claim_accounting",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "comparison_failure_fields_present": true,
+          "explicit_note_count": 0,
+          "failed_row_count": 0,
+          "failed_rows_with_notes": 0,
+          "replay_row_count": 0
+        },
+        "id": "confidence_failure_notes",
+        "label": "Confidence and failure notes",
+        "passed": false,
+        "reason": "missing_explicit_replay_notes_or_failure_evidence",
+        "required": true,
+        "status": "unknown"
+      },
+      {
+        "evidence": {
+          "mixed_csv": false,
+          "provider_names": [],
+          "replay_row_count": 0,
+          "report_row_count": 2,
+          "same_run_complete": false,
+          "source_types": []
+        },
+        "id": "provider_export_provenance",
+        "label": "Provider-export provenance",
+        "passed": false,
+        "reason": "missing_or_mixed_provider_export_provenance",
+        "required": true,
+        "status": "unknown"
+      }
+    ],
+    "generated_from": "matched_pair_evidence_and_replay_provenance",
+    "passed_required_gate_count": 1,
+    "public_claim_eligible_observed": null,
+    "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
+    "raw_metric_claim_status_observed": "token_savings_observed_cost_unmeasured",
+    "reason": "replay_evidence_required_for_public_claim",
+    "required_gate_count": 6,
+    "required_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "quality_non_inferiority",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "schema_version": "contextguard.bench.public-claim-readiness.v1",
+    "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
+  },
   "row_count": 2,
+  "schema": "context-guard-bench-report-v1",
   "summary_by_variant": {
     "baseline": {
-      "runs": 1,
-      "successful_runs": 1,
+      "artifacts_used_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
+      "byte_reduction_ratio": 1.0,
+      "byte_savings_pct": 0.0,
+      "bytes_after_successful": 12000,
+      "bytes_before_successful": 12000,
+      "bytes_saved_per_successful_task": 0.0,
+      "bytes_saved_successful": 0,
+      "compression_strategy": "baseline",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
+      "external_cost_successful_usd": 0.0,
+      "external_cost_unknown_successful": 1,
+      "external_tokens_measured_successful": 0,
+      "external_tokens_per_successful_task": null,
+      "external_tokens_successful": 0,
       "failed_runs": 0,
-      "total_tokens_all_runs": 1000,
-      "primary_tokens_measured_runs": 1,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": true,
+      "observed_telemetry": {
+        "byte_savings": "observed",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "unavailable",
+        "token_proxy": "inferred",
+        "tokens": "observed",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 10.0,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 1,
+      "primary_tokens_measured_successful": 1,
       "provider_cached_tokens_all_runs": 0,
       "provider_cached_tokens_measured_runs": 0,
+      "provider_cached_tokens_measured_successful": 0,
+      "provider_cached_tokens_per_successful_task": 0.0,
+      "provider_cached_tokens_per_task_including_failures": 0.0,
+      "provider_cached_tokens_successful": 0,
+      "runs": 1,
+      "successful_runs": 1,
+      "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": 0.0,
+      "token_proxy_saved_successful": 0,
+      "tokens_per_successful_task": 1000.0,
+      "tokens_per_task_including_failures": 1000.0,
       "total_cost_with_shift_all_runs_usd": 0.0,
       "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
+      "total_cost_with_shift_per_task_including_failures_usd": null,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 1000,
       "total_tokens_successful": 1000,
-      "primary_tokens_measured_successful": 1,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 10.0,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 10.0,
+      "wall_time_seconds_measured_runs": 1,
       "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 0,
-      "provider_cached_tokens_measured_successful": 0,
+      "wall_time_seconds_per_successful_task": 10.0,
+      "wall_time_seconds_per_task_including_failures": 10.0,
+      "wall_time_seconds_successful": 10.0
+    },
+    "brief_mode_standard": {
+      "artifacts_used_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
+      "byte_reduction_ratio": 0.75,
+      "byte_savings_pct": 25.0,
+      "bytes_after_successful": 9000,
+      "bytes_before_successful": 12000,
+      "bytes_saved_per_successful_task": 3000.0,
+      "bytes_saved_successful": 3000,
+      "compression_strategy": "brief_mode_standard",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
       "external_cost_successful_usd": 0.0,
       "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
       "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 12000,
-      "bytes_after_successful": 12000,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
-      "successful_task_count": 1,
-      "tokens_per_task_including_failures": 1000.0,
-      "wall_time_seconds_per_task_including_failures": 10.0,
-      "provider_cached_tokens_per_task_including_failures": 0.0,
-      "primary_cost_per_task_including_failures_usd": null,
-      "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": 1000.0,
-      "wall_time_seconds_per_successful_task": 10.0,
-      "provider_cached_tokens_per_successful_task": 0.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
       "external_tokens_per_successful_task": null,
-      "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
-      "byte_reduction_ratio": 1.0,
-      "compression_strategy": "baseline",
-      "is_baseline_strategy": true,
-      "bytes_saved_successful": 0,
-      "bytes_saved_per_successful_task": 0.0,
-      "byte_savings_pct": 0.0,
-      "token_proxy_saved_successful": 0,
-      "token_proxy_saved_per_successful_task": 0.0,
+      "external_tokens_successful": 0,
+      "failed_runs": 0,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": false,
       "observed_telemetry": {
-        "tokens": "observed",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
         "byte_savings": "observed",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "unavailable",
         "token_proxy": "inferred",
-        "wall_time": "observed",
-        "provider_cache": "unavailable"
-      }
-    },
-    "brief_mode_standard": {
-      "runs": 1,
-      "successful_runs": 1,
-      "failed_runs": 0,
-      "total_tokens_all_runs": 760,
-      "primary_tokens_measured_runs": 1,
+        "tokens": "observed",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 9.6,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 1,
+      "primary_tokens_measured_successful": 1,
       "provider_cached_tokens_all_runs": 0,
       "provider_cached_tokens_measured_runs": 0,
-      "total_cost_with_shift_all_runs_usd": 0.0,
-      "total_cost_with_shift_measured_runs": 0,
-      "total_tokens_successful": 760,
-      "primary_tokens_measured_successful": 1,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 9.6,
-      "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 0,
       "provider_cached_tokens_measured_successful": 0,
-      "external_cost_successful_usd": 0.0,
-      "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
-      "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 12000,
-      "bytes_after_successful": 9000,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
+      "provider_cached_tokens_per_successful_task": 0.0,
+      "provider_cached_tokens_per_task_including_failures": 0.0,
+      "provider_cached_tokens_successful": 0,
+      "runs": 1,
+      "successful_runs": 1,
       "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": 750.0,
+      "token_proxy_saved_successful": 750,
+      "tokens_per_successful_task": 760.0,
       "tokens_per_task_including_failures": 760.0,
-      "wall_time_seconds_per_task_including_failures": 9.6,
-      "provider_cached_tokens_per_task_including_failures": 0.0,
-      "primary_cost_per_task_including_failures_usd": null,
+      "total_cost_with_shift_all_runs_usd": 0.0,
+      "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
       "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": 760.0,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 760,
+      "total_tokens_successful": 760,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 9.6,
+      "wall_time_seconds_measured_runs": 1,
+      "wall_time_seconds_measured_successful": 1,
       "wall_time_seconds_per_successful_task": 9.6,
-      "provider_cached_tokens_per_successful_task": 0.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
-      "external_tokens_per_successful_task": null,
-      "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
-      "byte_reduction_ratio": 0.75,
-      "compression_strategy": "brief_mode_standard",
-      "is_baseline_strategy": false,
-      "bytes_saved_successful": 3000,
-      "bytes_saved_per_successful_task": 3000.0,
-      "byte_savings_pct": 25.0,
-      "token_proxy_saved_successful": 750,
-      "token_proxy_saved_per_successful_task": 750.0,
-      "observed_telemetry": {
-        "tokens": "observed",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
-        "byte_savings": "observed",
-        "token_proxy": "inferred",
-        "wall_time": "observed",
-        "provider_cache": "unavailable"
-      }
-    }
-  },
-  "comparisons": [
-    {
-      "variant": "brief_mode_standard",
-      "baseline_variant": "baseline",
-      "quality_gate": "pass",
-      "baseline_failure_rate": 0.0,
-      "variant_failure_rate": 0.0,
-      "failure_rate_delta_pp": 0.0,
-      "matched_successful_task_count": 1,
-      "baseline_successful_task_count": 1,
-      "missing_baseline_success_tasks": [],
-      "baseline_corrections_per_successful_task": 0.0,
-      "variant_corrections_per_successful_task": 0.0,
-      "paired_corrections_task_count": 1,
-      "corrections_delta_per_successful_task": 0.0,
-      "token_delta_per_successful_task": -240.0,
-      "token_savings_pct": 24.0,
-      "paired_token_task_count": 1,
-      "wall_time_delta_seconds_per_successful_task": -0.40000000000000036,
-      "wall_time_change_pct": -4.0000000000000036,
-      "paired_wall_time_task_count": 1,
-      "cost_savings_pct_with_shift": null,
-      "paired_cost_task_count": 0
+      "wall_time_seconds_per_task_including_failures": 9.6,
+      "wall_time_seconds_successful": 9.6
     }
-  ],
-  "claim_status": "token_savings_observed_cost_unmeasured",
-  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
+  }
 }