npm - @jaguilar87/gaia - Versions diffs - 5.0.4 → 5.0.5 - Mend

@jaguilar87/gaia 5.0.4 → 5.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/.claude-plugin/marketplace.json +2 -2
package/.claude-plugin/plugin.json +1 -1
package/CHANGELOG.md +56 -0
package/INSTALL.md +0 -2
package/README.md +1 -6
package/bin/README.md +0 -1
package/bin/cli/_install_helpers.py +1 -1
package/bin/cli/cleanup.py +0 -1
package/bin/cli/doctor.py +1 -1
package/bin/cli/memory.py +2 -0
package/bin/cli/update.py +1 -1
package/bin/pre-publish-validate.js +48 -5
package/config/README.md +22 -44
package/config/surface-routing.json +0 -1
package/dist/gaia-ops/.claude-plugin/plugin.json +1 -1
package/dist/gaia-ops/config/README.md +22 -44
package/dist/gaia-ops/config/surface-routing.json +0 -1
package/dist/gaia-ops/hooks/modules/agents/handoff_persister.py +2 -0
package/dist/gaia-ops/hooks/modules/security/approval_grants.py +2 -0
package/dist/gaia-ops/hooks/modules/tools/bash_validator.py +2 -0
package/dist/gaia-ops/hooks/modules/validation/commit_validator.py +90 -55
package/dist/gaia-ops/skills/README.md +1 -1
package/dist/gaia-ops/skills/gaia-patterns/SKILL.md +1 -1
package/dist/gaia-ops/skills/gaia-patterns/reference.md +0 -1
package/dist/gaia-ops/skills/gaia-release/SKILL.md +60 -24
package/dist/gaia-ops/skills/gaia-release/reference.md +35 -11
package/dist/gaia-ops/skills/git-conventions/SKILL.md +6 -2
package/dist/gaia-ops/skills/orchestrator-present-approval/SKILL.md +10 -2
package/dist/gaia-ops/skills/readme-writing/SKILL.md +1 -1
package/dist/gaia-ops/skills/readme-writing/reference.md +0 -1
package/dist/gaia-ops/tools/scan/ui.py +20 -4
package/dist/gaia-ops/tools/scan/verify.py +3 -3
package/dist/gaia-ops/tools/validation/README.md +15 -24
package/dist/gaia-security/.claude-plugin/plugin.json +1 -1
package/dist/gaia-security/hooks/modules/agents/handoff_persister.py +2 -0
package/dist/gaia-security/hooks/modules/security/approval_grants.py +2 -0
package/dist/gaia-security/hooks/modules/tools/bash_validator.py +2 -0
package/dist/gaia-security/hooks/modules/validation/commit_validator.py +90 -55
package/hooks/modules/agents/handoff_persister.py +2 -0
package/hooks/modules/security/approval_grants.py +2 -0
package/hooks/modules/tools/bash_validator.py +2 -0
package/hooks/modules/validation/commit_validator.py +90 -55
package/index.js +2 -12
package/package.json +4 -6
package/pyproject.toml +3 -3
package/scripts/bootstrap_database.sh +88 -439
package/scripts/check_schema_drift.py +208 -0
package/scripts/migrations/README.md +78 -28
package/scripts/migrations/schema.checksum +8 -0
package/scripts/release-prepare.mjs +199 -0
package/skills/README.md +1 -1
package/skills/gaia-patterns/SKILL.md +1 -1
package/skills/gaia-patterns/reference.md +0 -1
package/skills/gaia-release/SKILL.md +60 -24
package/skills/gaia-release/reference.md +35 -11
package/skills/git-conventions/SKILL.md +6 -2
package/skills/orchestrator-present-approval/SKILL.md +10 -2
package/skills/readme-writing/SKILL.md +1 -1
package/skills/readme-writing/reference.md +0 -1
package/tools/scan/ui.py +20 -4
package/tools/scan/verify.py +3 -3
package/tools/validation/README.md +15 -24
package/commands/README.md +0 -64
package/commands/gaia.md +0 -37
package/commands/scan-project.md +0 -74
package/config/crons-schema.md +0 -81
package/config/git_standards.json +0 -72
package/dist/gaia-ops/commands/gaia.md +0 -37
package/dist/gaia-ops/config/crons-schema.md +0 -81
package/dist/gaia-ops/config/git_standards.json +0 -72
package/dist/gaia-ops/tools/agentic-loop/decide-status.py +0 -210
package/dist/gaia-ops/tools/agentic-loop/parse-metric.py +0 -106
package/dist/gaia-ops/tools/agentic-loop/record-iteration.py +0 -223
package/git-hooks/commit-msg +0 -41
package/scripts/migrations/v10_to_v11.sql +0 -170
package/scripts/migrations/v10_to_v11_fresh.sql +0 -18
package/scripts/migrations/v11_to_v12.sql +0 -195
package/scripts/migrations/v11_to_v12_fresh.sql +0 -19
package/scripts/migrations/v12_to_v13.sql +0 -48
package/scripts/migrations/v12_to_v13_fresh.sql +0 -17
package/scripts/migrations/v13_to_v14.sql +0 -44
package/scripts/migrations/v13_to_v14_fresh.sql +0 -17
package/scripts/migrations/v14_to_v15.sql +0 -71
package/scripts/migrations/v14_to_v15_fresh.sql +0 -19
package/scripts/migrations/v15_to_v16.sql +0 -57
package/scripts/migrations/v15_to_v16_fresh.sql +0 -18
package/scripts/migrations/v16_to_v17.sql +0 -51
package/scripts/migrations/v16_to_v17_fresh.sql +0 -18
package/scripts/migrations/v17_to_v18.sql +0 -66
package/scripts/migrations/v17_to_v18_fresh.sql +0 -24
package/scripts/migrations/v1_to_v2.sql +0 -97
package/scripts/migrations/v2_to_v3.sql +0 -68
package/scripts/migrations/v2_to_v3_merge.sql +0 -69
package/scripts/migrations/v3_to_v4.sql +0 -67
package/scripts/migrations/v3_to_v4_fresh.sql +0 -20
package/scripts/migrations/v4_to_v5.sql +0 -55
package/scripts/migrations/v4_to_v5_fresh.sql +0 -20
package/scripts/migrations/v5_to_v6.sql +0 -48
package/scripts/migrations/v5_to_v6_fresh.sql +0 -17
package/scripts/migrations/v6_to_v7.sql +0 -26
package/scripts/migrations/v6_to_v7_fresh.sql +0 -13
package/scripts/migrations/v7_to_v8.sql +0 -44
package/scripts/migrations/v7_to_v8_fresh.sql +0 -14
package/scripts/migrations/v8_to_v9.sql +0 -87
package/scripts/migrations/v8_to_v9_fresh.sql +0 -15
package/scripts/migrations/v9_to_v10.sql +0 -109
package/scripts/migrations/v9_to_v10_episodes_workspace.sql +0 -109
package/scripts/migrations/v9_to_v10_fresh.sql +0 -18
package/templates/README.md +0 -70
package/templates/managed-settings.template.json +0 -43
package/tools/agentic-loop/decide-status.py +0 -210
package/tools/agentic-loop/parse-metric.py +0 -106
package/tools/agentic-loop/record-iteration.py +0 -223

package/scripts/migrations/v9_to_v10_episodes_workspace.sql DELETED Viewed

@@ -1,109 +0,0 @@
--- Migration v9 -> v10 (episodic-workflow-to-db: episodes workspace canonical)
---
--- Background
--- ----------
--- v9 schema has the episodes table with these columns:
---   episode_id, workspace, timestamp, session_id, task_id, agent,
---   type, title, prompt, enriched_prompt, wf_prompt, clarifications,
---   keywords, tags, commands_executed, context_metrics, relevance_score,
---   outcome, duration_seconds, exit_code, plan_status, output_length,
---   output_tokens_approx
---
--- v10 adds:
---   episodes.tier           -- security tier (T0/T1/T2/T3), promoted from context_metrics blob
---   episode_anomalies       -- structured anomaly records extracted from context_metrics blob
---
--- Design decisions
--- ----------------
--- D1: tier -> top-level column (not blob)
---   Rationale: tier is a single short TEXT value (T0/T1/T2/T3) with a clear
---   compliance query pattern: "COUNT(*) WHERE tier='T3' AND outcome='partial'".
---   Keeping it in the context_metrics JSON blob would require a full-table
---   JSON parse for every compliance query. With 10,000+ rows in workspace 'me'
---   alone, this is a significant performance cost. A column + index reduces
---   that to a B-tree lookup. The schema cost is one ALTER TABLE + one index.
---   Alternative considered: keep in blob. Rejected because the query pattern
---   is both real (used by context_injector.py anomaly surfacing) and frequent
---   (every compliance dashboard query). No reason to pay JSON parsing overhead
---   when the data is a four-value enum.
---
--- D2: episode_anomalies -> separate table (not blob)
---   Rationale: anomalies have a stable schema {type, severity, message} per
---   object. The query "all anomalies of type X in the last 7 days" is a real
---   operational need -- context_injector.py currently reads anomalies.jsonl
---   to surface critical anomalies in orchestrator context. That reader must
---   be ported post-migration. A separate table with a type index enables
---   `SELECT * FROM episode_anomalies JOIN episodes ON ... WHERE type=? AND
---   episodes.timestamp > ?` without JSON parsing any rows. With anomalies
---   present in a large fraction of episodes (4 anomalies in the 12 observed
---   sessions), the cardinality justifies a separate table. The anomalies[]
---   array is still preserved inside context_metrics for backward compatibility
---   with any reader that parses the full blob -- the table is an additional
---   queryable index, not a replacement.
---   Alternative considered: keep in context_metrics blob. Rejected because
---   the type-filtered cross-episode query has no efficient implementation
---   without the table. GROUP BY type reports are otherwise O(N) full scans
---   with JSON parsing per row.
---
--- Column notes
--- ------------
--- episodes.workspace: already present in the v9 schema; NO ALTER TABLE needed.
---   Live DB confirmed: workspace column exists and has data ('me', 'bildwiz',
---   'nfi'). The default 'me' for legacy rows is unnecessary -- workspace is
---   already populated. This step is a no-op in terms of DDL.
---
--- Atomicity: bootstrap_database.sh wraps this script in BEGIN/COMMIT.
--- A failure mid-flight rolls back to v9 state; the ledger row is NOT
--- inserted, so the next bootstrap retry sees the same pending migration.
--- Closes AC-2 of brief episodic-workflow-to-db (brief_id=72).
--- Step 1: Add tier column to episodes.
--- SQLite does not support CHECK constraints in ALTER TABLE ADD COLUMN without
--- a DEFAULT, so the CHECK is omitted here; validation is enforced at the
--- application layer (episodic.py / workflow_recorder.py writers).
-ALTER TABLE episodes ADD COLUMN tier TEXT;
--- Step 2: Index tier for compliance queries.
-CREATE INDEX IF NOT EXISTS idx_episodes_tier ON episodes(tier);
--- Step 3: Compound index for the primary compliance query pattern:
--- "T3 operations with non-COMPLETE outcomes in time window".
-CREATE INDEX IF NOT EXISTS idx_episodes_tier_outcome ON episodes(tier, outcome);
--- Step 4: Create episode_anomalies table.
--- Each row is one anomaly record extracted from an episode's context_metrics
--- blob. The payload column holds the full original JSON object for forward
--- compatibility (additional keys in future anomaly schemas are preserved).
-CREATE TABLE IF NOT EXISTS episode_anomalies (
-    id          INTEGER PRIMARY KEY AUTOINCREMENT,
-    episode_id  TEXT NOT NULL,              -- FK -> episodes.episode_id
-    workspace   TEXT NOT NULL,              -- denormalized for partition queries without JOIN
-    timestamp   TEXT NOT NULL,              -- denormalized from parent episode for time-range queries
-    type        TEXT NOT NULL,              -- e.g. "investigation_skip", "no_tool_use"
-    severity    TEXT,                       -- e.g. "warning", "error", "info"
-    message     TEXT,                       -- human-readable description
-    payload     TEXT,                       -- full JSON object (forward-compat for extra keys)
-    FOREIGN KEY (episode_id) REFERENCES episodes(episode_id) ON DELETE CASCADE
-);
--- Step 5: Indexes on episode_anomalies.
--- Primary query patterns:
---   (a) All anomalies of type X: WHERE type = ?
---   (b) Cross-episode anomaly report in time window: WHERE type = ? AND timestamp > ?
---   (c) Anomalies for a specific episode: WHERE episode_id = ?
---   (d) Workspace-scoped anomaly dashboard: WHERE workspace = ? AND timestamp > ?
-CREATE INDEX IF NOT EXISTS idx_episode_anomalies_type      ON episode_anomalies(type);
-CREATE INDEX IF NOT EXISTS idx_episode_anomalies_workspace  ON episode_anomalies(workspace, timestamp DESC);
-CREATE INDEX IF NOT EXISTS idx_episode_anomalies_episode    ON episode_anomalies(episode_id);
--- Step 6: Bump schema_version to 10.
-INSERT OR IGNORE INTO schema_version (version, applied_at, description)
-VALUES (10, strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
-        'episodes.tier column + idx + episode_anomalies table (brief episodic-workflow-to-db AC-2)');
--- Verification queries (run after applying):
--- SELECT MAX(version) FROM schema_version;           -- expect: 10
--- PRAGMA table_info(episodes);                       -- expect: tier column present (after output_tokens_approx)
--- SELECT * FROM sqlite_master WHERE type='index' AND name LIKE 'idx_episodes_tier%';  -- expect: 2 rows
--- PRAGMA table_info(episode_anomalies);              -- expect: 7 columns
--- SELECT COUNT(*) FROM episode_anomalies;            -- expect: 0 (populated by T3 migration task)

package/scripts/migrations/v9_to_v10_fresh.sql DELETED Viewed

@@ -1,18 +0,0 @@
--- Migration v9 -> v10 fresh-install variant (episodic-workflow-to-db AC-3)
---
--- Used by bootstrap_database.sh when the live DB already has the
--- episode_anomalies table (i.e. schema.sql ran first on a clean install and
--- created the tables in v10 target state, including episodes.tier column).
---
--- On a fresh install, schema.sql creates the episodes table WITH the tier
--- column, so ALTER TABLE is not needed. However, the tier-dependent indexes
--- (idx_episodes_tier, idx_episodes_tier_outcome) cannot be declared in
--- schema.sql because schema.sql runs before migrations on existing DBs where
--- tier does not yet exist. This fresh variant creates those indexes safely,
--- since on a fresh install tier is guaranteed to exist.
---
--- Atomicity: bootstrap_database.sh wraps this script in BEGIN/COMMIT.
--- Create tier indexes that schema.sql cannot safely declare for existing DBs.
-CREATE INDEX IF NOT EXISTS idx_episodes_tier ON episodes(tier);
-CREATE INDEX IF NOT EXISTS idx_episodes_tier_outcome ON episodes(tier, outcome);

package/templates/README.md DELETED Viewed

@@ -1,70 +0,0 @@
-# Templates
-Templates are the reference files that Gaia uses to generate per-project configuration. They are not consumed by the Claude Code runtime — they are consumed by the install scripts in `bin/` and by organization administrators deploying managed policies. Think of this directory as the catalog of files that exist as skeletons, ready to be filled in during installation or deployed verbatim as a policy.
-There are two audiences for this directory, and they do not overlap. The first is the individual developer installing Gaia into their project — `gaia install` (run by the npm postinstall hook) bootstraps the DB and `.claude/` structure. The second is the enterprise administrator — they take `managed-settings.template.json` and deploy it as a managed policy via the Claude.ai Admin Console or by placing it at `/etc/claude-code/managed-settings.json` on managed workstations. `gaia scan` (separate from install) writes project context to `~/.gaia/gaia.db` only; it does not read or interpolate templates.
-Keeping these files here, rather than embedding them in `bin/cli/scan.py`, means policies and skeletons can be audited and customized without touching executable code. An admin can diff `managed-settings.template.json` against a previous version. A developer can read `governance.template.md` (when present) before letting the scanner interpolate it.
-## Cuándo se activa
-This component does not activate in the runtime Claude Code pipeline. Templates are consumed only by install-time tooling and by administrators deploying policies out-of-band.
-**When each template is consumed:**
-```
-Individual developer runs: npm install @jaguilar87/gaia
-        |
-postinstall -> gaia install --postinstall
-        |
-Install bootstraps ~/.gaia/gaia.db, creates .claude/ symlinks, merges settings.
-Templates in this directory are NOT read during install or scan.
-Developer separately runs: gaia scan
-        |
-bin/cli/scan.py detects project stack and writes to ~/.gaia/gaia.db.
-gaia scan does NOT read templates/ or generate any files.
-```
-```
-Enterprise admin deploys managed policy
-        |
-Admin copies templates/managed-settings.template.json
-        |
-Deploys to Claude.ai Admin Console
-   OR writes to /etc/claude-code/managed-settings.json (Linux managed workstations)
-        |
-Managed settings take highest precedence — cannot be overridden by user or project
-```
-## Qué hay aquí
-```
-templates/
-├── managed-settings.template.json   # Enterprise reference — deployed by admin, not gaia-scan
-└── README.md
-```
-Currently only `managed-settings.template.json` ships in this directory. A `governance.template.md` has been referenced in prior docs but is not present in source and is not consumed by any current automated step (`gaia scan` writes to DB only; `gaia install` does not interpolate templates). If a future installer step consumes templates, this note should be updated.
-## Convenciones
-**Audience per file:**
-| File | Audience | Consumed by | Trigger |
-|------|----------|-------------|---------|
-| `managed-settings.template.json` | Enterprise administrator | Claude.ai Admin Console or `/etc/claude-code/managed-settings.json` | Admin action — out of band, not automated |
-| `governance.template.md` (if present) | Individual developer | Future install tooling (not yet implemented) | Not currently consumed by any automated step |
-**Managed settings precedence:** `managed-settings.template.json` contains wildcard deny rules that cannot be overridden by user or project settings. It also sets `disableBypassPermissionsMode: true` to prevent `--dangerously-skip-permissions`. Deploy this only when you want organization-wide enforcement.
-**No CLAUDE.md generated:** Orchestrator identity is no longer generated from a template. It lives in `agents/gaia-orchestrator.md` and is activated via `settings.json: { "agent": "gaia-orchestrator" }`. Surface routing is injected by the `UserPromptSubmit` hook, not by a template.
-**Template naming:** Files intended for interpolation use the `.template.<ext>` suffix (e.g., `governance.template.md`, `managed-settings.template.json`). Files without that suffix should not be here.
-## Ver también
-- [`bin/cli/install.py`](../bin/cli/install.py) — `gaia install` (postinstall) bootstraps the DB and `.claude/` structure; templates are not currently read by any automated step
-- [`bin/cli/update.py`](../bin/cli/update.py) — `gaia update` updates settings.local.json (merges, does not use templates here)
-- [`agents/gaia-orchestrator.md`](../agents/gaia-orchestrator.md) — orchestrator identity (replaces old CLAUDE.md template path)
-- [`build/gaia-ops.manifest.json`](../build/gaia-ops.manifest.json) — plugin-level permission defaults (distinct from managed-settings)

package/templates/managed-settings.template.json DELETED Viewed

@@ -1,43 +0,0 @@
-{
-  "_comment": [
-    "Managed settings template for enterprise/organization deployment.",
-    "Deploy to: /etc/claude-code/managed-settings.json (Linux/WSL)",
-    "           /Library/Application Support/ClaudeCode/managed-settings.json (macOS)",
-    "           Or via Claude.ai Admin > Claude Code > Managed Settings",
-    "",
-    "These rules have the HIGHEST precedence and CANNOT be overridden by",
-    "user, project, or local settings. They are the ultimate security gate."
-  ],
-  "permissions": {
-    "deny": [
-      "Bash(aws * delete-*:*)",
-      "Bash(aws * terminate-*:*)",
-      "Bash(az * delete:*)",
-      "Bash(gcloud * delete:*)",
-      "Bash(gsutil rb:*)",
-      "Bash(gsutil rm:*)",
-      "Bash(gcloud storage rm:*)",
-      "Bash(kubectl delete:*)",
-      "Bash(kubectl drain:*)",
-      "Bash(terraform destroy:*)",
-      "Bash(terragrunt destroy:*)",
-      "Bash(terragrunt run-all destroy:*)",
-      "Bash(helm uninstall:*)",
-      "Bash(helm delete:*)",
-      "Bash(flux uninstall:*)",
-      "Bash(docker system prune:*)",
-      "Bash(docker volume prune:*)",
-      "Bash(git push --force:*)",
-      "Bash(git push -f:*)",
-      "Bash(git reset --hard:*)",
-      "Bash(gh repo delete:*)",
-      "Bash(glab project delete:*)",
-      "Bash(dd:*)",
-      "Bash(fdisk:*)",
-      "Bash(mkfs:*)",
-      "Bash(mkfs.*:*)"
-    ]
-  },
-  "disableBypassPermissionsMode": "disable",
-  "allowManagedHooksOnly": false
-}

package/tools/agentic-loop/decide-status.py DELETED Viewed

@@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-"""
-decide-status.py
-Mechanically decide what to do based on numbers alone.  No LLM judgment.
-Usage:
-    python3 decide-status.py \
-        --current 94.5 \
-        --best 92.0 \
-        --threshold 98 \
-        --direction higher \
-        --consecutive-discards 2 \
-        --pivot-count 1
-Output JSON:
-    {
-      "decision": "keep",
-      "reason": "Metric improved from 92.0 to 94.5",
-      "improved": true,
-      "gap_remaining": 3.5
-    }
-Decision precedence (evaluated top-to-bottom, first match wins):
-  1. pivot_count >= 3                          → stop
-  2. consecutive_discards >= 5                 → pivot   (also a discard)
-  3. consecutive_discards >= 3                 → refine  (also a discard)
-  4. current meets or passes threshold         → threshold_reached
-  5. current improved vs best (per direction)  → keep
-  6. current same or worse                     → discard
-Exit codes:
-  0  success (decision emitted as JSON)
-  1  invalid input
-"""
-import argparse
-import json
-import sys
-Decision = str  # type alias for readability
-def _is_improved(current: float, best: float, direction: str) -> bool:
-    """Return True if *current* is strictly better than *best* per direction."""
-    if direction == "higher":
-        return current > best
-    return current < best  # lower is better
-def _threshold_reached(current: float, threshold: float, direction: str) -> bool:
-    """Return True if *current* has met or surpassed *threshold*."""
-    if direction == "higher":
-        return current >= threshold
-    return current <= threshold
-def _gap_remaining(current: float, threshold: float, direction: str) -> float:
-    """Absolute gap between current value and threshold."""
-    if direction == "higher":
-        return max(0.0, threshold - current)
-    return max(0.0, current - threshold)
-def decide(
-    current: float,
-    best: float,
-    threshold: float,
-    direction: str,
-    consecutive_discards: int,
-    pivot_count: int,
-) -> dict:
-    """Pure function: return decision dict from numeric inputs."""
-    gap = _gap_remaining(current, threshold, direction)
-    improved = _is_improved(current, best, direction)
-    # --- Precedence 1: hard stop on too many pivots ---
-    if pivot_count >= 3:
-        return {
-            "decision": "stop",
-            "reason": f"pivot_count={pivot_count} has reached the maximum of 3; halting loop",
-            "improved": improved,
-            "gap_remaining": gap,
-        }
-    # --- Precedence 2 & 3: discard streak escalations ---
-    # Evaluated before threshold/keep so an ongoing failing streak is flagged
-    # even if the current run happens to reach the threshold.
-    if consecutive_discards >= 5:
-        return {
-            "decision": "pivot",
-            "reason": (
-                f"consecutive_discards={consecutive_discards} >= 5; "
-                "strategy is not working, force a pivot"
-            ),
-            "improved": improved,
-            "gap_remaining": gap,
-        }
-    if consecutive_discards >= 3:
-        return {
-            "decision": "refine",
-            "reason": (
-                f"consecutive_discards={consecutive_discards} >= 3; "
-                "current approach needs refinement before continuing"
-            ),
-            "improved": improved,
-            "gap_remaining": gap,
-        }
-    # --- Precedence 4: threshold reached ---
-    if _threshold_reached(current, threshold, direction):
-        return {
-            "decision": "threshold_reached",
-            "reason": (
-                f"current={current} {'≥' if direction == 'higher' else '≤'} "
-                f"threshold={threshold}; goal achieved"
-            ),
-            "improved": improved,
-            "gap_remaining": 0.0,
-        }
-    # --- Precedence 5 & 6: standard keep/discard ---
-    if improved:
-        return {
-            "decision": "keep",
-            "reason": f"Metric improved from {best} to {current}",
-            "improved": True,
-            "gap_remaining": gap,
-        }
-    return {
-        "decision": "discard",
-        "reason": f"Metric did not improve (current={current}, best={best})",
-        "improved": False,
-        "gap_remaining": gap,
-    }
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Compute the next agentic-loop decision from metric numbers only.",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Decisions:
-  keep               current improved vs best
-  discard            current same or worse
-  refine             3+ consecutive discards (improvement needed in approach)
-  pivot              5+ consecutive discards (strategy change required)
-  stop               3+ pivots already attempted
-  threshold_reached  current meets or surpasses the goal threshold
-Direction values:
-  higher   larger numbers are better  (e.g. accuracy, passing tests)
-  lower    smaller numbers are better (e.g. error rate, latency ms)
-        """,
-    )
-    parser.add_argument("--current", required=True, type=float, help="Metric value for the current run")
-    parser.add_argument("--best", required=True, type=float, help="Best metric seen so far (from state.json)")
-    parser.add_argument("--threshold", required=True, type=float, help="Target threshold to reach")
-    parser.add_argument(
-        "--direction",
-        required=True,
-        choices=["higher", "lower"],
-        help="Whether higher or lower values are better",
-    )
-    parser.add_argument(
-        "--consecutive-discards",
-        required=True,
-        type=int,
-        metavar="N",
-        help="Number of consecutive discard outcomes so far (from state.json)",
-    )
-    parser.add_argument(
-        "--pivot-count",
-        required=True,
-        type=int,
-        metavar="N",
-        help="Number of pivots executed so far (from state.json)",
-    )
-    args = parser.parse_args()
-    # --- Input validation ---
-    errors = []
-    if args.consecutive_discards < 0:
-        errors.append("--consecutive-discards must be >= 0")
-    if args.pivot_count < 0:
-        errors.append("--pivot-count must be >= 0")
-    if errors:
-        for err in errors:
-            print(f"error: {err}", file=sys.stderr)
-        sys.exit(1)
-    result = decide(
-        current=args.current,
-        best=args.best,
-        threshold=args.threshold,
-        direction=args.direction,
-        consecutive_discards=args.consecutive_discards,
-        pivot_count=args.pivot_count,
-    )
-    print(json.dumps(result, indent=2))
-if __name__ == "__main__":
-    main()

package/tools/agentic-loop/parse-metric.py DELETED Viewed

@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-"""
-parse-metric.py
-Read stdout from eval_command and extract METRIC lines.
-Usage:
-    echo "output" | python3 parse-metric.py --metric accuracy
-    python3 parse-metric.py --metric accuracy --file /tmp/eval-output.txt
-Input lines must match: METRIC {name}={number}
-Output: JSON to stdout with metric name, numeric value, and raw line.
-"""
-import argparse
-import json
-import re
-import sys
-from typing import Optional
-METRIC_PATTERN = re.compile(r"^METRIC\s+(\w+)=([\d.]+)\s*$")
-def parse_lines(lines: list[str]) -> list[dict]:
-    """Extract all METRIC entries from a sequence of lines."""
-    results = []
-    for line in lines:
-        stripped = line.rstrip("\n")
-        match = METRIC_PATTERN.match(stripped)
-        if match:
-            name = match.group(1)
-            raw_value = match.group(2)
-            # Preserve int vs float from the source text.
-            value: int | float
-            if "." in raw_value:
-                value = float(raw_value)
-            else:
-                value = int(raw_value)
-            results.append(
-                {
-                    "metric": name,
-                    "value": value,
-                    "raw_line": stripped,
-                }
-            )
-    return results
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Extract METRIC lines from eval_command output.",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  echo "METRIC accuracy=94.5" | python3 parse-metric.py --metric accuracy
-  python3 parse-metric.py --metric passing_tests --file /tmp/out.txt
-  python3 parse-metric.py --file /tmp/out.txt          # returns all metrics
-        """,
-    )
-    parser.add_argument(
-        "--metric",
-        metavar="NAME",
-        help="Return only this named metric (case-sensitive). Exits 1 if not found.",
-    )
-    parser.add_argument(
-        "--file",
-        metavar="PATH",
-        help="Read from file instead of stdin.",
-    )
-    args = parser.parse_args()
-    # --- Read input ---
-    try:
-        if args.file:
-            with open(args.file, "r") as fh:
-                lines = fh.readlines()
-        else:
-            lines = sys.stdin.readlines()
-    except OSError as exc:
-        print(f"error: cannot read input: {exc}", file=sys.stderr)
-        sys.exit(1)
-    # --- Parse ---
-    all_metrics = parse_lines(lines)
-    if args.metric:
-        # Filter to the requested metric name.
-        matches = [m for m in all_metrics if m["metric"] == args.metric]
-        if not matches:
-            print(
-                f"error: metric '{args.metric}' not found in input",
-                file=sys.stderr,
-            )
-            sys.exit(1)
-        # Return the last occurrence if there are duplicates.
-        result = matches[-1]
-    else:
-        # Return all metrics as a list when no --metric filter is given.
-        result = all_metrics  # type: ignore[assignment]
-    print(json.dumps(result, indent=2))
-if __name__ == "__main__":
-    main()