@jaguilar87/gaia 5.0.4 → 5.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/CHANGELOG.md +56 -0
  4. package/INSTALL.md +0 -2
  5. package/README.md +1 -6
  6. package/bin/README.md +0 -1
  7. package/bin/cli/_install_helpers.py +1 -1
  8. package/bin/cli/cleanup.py +0 -1
  9. package/bin/cli/doctor.py +1 -1
  10. package/bin/cli/memory.py +2 -0
  11. package/bin/cli/update.py +1 -1
  12. package/bin/pre-publish-validate.js +48 -5
  13. package/config/README.md +22 -44
  14. package/config/surface-routing.json +0 -1
  15. package/dist/gaia-ops/.claude-plugin/plugin.json +1 -1
  16. package/dist/gaia-ops/config/README.md +22 -44
  17. package/dist/gaia-ops/config/surface-routing.json +0 -1
  18. package/dist/gaia-ops/hooks/modules/agents/handoff_persister.py +2 -0
  19. package/dist/gaia-ops/hooks/modules/security/approval_grants.py +2 -0
  20. package/dist/gaia-ops/hooks/modules/tools/bash_validator.py +2 -0
  21. package/dist/gaia-ops/hooks/modules/validation/commit_validator.py +90 -55
  22. package/dist/gaia-ops/skills/README.md +1 -1
  23. package/dist/gaia-ops/skills/gaia-patterns/SKILL.md +1 -1
  24. package/dist/gaia-ops/skills/gaia-patterns/reference.md +0 -1
  25. package/dist/gaia-ops/skills/gaia-release/SKILL.md +60 -24
  26. package/dist/gaia-ops/skills/gaia-release/reference.md +35 -11
  27. package/dist/gaia-ops/skills/git-conventions/SKILL.md +6 -2
  28. package/dist/gaia-ops/skills/orchestrator-present-approval/SKILL.md +10 -2
  29. package/dist/gaia-ops/skills/readme-writing/SKILL.md +1 -1
  30. package/dist/gaia-ops/skills/readme-writing/reference.md +0 -1
  31. package/dist/gaia-ops/tools/scan/ui.py +20 -4
  32. package/dist/gaia-ops/tools/scan/verify.py +3 -3
  33. package/dist/gaia-ops/tools/validation/README.md +15 -24
  34. package/dist/gaia-security/.claude-plugin/plugin.json +1 -1
  35. package/dist/gaia-security/hooks/modules/agents/handoff_persister.py +2 -0
  36. package/dist/gaia-security/hooks/modules/security/approval_grants.py +2 -0
  37. package/dist/gaia-security/hooks/modules/tools/bash_validator.py +2 -0
  38. package/dist/gaia-security/hooks/modules/validation/commit_validator.py +90 -55
  39. package/hooks/modules/agents/handoff_persister.py +2 -0
  40. package/hooks/modules/security/approval_grants.py +2 -0
  41. package/hooks/modules/tools/bash_validator.py +2 -0
  42. package/hooks/modules/validation/commit_validator.py +90 -55
  43. package/index.js +2 -12
  44. package/package.json +4 -6
  45. package/pyproject.toml +3 -3
  46. package/scripts/bootstrap_database.sh +88 -439
  47. package/scripts/check_schema_drift.py +208 -0
  48. package/scripts/migrations/README.md +78 -28
  49. package/scripts/migrations/schema.checksum +8 -0
  50. package/scripts/release-prepare.mjs +199 -0
  51. package/skills/README.md +1 -1
  52. package/skills/gaia-patterns/SKILL.md +1 -1
  53. package/skills/gaia-patterns/reference.md +0 -1
  54. package/skills/gaia-release/SKILL.md +60 -24
  55. package/skills/gaia-release/reference.md +35 -11
  56. package/skills/git-conventions/SKILL.md +6 -2
  57. package/skills/orchestrator-present-approval/SKILL.md +10 -2
  58. package/skills/readme-writing/SKILL.md +1 -1
  59. package/skills/readme-writing/reference.md +0 -1
  60. package/tools/scan/ui.py +20 -4
  61. package/tools/scan/verify.py +3 -3
  62. package/tools/validation/README.md +15 -24
  63. package/commands/README.md +0 -64
  64. package/commands/gaia.md +0 -37
  65. package/commands/scan-project.md +0 -74
  66. package/config/crons-schema.md +0 -81
  67. package/config/git_standards.json +0 -72
  68. package/dist/gaia-ops/commands/gaia.md +0 -37
  69. package/dist/gaia-ops/config/crons-schema.md +0 -81
  70. package/dist/gaia-ops/config/git_standards.json +0 -72
  71. package/dist/gaia-ops/tools/agentic-loop/decide-status.py +0 -210
  72. package/dist/gaia-ops/tools/agentic-loop/parse-metric.py +0 -106
  73. package/dist/gaia-ops/tools/agentic-loop/record-iteration.py +0 -223
  74. package/git-hooks/commit-msg +0 -41
  75. package/scripts/migrations/v10_to_v11.sql +0 -170
  76. package/scripts/migrations/v10_to_v11_fresh.sql +0 -18
  77. package/scripts/migrations/v11_to_v12.sql +0 -195
  78. package/scripts/migrations/v11_to_v12_fresh.sql +0 -19
  79. package/scripts/migrations/v12_to_v13.sql +0 -48
  80. package/scripts/migrations/v12_to_v13_fresh.sql +0 -17
  81. package/scripts/migrations/v13_to_v14.sql +0 -44
  82. package/scripts/migrations/v13_to_v14_fresh.sql +0 -17
  83. package/scripts/migrations/v14_to_v15.sql +0 -71
  84. package/scripts/migrations/v14_to_v15_fresh.sql +0 -19
  85. package/scripts/migrations/v15_to_v16.sql +0 -57
  86. package/scripts/migrations/v15_to_v16_fresh.sql +0 -18
  87. package/scripts/migrations/v16_to_v17.sql +0 -51
  88. package/scripts/migrations/v16_to_v17_fresh.sql +0 -18
  89. package/scripts/migrations/v17_to_v18.sql +0 -66
  90. package/scripts/migrations/v17_to_v18_fresh.sql +0 -24
  91. package/scripts/migrations/v1_to_v2.sql +0 -97
  92. package/scripts/migrations/v2_to_v3.sql +0 -68
  93. package/scripts/migrations/v2_to_v3_merge.sql +0 -69
  94. package/scripts/migrations/v3_to_v4.sql +0 -67
  95. package/scripts/migrations/v3_to_v4_fresh.sql +0 -20
  96. package/scripts/migrations/v4_to_v5.sql +0 -55
  97. package/scripts/migrations/v4_to_v5_fresh.sql +0 -20
  98. package/scripts/migrations/v5_to_v6.sql +0 -48
  99. package/scripts/migrations/v5_to_v6_fresh.sql +0 -17
  100. package/scripts/migrations/v6_to_v7.sql +0 -26
  101. package/scripts/migrations/v6_to_v7_fresh.sql +0 -13
  102. package/scripts/migrations/v7_to_v8.sql +0 -44
  103. package/scripts/migrations/v7_to_v8_fresh.sql +0 -14
  104. package/scripts/migrations/v8_to_v9.sql +0 -87
  105. package/scripts/migrations/v8_to_v9_fresh.sql +0 -15
  106. package/scripts/migrations/v9_to_v10.sql +0 -109
  107. package/scripts/migrations/v9_to_v10_episodes_workspace.sql +0 -109
  108. package/scripts/migrations/v9_to_v10_fresh.sql +0 -18
  109. package/templates/README.md +0 -70
  110. package/templates/managed-settings.template.json +0 -43
  111. package/tools/agentic-loop/decide-status.py +0 -210
  112. package/tools/agentic-loop/parse-metric.py +0 -106
  113. package/tools/agentic-loop/record-iteration.py +0 -223
@@ -1,109 +0,0 @@
1
- -- Migration v9 -> v10 (episodic-workflow-to-db: episodes workspace canonical)
2
- --
3
- -- Background
4
- -- ----------
5
- -- v9 schema has the episodes table with these columns:
6
- -- episode_id, workspace, timestamp, session_id, task_id, agent,
7
- -- type, title, prompt, enriched_prompt, wf_prompt, clarifications,
8
- -- keywords, tags, commands_executed, context_metrics, relevance_score,
9
- -- outcome, duration_seconds, exit_code, plan_status, output_length,
10
- -- output_tokens_approx
11
- --
12
- -- v10 adds:
13
- -- episodes.tier -- security tier (T0/T1/T2/T3), promoted from context_metrics blob
14
- -- episode_anomalies -- structured anomaly records extracted from context_metrics blob
15
- --
16
- -- Design decisions
17
- -- ----------------
18
- -- D1: tier -> top-level column (not blob)
19
- -- Rationale: tier is a single short TEXT value (T0/T1/T2/T3) with a clear
20
- -- compliance query pattern: "COUNT(*) WHERE tier='T3' AND outcome='partial'".
21
- -- Keeping it in the context_metrics JSON blob would require a full-table
22
- -- JSON parse for every compliance query. With 10,000+ rows in workspace 'me'
23
- -- alone, this is a significant performance cost. A column + index reduces
24
- -- that to a B-tree lookup. The schema cost is one ALTER TABLE + one index.
25
- -- Alternative considered: keep in blob. Rejected because the query pattern
26
- -- is both real (used by context_injector.py anomaly surfacing) and frequent
27
- -- (every compliance dashboard query). No reason to pay JSON parsing overhead
28
- -- when the data is a four-value enum.
29
- --
30
- -- D2: episode_anomalies -> separate table (not blob)
31
- -- Rationale: anomalies have a stable schema {type, severity, message} per
32
- -- object. The query "all anomalies of type X in the last 7 days" is a real
33
- -- operational need -- context_injector.py currently reads anomalies.jsonl
34
- -- to surface critical anomalies in orchestrator context. That reader must
35
- -- be ported post-migration. A separate table with a type index enables
36
- -- `SELECT * FROM episode_anomalies JOIN episodes ON ... WHERE type=? AND
37
- -- episodes.timestamp > ?` without JSON parsing any rows. With anomalies
38
- -- present in a large fraction of episodes (4 anomalies in the 12 observed
39
- -- sessions), the cardinality justifies a separate table. The anomalies[]
40
- -- array is still preserved inside context_metrics for backward compatibility
41
- -- with any reader that parses the full blob -- the table is an additional
42
- -- queryable index, not a replacement.
43
- -- Alternative considered: keep in context_metrics blob. Rejected because
44
- -- the type-filtered cross-episode query has no efficient implementation
45
- -- without the table. GROUP BY type reports are otherwise O(N) full scans
46
- -- with JSON parsing per row.
47
- --
48
- -- Column notes
49
- -- ------------
50
- -- episodes.workspace: already present in the v9 schema; NO ALTER TABLE needed.
51
- -- Live DB confirmed: workspace column exists and has data ('me', 'bildwiz',
52
- -- 'nfi'). The default 'me' for legacy rows is unnecessary -- workspace is
53
- -- already populated. This step is a no-op in terms of DDL.
54
- --
55
- -- Atomicity: bootstrap_database.sh wraps this script in BEGIN/COMMIT.
56
- -- A failure mid-flight rolls back to v9 state; the ledger row is NOT
57
- -- inserted, so the next bootstrap retry sees the same pending migration.
58
- -- Closes AC-2 of brief episodic-workflow-to-db (brief_id=72).
59
-
60
- -- Step 1: Add tier column to episodes.
61
- -- SQLite does not support CHECK constraints in ALTER TABLE ADD COLUMN without
62
- -- a DEFAULT, so the CHECK is omitted here; validation is enforced at the
63
- -- application layer (episodic.py / workflow_recorder.py writers).
64
- ALTER TABLE episodes ADD COLUMN tier TEXT;
65
-
66
- -- Step 2: Index tier for compliance queries.
67
- CREATE INDEX IF NOT EXISTS idx_episodes_tier ON episodes(tier);
68
-
69
- -- Step 3: Compound index for the primary compliance query pattern:
70
- -- "T3 operations with non-COMPLETE outcomes in time window".
71
- CREATE INDEX IF NOT EXISTS idx_episodes_tier_outcome ON episodes(tier, outcome);
72
-
73
- -- Step 4: Create episode_anomalies table.
74
- -- Each row is one anomaly record extracted from an episode's context_metrics
75
- -- blob. The payload column holds the full original JSON object for forward
76
- -- compatibility (additional keys in future anomaly schemas are preserved).
77
- CREATE TABLE IF NOT EXISTS episode_anomalies (
78
- id INTEGER PRIMARY KEY AUTOINCREMENT,
79
- episode_id TEXT NOT NULL, -- FK -> episodes.episode_id
80
- workspace TEXT NOT NULL, -- denormalized for partition queries without JOIN
81
- timestamp TEXT NOT NULL, -- denormalized from parent episode for time-range queries
82
- type TEXT NOT NULL, -- e.g. "investigation_skip", "no_tool_use"
83
- severity TEXT, -- e.g. "warning", "error", "info"
84
- message TEXT, -- human-readable description
85
- payload TEXT, -- full JSON object (forward-compat for extra keys)
86
- FOREIGN KEY (episode_id) REFERENCES episodes(episode_id) ON DELETE CASCADE
87
- );
88
-
89
- -- Step 5: Indexes on episode_anomalies.
90
- -- Primary query patterns:
91
- -- (a) All anomalies of type X: WHERE type = ?
92
- -- (b) Cross-episode anomaly report in time window: WHERE type = ? AND timestamp > ?
93
- -- (c) Anomalies for a specific episode: WHERE episode_id = ?
94
- -- (d) Workspace-scoped anomaly dashboard: WHERE workspace = ? AND timestamp > ?
95
- CREATE INDEX IF NOT EXISTS idx_episode_anomalies_type ON episode_anomalies(type);
96
- CREATE INDEX IF NOT EXISTS idx_episode_anomalies_workspace ON episode_anomalies(workspace, timestamp DESC);
97
- CREATE INDEX IF NOT EXISTS idx_episode_anomalies_episode ON episode_anomalies(episode_id);
98
-
99
- -- Step 6: Bump schema_version to 10.
100
- INSERT OR IGNORE INTO schema_version (version, applied_at, description)
101
- VALUES (10, strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
102
- 'episodes.tier column + idx + episode_anomalies table (brief episodic-workflow-to-db AC-2)');
103
-
104
- -- Verification queries (run after applying):
105
- -- SELECT MAX(version) FROM schema_version; -- expect: 10
106
- -- PRAGMA table_info(episodes); -- expect: tier column present (after output_tokens_approx)
107
- -- SELECT * FROM sqlite_master WHERE type='index' AND name LIKE 'idx_episodes_tier%'; -- expect: 2 rows
108
- -- PRAGMA table_info(episode_anomalies); -- expect: 7 columns
109
- -- SELECT COUNT(*) FROM episode_anomalies; -- expect: 0 (populated by T3 migration task)
@@ -1,18 +0,0 @@
1
- -- Migration v9 -> v10 fresh-install variant (episodic-workflow-to-db AC-3)
2
- --
3
- -- Used by bootstrap_database.sh when the live DB already has the
4
- -- episode_anomalies table (i.e. schema.sql ran first on a clean install and
5
- -- created the tables in v10 target state, including episodes.tier column).
6
- --
7
- -- On a fresh install, schema.sql creates the episodes table WITH the tier
8
- -- column, so ALTER TABLE is not needed. However, the tier-dependent indexes
9
- -- (idx_episodes_tier, idx_episodes_tier_outcome) cannot be declared in
10
- -- schema.sql because schema.sql runs before migrations on existing DBs where
11
- -- tier does not yet exist. This fresh variant creates those indexes safely,
12
- -- since on a fresh install tier is guaranteed to exist.
13
- --
14
- -- Atomicity: bootstrap_database.sh wraps this script in BEGIN/COMMIT.
15
-
16
- -- Create tier indexes that schema.sql cannot safely declare for existing DBs.
17
- CREATE INDEX IF NOT EXISTS idx_episodes_tier ON episodes(tier);
18
- CREATE INDEX IF NOT EXISTS idx_episodes_tier_outcome ON episodes(tier, outcome);
@@ -1,70 +0,0 @@
1
- # Templates
2
-
3
- Templates are the reference files that Gaia uses to generate per-project configuration. They are not consumed by the Claude Code runtime — they are consumed by the install scripts in `bin/` and by organization administrators deploying managed policies. Think of this directory as the catalog of files that exist as skeletons, ready to be filled in during installation or deployed verbatim as a policy.
4
-
5
- There are two audiences for this directory, and they do not overlap. The first is the individual developer installing Gaia into their project — `gaia install` (run by the npm postinstall hook) bootstraps the DB and `.claude/` structure. The second is the enterprise administrator — they take `managed-settings.template.json` and deploy it as a managed policy via the Claude.ai Admin Console or by placing it at `/etc/claude-code/managed-settings.json` on managed workstations. `gaia scan` (separate from install) writes project context to `~/.gaia/gaia.db` only; it does not read or interpolate templates.
6
-
7
- Keeping these files here, rather than embedding them in `bin/cli/scan.py`, means policies and skeletons can be audited and customized without touching executable code. An admin can diff `managed-settings.template.json` against a previous version. A developer can read `governance.template.md` (when present) before letting the scanner interpolate it.
8
-
9
- ## Cuándo se activa
10
-
11
- This component does not activate in the runtime Claude Code pipeline. Templates are consumed only by install-time tooling and by administrators deploying policies out-of-band.
12
-
13
- **When each template is consumed:**
14
-
15
- ```
16
- Individual developer runs: npm install @jaguilar87/gaia
17
- |
18
- postinstall -> gaia install --postinstall
19
- |
20
- Install bootstraps ~/.gaia/gaia.db, creates .claude/ symlinks, merges settings.
21
- Templates in this directory are NOT read during install or scan.
22
-
23
- Developer separately runs: gaia scan
24
- |
25
- bin/cli/scan.py detects project stack and writes to ~/.gaia/gaia.db.
26
- gaia scan does NOT read templates/ or generate any files.
27
- ```
28
-
29
- ```
30
- Enterprise admin deploys managed policy
31
- |
32
- Admin copies templates/managed-settings.template.json
33
- |
34
- Deploys to Claude.ai Admin Console
35
- OR writes to /etc/claude-code/managed-settings.json (Linux managed workstations)
36
- |
37
- Managed settings take highest precedence — cannot be overridden by user or project
38
- ```
39
-
40
- ## Qué hay aquí
41
-
42
- ```
43
- templates/
44
- ├── managed-settings.template.json # Enterprise reference — deployed by admin, not gaia-scan
45
- └── README.md
46
- ```
47
-
48
- Currently only `managed-settings.template.json` ships in this directory. A `governance.template.md` has been referenced in prior docs but is not present in source and is not consumed by any current automated step (`gaia scan` writes to DB only; `gaia install` does not interpolate templates). If a future installer step consumes templates, this note should be updated.
49
-
50
- ## Convenciones
51
-
52
- **Audience per file:**
53
-
54
- | File | Audience | Consumed by | Trigger |
55
- |------|----------|-------------|---------|
56
- | `managed-settings.template.json` | Enterprise administrator | Claude.ai Admin Console or `/etc/claude-code/managed-settings.json` | Admin action — out of band, not automated |
57
- | `governance.template.md` (if present) | Individual developer | Future install tooling (not yet implemented) | Not currently consumed by any automated step |
58
-
59
- **Managed settings precedence:** `managed-settings.template.json` contains wildcard deny rules that cannot be overridden by user or project settings. It also sets `disableBypassPermissionsMode: true` to prevent `--dangerously-skip-permissions`. Deploy this only when you want organization-wide enforcement.
60
-
61
- **No CLAUDE.md generated:** Orchestrator identity is no longer generated from a template. It lives in `agents/gaia-orchestrator.md` and is activated via `settings.json: { "agent": "gaia-orchestrator" }`. Surface routing is injected by the `UserPromptSubmit` hook, not by a template.
62
-
63
- **Template naming:** Files intended for interpolation use the `.template.<ext>` suffix (e.g., `governance.template.md`, `managed-settings.template.json`). Files without that suffix should not be here.
64
-
65
- ## Ver también
66
-
67
- - [`bin/cli/install.py`](../bin/cli/install.py) — `gaia install` (postinstall) bootstraps the DB and `.claude/` structure; templates are not currently read by any automated step
68
- - [`bin/cli/update.py`](../bin/cli/update.py) — `gaia update` updates settings.local.json (merges, does not use templates here)
69
- - [`agents/gaia-orchestrator.md`](../agents/gaia-orchestrator.md) — orchestrator identity (replaces old CLAUDE.md template path)
70
- - [`build/gaia-ops.manifest.json`](../build/gaia-ops.manifest.json) — plugin-level permission defaults (distinct from managed-settings)
@@ -1,43 +0,0 @@
1
- {
2
- "_comment": [
3
- "Managed settings template for enterprise/organization deployment.",
4
- "Deploy to: /etc/claude-code/managed-settings.json (Linux/WSL)",
5
- " /Library/Application Support/ClaudeCode/managed-settings.json (macOS)",
6
- " Or via Claude.ai Admin > Claude Code > Managed Settings",
7
- "",
8
- "These rules have the HIGHEST precedence and CANNOT be overridden by",
9
- "user, project, or local settings. They are the ultimate security gate."
10
- ],
11
- "permissions": {
12
- "deny": [
13
- "Bash(aws * delete-*:*)",
14
- "Bash(aws * terminate-*:*)",
15
- "Bash(az * delete:*)",
16
- "Bash(gcloud * delete:*)",
17
- "Bash(gsutil rb:*)",
18
- "Bash(gsutil rm:*)",
19
- "Bash(gcloud storage rm:*)",
20
- "Bash(kubectl delete:*)",
21
- "Bash(kubectl drain:*)",
22
- "Bash(terraform destroy:*)",
23
- "Bash(terragrunt destroy:*)",
24
- "Bash(terragrunt run-all destroy:*)",
25
- "Bash(helm uninstall:*)",
26
- "Bash(helm delete:*)",
27
- "Bash(flux uninstall:*)",
28
- "Bash(docker system prune:*)",
29
- "Bash(docker volume prune:*)",
30
- "Bash(git push --force:*)",
31
- "Bash(git push -f:*)",
32
- "Bash(git reset --hard:*)",
33
- "Bash(gh repo delete:*)",
34
- "Bash(glab project delete:*)",
35
- "Bash(dd:*)",
36
- "Bash(fdisk:*)",
37
- "Bash(mkfs:*)",
38
- "Bash(mkfs.*:*)"
39
- ]
40
- },
41
- "disableBypassPermissionsMode": "disable",
42
- "allowManagedHooksOnly": false
43
- }
@@ -1,210 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- decide-status.py
4
-
5
- Mechanically decide what to do based on numbers alone. No LLM judgment.
6
-
7
- Usage:
8
- python3 decide-status.py \
9
- --current 94.5 \
10
- --best 92.0 \
11
- --threshold 98 \
12
- --direction higher \
13
- --consecutive-discards 2 \
14
- --pivot-count 1
15
-
16
- Output JSON:
17
- {
18
- "decision": "keep",
19
- "reason": "Metric improved from 92.0 to 94.5",
20
- "improved": true,
21
- "gap_remaining": 3.5
22
- }
23
-
24
- Decision precedence (evaluated top-to-bottom, first match wins):
25
- 1. pivot_count >= 3 → stop
26
- 2. consecutive_discards >= 5 → pivot (also a discard)
27
- 3. consecutive_discards >= 3 → refine (also a discard)
28
- 4. current meets or passes threshold → threshold_reached
29
- 5. current improved vs best (per direction) → keep
30
- 6. current same or worse → discard
31
-
32
- Exit codes:
33
- 0 success (decision emitted as JSON)
34
- 1 invalid input
35
- """
36
-
37
- import argparse
38
- import json
39
- import sys
40
-
41
-
42
- Decision = str # type alias for readability
43
-
44
-
45
- def _is_improved(current: float, best: float, direction: str) -> bool:
46
- """Return True if *current* is strictly better than *best* per direction."""
47
- if direction == "higher":
48
- return current > best
49
- return current < best # lower is better
50
-
51
-
52
- def _threshold_reached(current: float, threshold: float, direction: str) -> bool:
53
- """Return True if *current* has met or surpassed *threshold*."""
54
- if direction == "higher":
55
- return current >= threshold
56
- return current <= threshold
57
-
58
-
59
- def _gap_remaining(current: float, threshold: float, direction: str) -> float:
60
- """Absolute gap between current value and threshold."""
61
- if direction == "higher":
62
- return max(0.0, threshold - current)
63
- return max(0.0, current - threshold)
64
-
65
-
66
- def decide(
67
- current: float,
68
- best: float,
69
- threshold: float,
70
- direction: str,
71
- consecutive_discards: int,
72
- pivot_count: int,
73
- ) -> dict:
74
- """Pure function: return decision dict from numeric inputs."""
75
-
76
- gap = _gap_remaining(current, threshold, direction)
77
- improved = _is_improved(current, best, direction)
78
-
79
- # --- Precedence 1: hard stop on too many pivots ---
80
- if pivot_count >= 3:
81
- return {
82
- "decision": "stop",
83
- "reason": f"pivot_count={pivot_count} has reached the maximum of 3; halting loop",
84
- "improved": improved,
85
- "gap_remaining": gap,
86
- }
87
-
88
- # --- Precedence 2 & 3: discard streak escalations ---
89
- # Evaluated before threshold/keep so an ongoing failing streak is flagged
90
- # even if the current run happens to reach the threshold.
91
- if consecutive_discards >= 5:
92
- return {
93
- "decision": "pivot",
94
- "reason": (
95
- f"consecutive_discards={consecutive_discards} >= 5; "
96
- "strategy is not working, force a pivot"
97
- ),
98
- "improved": improved,
99
- "gap_remaining": gap,
100
- }
101
-
102
- if consecutive_discards >= 3:
103
- return {
104
- "decision": "refine",
105
- "reason": (
106
- f"consecutive_discards={consecutive_discards} >= 3; "
107
- "current approach needs refinement before continuing"
108
- ),
109
- "improved": improved,
110
- "gap_remaining": gap,
111
- }
112
-
113
- # --- Precedence 4: threshold reached ---
114
- if _threshold_reached(current, threshold, direction):
115
- return {
116
- "decision": "threshold_reached",
117
- "reason": (
118
- f"current={current} {'≥' if direction == 'higher' else '≤'} "
119
- f"threshold={threshold}; goal achieved"
120
- ),
121
- "improved": improved,
122
- "gap_remaining": 0.0,
123
- }
124
-
125
- # --- Precedence 5 & 6: standard keep/discard ---
126
- if improved:
127
- return {
128
- "decision": "keep",
129
- "reason": f"Metric improved from {best} to {current}",
130
- "improved": True,
131
- "gap_remaining": gap,
132
- }
133
-
134
- return {
135
- "decision": "discard",
136
- "reason": f"Metric did not improve (current={current}, best={best})",
137
- "improved": False,
138
- "gap_remaining": gap,
139
- }
140
-
141
-
142
- def main() -> None:
143
- parser = argparse.ArgumentParser(
144
- description="Compute the next agentic-loop decision from metric numbers only.",
145
- formatter_class=argparse.RawDescriptionHelpFormatter,
146
- epilog="""
147
- Decisions:
148
- keep current improved vs best
149
- discard current same or worse
150
- refine 3+ consecutive discards (improvement needed in approach)
151
- pivot 5+ consecutive discards (strategy change required)
152
- stop 3+ pivots already attempted
153
- threshold_reached current meets or surpasses the goal threshold
154
-
155
- Direction values:
156
- higher larger numbers are better (e.g. accuracy, passing tests)
157
- lower smaller numbers are better (e.g. error rate, latency ms)
158
- """,
159
- )
160
- parser.add_argument("--current", required=True, type=float, help="Metric value for the current run")
161
- parser.add_argument("--best", required=True, type=float, help="Best metric seen so far (from state.json)")
162
- parser.add_argument("--threshold", required=True, type=float, help="Target threshold to reach")
163
- parser.add_argument(
164
- "--direction",
165
- required=True,
166
- choices=["higher", "lower"],
167
- help="Whether higher or lower values are better",
168
- )
169
- parser.add_argument(
170
- "--consecutive-discards",
171
- required=True,
172
- type=int,
173
- metavar="N",
174
- help="Number of consecutive discard outcomes so far (from state.json)",
175
- )
176
- parser.add_argument(
177
- "--pivot-count",
178
- required=True,
179
- type=int,
180
- metavar="N",
181
- help="Number of pivots executed so far (from state.json)",
182
- )
183
- args = parser.parse_args()
184
-
185
- # --- Input validation ---
186
- errors = []
187
- if args.consecutive_discards < 0:
188
- errors.append("--consecutive-discards must be >= 0")
189
- if args.pivot_count < 0:
190
- errors.append("--pivot-count must be >= 0")
191
-
192
- if errors:
193
- for err in errors:
194
- print(f"error: {err}", file=sys.stderr)
195
- sys.exit(1)
196
-
197
- result = decide(
198
- current=args.current,
199
- best=args.best,
200
- threshold=args.threshold,
201
- direction=args.direction,
202
- consecutive_discards=args.consecutive_discards,
203
- pivot_count=args.pivot_count,
204
- )
205
-
206
- print(json.dumps(result, indent=2))
207
-
208
-
209
- if __name__ == "__main__":
210
- main()
@@ -1,106 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- parse-metric.py
4
-
5
- Read stdout from eval_command and extract METRIC lines.
6
-
7
- Usage:
8
- echo "output" | python3 parse-metric.py --metric accuracy
9
- python3 parse-metric.py --metric accuracy --file /tmp/eval-output.txt
10
-
11
- Input lines must match: METRIC {name}={number}
12
- Output: JSON to stdout with metric name, numeric value, and raw line.
13
- """
14
-
15
- import argparse
16
- import json
17
- import re
18
- import sys
19
- from typing import Optional
20
-
21
-
22
- METRIC_PATTERN = re.compile(r"^METRIC\s+(\w+)=([\d.]+)\s*$")
23
-
24
-
25
- def parse_lines(lines: list[str]) -> list[dict]:
26
- """Extract all METRIC entries from a sequence of lines."""
27
- results = []
28
- for line in lines:
29
- stripped = line.rstrip("\n")
30
- match = METRIC_PATTERN.match(stripped)
31
- if match:
32
- name = match.group(1)
33
- raw_value = match.group(2)
34
- # Preserve int vs float from the source text.
35
- value: int | float
36
- if "." in raw_value:
37
- value = float(raw_value)
38
- else:
39
- value = int(raw_value)
40
- results.append(
41
- {
42
- "metric": name,
43
- "value": value,
44
- "raw_line": stripped,
45
- }
46
- )
47
- return results
48
-
49
-
50
- def main() -> None:
51
- parser = argparse.ArgumentParser(
52
- description="Extract METRIC lines from eval_command output.",
53
- formatter_class=argparse.RawDescriptionHelpFormatter,
54
- epilog="""
55
- Examples:
56
- echo "METRIC accuracy=94.5" | python3 parse-metric.py --metric accuracy
57
- python3 parse-metric.py --metric passing_tests --file /tmp/out.txt
58
- python3 parse-metric.py --file /tmp/out.txt # returns all metrics
59
- """,
60
- )
61
- parser.add_argument(
62
- "--metric",
63
- metavar="NAME",
64
- help="Return only this named metric (case-sensitive). Exits 1 if not found.",
65
- )
66
- parser.add_argument(
67
- "--file",
68
- metavar="PATH",
69
- help="Read from file instead of stdin.",
70
- )
71
- args = parser.parse_args()
72
-
73
- # --- Read input ---
74
- try:
75
- if args.file:
76
- with open(args.file, "r") as fh:
77
- lines = fh.readlines()
78
- else:
79
- lines = sys.stdin.readlines()
80
- except OSError as exc:
81
- print(f"error: cannot read input: {exc}", file=sys.stderr)
82
- sys.exit(1)
83
-
84
- # --- Parse ---
85
- all_metrics = parse_lines(lines)
86
-
87
- if args.metric:
88
- # Filter to the requested metric name.
89
- matches = [m for m in all_metrics if m["metric"] == args.metric]
90
- if not matches:
91
- print(
92
- f"error: metric '{args.metric}' not found in input",
93
- file=sys.stderr,
94
- )
95
- sys.exit(1)
96
- # Return the last occurrence if there are duplicates.
97
- result = matches[-1]
98
- else:
99
- # Return all metrics as a list when no --metric filter is given.
100
- result = all_metrics # type: ignore[assignment]
101
-
102
- print(json.dumps(result, indent=2))
103
-
104
-
105
- if __name__ == "__main__":
106
- main()