@jaguilar87/gaia 5.0.4 → 5.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/CHANGELOG.md +65 -0
  4. package/INSTALL.md +0 -2
  5. package/README.md +1 -6
  6. package/bin/README.md +0 -1
  7. package/bin/cli/_install_helpers.py +1 -1
  8. package/bin/cli/cleanup.py +0 -1
  9. package/bin/cli/doctor.py +2 -2
  10. package/bin/cli/memory.py +2 -0
  11. package/bin/cli/update.py +1 -1
  12. package/bin/pre-publish-validate.js +48 -5
  13. package/config/README.md +22 -44
  14. package/config/surface-routing.json +0 -1
  15. package/dist/gaia-ops/.claude-plugin/plugin.json +1 -1
  16. package/dist/gaia-ops/config/README.md +22 -44
  17. package/dist/gaia-ops/config/surface-routing.json +0 -1
  18. package/dist/gaia-ops/hooks/modules/agents/handoff_persister.py +2 -0
  19. package/dist/gaia-ops/hooks/modules/security/approval_grants.py +2 -0
  20. package/dist/gaia-ops/hooks/modules/tools/bash_validator.py +2 -0
  21. package/dist/gaia-ops/hooks/modules/validation/commit_validator.py +90 -55
  22. package/dist/gaia-ops/skills/README.md +1 -1
  23. package/dist/gaia-ops/skills/gaia-patterns/SKILL.md +1 -1
  24. package/dist/gaia-ops/skills/gaia-patterns/reference.md +0 -1
  25. package/dist/gaia-ops/skills/gaia-release/SKILL.md +60 -24
  26. package/dist/gaia-ops/skills/gaia-release/reference.md +35 -11
  27. package/dist/gaia-ops/skills/git-conventions/SKILL.md +6 -2
  28. package/dist/gaia-ops/skills/orchestrator-present-approval/SKILL.md +10 -2
  29. package/dist/gaia-ops/skills/readme-writing/SKILL.md +1 -1
  30. package/dist/gaia-ops/skills/readme-writing/reference.md +0 -1
  31. package/dist/gaia-ops/tools/scan/ui.py +20 -4
  32. package/dist/gaia-ops/tools/scan/verify.py +3 -3
  33. package/dist/gaia-ops/tools/validation/README.md +15 -24
  34. package/dist/gaia-security/.claude-plugin/plugin.json +1 -1
  35. package/dist/gaia-security/hooks/modules/agents/handoff_persister.py +2 -0
  36. package/dist/gaia-security/hooks/modules/security/approval_grants.py +2 -0
  37. package/dist/gaia-security/hooks/modules/tools/bash_validator.py +2 -0
  38. package/dist/gaia-security/hooks/modules/validation/commit_validator.py +90 -55
  39. package/hooks/modules/agents/handoff_persister.py +2 -0
  40. package/hooks/modules/security/approval_grants.py +2 -0
  41. package/hooks/modules/tools/bash_validator.py +2 -0
  42. package/hooks/modules/validation/commit_validator.py +90 -55
  43. package/index.js +2 -12
  44. package/package.json +4 -6
  45. package/pyproject.toml +3 -3
  46. package/scripts/bootstrap_database.sh +88 -439
  47. package/scripts/check_schema_drift.py +208 -0
  48. package/scripts/migrations/README.md +78 -28
  49. package/scripts/migrations/schema.checksum +8 -0
  50. package/scripts/release-prepare.mjs +199 -0
  51. package/skills/README.md +1 -1
  52. package/skills/gaia-patterns/SKILL.md +1 -1
  53. package/skills/gaia-patterns/reference.md +0 -1
  54. package/skills/gaia-release/SKILL.md +60 -24
  55. package/skills/gaia-release/reference.md +35 -11
  56. package/skills/git-conventions/SKILL.md +6 -2
  57. package/skills/orchestrator-present-approval/SKILL.md +10 -2
  58. package/skills/readme-writing/SKILL.md +1 -1
  59. package/skills/readme-writing/reference.md +0 -1
  60. package/tools/scan/ui.py +20 -4
  61. package/tools/scan/verify.py +3 -3
  62. package/tools/validation/README.md +15 -24
  63. package/commands/README.md +0 -64
  64. package/commands/gaia.md +0 -37
  65. package/commands/scan-project.md +0 -74
  66. package/config/crons-schema.md +0 -81
  67. package/config/git_standards.json +0 -72
  68. package/dist/gaia-ops/commands/gaia.md +0 -37
  69. package/dist/gaia-ops/config/crons-schema.md +0 -81
  70. package/dist/gaia-ops/config/git_standards.json +0 -72
  71. package/dist/gaia-ops/tools/agentic-loop/decide-status.py +0 -210
  72. package/dist/gaia-ops/tools/agentic-loop/parse-metric.py +0 -106
  73. package/dist/gaia-ops/tools/agentic-loop/record-iteration.py +0 -223
  74. package/git-hooks/commit-msg +0 -41
  75. package/scripts/migrations/v10_to_v11.sql +0 -170
  76. package/scripts/migrations/v10_to_v11_fresh.sql +0 -18
  77. package/scripts/migrations/v11_to_v12.sql +0 -195
  78. package/scripts/migrations/v11_to_v12_fresh.sql +0 -19
  79. package/scripts/migrations/v12_to_v13.sql +0 -48
  80. package/scripts/migrations/v12_to_v13_fresh.sql +0 -17
  81. package/scripts/migrations/v13_to_v14.sql +0 -44
  82. package/scripts/migrations/v13_to_v14_fresh.sql +0 -17
  83. package/scripts/migrations/v14_to_v15.sql +0 -71
  84. package/scripts/migrations/v14_to_v15_fresh.sql +0 -19
  85. package/scripts/migrations/v15_to_v16.sql +0 -57
  86. package/scripts/migrations/v15_to_v16_fresh.sql +0 -18
  87. package/scripts/migrations/v16_to_v17.sql +0 -51
  88. package/scripts/migrations/v16_to_v17_fresh.sql +0 -18
  89. package/scripts/migrations/v17_to_v18.sql +0 -66
  90. package/scripts/migrations/v17_to_v18_fresh.sql +0 -24
  91. package/scripts/migrations/v1_to_v2.sql +0 -97
  92. package/scripts/migrations/v2_to_v3.sql +0 -68
  93. package/scripts/migrations/v2_to_v3_merge.sql +0 -69
  94. package/scripts/migrations/v3_to_v4.sql +0 -67
  95. package/scripts/migrations/v3_to_v4_fresh.sql +0 -20
  96. package/scripts/migrations/v4_to_v5.sql +0 -55
  97. package/scripts/migrations/v4_to_v5_fresh.sql +0 -20
  98. package/scripts/migrations/v5_to_v6.sql +0 -48
  99. package/scripts/migrations/v5_to_v6_fresh.sql +0 -17
  100. package/scripts/migrations/v6_to_v7.sql +0 -26
  101. package/scripts/migrations/v6_to_v7_fresh.sql +0 -13
  102. package/scripts/migrations/v7_to_v8.sql +0 -44
  103. package/scripts/migrations/v7_to_v8_fresh.sql +0 -14
  104. package/scripts/migrations/v8_to_v9.sql +0 -87
  105. package/scripts/migrations/v8_to_v9_fresh.sql +0 -15
  106. package/scripts/migrations/v9_to_v10.sql +0 -109
  107. package/scripts/migrations/v9_to_v10_episodes_workspace.sql +0 -109
  108. package/scripts/migrations/v9_to_v10_fresh.sql +0 -18
  109. package/templates/README.md +0 -70
  110. package/templates/managed-settings.template.json +0 -43
  111. package/tools/agentic-loop/decide-status.py +0 -210
  112. package/tools/agentic-loop/parse-metric.py +0 -106
  113. package/tools/agentic-loop/record-iteration.py +0 -223
@@ -1,81 +0,0 @@
1
- # Crons Persistence Schema
2
-
3
- **Version:** 1
4
- **File location:** `.claude/crons.json`
5
- **Owner:** Gaia cron persistence system
6
-
7
- ---
8
-
9
- ## Schema
10
-
11
- ```json
12
- {
13
- "crons": [
14
- {
15
- "name": "check-email",
16
- "interval_minutes": 180,
17
- "prompt": "Revisa el correo y haz triage según gmail-triage skill",
18
- "enabled": true,
19
- "created": "2026-04-13T20:00:00Z",
20
- "last_run": "2026-04-13T23:00:00Z"
21
- }
22
- ],
23
- "version": 1
24
- }
25
- ```
26
-
27
- ## Field Definitions
28
-
29
- | Field | Type | Required | Description |
30
- |-------|------|----------|-------------|
31
- | `name` | string | yes | Unique identifier for the cron. Used as the dedup key during restore. Must be URL-safe (alphanumeric, hyphens). |
32
- | `interval_minutes` | integer | yes | How often the cron fires, in minutes. Mirrors CronCreate interval. |
33
- | `prompt` | string | yes | The exact prompt sent to the orchestrator on each tick. |
34
- | `enabled` | boolean | yes | If false, the cron is skipped during restore. Allows pausing without deletion. |
35
- | `created` | string (ISO 8601 UTC) | yes | Timestamp when the cron was first created. Set once, never updated. |
36
- | `last_run` | string (ISO 8601 UTC) or null | yes | Timestamp of the most recent execution. Null if the cron has never run. |
37
-
38
- ## Top-level Fields
39
-
40
- | Field | Type | Description |
41
- |-------|------|-------------|
42
- | `crons` | array | The list of persisted cron entries. May be empty. |
43
- | `version` | integer | Schema version. Currently 1. Increment when field semantics change. |
44
-
45
- ## Constraints
46
-
47
- - `name` must be unique within the `crons` array. Duplicate names are invalid.
48
- - `interval_minutes` must be a positive integer greater than 0.
49
- - `last_run` is `null` when the cron has been created but has not yet fired.
50
-
51
- ## Example: Multiple Crons
52
-
53
- ```json
54
- {
55
- "crons": [
56
- {
57
- "name": "check-email",
58
- "interval_minutes": 180,
59
- "prompt": "Revisa el correo y haz triage según gmail-triage skill",
60
- "enabled": true,
61
- "created": "2026-04-13T20:00:00Z",
62
- "last_run": "2026-04-13T23:00:00Z"
63
- },
64
- {
65
- "name": "drift-monitor",
66
- "interval_minutes": 60,
67
- "prompt": "Check for infrastructure drift in the current project",
68
- "enabled": false,
69
- "created": "2026-04-10T10:00:00Z",
70
- "last_run": null
71
- }
72
- ],
73
- "version": 1
74
- }
75
- ```
76
-
77
- ## File Location
78
-
79
- The file lives at `.claude/crons.json`, resolved relative to the active project root (same directory where `.claude/` is found). The path module `find_claude_dir()` from `hooks/modules/core/paths.py` provides the canonical `.claude/` path.
80
-
81
- For projects that use `CLAUDE_PLUGIN_DATA`, the file lives under that data directory instead, consistent with how other persisted data (logs, sessions, grants) is stored.
@@ -1,72 +0,0 @@
1
- {
2
- "commit_message": {
3
- "format": "conventional_commits",
4
- "description": "Commit messages must follow Conventional Commits specification",
5
-
6
- "type_allowed": [
7
- "feat",
8
- "fix",
9
- "refactor",
10
- "docs",
11
- "test",
12
- "chore",
13
- "ci",
14
- "perf",
15
- "style",
16
- "build"
17
- ],
18
-
19
- "scope_required": false,
20
- "scope_examples": ["helmrelease", "terraform", "pg-non-prod", "infrastructure"],
21
-
22
- "subject_max_length": 72,
23
- "subject_rules": {
24
- "capitalize_first_letter": false,
25
- "no_period_at_end": true,
26
- "imperative_mood": true,
27
- "no_emoji": true
28
- },
29
-
30
- "body_max_line_length": 72,
31
- "body_required": false,
32
-
33
- "footer_forbidden": [
34
- "Generated with Claude Code",
35
- "Co-Authored-By: Claude",
36
- "🤖 Generated with"
37
- ],
38
-
39
- "footer_allowed": [
40
- "BREAKING CHANGE:",
41
- "Refs:",
42
- "Closes:",
43
- "Fixes:",
44
- "Implements:",
45
- "See:"
46
- ],
47
-
48
- "examples_valid": [
49
- "feat(helmrelease): add Phase 3.3 services",
50
- "fix(pg-non-prod): correct API key environment variable mappings",
51
- "refactor: simplify context provider logic",
52
- "docs: update README with new workflow",
53
- "chore(deps): update terraform to v1.6.0"
54
- ],
55
-
56
- "examples_invalid": [
57
- "Added new feature",
58
- "Fixed bugs",
59
- "Updates",
60
- "feat: add feature\n\n🤖 Generated with Claude Code",
61
- "feat: add new feature 🚀",
62
- "fix: 🐛 correct bug"
63
- ]
64
- },
65
-
66
- "enforcement": {
67
- "enabled": true,
68
- "block_on_failure": true,
69
- "log_violations": true,
70
- "log_path": ".claude/logs/commit-violations.jsonl"
71
- }
72
- }
@@ -1,37 +0,0 @@
1
- ---
2
- name: gaia
3
- description: Invoke the Gaia meta-agent for system architecture analysis, agent design, skill creation, and orchestration debugging
4
- allowed-tools:
5
- - Bash(*)
6
- - Read
7
- - Edit
8
- - Write
9
- - Glob
10
- - Grep
11
- - WebSearch
12
- - WebFetch
13
- - Task
14
- - Agent
15
- - Skill
16
- ---
17
-
18
- Invoke the Gaia meta-agent (`gaia-system`) to work on the gaia-ops orchestration
19
- system itself. This is the entry point for tasks that modify or analyze agents,
20
- skills, hooks, or system architecture.
21
-
22
- ## When to use
23
-
24
- - Analyze or improve the gaia-ops architecture
25
- - Create or update agent definitions (`.md` files)
26
- - Create or update skills (`SKILL.md` files)
27
- - Write or debug Python hooks and tools
28
- - Update `CLAUDE.md` or system configuration
29
- - Research best practices for agent orchestration
30
-
31
- ## How it works
32
-
33
- This command delegates to the `gaia-system` agent, which is the meta-agent
34
- specialized in the orchestration system. It follows the standard agent protocol
35
- and returns a `agent_contract_handoff` block with findings and status.
36
-
37
- $ARGUMENTS
@@ -1,81 +0,0 @@
1
- # Crons Persistence Schema
2
-
3
- **Version:** 1
4
- **File location:** `.claude/crons.json`
5
- **Owner:** Gaia cron persistence system
6
-
7
- ---
8
-
9
- ## Schema
10
-
11
- ```json
12
- {
13
- "crons": [
14
- {
15
- "name": "check-email",
16
- "interval_minutes": 180,
17
- "prompt": "Revisa el correo y haz triage según gmail-triage skill",
18
- "enabled": true,
19
- "created": "2026-04-13T20:00:00Z",
20
- "last_run": "2026-04-13T23:00:00Z"
21
- }
22
- ],
23
- "version": 1
24
- }
25
- ```
26
-
27
- ## Field Definitions
28
-
29
- | Field | Type | Required | Description |
30
- |-------|------|----------|-------------|
31
- | `name` | string | yes | Unique identifier for the cron. Used as the dedup key during restore. Must be URL-safe (alphanumeric, hyphens). |
32
- | `interval_minutes` | integer | yes | How often the cron fires, in minutes. Mirrors CronCreate interval. |
33
- | `prompt` | string | yes | The exact prompt sent to the orchestrator on each tick. |
34
- | `enabled` | boolean | yes | If false, the cron is skipped during restore. Allows pausing without deletion. |
35
- | `created` | string (ISO 8601 UTC) | yes | Timestamp when the cron was first created. Set once, never updated. |
36
- | `last_run` | string (ISO 8601 UTC) or null | yes | Timestamp of the most recent execution. Null if the cron has never run. |
37
-
38
- ## Top-level Fields
39
-
40
- | Field | Type | Description |
41
- |-------|------|-------------|
42
- | `crons` | array | The list of persisted cron entries. May be empty. |
43
- | `version` | integer | Schema version. Currently 1. Increment when field semantics change. |
44
-
45
- ## Constraints
46
-
47
- - `name` must be unique within the `crons` array. Duplicate names are invalid.
48
- - `interval_minutes` must be a positive integer greater than 0.
49
- - `last_run` is `null` when the cron has been created but has not yet fired.
50
-
51
- ## Example: Multiple Crons
52
-
53
- ```json
54
- {
55
- "crons": [
56
- {
57
- "name": "check-email",
58
- "interval_minutes": 180,
59
- "prompt": "Revisa el correo y haz triage según gmail-triage skill",
60
- "enabled": true,
61
- "created": "2026-04-13T20:00:00Z",
62
- "last_run": "2026-04-13T23:00:00Z"
63
- },
64
- {
65
- "name": "drift-monitor",
66
- "interval_minutes": 60,
67
- "prompt": "Check for infrastructure drift in the current project",
68
- "enabled": false,
69
- "created": "2026-04-10T10:00:00Z",
70
- "last_run": null
71
- }
72
- ],
73
- "version": 1
74
- }
75
- ```
76
-
77
- ## File Location
78
-
79
- The file lives at `.claude/crons.json`, resolved relative to the active project root (same directory where `.claude/` is found). The path module `find_claude_dir()` from `hooks/modules/core/paths.py` provides the canonical `.claude/` path.
80
-
81
- For projects that use `CLAUDE_PLUGIN_DATA`, the file lives under that data directory instead, consistent with how other persisted data (logs, sessions, grants) is stored.
@@ -1,72 +0,0 @@
1
- {
2
- "commit_message": {
3
- "format": "conventional_commits",
4
- "description": "Commit messages must follow Conventional Commits specification",
5
-
6
- "type_allowed": [
7
- "feat",
8
- "fix",
9
- "refactor",
10
- "docs",
11
- "test",
12
- "chore",
13
- "ci",
14
- "perf",
15
- "style",
16
- "build"
17
- ],
18
-
19
- "scope_required": false,
20
- "scope_examples": ["helmrelease", "terraform", "pg-non-prod", "infrastructure"],
21
-
22
- "subject_max_length": 72,
23
- "subject_rules": {
24
- "capitalize_first_letter": false,
25
- "no_period_at_end": true,
26
- "imperative_mood": true,
27
- "no_emoji": true
28
- },
29
-
30
- "body_max_line_length": 72,
31
- "body_required": false,
32
-
33
- "footer_forbidden": [
34
- "Generated with Claude Code",
35
- "Co-Authored-By: Claude",
36
- "🤖 Generated with"
37
- ],
38
-
39
- "footer_allowed": [
40
- "BREAKING CHANGE:",
41
- "Refs:",
42
- "Closes:",
43
- "Fixes:",
44
- "Implements:",
45
- "See:"
46
- ],
47
-
48
- "examples_valid": [
49
- "feat(helmrelease): add Phase 3.3 services",
50
- "fix(pg-non-prod): correct API key environment variable mappings",
51
- "refactor: simplify context provider logic",
52
- "docs: update README with new workflow",
53
- "chore(deps): update terraform to v1.6.0"
54
- ],
55
-
56
- "examples_invalid": [
57
- "Added new feature",
58
- "Fixed bugs",
59
- "Updates",
60
- "feat: add feature\n\n🤖 Generated with Claude Code",
61
- "feat: add new feature 🚀",
62
- "fix: 🐛 correct bug"
63
- ]
64
- },
65
-
66
- "enforcement": {
67
- "enabled": true,
68
- "block_on_failure": true,
69
- "log_violations": true,
70
- "log_path": ".claude/logs/commit-violations.jsonl"
71
- }
72
- }
@@ -1,210 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- decide-status.py
4
-
5
- Mechanically decide what to do based on numbers alone. No LLM judgment.
6
-
7
- Usage:
8
- python3 decide-status.py \
9
- --current 94.5 \
10
- --best 92.0 \
11
- --threshold 98 \
12
- --direction higher \
13
- --consecutive-discards 2 \
14
- --pivot-count 1
15
-
16
- Output JSON:
17
- {
18
- "decision": "keep",
19
- "reason": "Metric improved from 92.0 to 94.5",
20
- "improved": true,
21
- "gap_remaining": 3.5
22
- }
23
-
24
- Decision precedence (evaluated top-to-bottom, first match wins):
25
- 1. pivot_count >= 3 → stop
26
- 2. consecutive_discards >= 5 → pivot (also a discard)
27
- 3. consecutive_discards >= 3 → refine (also a discard)
28
- 4. current meets or passes threshold → threshold_reached
29
- 5. current improved vs best (per direction) → keep
30
- 6. current same or worse → discard
31
-
32
- Exit codes:
33
- 0 success (decision emitted as JSON)
34
- 1 invalid input
35
- """
36
-
37
- import argparse
38
- import json
39
- import sys
40
-
41
-
42
- Decision = str # type alias for readability
43
-
44
-
45
- def _is_improved(current: float, best: float, direction: str) -> bool:
46
- """Return True if *current* is strictly better than *best* per direction."""
47
- if direction == "higher":
48
- return current > best
49
- return current < best # lower is better
50
-
51
-
52
- def _threshold_reached(current: float, threshold: float, direction: str) -> bool:
53
- """Return True if *current* has met or surpassed *threshold*."""
54
- if direction == "higher":
55
- return current >= threshold
56
- return current <= threshold
57
-
58
-
59
- def _gap_remaining(current: float, threshold: float, direction: str) -> float:
60
- """Absolute gap between current value and threshold."""
61
- if direction == "higher":
62
- return max(0.0, threshold - current)
63
- return max(0.0, current - threshold)
64
-
65
-
66
- def decide(
67
- current: float,
68
- best: float,
69
- threshold: float,
70
- direction: str,
71
- consecutive_discards: int,
72
- pivot_count: int,
73
- ) -> dict:
74
- """Pure function: return decision dict from numeric inputs."""
75
-
76
- gap = _gap_remaining(current, threshold, direction)
77
- improved = _is_improved(current, best, direction)
78
-
79
- # --- Precedence 1: hard stop on too many pivots ---
80
- if pivot_count >= 3:
81
- return {
82
- "decision": "stop",
83
- "reason": f"pivot_count={pivot_count} has reached the maximum of 3; halting loop",
84
- "improved": improved,
85
- "gap_remaining": gap,
86
- }
87
-
88
- # --- Precedence 2 & 3: discard streak escalations ---
89
- # Evaluated before threshold/keep so an ongoing failing streak is flagged
90
- # even if the current run happens to reach the threshold.
91
- if consecutive_discards >= 5:
92
- return {
93
- "decision": "pivot",
94
- "reason": (
95
- f"consecutive_discards={consecutive_discards} >= 5; "
96
- "strategy is not working, force a pivot"
97
- ),
98
- "improved": improved,
99
- "gap_remaining": gap,
100
- }
101
-
102
- if consecutive_discards >= 3:
103
- return {
104
- "decision": "refine",
105
- "reason": (
106
- f"consecutive_discards={consecutive_discards} >= 3; "
107
- "current approach needs refinement before continuing"
108
- ),
109
- "improved": improved,
110
- "gap_remaining": gap,
111
- }
112
-
113
- # --- Precedence 4: threshold reached ---
114
- if _threshold_reached(current, threshold, direction):
115
- return {
116
- "decision": "threshold_reached",
117
- "reason": (
118
- f"current={current} {'≥' if direction == 'higher' else '≤'} "
119
- f"threshold={threshold}; goal achieved"
120
- ),
121
- "improved": improved,
122
- "gap_remaining": 0.0,
123
- }
124
-
125
- # --- Precedence 5 & 6: standard keep/discard ---
126
- if improved:
127
- return {
128
- "decision": "keep",
129
- "reason": f"Metric improved from {best} to {current}",
130
- "improved": True,
131
- "gap_remaining": gap,
132
- }
133
-
134
- return {
135
- "decision": "discard",
136
- "reason": f"Metric did not improve (current={current}, best={best})",
137
- "improved": False,
138
- "gap_remaining": gap,
139
- }
140
-
141
-
142
- def main() -> None:
143
- parser = argparse.ArgumentParser(
144
- description="Compute the next agentic-loop decision from metric numbers only.",
145
- formatter_class=argparse.RawDescriptionHelpFormatter,
146
- epilog="""
147
- Decisions:
148
- keep current improved vs best
149
- discard current same or worse
150
- refine 3+ consecutive discards (improvement needed in approach)
151
- pivot 5+ consecutive discards (strategy change required)
152
- stop 3+ pivots already attempted
153
- threshold_reached current meets or surpasses the goal threshold
154
-
155
- Direction values:
156
- higher larger numbers are better (e.g. accuracy, passing tests)
157
- lower smaller numbers are better (e.g. error rate, latency ms)
158
- """,
159
- )
160
- parser.add_argument("--current", required=True, type=float, help="Metric value for the current run")
161
- parser.add_argument("--best", required=True, type=float, help="Best metric seen so far (from state.json)")
162
- parser.add_argument("--threshold", required=True, type=float, help="Target threshold to reach")
163
- parser.add_argument(
164
- "--direction",
165
- required=True,
166
- choices=["higher", "lower"],
167
- help="Whether higher or lower values are better",
168
- )
169
- parser.add_argument(
170
- "--consecutive-discards",
171
- required=True,
172
- type=int,
173
- metavar="N",
174
- help="Number of consecutive discard outcomes so far (from state.json)",
175
- )
176
- parser.add_argument(
177
- "--pivot-count",
178
- required=True,
179
- type=int,
180
- metavar="N",
181
- help="Number of pivots executed so far (from state.json)",
182
- )
183
- args = parser.parse_args()
184
-
185
- # --- Input validation ---
186
- errors = []
187
- if args.consecutive_discards < 0:
188
- errors.append("--consecutive-discards must be >= 0")
189
- if args.pivot_count < 0:
190
- errors.append("--pivot-count must be >= 0")
191
-
192
- if errors:
193
- for err in errors:
194
- print(f"error: {err}", file=sys.stderr)
195
- sys.exit(1)
196
-
197
- result = decide(
198
- current=args.current,
199
- best=args.best,
200
- threshold=args.threshold,
201
- direction=args.direction,
202
- consecutive_discards=args.consecutive_discards,
203
- pivot_count=args.pivot_count,
204
- )
205
-
206
- print(json.dumps(result, indent=2))
207
-
208
-
209
- if __name__ == "__main__":
210
- main()
@@ -1,106 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- parse-metric.py
4
-
5
- Read stdout from eval_command and extract METRIC lines.
6
-
7
- Usage:
8
- echo "output" | python3 parse-metric.py --metric accuracy
9
- python3 parse-metric.py --metric accuracy --file /tmp/eval-output.txt
10
-
11
- Input lines must match: METRIC {name}={number}
12
- Output: JSON to stdout with metric name, numeric value, and raw line.
13
- """
14
-
15
- import argparse
16
- import json
17
- import re
18
- import sys
19
- from typing import Optional
20
-
21
-
22
- METRIC_PATTERN = re.compile(r"^METRIC\s+(\w+)=([\d.]+)\s*$")
23
-
24
-
25
- def parse_lines(lines: list[str]) -> list[dict]:
26
- """Extract all METRIC entries from a sequence of lines."""
27
- results = []
28
- for line in lines:
29
- stripped = line.rstrip("\n")
30
- match = METRIC_PATTERN.match(stripped)
31
- if match:
32
- name = match.group(1)
33
- raw_value = match.group(2)
34
- # Preserve int vs float from the source text.
35
- value: int | float
36
- if "." in raw_value:
37
- value = float(raw_value)
38
- else:
39
- value = int(raw_value)
40
- results.append(
41
- {
42
- "metric": name,
43
- "value": value,
44
- "raw_line": stripped,
45
- }
46
- )
47
- return results
48
-
49
-
50
- def main() -> None:
51
- parser = argparse.ArgumentParser(
52
- description="Extract METRIC lines from eval_command output.",
53
- formatter_class=argparse.RawDescriptionHelpFormatter,
54
- epilog="""
55
- Examples:
56
- echo "METRIC accuracy=94.5" | python3 parse-metric.py --metric accuracy
57
- python3 parse-metric.py --metric passing_tests --file /tmp/out.txt
58
- python3 parse-metric.py --file /tmp/out.txt # returns all metrics
59
- """,
60
- )
61
- parser.add_argument(
62
- "--metric",
63
- metavar="NAME",
64
- help="Return only this named metric (case-sensitive). Exits 1 if not found.",
65
- )
66
- parser.add_argument(
67
- "--file",
68
- metavar="PATH",
69
- help="Read from file instead of stdin.",
70
- )
71
- args = parser.parse_args()
72
-
73
- # --- Read input ---
74
- try:
75
- if args.file:
76
- with open(args.file, "r") as fh:
77
- lines = fh.readlines()
78
- else:
79
- lines = sys.stdin.readlines()
80
- except OSError as exc:
81
- print(f"error: cannot read input: {exc}", file=sys.stderr)
82
- sys.exit(1)
83
-
84
- # --- Parse ---
85
- all_metrics = parse_lines(lines)
86
-
87
- if args.metric:
88
- # Filter to the requested metric name.
89
- matches = [m for m in all_metrics if m["metric"] == args.metric]
90
- if not matches:
91
- print(
92
- f"error: metric '{args.metric}' not found in input",
93
- file=sys.stderr,
94
- )
95
- sys.exit(1)
96
- # Return the last occurrence if there are duplicates.
97
- result = matches[-1]
98
- else:
99
- # Return all metrics as a list when no --metric filter is given.
100
- result = all_metrics # type: ignore[assignment]
101
-
102
- print(json.dumps(result, indent=2))
103
-
104
-
105
- if __name__ == "__main__":
106
- main()