@jaguilar87/gaia 5.0.2 → 5.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/ARCHITECTURE.md +0 -1
- package/CHANGELOG.md +110 -0
- package/INSTALL.md +0 -2
- package/README.md +1 -6
- package/bin/README.md +0 -1
- package/bin/cli/_install_helpers.py +1 -1
- package/bin/cli/approvals.py +23 -21
- package/bin/cli/cleanup.py +0 -1
- package/bin/cli/doctor.py +1 -1
- package/bin/cli/memory.py +2 -0
- package/bin/cli/update.py +1 -1
- package/bin/pre-publish-validate.js +48 -5
- package/config/README.md +22 -44
- package/config/surface-routing.json +0 -2
- package/dist/gaia-ops/.claude-plugin/plugin.json +1 -1
- package/dist/gaia-ops/config/README.md +22 -44
- package/dist/gaia-ops/config/surface-routing.json +0 -2
- package/dist/gaia-ops/hooks/modules/agents/contract_validator.py +18 -0
- package/dist/gaia-ops/hooks/modules/agents/handoff_persister.py +214 -2
- package/dist/gaia-ops/hooks/modules/agents/response_contract.py +26 -0
- package/dist/gaia-ops/hooks/modules/agents/transcript_reader.py +15 -0
- package/dist/gaia-ops/hooks/modules/security/__init__.py +0 -5
- package/dist/gaia-ops/hooks/modules/security/approval_grants.py +124 -19
- package/dist/gaia-ops/hooks/modules/security/mutative_verbs.py +99 -7
- package/dist/gaia-ops/hooks/modules/tools/bash_validator.py +127 -24
- package/dist/gaia-ops/hooks/modules/validation/commit_validator.py +90 -55
- package/dist/gaia-ops/skills/README.md +1 -1
- package/dist/gaia-ops/skills/agent-contract-handoff/SKILL.md +3 -0
- package/dist/gaia-ops/skills/agent-response/SKILL.md +4 -2
- package/dist/gaia-ops/skills/gaia-patterns/SKILL.md +1 -1
- package/dist/gaia-ops/skills/gaia-patterns/reference.md +2 -3
- package/dist/gaia-ops/skills/gaia-release/SKILL.md +60 -24
- package/dist/gaia-ops/skills/gaia-release/reference.md +35 -11
- package/dist/gaia-ops/skills/git-conventions/SKILL.md +6 -2
- package/dist/gaia-ops/skills/orchestrator-present-approval/SKILL.md +30 -7
- package/dist/gaia-ops/skills/orchestrator-present-approval/reference.md +32 -15
- package/dist/gaia-ops/skills/readme-writing/SKILL.md +1 -1
- package/dist/gaia-ops/skills/readme-writing/reference.md +0 -1
- package/dist/gaia-ops/skills/security-tiers/SKILL.md +5 -1
- package/dist/gaia-ops/skills/security-tiers/reference.md +3 -1
- package/dist/gaia-ops/skills/subagent-request-approval/SKILL.md +43 -6
- package/dist/gaia-ops/skills/subagent-request-approval/reference.md +66 -16
- package/dist/gaia-ops/tools/context/README.md +1 -1
- package/dist/gaia-ops/tools/gaia_simulator/extractor.py +0 -1
- package/dist/gaia-ops/tools/scan/ui.py +20 -4
- package/dist/gaia-ops/tools/scan/verify.py +3 -3
- package/dist/gaia-ops/tools/validation/README.md +15 -24
- package/dist/gaia-security/.claude-plugin/plugin.json +1 -1
- package/dist/gaia-security/hooks/modules/agents/contract_validator.py +18 -0
- package/dist/gaia-security/hooks/modules/agents/handoff_persister.py +214 -2
- package/dist/gaia-security/hooks/modules/agents/response_contract.py +26 -0
- package/dist/gaia-security/hooks/modules/agents/transcript_reader.py +15 -0
- package/dist/gaia-security/hooks/modules/security/__init__.py +0 -5
- package/dist/gaia-security/hooks/modules/security/approval_grants.py +124 -19
- package/dist/gaia-security/hooks/modules/security/mutative_verbs.py +99 -7
- package/dist/gaia-security/hooks/modules/tools/bash_validator.py +127 -24
- package/dist/gaia-security/hooks/modules/validation/commit_validator.py +90 -55
- package/gaia/state/transitions.py +4 -4
- package/gaia/store/writer.py +56 -0
- package/hooks/modules/README.md +2 -4
- package/hooks/modules/agents/contract_validator.py +18 -0
- package/hooks/modules/agents/handoff_persister.py +214 -2
- package/hooks/modules/agents/response_contract.py +26 -0
- package/hooks/modules/agents/transcript_reader.py +15 -0
- package/hooks/modules/security/__init__.py +0 -5
- package/hooks/modules/security/approval_grants.py +124 -19
- package/hooks/modules/security/mutative_verbs.py +99 -7
- package/hooks/modules/tools/bash_validator.py +127 -24
- package/hooks/modules/validation/commit_validator.py +90 -55
- package/index.js +2 -12
- package/package.json +4 -6
- package/pyproject.toml +3 -3
- package/scripts/bootstrap_database.sh +88 -439
- package/scripts/check_schema_drift.py +208 -0
- package/scripts/migrations/README.md +78 -28
- package/scripts/migrations/schema.checksum +8 -0
- package/scripts/release-prepare.mjs +199 -0
- package/skills/README.md +1 -1
- package/skills/agent-contract-handoff/SKILL.md +3 -0
- package/skills/agent-response/SKILL.md +4 -2
- package/skills/gaia-patterns/SKILL.md +1 -1
- package/skills/gaia-patterns/reference.md +2 -3
- package/skills/gaia-release/SKILL.md +60 -24
- package/skills/gaia-release/reference.md +35 -11
- package/skills/git-conventions/SKILL.md +6 -2
- package/skills/orchestrator-present-approval/SKILL.md +30 -7
- package/skills/orchestrator-present-approval/reference.md +32 -15
- package/skills/readme-writing/SKILL.md +1 -1
- package/skills/readme-writing/reference.md +0 -1
- package/skills/security-tiers/SKILL.md +5 -1
- package/skills/security-tiers/reference.md +3 -1
- package/skills/subagent-request-approval/SKILL.md +43 -6
- package/skills/subagent-request-approval/reference.md +66 -16
- package/tools/context/README.md +1 -1
- package/tools/gaia_simulator/extractor.py +0 -1
- package/tools/scan/ui.py +20 -4
- package/tools/scan/verify.py +3 -3
- package/tools/validation/README.md +15 -24
- package/commands/README.md +0 -64
- package/commands/gaia.md +0 -37
- package/commands/scan-project.md +0 -74
- package/config/crons-schema.md +0 -81
- package/config/git_standards.json +0 -72
- package/dist/gaia-ops/commands/gaia.md +0 -37
- package/dist/gaia-ops/config/crons-schema.md +0 -81
- package/dist/gaia-ops/config/git_standards.json +0 -72
- package/dist/gaia-ops/hooks/modules/security/gitops_validator.py +0 -179
- package/dist/gaia-ops/tools/agentic-loop/decide-status.py +0 -210
- package/dist/gaia-ops/tools/agentic-loop/parse-metric.py +0 -106
- package/dist/gaia-ops/tools/agentic-loop/record-iteration.py +0 -223
- package/dist/gaia-security/hooks/modules/security/gitops_validator.py +0 -179
- package/git-hooks/commit-msg +0 -41
- package/hooks/modules/security/gitops_validator.py +0 -179
- package/scripts/migrations/v10_to_v11.sql +0 -170
- package/scripts/migrations/v10_to_v11_fresh.sql +0 -18
- package/scripts/migrations/v11_to_v12.sql +0 -195
- package/scripts/migrations/v11_to_v12_fresh.sql +0 -19
- package/scripts/migrations/v12_to_v13.sql +0 -48
- package/scripts/migrations/v12_to_v13_fresh.sql +0 -17
- package/scripts/migrations/v13_to_v14.sql +0 -44
- package/scripts/migrations/v13_to_v14_fresh.sql +0 -17
- package/scripts/migrations/v14_to_v15.sql +0 -71
- package/scripts/migrations/v14_to_v15_fresh.sql +0 -19
- package/scripts/migrations/v15_to_v16.sql +0 -57
- package/scripts/migrations/v15_to_v16_fresh.sql +0 -18
- package/scripts/migrations/v16_to_v17.sql +0 -51
- package/scripts/migrations/v16_to_v17_fresh.sql +0 -18
- package/scripts/migrations/v17_to_v18.sql +0 -66
- package/scripts/migrations/v17_to_v18_fresh.sql +0 -24
- package/scripts/migrations/v1_to_v2.sql +0 -97
- package/scripts/migrations/v2_to_v3.sql +0 -68
- package/scripts/migrations/v2_to_v3_merge.sql +0 -69
- package/scripts/migrations/v3_to_v4.sql +0 -67
- package/scripts/migrations/v3_to_v4_fresh.sql +0 -20
- package/scripts/migrations/v4_to_v5.sql +0 -55
- package/scripts/migrations/v4_to_v5_fresh.sql +0 -20
- package/scripts/migrations/v5_to_v6.sql +0 -48
- package/scripts/migrations/v5_to_v6_fresh.sql +0 -17
- package/scripts/migrations/v6_to_v7.sql +0 -26
- package/scripts/migrations/v6_to_v7_fresh.sql +0 -13
- package/scripts/migrations/v7_to_v8.sql +0 -44
- package/scripts/migrations/v7_to_v8_fresh.sql +0 -14
- package/scripts/migrations/v8_to_v9.sql +0 -87
- package/scripts/migrations/v8_to_v9_fresh.sql +0 -15
- package/scripts/migrations/v9_to_v10.sql +0 -109
- package/scripts/migrations/v9_to_v10_episodes_workspace.sql +0 -109
- package/scripts/migrations/v9_to_v10_fresh.sql +0 -18
- package/templates/README.md +0 -70
- package/templates/managed-settings.template.json +0 -43
- package/tools/agentic-loop/decide-status.py +0 -210
- package/tools/agentic-loop/parse-metric.py +0 -106
- package/tools/agentic-loop/record-iteration.py +0 -223
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
decide-status.py
|
|
4
|
-
|
|
5
|
-
Mechanically decide what to do based on numbers alone. No LLM judgment.
|
|
6
|
-
|
|
7
|
-
Usage:
|
|
8
|
-
python3 decide-status.py \
|
|
9
|
-
--current 94.5 \
|
|
10
|
-
--best 92.0 \
|
|
11
|
-
--threshold 98 \
|
|
12
|
-
--direction higher \
|
|
13
|
-
--consecutive-discards 2 \
|
|
14
|
-
--pivot-count 1
|
|
15
|
-
|
|
16
|
-
Output JSON:
|
|
17
|
-
{
|
|
18
|
-
"decision": "keep",
|
|
19
|
-
"reason": "Metric improved from 92.0 to 94.5",
|
|
20
|
-
"improved": true,
|
|
21
|
-
"gap_remaining": 3.5
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
Decision precedence (evaluated top-to-bottom, first match wins):
|
|
25
|
-
1. pivot_count >= 3 → stop
|
|
26
|
-
2. consecutive_discards >= 5 → pivot (also a discard)
|
|
27
|
-
3. consecutive_discards >= 3 → refine (also a discard)
|
|
28
|
-
4. current meets or passes threshold → threshold_reached
|
|
29
|
-
5. current improved vs best (per direction) → keep
|
|
30
|
-
6. current same or worse → discard
|
|
31
|
-
|
|
32
|
-
Exit codes:
|
|
33
|
-
0 success (decision emitted as JSON)
|
|
34
|
-
1 invalid input
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
import argparse
|
|
38
|
-
import json
|
|
39
|
-
import sys
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
Decision = str # type alias for readability
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def _is_improved(current: float, best: float, direction: str) -> bool:
|
|
46
|
-
"""Return True if *current* is strictly better than *best* per direction."""
|
|
47
|
-
if direction == "higher":
|
|
48
|
-
return current > best
|
|
49
|
-
return current < best # lower is better
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _threshold_reached(current: float, threshold: float, direction: str) -> bool:
|
|
53
|
-
"""Return True if *current* has met or surpassed *threshold*."""
|
|
54
|
-
if direction == "higher":
|
|
55
|
-
return current >= threshold
|
|
56
|
-
return current <= threshold
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def _gap_remaining(current: float, threshold: float, direction: str) -> float:
|
|
60
|
-
"""Absolute gap between current value and threshold."""
|
|
61
|
-
if direction == "higher":
|
|
62
|
-
return max(0.0, threshold - current)
|
|
63
|
-
return max(0.0, current - threshold)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def decide(
|
|
67
|
-
current: float,
|
|
68
|
-
best: float,
|
|
69
|
-
threshold: float,
|
|
70
|
-
direction: str,
|
|
71
|
-
consecutive_discards: int,
|
|
72
|
-
pivot_count: int,
|
|
73
|
-
) -> dict:
|
|
74
|
-
"""Pure function: return decision dict from numeric inputs."""
|
|
75
|
-
|
|
76
|
-
gap = _gap_remaining(current, threshold, direction)
|
|
77
|
-
improved = _is_improved(current, best, direction)
|
|
78
|
-
|
|
79
|
-
# --- Precedence 1: hard stop on too many pivots ---
|
|
80
|
-
if pivot_count >= 3:
|
|
81
|
-
return {
|
|
82
|
-
"decision": "stop",
|
|
83
|
-
"reason": f"pivot_count={pivot_count} has reached the maximum of 3; halting loop",
|
|
84
|
-
"improved": improved,
|
|
85
|
-
"gap_remaining": gap,
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
# --- Precedence 2 & 3: discard streak escalations ---
|
|
89
|
-
# Evaluated before threshold/keep so an ongoing failing streak is flagged
|
|
90
|
-
# even if the current run happens to reach the threshold.
|
|
91
|
-
if consecutive_discards >= 5:
|
|
92
|
-
return {
|
|
93
|
-
"decision": "pivot",
|
|
94
|
-
"reason": (
|
|
95
|
-
f"consecutive_discards={consecutive_discards} >= 5; "
|
|
96
|
-
"strategy is not working, force a pivot"
|
|
97
|
-
),
|
|
98
|
-
"improved": improved,
|
|
99
|
-
"gap_remaining": gap,
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
if consecutive_discards >= 3:
|
|
103
|
-
return {
|
|
104
|
-
"decision": "refine",
|
|
105
|
-
"reason": (
|
|
106
|
-
f"consecutive_discards={consecutive_discards} >= 3; "
|
|
107
|
-
"current approach needs refinement before continuing"
|
|
108
|
-
),
|
|
109
|
-
"improved": improved,
|
|
110
|
-
"gap_remaining": gap,
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
# --- Precedence 4: threshold reached ---
|
|
114
|
-
if _threshold_reached(current, threshold, direction):
|
|
115
|
-
return {
|
|
116
|
-
"decision": "threshold_reached",
|
|
117
|
-
"reason": (
|
|
118
|
-
f"current={current} {'≥' if direction == 'higher' else '≤'} "
|
|
119
|
-
f"threshold={threshold}; goal achieved"
|
|
120
|
-
),
|
|
121
|
-
"improved": improved,
|
|
122
|
-
"gap_remaining": 0.0,
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
# --- Precedence 5 & 6: standard keep/discard ---
|
|
126
|
-
if improved:
|
|
127
|
-
return {
|
|
128
|
-
"decision": "keep",
|
|
129
|
-
"reason": f"Metric improved from {best} to {current}",
|
|
130
|
-
"improved": True,
|
|
131
|
-
"gap_remaining": gap,
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
return {
|
|
135
|
-
"decision": "discard",
|
|
136
|
-
"reason": f"Metric did not improve (current={current}, best={best})",
|
|
137
|
-
"improved": False,
|
|
138
|
-
"gap_remaining": gap,
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def main() -> None:
|
|
143
|
-
parser = argparse.ArgumentParser(
|
|
144
|
-
description="Compute the next agentic-loop decision from metric numbers only.",
|
|
145
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
146
|
-
epilog="""
|
|
147
|
-
Decisions:
|
|
148
|
-
keep current improved vs best
|
|
149
|
-
discard current same or worse
|
|
150
|
-
refine 3+ consecutive discards (improvement needed in approach)
|
|
151
|
-
pivot 5+ consecutive discards (strategy change required)
|
|
152
|
-
stop 3+ pivots already attempted
|
|
153
|
-
threshold_reached current meets or surpasses the goal threshold
|
|
154
|
-
|
|
155
|
-
Direction values:
|
|
156
|
-
higher larger numbers are better (e.g. accuracy, passing tests)
|
|
157
|
-
lower smaller numbers are better (e.g. error rate, latency ms)
|
|
158
|
-
""",
|
|
159
|
-
)
|
|
160
|
-
parser.add_argument("--current", required=True, type=float, help="Metric value for the current run")
|
|
161
|
-
parser.add_argument("--best", required=True, type=float, help="Best metric seen so far (from state.json)")
|
|
162
|
-
parser.add_argument("--threshold", required=True, type=float, help="Target threshold to reach")
|
|
163
|
-
parser.add_argument(
|
|
164
|
-
"--direction",
|
|
165
|
-
required=True,
|
|
166
|
-
choices=["higher", "lower"],
|
|
167
|
-
help="Whether higher or lower values are better",
|
|
168
|
-
)
|
|
169
|
-
parser.add_argument(
|
|
170
|
-
"--consecutive-discards",
|
|
171
|
-
required=True,
|
|
172
|
-
type=int,
|
|
173
|
-
metavar="N",
|
|
174
|
-
help="Number of consecutive discard outcomes so far (from state.json)",
|
|
175
|
-
)
|
|
176
|
-
parser.add_argument(
|
|
177
|
-
"--pivot-count",
|
|
178
|
-
required=True,
|
|
179
|
-
type=int,
|
|
180
|
-
metavar="N",
|
|
181
|
-
help="Number of pivots executed so far (from state.json)",
|
|
182
|
-
)
|
|
183
|
-
args = parser.parse_args()
|
|
184
|
-
|
|
185
|
-
# --- Input validation ---
|
|
186
|
-
errors = []
|
|
187
|
-
if args.consecutive_discards < 0:
|
|
188
|
-
errors.append("--consecutive-discards must be >= 0")
|
|
189
|
-
if args.pivot_count < 0:
|
|
190
|
-
errors.append("--pivot-count must be >= 0")
|
|
191
|
-
|
|
192
|
-
if errors:
|
|
193
|
-
for err in errors:
|
|
194
|
-
print(f"error: {err}", file=sys.stderr)
|
|
195
|
-
sys.exit(1)
|
|
196
|
-
|
|
197
|
-
result = decide(
|
|
198
|
-
current=args.current,
|
|
199
|
-
best=args.best,
|
|
200
|
-
threshold=args.threshold,
|
|
201
|
-
direction=args.direction,
|
|
202
|
-
consecutive_discards=args.consecutive_discards,
|
|
203
|
-
pivot_count=args.pivot_count,
|
|
204
|
-
)
|
|
205
|
-
|
|
206
|
-
print(json.dumps(result, indent=2))
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
if __name__ == "__main__":
|
|
210
|
-
main()
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
parse-metric.py
|
|
4
|
-
|
|
5
|
-
Read stdout from eval_command and extract METRIC lines.
|
|
6
|
-
|
|
7
|
-
Usage:
|
|
8
|
-
echo "output" | python3 parse-metric.py --metric accuracy
|
|
9
|
-
python3 parse-metric.py --metric accuracy --file /tmp/eval-output.txt
|
|
10
|
-
|
|
11
|
-
Input lines must match: METRIC {name}={number}
|
|
12
|
-
Output: JSON to stdout with metric name, numeric value, and raw line.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
import argparse
|
|
16
|
-
import json
|
|
17
|
-
import re
|
|
18
|
-
import sys
|
|
19
|
-
from typing import Optional
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
METRIC_PATTERN = re.compile(r"^METRIC\s+(\w+)=([\d.]+)\s*$")
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def parse_lines(lines: list[str]) -> list[dict]:
|
|
26
|
-
"""Extract all METRIC entries from a sequence of lines."""
|
|
27
|
-
results = []
|
|
28
|
-
for line in lines:
|
|
29
|
-
stripped = line.rstrip("\n")
|
|
30
|
-
match = METRIC_PATTERN.match(stripped)
|
|
31
|
-
if match:
|
|
32
|
-
name = match.group(1)
|
|
33
|
-
raw_value = match.group(2)
|
|
34
|
-
# Preserve int vs float from the source text.
|
|
35
|
-
value: int | float
|
|
36
|
-
if "." in raw_value:
|
|
37
|
-
value = float(raw_value)
|
|
38
|
-
else:
|
|
39
|
-
value = int(raw_value)
|
|
40
|
-
results.append(
|
|
41
|
-
{
|
|
42
|
-
"metric": name,
|
|
43
|
-
"value": value,
|
|
44
|
-
"raw_line": stripped,
|
|
45
|
-
}
|
|
46
|
-
)
|
|
47
|
-
return results
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def main() -> None:
|
|
51
|
-
parser = argparse.ArgumentParser(
|
|
52
|
-
description="Extract METRIC lines from eval_command output.",
|
|
53
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
54
|
-
epilog="""
|
|
55
|
-
Examples:
|
|
56
|
-
echo "METRIC accuracy=94.5" | python3 parse-metric.py --metric accuracy
|
|
57
|
-
python3 parse-metric.py --metric passing_tests --file /tmp/out.txt
|
|
58
|
-
python3 parse-metric.py --file /tmp/out.txt # returns all metrics
|
|
59
|
-
""",
|
|
60
|
-
)
|
|
61
|
-
parser.add_argument(
|
|
62
|
-
"--metric",
|
|
63
|
-
metavar="NAME",
|
|
64
|
-
help="Return only this named metric (case-sensitive). Exits 1 if not found.",
|
|
65
|
-
)
|
|
66
|
-
parser.add_argument(
|
|
67
|
-
"--file",
|
|
68
|
-
metavar="PATH",
|
|
69
|
-
help="Read from file instead of stdin.",
|
|
70
|
-
)
|
|
71
|
-
args = parser.parse_args()
|
|
72
|
-
|
|
73
|
-
# --- Read input ---
|
|
74
|
-
try:
|
|
75
|
-
if args.file:
|
|
76
|
-
with open(args.file, "r") as fh:
|
|
77
|
-
lines = fh.readlines()
|
|
78
|
-
else:
|
|
79
|
-
lines = sys.stdin.readlines()
|
|
80
|
-
except OSError as exc:
|
|
81
|
-
print(f"error: cannot read input: {exc}", file=sys.stderr)
|
|
82
|
-
sys.exit(1)
|
|
83
|
-
|
|
84
|
-
# --- Parse ---
|
|
85
|
-
all_metrics = parse_lines(lines)
|
|
86
|
-
|
|
87
|
-
if args.metric:
|
|
88
|
-
# Filter to the requested metric name.
|
|
89
|
-
matches = [m for m in all_metrics if m["metric"] == args.metric]
|
|
90
|
-
if not matches:
|
|
91
|
-
print(
|
|
92
|
-
f"error: metric '{args.metric}' not found in input",
|
|
93
|
-
file=sys.stderr,
|
|
94
|
-
)
|
|
95
|
-
sys.exit(1)
|
|
96
|
-
# Return the last occurrence if there are duplicates.
|
|
97
|
-
result = matches[-1]
|
|
98
|
-
else:
|
|
99
|
-
# Return all metrics as a list when no --metric filter is given.
|
|
100
|
-
result = all_metrics # type: ignore[assignment]
|
|
101
|
-
|
|
102
|
-
print(json.dumps(result, indent=2))
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
if __name__ == "__main__":
|
|
106
|
-
main()
|
|
@@ -1,223 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
record-iteration.py
|
|
4
|
-
|
|
5
|
-
Atomically update state.json and append to worklog.md after each iteration.
|
|
6
|
-
The LLM never writes state.json directly — this script is the only writer.
|
|
7
|
-
|
|
8
|
-
Usage:
|
|
9
|
-
python3 record-iteration.py \
|
|
10
|
-
--state-file state.json \
|
|
11
|
-
--worklog worklog.md \
|
|
12
|
-
--iteration 5 \
|
|
13
|
-
--metric-value 94.5 \
|
|
14
|
-
--status keep \
|
|
15
|
-
--description "Handle hyphenated verbs" \
|
|
16
|
-
--insight "delete-objects splits correctly" \
|
|
17
|
-
--next "Check camelCase+hyphen combined"
|
|
18
|
-
|
|
19
|
-
Optional flags:
|
|
20
|
-
--changed TEXT What was modified (default: same as description)
|
|
21
|
-
--metric-name TEXT Name of the metric recorded (default: "metric")
|
|
22
|
-
|
|
23
|
-
Atomic write guarantee: state.json is written to a .tmp sibling, fsynced,
|
|
24
|
-
then renamed over the original. Either the full write lands or the original
|
|
25
|
-
is untouched.
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
from __future__ import annotations
|
|
29
|
-
|
|
30
|
-
import argparse
|
|
31
|
-
import json
|
|
32
|
-
import os
|
|
33
|
-
import sys
|
|
34
|
-
import tempfile
|
|
35
|
-
from datetime import datetime, timezone
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def load_state(path: str) -> dict:
|
|
39
|
-
"""Load existing state.json or return an empty skeleton."""
|
|
40
|
-
if not os.path.exists(path):
|
|
41
|
-
return {
|
|
42
|
-
"iteration": 0,
|
|
43
|
-
"current_metric": None,
|
|
44
|
-
"best_metric": None,
|
|
45
|
-
"consecutive_discards": 0,
|
|
46
|
-
"pivot_count": 0,
|
|
47
|
-
"timestamp": None,
|
|
48
|
-
"status": None,
|
|
49
|
-
}
|
|
50
|
-
try:
|
|
51
|
-
with open(path, "r") as fh:
|
|
52
|
-
data = json.load(fh)
|
|
53
|
-
return data
|
|
54
|
-
except (OSError, json.JSONDecodeError) as exc:
|
|
55
|
-
print(f"error: cannot read state file '{path}': {exc}", file=sys.stderr)
|
|
56
|
-
sys.exit(1)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def atomic_write_json(path: str, data: dict) -> None:
|
|
60
|
-
"""Write *data* to *path* atomically using write-fsync-rename."""
|
|
61
|
-
dir_name = os.path.dirname(os.path.abspath(path))
|
|
62
|
-
# Use a temp file in the same directory so rename is on the same filesystem.
|
|
63
|
-
try:
|
|
64
|
-
fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp")
|
|
65
|
-
try:
|
|
66
|
-
with os.fdopen(fd, "w") as fh:
|
|
67
|
-
json.dump(data, fh, indent=2)
|
|
68
|
-
fh.write("\n")
|
|
69
|
-
fh.flush()
|
|
70
|
-
os.fsync(fh.fileno())
|
|
71
|
-
os.replace(tmp_path, path)
|
|
72
|
-
except Exception:
|
|
73
|
-
# Clean up orphaned temp file on failure.
|
|
74
|
-
try:
|
|
75
|
-
os.unlink(tmp_path)
|
|
76
|
-
except OSError:
|
|
77
|
-
pass
|
|
78
|
-
raise
|
|
79
|
-
except OSError as exc:
|
|
80
|
-
print(f"error: atomic write to '{path}' failed: {exc}", file=sys.stderr)
|
|
81
|
-
sys.exit(1)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def append_worklog(
|
|
85
|
-
path: str,
|
|
86
|
-
iteration: int,
|
|
87
|
-
description: str,
|
|
88
|
-
metric_name: str,
|
|
89
|
-
metric_value: float,
|
|
90
|
-
status: str,
|
|
91
|
-
changed: str,
|
|
92
|
-
insight: str,
|
|
93
|
-
next_step: str,
|
|
94
|
-
best_metric: float | None,
|
|
95
|
-
) -> None:
|
|
96
|
-
"""Append a structured run entry to worklog.md."""
|
|
97
|
-
status_upper = status.upper()
|
|
98
|
-
|
|
99
|
-
# Build result sentence
|
|
100
|
-
if best_metric is None:
|
|
101
|
-
result_text = f"{metric_name}={metric_value} (first run, no prior best)"
|
|
102
|
-
else:
|
|
103
|
-
comparison = (
|
|
104
|
-
f"improved from {best_metric}"
|
|
105
|
-
if metric_value > best_metric
|
|
106
|
-
else (
|
|
107
|
-
f"unchanged from {best_metric}"
|
|
108
|
-
if metric_value == best_metric
|
|
109
|
-
else f"regressed from {best_metric}"
|
|
110
|
-
)
|
|
111
|
-
)
|
|
112
|
-
result_text = f"{metric_name}={metric_value} ({comparison})"
|
|
113
|
-
|
|
114
|
-
entry = (
|
|
115
|
-
f"\n### Run {iteration}: {description} — {metric_name}={metric_value} ({status_upper})\n"
|
|
116
|
-
f"- **Changed:** {changed}\n"
|
|
117
|
-
f"- **Result:** {result_text}\n"
|
|
118
|
-
f"- **Insight:** {insight}\n"
|
|
119
|
-
f"- **Next:** {next_step}\n"
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
try:
|
|
123
|
-
with open(path, "a") as fh:
|
|
124
|
-
fh.write(entry)
|
|
125
|
-
except OSError as exc:
|
|
126
|
-
print(f"error: cannot append to worklog '{path}': {exc}", file=sys.stderr)
|
|
127
|
-
sys.exit(1)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def main() -> None:
|
|
131
|
-
parser = argparse.ArgumentParser(
|
|
132
|
-
description="Atomically record an agentic-loop iteration into state.json and worklog.md.",
|
|
133
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
134
|
-
epilog="""
|
|
135
|
-
Status values:
|
|
136
|
-
keep — metric improved; best is updated, consecutive_discards reset to 0
|
|
137
|
-
discard — metric did not improve; consecutive_discards incremented
|
|
138
|
-
pivot — forced strategy change (also increments pivot_count)
|
|
139
|
-
stop — terminal state; loop should halt
|
|
140
|
-
|
|
141
|
-
Exit codes:
|
|
142
|
-
0 success
|
|
143
|
-
1 error (message on stderr)
|
|
144
|
-
""",
|
|
145
|
-
)
|
|
146
|
-
parser.add_argument("--state-file", required=True, metavar="PATH", help="Path to state.json")
|
|
147
|
-
parser.add_argument("--worklog", required=True, metavar="PATH", help="Path to worklog.md (append-only)")
|
|
148
|
-
parser.add_argument("--iteration", required=True, type=int, help="Current iteration number (1-based)")
|
|
149
|
-
parser.add_argument("--metric-value", required=True, type=float, metavar="NUM", help="Numeric metric value this run")
|
|
150
|
-
parser.add_argument(
|
|
151
|
-
"--status",
|
|
152
|
-
required=True,
|
|
153
|
-
choices=["keep", "discard", "pivot", "stop"],
|
|
154
|
-
help="Outcome classification for this iteration",
|
|
155
|
-
)
|
|
156
|
-
parser.add_argument("--description", required=True, help="Short description of what changed this run")
|
|
157
|
-
parser.add_argument("--insight", required=True, help="What was learned from this run")
|
|
158
|
-
parser.add_argument("--next", required=True, dest="next_step", help="What to try in the next iteration")
|
|
159
|
-
parser.add_argument(
|
|
160
|
-
"--changed",
|
|
161
|
-
default=None,
|
|
162
|
-
metavar="TEXT",
|
|
163
|
-
help="What was specifically modified (defaults to --description)",
|
|
164
|
-
)
|
|
165
|
-
parser.add_argument(
|
|
166
|
-
"--metric-name",
|
|
167
|
-
default="metric",
|
|
168
|
-
metavar="NAME",
|
|
169
|
-
help="Name label for the metric (default: metric)",
|
|
170
|
-
)
|
|
171
|
-
args = parser.parse_args()
|
|
172
|
-
|
|
173
|
-
changed = args.changed if args.changed is not None else args.description
|
|
174
|
-
|
|
175
|
-
# --- Load current state ---
|
|
176
|
-
state = load_state(args.state_file)
|
|
177
|
-
|
|
178
|
-
prev_best: float | None = state.get("best_metric")
|
|
179
|
-
|
|
180
|
-
# --- Compute new state values ---
|
|
181
|
-
state["iteration"] = args.iteration
|
|
182
|
-
state["current_metric"] = args.metric_value
|
|
183
|
-
state["status"] = args.status
|
|
184
|
-
state["timestamp"] = datetime.now(tz=timezone.utc).isoformat()
|
|
185
|
-
|
|
186
|
-
if args.status == "keep":
|
|
187
|
-
# Keep: this run is better; promote to best.
|
|
188
|
-
state["best_metric"] = args.metric_value
|
|
189
|
-
state["consecutive_discards"] = 0
|
|
190
|
-
elif args.status == "discard":
|
|
191
|
-
# Do not update best; increment discard counter.
|
|
192
|
-
state["consecutive_discards"] = int(state.get("consecutive_discards") or 0) + 1
|
|
193
|
-
elif args.status == "pivot":
|
|
194
|
-
# Pivot: counts as a discard for streak purposes, but also advances pivot_count.
|
|
195
|
-
state["consecutive_discards"] = int(state.get("consecutive_discards") or 0) + 1
|
|
196
|
-
state["pivot_count"] = int(state.get("pivot_count") or 0) + 1
|
|
197
|
-
elif args.status == "stop":
|
|
198
|
-
# Terminal — no counter changes needed beyond recording.
|
|
199
|
-
pass
|
|
200
|
-
|
|
201
|
-
# --- Atomic write ---
|
|
202
|
-
atomic_write_json(args.state_file, state)
|
|
203
|
-
|
|
204
|
-
# --- Append worklog ---
|
|
205
|
-
append_worklog(
|
|
206
|
-
path=args.worklog,
|
|
207
|
-
iteration=args.iteration,
|
|
208
|
-
description=args.description,
|
|
209
|
-
metric_name=args.metric_name,
|
|
210
|
-
metric_value=args.metric_value,
|
|
211
|
-
status=args.status,
|
|
212
|
-
changed=changed,
|
|
213
|
-
insight=args.insight,
|
|
214
|
-
next_step=args.next_step,
|
|
215
|
-
best_metric=prev_best,
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
# Emit updated state summary for easy inspection.
|
|
219
|
-
print(json.dumps(state, indent=2))
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
if __name__ == "__main__":
|
|
223
|
-
main()
|