claudecode-omc 5.4.0 → 5.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.local/guidelines/CLAUDE.md +31 -0
- package/README.md +57 -1
- package/bundled/manifest.json +2 -2
- package/bundled/upstream/oh-my-claudecode/agents/analyst.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/architect.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/code-reviewer.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/code-simplifier.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/critic.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/debugger.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/designer.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/document-specialist.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/executor.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/explore.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/git-master.md +3 -3
- package/bundled/upstream/oh-my-claudecode/agents/planner.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/qa-tester.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/scientist.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/security-reviewer.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/test-engineer.md +1 -75
- package/bundled/upstream/oh-my-claudecode/agents/tracer.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/verifier.md +1 -1
- package/bundled/upstream/oh-my-claudecode/agents/writer.md +1 -1
- package/bundled/upstream/oh-my-claudecode/hooks/hooks.json +21 -1
- package/bundled/upstream/oh-my-claudecode/skills/AGENTS.md +200 -0
- package/bundled/upstream/oh-my-claudecode/skills/autopilot/SKILL.md +17 -10
- package/bundled/upstream/oh-my-claudecode/skills/autoresearch/SKILL.md +90 -0
- package/bundled/upstream/oh-my-claudecode/skills/cancel/SKILL.md +15 -6
- package/bundled/upstream/oh-my-claudecode/skills/configure-notifications/SKILL.md +12 -12
- package/bundled/upstream/oh-my-claudecode/skills/debug/SKILL.md +35 -0
- package/bundled/upstream/oh-my-claudecode/skills/deep-dive/SKILL.md +4 -0
- package/bundled/upstream/oh-my-claudecode/skills/deep-interview/SKILL.md +23 -18
- package/bundled/upstream/oh-my-claudecode/skills/hud/SKILL.md +23 -101
- package/bundled/upstream/oh-my-claudecode/skills/learner/SKILL.md +27 -2
- package/bundled/upstream/oh-my-claudecode/skills/mcp-setup/SKILL.md +67 -8
- package/bundled/upstream/oh-my-claudecode/skills/omc-doctor/SKILL.md +32 -47
- package/bundled/upstream/oh-my-claudecode/skills/omc-setup/SKILL.md +4 -2
- package/bundled/upstream/oh-my-claudecode/skills/omc-setup/phases/01-install-claude-md.md +15 -4
- package/bundled/upstream/oh-my-claudecode/skills/omc-setup/phases/02-configure.md +9 -9
- package/bundled/upstream/oh-my-claudecode/skills/omc-setup/phases/03-integrations.md +13 -13
- package/bundled/upstream/oh-my-claudecode/skills/omc-setup/phases/04-welcome.md +3 -3
- package/bundled/upstream/oh-my-claudecode/skills/omc-teams/SKILL.md +28 -0
- package/bundled/upstream/oh-my-claudecode/skills/plan/SKILL.md +1 -0
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/SKILL.md +25 -5
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/lib/config.sh +2 -15
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/lib/providers/github.sh +1 -1
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/lib/session.sh +2 -2
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/lib/tmux.sh +109 -4
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/lib/worktree.sh +26 -0
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/psm.sh +46 -5
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/templates/pr-review.md +5 -2
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/templates/projects.json +1 -1
- package/bundled/upstream/oh-my-claudecode/skills/project-session-manager/tests/test-psm-prompt-injection.sh +336 -0
- package/bundled/upstream/oh-my-claudecode/skills/ralph/SKILL.md +18 -9
- package/bundled/upstream/oh-my-claudecode/skills/ralplan/SKILL.md +2 -0
- package/bundled/upstream/oh-my-claudecode/skills/release/SKILL.md +167 -57
- package/bundled/upstream/oh-my-claudecode/skills/remember/SKILL.md +41 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/SKILL.md +391 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/data_contracts.md +274 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/scripts/plot_progress.py +128 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/scripts/resolve-paths.mjs +192 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/scripts/validate.sh +404 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/si-benchmark-builder.md +79 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/si-goal-clarifier.md +94 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/si-researcher.md +73 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/templates/agent-settings.json +14 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/templates/goal.md +22 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/templates/harness.md +18 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/templates/idea.md +5 -0
- package/bundled/upstream/oh-my-claudecode/skills/self-improve/templates/settings.json +23 -0
- package/bundled/upstream/oh-my-claudecode/skills/skill/SKILL.md +46 -77
- package/bundled/upstream/oh-my-claudecode/skills/skillify/SKILL.md +53 -0
- package/bundled/upstream/oh-my-claudecode/skills/team/SKILL.md +83 -11
- package/bundled/upstream/oh-my-claudecode/skills/trace/SKILL.md +1 -0
- package/bundled/upstream/oh-my-claudecode/skills/ultraqa/SKILL.md +1 -0
- package/bundled/upstream/oh-my-claudecode/skills/ultrawork/SKILL.md +1 -0
- package/bundled/upstream/oh-my-claudecode/skills/verify/SKILL.md +37 -0
- package/bundled/upstream/oh-my-claudecode/skills/wiki/SKILL.md +67 -0
- package/package.json +3 -1
- package/src/cli/artifact.js +47 -0
- package/src/cli/doctor.js +6 -1
- package/src/cli/guidelines.js +83 -0
- package/src/cli/index.js +13 -1
- package/src/cli/setup.js +68 -19
- package/src/cli/source.js +35 -1
- package/src/config/artifact-types.js +12 -2
- package/src/config/paths.js +95 -4
- package/src/config/sources.js +29 -5
- package/src/guidelines/apply.js +152 -0
- package/src/guidelines/optimizer.js +325 -0
- package/src/merge/claude-md-merger.js +35 -12
- package/bundled/upstream/oh-my-claudecode/skills/omc-doctor/skill-debugger.md +0 -101
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# validate.sh — Sealed file enforcement + plan schema validation for self-improvement loop.
|
|
3
|
+
# Usage:
|
|
4
|
+
# ./validate.sh --worktree /path --settings /path/to/settings.json plan.json
|
|
5
|
+
# ./validate.sh --project-root /path/to/omc/project --topic "Improve tests" plan.json
|
|
6
|
+
# ./validate.sh plan.json
|
|
7
|
+
# ./validate.sh
|
|
8
|
+
|
|
9
|
+
set -euo pipefail
|
|
10
|
+
|
|
11
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
12
|
+
SKILL_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
|
13
|
+
|
|
14
|
+
# Settings path may be provided directly or resolved from the scoped self-improve root.
|
|
15
|
+
SETTINGS=""
|
|
16
|
+
PROJECT_ROOT=""
|
|
17
|
+
TOPIC_NAME=""
|
|
18
|
+
TOPIC_SLUG=""
|
|
19
|
+
|
|
20
|
+
# Parse arguments
|
|
21
|
+
WORKTREE_PATH=""
|
|
22
|
+
POSITIONAL_ARGS=()
|
|
23
|
+
while [[ $# -gt 0 ]]; do
|
|
24
|
+
case "$1" in
|
|
25
|
+
--worktree)
|
|
26
|
+
WORKTREE_PATH="$2"
|
|
27
|
+
shift 2
|
|
28
|
+
;;
|
|
29
|
+
--settings)
|
|
30
|
+
SETTINGS="$2"
|
|
31
|
+
shift 2
|
|
32
|
+
;;
|
|
33
|
+
--project-root)
|
|
34
|
+
PROJECT_ROOT="$2"
|
|
35
|
+
shift 2
|
|
36
|
+
;;
|
|
37
|
+
--topic)
|
|
38
|
+
TOPIC_NAME="$2"
|
|
39
|
+
shift 2
|
|
40
|
+
;;
|
|
41
|
+
--slug)
|
|
42
|
+
TOPIC_SLUG="$2"
|
|
43
|
+
shift 2
|
|
44
|
+
;;
|
|
45
|
+
*)
|
|
46
|
+
POSITIONAL_ARGS+=("$1")
|
|
47
|
+
shift
|
|
48
|
+
;;
|
|
49
|
+
esac
|
|
50
|
+
done
|
|
51
|
+
set -- "${POSITIONAL_ARGS[@]+"${POSITIONAL_ARGS[@]}"}"
|
|
52
|
+
|
|
53
|
+
GIT_DIR="${WORKTREE_PATH:-$(pwd)}"
|
|
54
|
+
|
|
55
|
+
err() { echo "ERROR: $*" >&2; }
|
|
56
|
+
ok() { echo "OK: $*"; }
|
|
57
|
+
|
|
58
|
+
require_jq() {
|
|
59
|
+
if ! command -v jq &>/dev/null; then
|
|
60
|
+
err "jq is not installed. Install with: brew install jq (macOS) or apt-get install jq (Linux)"
|
|
61
|
+
exit 1
|
|
62
|
+
fi
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
resolve_settings_from_project_root() {
|
|
66
|
+
local project_root="$1"
|
|
67
|
+
local resolver="${SCRIPT_DIR}/resolve-paths.mjs"
|
|
68
|
+
local args=( "${resolver}" --project-root "${project_root}" )
|
|
69
|
+
|
|
70
|
+
if [[ -n "${TOPIC_SLUG}" ]]; then
|
|
71
|
+
args+=( --slug "${TOPIC_SLUG}" )
|
|
72
|
+
elif [[ -n "${TOPIC_NAME}" ]]; then
|
|
73
|
+
args+=( --topic "${TOPIC_NAME}" )
|
|
74
|
+
fi
|
|
75
|
+
|
|
76
|
+
local resolved
|
|
77
|
+
resolved=$(node "${args[@]}" 2>/dev/null || true)
|
|
78
|
+
if [[ -z "${resolved}" ]]; then
|
|
79
|
+
return 1
|
|
80
|
+
fi
|
|
81
|
+
|
|
82
|
+
local candidate
|
|
83
|
+
candidate=$(printf '%s' "${resolved}" | jq -r '.settings_path // ""' 2>/dev/null || true)
|
|
84
|
+
if [[ -n "${candidate}" ]]; then
|
|
85
|
+
SETTINGS="${candidate}"
|
|
86
|
+
return 0
|
|
87
|
+
fi
|
|
88
|
+
|
|
89
|
+
return 1
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
discover_settings_from_search_root() {
|
|
93
|
+
local search_dir="$1"
|
|
94
|
+
while [[ "${search_dir}" != "/" ]]; do
|
|
95
|
+
if [[ -f "${search_dir}/.omc/self-improve/config/settings.json" ]]; then
|
|
96
|
+
SETTINGS="${search_dir}/.omc/self-improve/config/settings.json"
|
|
97
|
+
return 0
|
|
98
|
+
fi
|
|
99
|
+
|
|
100
|
+
shopt -s nullglob
|
|
101
|
+
local scoped_candidates=( "${search_dir}"/.omc/self-improve/topics/*/config/settings.json )
|
|
102
|
+
shopt -u nullglob
|
|
103
|
+
if [[ "${#scoped_candidates[@]}" -eq 1 ]]; then
|
|
104
|
+
SETTINGS="${scoped_candidates[0]}"
|
|
105
|
+
return 0
|
|
106
|
+
fi
|
|
107
|
+
if [[ "${#scoped_candidates[@]}" -gt 1 ]]; then
|
|
108
|
+
err "Multiple self-improve topics exist under ${search_dir}/.omc/self-improve/topics/. Pass --settings, --project-root with --topic/--slug, or set SELF_IMPROVE_SETTINGS_PATH."
|
|
109
|
+
exit 1
|
|
110
|
+
fi
|
|
111
|
+
|
|
112
|
+
search_dir="$(dirname "${search_dir}")"
|
|
113
|
+
done
|
|
114
|
+
return 1
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
resolve_settings_path() {
|
|
118
|
+
[[ -n "${SETTINGS}" ]] && return 0
|
|
119
|
+
|
|
120
|
+
if [[ -n "${SELF_IMPROVE_SETTINGS_PATH:-}" ]]; then
|
|
121
|
+
SETTINGS="${SELF_IMPROVE_SETTINGS_PATH}"
|
|
122
|
+
return 0
|
|
123
|
+
fi
|
|
124
|
+
|
|
125
|
+
require_jq
|
|
126
|
+
|
|
127
|
+
if [[ -n "${PROJECT_ROOT}" ]]; then
|
|
128
|
+
if [[ -n "${TOPIC_SLUG}" || -n "${TOPIC_NAME}" ]]; then
|
|
129
|
+
if resolve_settings_from_project_root "${PROJECT_ROOT}"; then
|
|
130
|
+
return 0
|
|
131
|
+
fi
|
|
132
|
+
fi
|
|
133
|
+
|
|
134
|
+
if discover_settings_from_search_root "${PROJECT_ROOT}"; then
|
|
135
|
+
return 0
|
|
136
|
+
fi
|
|
137
|
+
fi
|
|
138
|
+
|
|
139
|
+
local search_dir="${WORKTREE_PATH:-$(pwd)}"
|
|
140
|
+
if discover_settings_from_search_root "${search_dir}"; then
|
|
141
|
+
return 0
|
|
142
|
+
fi
|
|
143
|
+
|
|
144
|
+
return 1
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
check_sealed_files() {
|
|
148
|
+
resolve_settings_path || true
|
|
149
|
+
|
|
150
|
+
if [[ -z "${SETTINGS}" || ! -f "${SETTINGS}" ]]; then
|
|
151
|
+
ok "No settings file found — skipping sealed file check."
|
|
152
|
+
return 0
|
|
153
|
+
fi
|
|
154
|
+
|
|
155
|
+
has_sealed=$(jq -r 'if (.sealed_files | type) == "array" and (.sealed_files | length) > 0 then "yes" else "no" end' "${SETTINGS}" 2>/dev/null || echo "no")
|
|
156
|
+
|
|
157
|
+
if [[ "${has_sealed}" != "yes" ]]; then
|
|
158
|
+
ok "No sealed files configured — skipping."
|
|
159
|
+
return 0
|
|
160
|
+
fi
|
|
161
|
+
|
|
162
|
+
if ! git -C "${GIT_DIR}" rev-parse --git-dir &>/dev/null 2>&1; then
|
|
163
|
+
ok "Not a git repository — skipping sealed file check."
|
|
164
|
+
return 0
|
|
165
|
+
fi
|
|
166
|
+
|
|
167
|
+
local modified_files_str=""
|
|
168
|
+
if [[ -n "${WORKTREE_PATH}" ]]; then
|
|
169
|
+
# Find the correct baseline: the improvement branch this experiment branched from.
|
|
170
|
+
# Try improve/* branches first, then fall back to main/master.
|
|
171
|
+
local base_commit
|
|
172
|
+
local improve_branch
|
|
173
|
+
improve_branch=$(git -C "${GIT_DIR}" branch -a --list 'improve/*' 2>/dev/null | head -1 | tr -d ' *' || true)
|
|
174
|
+
if [[ -z "${improve_branch}" ]]; then
|
|
175
|
+
improve_branch=$(git -C "${GIT_DIR}" symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||' || echo "main")
|
|
176
|
+
fi
|
|
177
|
+
base_commit=$(git -C "${GIT_DIR}" merge-base HEAD "${improve_branch}" 2>/dev/null || echo "HEAD~1")
|
|
178
|
+
modified_files_str=$(git -C "${GIT_DIR}" diff --name-only "${base_commit}" 2>/dev/null || true)
|
|
179
|
+
local uncommitted
|
|
180
|
+
uncommitted=$(git -C "${GIT_DIR}" diff --name-only 2>/dev/null || true)
|
|
181
|
+
if [[ -n "${uncommitted}" ]]; then
|
|
182
|
+
modified_files_str="${modified_files_str}"$'\n'"${uncommitted}"
|
|
183
|
+
fi
|
|
184
|
+
else
|
|
185
|
+
modified_files_str=$(git -C "${GIT_DIR}" diff --name-only HEAD 2>/dev/null || true)
|
|
186
|
+
local staged
|
|
187
|
+
staged=$(git -C "${GIT_DIR}" diff --name-only --cached 2>/dev/null || true)
|
|
188
|
+
if [[ -n "${staged}" ]]; then
|
|
189
|
+
modified_files_str="${modified_files_str}"$'\n'"${staged}"
|
|
190
|
+
fi
|
|
191
|
+
fi
|
|
192
|
+
|
|
193
|
+
if [[ -n "${modified_files_str}" ]]; then
|
|
194
|
+
modified_files_str=$(echo "${modified_files_str}" | sort -u)
|
|
195
|
+
fi
|
|
196
|
+
|
|
197
|
+
if [[ -z "${modified_files_str}" ]]; then
|
|
198
|
+
ok "No modified files detected."
|
|
199
|
+
return 0
|
|
200
|
+
fi
|
|
201
|
+
|
|
202
|
+
violations=""
|
|
203
|
+
while IFS= read -r sealed; do
|
|
204
|
+
[[ -z "${sealed}" ]] && continue
|
|
205
|
+
while IFS= read -r modified; do
|
|
206
|
+
[[ -z "${modified}" ]] && continue
|
|
207
|
+
if [[ "${sealed}" == */ ]]; then
|
|
208
|
+
[[ "${modified}" == "${sealed}"* ]] && violations="${violations} ${modified}"
|
|
209
|
+
else
|
|
210
|
+
[[ "${modified}" == "${sealed}" ]] && violations="${violations} ${modified}"
|
|
211
|
+
fi
|
|
212
|
+
done <<< "${modified_files_str}"
|
|
213
|
+
done < <(jq -r '.sealed_files[]' "${SETTINGS}" 2>/dev/null)
|
|
214
|
+
|
|
215
|
+
if [[ -n "${violations}" ]]; then
|
|
216
|
+
err "Sealed file(s) were modified:${violations}"
|
|
217
|
+
exit 1
|
|
218
|
+
fi
|
|
219
|
+
|
|
220
|
+
local modified_count
|
|
221
|
+
modified_count=$(echo "${modified_files_str}" | wc -l | tr -d ' ')
|
|
222
|
+
ok "Sealed file check passed (${modified_count} modified, none sealed)."
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
check_plan_schema() {
|
|
226
|
+
local plan_file="$1"
|
|
227
|
+
require_jq
|
|
228
|
+
|
|
229
|
+
if [[ ! -f "${plan_file}" ]]; then
|
|
230
|
+
err "Plan file not found: ${plan_file}"
|
|
231
|
+
exit 1
|
|
232
|
+
fi
|
|
233
|
+
|
|
234
|
+
local required_fields="plan_id planner_id round hypothesis approach_family critic_approved target_files steps expected_outcome history_reference"
|
|
235
|
+
local missing=""
|
|
236
|
+
|
|
237
|
+
for field in ${required_fields}; do
|
|
238
|
+
val=$(jq -r --arg f "${field}" '.[$f]' "${plan_file}" 2>/dev/null)
|
|
239
|
+
if [[ "${val}" == "null" || -z "${val}" ]]; then
|
|
240
|
+
missing="${missing} ${field}"
|
|
241
|
+
fi
|
|
242
|
+
done
|
|
243
|
+
|
|
244
|
+
if [[ -n "${missing}" ]]; then
|
|
245
|
+
err "Plan is missing required fields:${missing}"
|
|
246
|
+
exit 1
|
|
247
|
+
fi
|
|
248
|
+
ok "Plan contains all required fields."
|
|
249
|
+
|
|
250
|
+
# approach_family validation is handled by the critic (supports custom families
|
|
251
|
+
# from harness.md). validate.sh only checks structural schema, not taxonomy.
|
|
252
|
+
|
|
253
|
+
# Validate hypothesis is a single string
|
|
254
|
+
hypothesis_type=$(jq -r '.hypothesis | type' "${plan_file}" 2>/dev/null)
|
|
255
|
+
if [[ "${hypothesis_type}" != "string" ]]; then
|
|
256
|
+
err "hypothesis must be a string (got ${hypothesis_type})"
|
|
257
|
+
exit 1
|
|
258
|
+
fi
|
|
259
|
+
ok "One-hypothesis check passed."
|
|
260
|
+
|
|
261
|
+
# Validate steps non-empty
|
|
262
|
+
steps_len=$(jq '.steps | length' "${plan_file}" 2>/dev/null || echo "0")
|
|
263
|
+
if [[ "${steps_len}" -eq 0 ]]; then
|
|
264
|
+
err "steps must be a non-empty array"
|
|
265
|
+
exit 1
|
|
266
|
+
fi
|
|
267
|
+
ok "Steps validated (${steps_len} step(s))."
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
check_result_schema() {
|
|
271
|
+
local result_file="$1"
|
|
272
|
+
require_jq
|
|
273
|
+
|
|
274
|
+
if [[ ! -f "${result_file}" ]]; then
|
|
275
|
+
err "Result file not found: ${result_file}"
|
|
276
|
+
exit 1
|
|
277
|
+
fi
|
|
278
|
+
|
|
279
|
+
local required_fields="executor_id plan_id benchmark_score status timestamp benchmark_raw"
|
|
280
|
+
local missing=""
|
|
281
|
+
|
|
282
|
+
for field in ${required_fields}; do
|
|
283
|
+
val=$(jq -r --arg f "${field}" '.[$f]' "${result_file}" 2>/dev/null)
|
|
284
|
+
if [[ "${val}" == "null" || -z "${val}" ]]; then
|
|
285
|
+
if [[ "${field}" == "benchmark_raw" ]]; then
|
|
286
|
+
exists=$(jq --arg f "${field}" 'has($f)' "${result_file}" 2>/dev/null || echo "false")
|
|
287
|
+
if [[ "${exists}" != "true" ]]; then
|
|
288
|
+
missing="${missing} ${field}"
|
|
289
|
+
fi
|
|
290
|
+
elif [[ "${field}" == "benchmark_score" ]]; then
|
|
291
|
+
exists=$(jq --arg f "${field}" 'has($f)' "${result_file}" 2>/dev/null || echo "false")
|
|
292
|
+
if [[ "${exists}" != "true" ]]; then
|
|
293
|
+
missing="${missing} ${field}"
|
|
294
|
+
fi
|
|
295
|
+
else
|
|
296
|
+
missing="${missing} ${field}"
|
|
297
|
+
fi
|
|
298
|
+
fi
|
|
299
|
+
done
|
|
300
|
+
|
|
301
|
+
if [[ -n "${missing}" ]]; then
|
|
302
|
+
err "Result is missing required fields:${missing}"
|
|
303
|
+
exit 1
|
|
304
|
+
fi
|
|
305
|
+
ok "Result contains all required fields."
|
|
306
|
+
|
|
307
|
+
# Validate status enum
|
|
308
|
+
local status
|
|
309
|
+
status=$(jq -r '.status' "${result_file}" 2>/dev/null)
|
|
310
|
+
case "${status}" in
|
|
311
|
+
success|regression|error|timeout) ;;
|
|
312
|
+
*)
|
|
313
|
+
err "Invalid status '${status}'. Must be one of: success, regression, error, timeout"
|
|
314
|
+
exit 1
|
|
315
|
+
;;
|
|
316
|
+
esac
|
|
317
|
+
ok "Status '${status}' is valid."
|
|
318
|
+
|
|
319
|
+
# Check failure_analysis on non-success status
|
|
320
|
+
if [[ "${status}" != "success" ]]; then
|
|
321
|
+
local fa_type
|
|
322
|
+
fa_type=$(jq -r '.failure_analysis | type' "${result_file}" 2>/dev/null)
|
|
323
|
+
if [[ "${fa_type}" != "object" ]]; then
|
|
324
|
+
err "failure_analysis must be a non-null object when status is '${status}' (got ${fa_type})"
|
|
325
|
+
exit 1
|
|
326
|
+
fi
|
|
327
|
+
|
|
328
|
+
local fa_fields="what why category lesson"
|
|
329
|
+
local fa_missing=""
|
|
330
|
+
for field in ${fa_fields}; do
|
|
331
|
+
val=$(jq -r --arg f "${field}" '.failure_analysis[$f]' "${result_file}" 2>/dev/null)
|
|
332
|
+
if [[ "${val}" == "null" || -z "${val}" ]]; then
|
|
333
|
+
fa_missing="${fa_missing} ${field}"
|
|
334
|
+
fi
|
|
335
|
+
done
|
|
336
|
+
|
|
337
|
+
if [[ -n "${fa_missing}" ]]; then
|
|
338
|
+
err "failure_analysis is missing required fields:${fa_missing}"
|
|
339
|
+
exit 1
|
|
340
|
+
fi
|
|
341
|
+
ok "failure_analysis is complete for non-success status."
|
|
342
|
+
|
|
343
|
+
# Validate failure category enum
|
|
344
|
+
local fa_category
|
|
345
|
+
fa_category=$(jq -r '.failure_analysis.category' "${result_file}" 2>/dev/null)
|
|
346
|
+
local valid_categories="oom timeout regression logic_error scope_error infrastructure benchmark_parse_error sealed_file_violation"
|
|
347
|
+
local cat_valid=0
|
|
348
|
+
for cat in ${valid_categories}; do
|
|
349
|
+
if [[ "${fa_category}" == "${cat}" ]]; then
|
|
350
|
+
cat_valid=1
|
|
351
|
+
break
|
|
352
|
+
fi
|
|
353
|
+
done
|
|
354
|
+
|
|
355
|
+
if [[ ${cat_valid} -eq 0 ]]; then
|
|
356
|
+
err "failure_analysis.category '${fa_category}' is not valid. Must be one of: ${valid_categories}"
|
|
357
|
+
exit 1
|
|
358
|
+
fi
|
|
359
|
+
ok "failure_analysis.category '${fa_category}' is valid."
|
|
360
|
+
fi
|
|
361
|
+
|
|
362
|
+
# Validate sub_scores if present
|
|
363
|
+
local has_sub_scores
|
|
364
|
+
has_sub_scores=$(jq 'has("sub_scores")' "${result_file}" 2>/dev/null || echo "false")
|
|
365
|
+
if [[ "${has_sub_scores}" == "true" ]]; then
|
|
366
|
+
local sub_scores_type
|
|
367
|
+
sub_scores_type=$(jq -r '.sub_scores | type' "${result_file}" 2>/dev/null)
|
|
368
|
+
if [[ "${sub_scores_type}" == "object" ]]; then
|
|
369
|
+
local invalid_values
|
|
370
|
+
invalid_values=$(jq -r '.sub_scores | to_entries[] | select(.value != null and (.value | type) != "number") | .key' "${result_file}" 2>/dev/null)
|
|
371
|
+
if [[ -n "${invalid_values}" ]]; then
|
|
372
|
+
err "sub_scores contains non-numeric values for keys: ${invalid_values}"
|
|
373
|
+
exit 1
|
|
374
|
+
fi
|
|
375
|
+
local sub_scores_count
|
|
376
|
+
sub_scores_count=$(jq '.sub_scores | length' "${result_file}" 2>/dev/null)
|
|
377
|
+
ok "sub_scores is a valid object (${sub_scores_count} dimension(s))."
|
|
378
|
+
elif [[ "${sub_scores_type}" != "null" ]]; then
|
|
379
|
+
err "sub_scores must be an object or null (got ${sub_scores_type})"
|
|
380
|
+
exit 1
|
|
381
|
+
fi
|
|
382
|
+
fi
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
main() {
|
|
386
|
+
resolve_settings_path || true
|
|
387
|
+
echo "=== self-improve validate.sh ==="
|
|
388
|
+
if [[ -n "${SETTINGS}" ]]; then
|
|
389
|
+
echo "Settings: ${SETTINGS}"
|
|
390
|
+
fi
|
|
391
|
+
check_sealed_files
|
|
392
|
+
|
|
393
|
+
if [[ ${#POSITIONAL_ARGS[@]} -ge 1 ]]; then
|
|
394
|
+
check_plan_schema "${POSITIONAL_ARGS[0]}"
|
|
395
|
+
fi
|
|
396
|
+
|
|
397
|
+
if [[ ${#POSITIONAL_ARGS[@]} -ge 2 ]]; then
|
|
398
|
+
check_result_schema "${POSITIONAL_ARGS[1]}"
|
|
399
|
+
fi
|
|
400
|
+
|
|
401
|
+
echo "=== All checks passed ==="
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
main
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Self-Improvement Benchmark Builder
|
|
2
|
+
|
|
3
|
+
## Input Contract
|
|
4
|
+
|
|
5
|
+
Arguments passed via prompt context:
|
|
6
|
+
- `repo_path`: Absolute path to the target repository
|
|
7
|
+
- `goal_path`: Path to goal.md with defined objective and metric
|
|
8
|
+
- `settings_path`: Path to settings.json
|
|
9
|
+
- `agent_settings_path`: Path to agent-settings.json
|
|
10
|
+
- `tracking_path`: Path to tracking/ directory
|
|
11
|
+
|
|
12
|
+
## Role
|
|
13
|
+
|
|
14
|
+
You build a benchmark for the self-improvement loop. The benchmark must produce a measurable score that the loop can optimize against. Prefer adapting existing evaluation over building from scratch.
|
|
15
|
+
|
|
16
|
+
## Prerequisites
|
|
17
|
+
|
|
18
|
+
- Target repo exists and is cloned
|
|
19
|
+
- Goal is defined (si_setting_goal is true)
|
|
20
|
+
- goal.md has a defined objective and metric
|
|
21
|
+
|
|
22
|
+
## Workflow
|
|
23
|
+
|
|
24
|
+
### Phase 1 — Understand the Goal
|
|
25
|
+
Read goal.md. Extract metric name, direction, target value, scope.
|
|
26
|
+
|
|
27
|
+
### Phase 2 — Repo Survey
|
|
28
|
+
Explore the target repo for existing evaluation:
|
|
29
|
+
- Test suites (pytest, jest, go test, cargo test)
|
|
30
|
+
- Benchmark scripts (benchmark.*, eval.*, score.*)
|
|
31
|
+
- CI evaluation (.github/workflows/)
|
|
32
|
+
- Performance tests, metrics in code
|
|
33
|
+
|
|
34
|
+
Classify: Ready to use | Partially usable | Nothing exists
|
|
35
|
+
|
|
36
|
+
### Phase 3 — Interview (only if needed)
|
|
37
|
+
If approach is unclear, ask up to 3 questions. Hard cap.
|
|
38
|
+
|
|
39
|
+
### Phase 4 — Design
|
|
40
|
+
Requirements:
|
|
41
|
+
- **JSON output preferred**: Last line of stdout as `{"primary": 85.2, "sub_scores": {"dim_a": 0.92}}`
|
|
42
|
+
- **Deterministic**: Same code → same score (fixed seeds)
|
|
43
|
+
- **Fast**: Under 5 minutes ideally
|
|
44
|
+
- **Self-contained**: No external services
|
|
45
|
+
- **Honest**: Measures actual quality
|
|
46
|
+
|
|
47
|
+
### Phase 5 — Implement
|
|
48
|
+
Build the benchmark. Place it in the target repo (scripts/benchmark.py or benchmark.py).
|
|
49
|
+
Must exit 0 on success, non-zero on error. Print score as last stdout line.
|
|
50
|
+
|
|
51
|
+
### Phase 6 — Validate
|
|
52
|
+
Run the benchmark 3 times:
|
|
53
|
+
```
|
|
54
|
+
Run 1: {x}
|
|
55
|
+
Run 2: {y}
|
|
56
|
+
Run 3: {z}
|
|
57
|
+
Variance: {(max-min)/mean * 100}%
|
|
58
|
+
```
|
|
59
|
+
All 3 must complete. Variance must be < 5%.
|
|
60
|
+
|
|
61
|
+
### Phase 7 — Record and Configure
|
|
62
|
+
Update settings.json:
|
|
63
|
+
- `benchmark_command`: the shell command
|
|
64
|
+
- `benchmark_format`: "json", "number", or "pass_fail"
|
|
65
|
+
- `primary_metric`: key name in JSON output (default: "primary")
|
|
66
|
+
|
|
67
|
+
**Add benchmark script to `sealed_files`** — prevents the loop from modifying it.
|
|
68
|
+
|
|
69
|
+
Record baseline to tracking/baseline.json:
|
|
70
|
+
```json
|
|
71
|
+
{ "baseline_score": <mean_score>, "recorded_at": "<ISO 8601>" }
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Update agent-settings.json:
|
|
75
|
+
- `si_setting_benchmark` → true
|
|
76
|
+
- `best_score` → mean_score
|
|
77
|
+
|
|
78
|
+
### Phase 8 — Handoff
|
|
79
|
+
Report: benchmark command, score, variance, and next step.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Self-Improvement Goal Clarifier
|
|
2
|
+
|
|
3
|
+
## Input Contract
|
|
4
|
+
|
|
5
|
+
Arguments passed via context:
|
|
6
|
+
- `repo_path`: Absolute path to the target repository
|
|
7
|
+
- `config_path`: Path to `<self-improve-root>/config/`
|
|
8
|
+
- `agent_settings_path`: Path to agent-settings.json
|
|
9
|
+
- `topic_slug`: Resolved self-improve topic slug
|
|
10
|
+
|
|
11
|
+
## Role
|
|
12
|
+
|
|
13
|
+
You are an interviewer. Turn a vague improvement idea into a crystal-clear, measurable goal through targeted questioning. One question per round, always targeting the weakest dimension.
|
|
14
|
+
|
|
15
|
+
## Prerequisites
|
|
16
|
+
|
|
17
|
+
- Target repo exists and is cloned
|
|
18
|
+
- If goal.md already has a complete goal, ask: "A goal is already defined. Refine or start fresh?"
|
|
19
|
+
|
|
20
|
+
## Clarity Dimensions
|
|
21
|
+
|
|
22
|
+
Score each 0-100 after every round:
|
|
23
|
+
|
|
24
|
+
| Dimension | What it measures |
|
|
25
|
+
|-----------|-----------------|
|
|
26
|
+
| **Objective** | What exactly should improve? Specific enough to act on? |
|
|
27
|
+
| **Metric** | How do we measure it? Well-defined and automatable? |
|
|
28
|
+
| **Target** | What score are we aiming for? Realistic? |
|
|
29
|
+
| **Scope** | Which files/modules in/out of bounds? |
|
|
30
|
+
|
|
31
|
+
**Ambiguity score** = 100 - average(all dimensions)
|
|
32
|
+
|
|
33
|
+
## Workflow
|
|
34
|
+
|
|
35
|
+
### Phase 1 — Repo Scan (silent)
|
|
36
|
+
Explore the target repo: README, main source, tests, configs. Identify what it does, existing metrics, improvement opportunities. Use this to inform questions.
|
|
37
|
+
|
|
38
|
+
### Phase 2 — Fast-Path Check
|
|
39
|
+
If user provides fully formed goal (objective, metric, target, scope all clear), skip interview. Go to Phase 4.
|
|
40
|
+
|
|
41
|
+
### Phase 3 — Interview Rounds
|
|
42
|
+
Each round:
|
|
43
|
+
1. Score all 4 dimensions
|
|
44
|
+
2. Display scoreboard:
|
|
45
|
+
```
|
|
46
|
+
=== Round {n} ===
|
|
47
|
+
Objective: {score}/100
|
|
48
|
+
Metric: {score}/100
|
|
49
|
+
Target: {score}/100
|
|
50
|
+
Scope: {score}/100
|
|
51
|
+
Ambiguity: {score}%
|
|
52
|
+
```
|
|
53
|
+
3. Ask ONE question targeting the lowest-scoring dimension. Use repo context.
|
|
54
|
+
4. Wait for response. Update scores. Repeat.
|
|
55
|
+
|
|
56
|
+
**Exit when ambiguity <= 20%** (all dimensions >= 80).
|
|
57
|
+
**Soft cap: 8 rounds**. **Hard cap: 12 rounds**.
|
|
58
|
+
|
|
59
|
+
### Phase 4 — Write Goal
|
|
60
|
+
Write `<self-improve-root>/config/goal.md`:
|
|
61
|
+
```markdown
|
|
62
|
+
# Improvement Goal
|
|
63
|
+
|
|
64
|
+
## Objective
|
|
65
|
+
{specific objective}
|
|
66
|
+
|
|
67
|
+
## Target Metric
|
|
68
|
+
- **Metric name**: {name}
|
|
69
|
+
- **Target value**: {value}
|
|
70
|
+
- **Direction**: higher_is_better | lower_is_better
|
|
71
|
+
|
|
72
|
+
## Scope
|
|
73
|
+
- **In scope**: {files, modules}
|
|
74
|
+
- **Out of scope**: {exclusions}
|
|
75
|
+
|
|
76
|
+
## Milestones (optional)
|
|
77
|
+
| Milestone | Target | Strategy Focus |
|
|
78
|
+
|-----------|--------|----------------|
|
|
79
|
+
|
|
80
|
+
## Experiment Ideas (optional)
|
|
81
|
+
{ideas from interview}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Update settings.json: `benchmark_direction`, `target_value`
|
|
85
|
+
Set `si_setting_goal` → true in agent-settings.json
|
|
86
|
+
|
|
87
|
+
### Phase 5 — Handoff
|
|
88
|
+
Print summary and suggest next step (benchmark builder if needed).
|
|
89
|
+
|
|
90
|
+
## Constraints
|
|
91
|
+
- ONE question per round
|
|
92
|
+
- Never assume — ask
|
|
93
|
+
- Use repo evidence in questions
|
|
94
|
+
- Partial updates only when writing settings JSON
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Self-Improvement Researcher
|
|
2
|
+
|
|
3
|
+
## Input Contract
|
|
4
|
+
|
|
5
|
+
Arguments passed via prompt context:
|
|
6
|
+
- `iteration`: Current iteration number (1-indexed)
|
|
7
|
+
- `repo_path`: Absolute path to the target repository
|
|
8
|
+
- `goal_path`: Path to goal.md
|
|
9
|
+
- `history_path`: Path to iteration_history/ directory
|
|
10
|
+
- `briefs_path`: Path to research_briefs/ directory
|
|
11
|
+
|
|
12
|
+
## Role
|
|
13
|
+
|
|
14
|
+
You are the **knowledge gatherer** for the self-improvement loop. Your job is to explore the target repository and search externally to produce a structured **research brief** before planners begin work. You run once per iteration, first.
|
|
15
|
+
|
|
16
|
+
Your output — a research brief JSON — is the foundation all N planners read before generating hypotheses.
|
|
17
|
+
|
|
18
|
+
## Inputs
|
|
19
|
+
|
|
20
|
+
Read all of the following before producing output:
|
|
21
|
+
|
|
22
|
+
- Goal file — improvement objective, target metric, scope constraints, experiment ideas
|
|
23
|
+
- Iteration history — ALL prior records (winners, losers, lessons)
|
|
24
|
+
- Prior research briefs — avoid redundant research
|
|
25
|
+
- Target repository — source files, tests, configs, documentation
|
|
26
|
+
|
|
27
|
+
## Workflow
|
|
28
|
+
|
|
29
|
+
1. **Read the goal**: Extract primary metric, target score, scope constraints, user ideas
|
|
30
|
+
2. **Read all iteration history**: Build a map of what has been tried, what worked, what failed
|
|
31
|
+
3. **Check for user ideas**: Treat as highest-priority input
|
|
32
|
+
4. **Deep-dive the target repository**:
|
|
33
|
+
- README, main source, tests, configs, dependencies
|
|
34
|
+
- Known bottlenecks (TODO/FIXME comments, profile outputs)
|
|
35
|
+
- Test coverage gaps, configuration defaults, outdated dependencies
|
|
36
|
+
5. **Determine research strategy** based on iteration state:
|
|
37
|
+
- First iteration → broad exploration across all approach families
|
|
38
|
+
- After failures → avoid repeating documented failures
|
|
39
|
+
- Strategy exhaustion (same family 3+ wins) → shift to unexplored families
|
|
40
|
+
- Near target (within 5%) → fine-grained, low-risk changes
|
|
41
|
+
6. **Search externally** when needed: papers, benchmarks, similar projects, official docs
|
|
42
|
+
7. **Rank ideas**: high confidence first, then medium, then low. 3-10 ideas.
|
|
43
|
+
8. **Write the research brief** as JSON
|
|
44
|
+
|
|
45
|
+
## Output
|
|
46
|
+
|
|
47
|
+
Write to the path specified by the orchestrator. JSON format:
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{
|
|
51
|
+
"iteration": 1,
|
|
52
|
+
"researcher_id": "researcher",
|
|
53
|
+
"repo_analysis_summary": "What the codebase does, current metric state, what has been tried, biggest gap",
|
|
54
|
+
"ideas": [
|
|
55
|
+
{
|
|
56
|
+
"title": "Short action-oriented name",
|
|
57
|
+
"source": "Specific origin — file names, issue numbers, paper titles",
|
|
58
|
+
"evidence": "Concrete evidence — line numbers, config values, benchmark numbers",
|
|
59
|
+
"approach_family": "architecture|training_config|data|infrastructure|optimization|testing|documentation|other",
|
|
60
|
+
"confidence": "high|medium|low",
|
|
61
|
+
"estimated_impact": "3-5% or unknown"
|
|
62
|
+
}
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Quality Standards
|
|
68
|
+
|
|
69
|
+
- Every idea has specific, citable evidence
|
|
70
|
+
- No idea repeats a documented failure without explaining the difference
|
|
71
|
+
- Ideas span at least 2 different approach families
|
|
72
|
+
- Ideas sorted: high confidence first
|
|
73
|
+
- Valid JSON matching the schema
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{
|
|
2
|
+
"trust_confirmed": false,
|
|
3
|
+
"si_setting_goal": false,
|
|
4
|
+
"si_setting_benchmark": false,
|
|
5
|
+
"si_setting_harness": false,
|
|
6
|
+
"iterations": 0,
|
|
7
|
+
"best_score": null,
|
|
8
|
+
"current_milestone": null,
|
|
9
|
+
"current_phase": null,
|
|
10
|
+
"plateau_consecutive_count": 0,
|
|
11
|
+
"circuit_breaker_count": 0,
|
|
12
|
+
"status": "idle",
|
|
13
|
+
"goal_slug": null
|
|
14
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Improvement Goal
|
|
2
|
+
|
|
3
|
+
## Objective
|
|
4
|
+
<!-- Define what exactly should improve -->
|
|
5
|
+
|
|
6
|
+
## Target Metric
|
|
7
|
+
- **Metric name**:
|
|
8
|
+
- **Target value**:
|
|
9
|
+
- **Direction**: higher_is_better | lower_is_better
|
|
10
|
+
|
|
11
|
+
## Scope
|
|
12
|
+
- **In scope**:
|
|
13
|
+
- **Out of scope**:
|
|
14
|
+
|
|
15
|
+
## Milestones (optional)
|
|
16
|
+
| Milestone | Target | Strategy Focus |
|
|
17
|
+
|-----------|--------|----------------|
|
|
18
|
+
| M1 | | Quick wins, low-hanging fruit |
|
|
19
|
+
| M2 | | Moderate improvements |
|
|
20
|
+
|
|
21
|
+
## Experiment Ideas (optional)
|
|
22
|
+
<!-- Add specific ideas for the improvement loop to try -->
|