@nbardy/oompa 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/test-models +74 -22
- package/package.json +1 -1
package/bin/test-models
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
|
-
# test-models —
|
|
2
|
+
# test-models — end-to-end validation of models in oompa.json
|
|
3
3
|
#
|
|
4
4
|
# Usage: test-models [path/to/oompa.json]
|
|
5
5
|
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
6
|
+
# For each unique model, launches the agent and asks it to write a result
|
|
7
|
+
# file. Then checks all expected files exist. This validates the full
|
|
8
|
+
# pipeline: harness CLI → model access → code execution → file I/O.
|
|
8
9
|
|
|
9
10
|
set -euo pipefail
|
|
10
11
|
|
|
@@ -16,9 +17,9 @@ if [ ! -f "$CONFIG" ]; then
|
|
|
16
17
|
exit 1
|
|
17
18
|
fi
|
|
18
19
|
|
|
19
|
-
# Extract unique model strings
|
|
20
|
+
# Extract unique model strings from workers[] and review_model
|
|
20
21
|
MODELS=$(python3 -c "
|
|
21
|
-
import json
|
|
22
|
+
import json
|
|
22
23
|
with open('$CONFIG') as f:
|
|
23
24
|
cfg = json.load(f)
|
|
24
25
|
models = set()
|
|
@@ -36,43 +37,94 @@ if [ -z "$MODELS" ]; then
|
|
|
36
37
|
exit 1
|
|
37
38
|
fi
|
|
38
39
|
|
|
39
|
-
|
|
40
|
+
# Create results directory
|
|
41
|
+
RUN_ID=$(python3 -c "import uuid; print(str(uuid.uuid4())[:8])")
|
|
42
|
+
RESULTS_DIR="tst_results_${RUN_ID}"
|
|
43
|
+
mkdir -p "$RESULTS_DIR"
|
|
44
|
+
|
|
45
|
+
MODEL_COUNT=$(echo "$MODELS" | wc -l | tr -d ' ')
|
|
46
|
+
echo "Testing $MODEL_COUNT models from $CONFIG"
|
|
47
|
+
echo "Results dir: $RESULTS_DIR"
|
|
40
48
|
echo ""
|
|
41
49
|
|
|
42
|
-
|
|
43
|
-
|
|
50
|
+
# Launch all models in parallel
|
|
51
|
+
PIDS=()
|
|
52
|
+
MODEL_NAMES=()
|
|
44
53
|
|
|
45
54
|
while IFS= read -r model; do
|
|
46
55
|
HARNESS="${model%%:*}"
|
|
47
|
-
|
|
56
|
+
# Strip reasoning suffix for the model name passed to CLI
|
|
57
|
+
REST="${model#*:}"
|
|
58
|
+
MODEL_NAME="${REST%%:*}"
|
|
59
|
+
# Safe filename: replace slashes and dots
|
|
60
|
+
SAFE_NAME=$(echo "$model" | tr '/:.' '_')
|
|
61
|
+
|
|
62
|
+
MODEL_NAMES+=("$SAFE_NAME")
|
|
63
|
+
PROMPT="Write a file called ${RESULTS_DIR}/${SAFE_NAME}_DONE with exactly the text DONE. Nothing else. Just create that one file."
|
|
48
64
|
|
|
49
|
-
|
|
65
|
+
echo " launching $model ..."
|
|
50
66
|
|
|
51
67
|
case "$HARNESS" in
|
|
52
68
|
claude)
|
|
53
|
-
|
|
69
|
+
claude --model "$MODEL_NAME" -p "$PROMPT" --dangerously-skip-permissions --max-turns 3 \
|
|
70
|
+
> "${RESULTS_DIR}/${SAFE_NAME}.log" 2>&1 &
|
|
71
|
+
PIDS+=($!)
|
|
54
72
|
;;
|
|
55
73
|
codex)
|
|
56
|
-
|
|
74
|
+
codex exec --model "$MODEL_NAME" \
|
|
75
|
+
--dangerously-bypass-approvals-and-sandbox \
|
|
76
|
+
--skip-git-repo-check \
|
|
77
|
+
-- "$PROMPT" \
|
|
78
|
+
> "${RESULTS_DIR}/${SAFE_NAME}.log" 2>&1 &
|
|
79
|
+
PIDS+=($!)
|
|
57
80
|
;;
|
|
58
81
|
*)
|
|
59
|
-
echo "SKIP (unknown harness)"
|
|
60
|
-
|
|
82
|
+
echo " SKIP (unknown harness: $HARNESS)"
|
|
83
|
+
# Remove from expected list
|
|
84
|
+
unset 'MODEL_NAMES[${#MODEL_NAMES[@]}-1]'
|
|
61
85
|
;;
|
|
62
86
|
esac
|
|
87
|
+
done <<< "$MODELS"
|
|
88
|
+
|
|
89
|
+
# Wait for all
|
|
90
|
+
echo ""
|
|
91
|
+
echo "Waiting for all models to complete..."
|
|
92
|
+
for pid in "${PIDS[@]}"; do
|
|
93
|
+
wait "$pid" 2>/dev/null || true
|
|
94
|
+
done
|
|
63
95
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
96
|
+
# Check results
|
|
97
|
+
echo ""
|
|
98
|
+
echo "Results:"
|
|
99
|
+
echo ""
|
|
100
|
+
|
|
101
|
+
PASS=0
|
|
102
|
+
FAIL=0
|
|
103
|
+
|
|
104
|
+
for safe_name in "${MODEL_NAMES[@]}"; do
|
|
105
|
+
RESULT_FILE="${RESULTS_DIR}/${safe_name}_DONE"
|
|
106
|
+
printf " %-40s " "$safe_name"
|
|
107
|
+
|
|
108
|
+
if [ -f "$RESULT_FILE" ]; then
|
|
109
|
+
CONTENT=$(cat "$RESULT_FILE" | tr -d '[:space:]')
|
|
110
|
+
if [ "$CONTENT" = "DONE" ]; then
|
|
111
|
+
echo "PASS"
|
|
112
|
+
PASS=$((PASS + 1))
|
|
113
|
+
else
|
|
114
|
+
echo "FAIL (file exists but content: '$(head -1 "$RESULT_FILE")')"
|
|
115
|
+
FAIL=$((FAIL + 1))
|
|
116
|
+
fi
|
|
67
117
|
else
|
|
68
|
-
echo "FAIL"
|
|
69
|
-
#
|
|
70
|
-
|
|
118
|
+
echo "FAIL (no result file)"
|
|
119
|
+
# Show first few lines of log for debugging
|
|
120
|
+
if [ -f "${RESULTS_DIR}/${safe_name}.log" ]; then
|
|
121
|
+
head -5 "${RESULTS_DIR}/${safe_name}.log" | sed 's/^/ /'
|
|
122
|
+
fi
|
|
71
123
|
FAIL=$((FAIL + 1))
|
|
72
124
|
fi
|
|
73
|
-
done
|
|
125
|
+
done
|
|
74
126
|
|
|
75
127
|
echo ""
|
|
76
|
-
echo "$PASS passed, $FAIL failed"
|
|
128
|
+
echo "$PASS passed, $FAIL failed (results in $RESULTS_DIR/)"
|
|
77
129
|
|
|
78
130
|
[ "$FAIL" -eq 0 ]
|