@kontourai/flow-agents 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/dependabot.yml +23 -0
- package/.github/workflows/release-please.yml +31 -0
- package/.github/workflows/runtime-compat.yml +118 -0
- package/CHANGELOG.md +46 -0
- package/CONTRIBUTING.md +4 -0
- package/README.md +80 -18
- package/build/src/cli/flow-kit.js +9 -4
- package/build/src/cli/init.js +215 -5
- package/build/src/cli/runtime-adapter.js +9 -5
- package/build/src/cli/telemetry-doctor.js +4 -1
- package/build/src/cli/utterance-check.js +65 -1
- package/build/src/runtime-adapters.js +34 -0
- package/build/src/tools/build-universal-bundles.js +285 -0
- package/build/src/tools/filter-installed-packs.js +3 -0
- package/build/src/tools/validate-source-tree.js +5 -1
- package/console.telemetry.json +115 -20
- package/context/scripts/telemetry/lib/config.sh +5 -1
- package/context/settings/flow-agents-settings.json +7 -0
- package/docs/_layouts/default.html +2 -0
- package/docs/context-map.md +1 -0
- package/docs/index.md +53 -4
- package/docs/integrations/conformance.md +246 -0
- package/docs/integrations/framework-adapter.md +275 -0
- package/docs/integrations/harness-install.md +213 -0
- package/docs/integrations/index.md +58 -0
- package/docs/integrations/knowledge-kit-live.md +211 -0
- package/docs/kit-authoring-guide.md +169 -0
- package/docs/north-star.md +2 -2
- package/docs/spec/runtime-hook-surface.md +525 -0
- package/docs/survey-utterance-check.md +211 -94
- package/docs/vision.md +45 -0
- package/evals/acceptance/run.sh +13 -2
- package/evals/acceptance/test_knowledge_kit_live.sh +221 -0
- package/evals/acceptance/test_opencode_harness.sh +121 -0
- package/evals/acceptance/test_pi_harness.sh +113 -0
- package/evals/integration/test_bundle_install.sh +226 -1
- package/evals/integration/test_bundle_lifecycle.sh +641 -0
- package/evals/integration/test_runtime_adapter_activation.sh +113 -1
- package/evals/integration/test_utterance_check.sh +291 -44
- package/evals/run.sh +2 -0
- package/evals/static/test_universal_bundles.sh +137 -2
- package/integrations/strands/README.md +256 -0
- package/integrations/strands/example.py +74 -0
- package/integrations/strands/examples/knowledge_kit_live.py +461 -0
- package/integrations/strands/flow_agents_strands/__init__.py +27 -0
- package/integrations/strands/flow_agents_strands/hooks.py +194 -0
- package/integrations/strands/flow_agents_strands/policy.py +348 -0
- package/integrations/strands/flow_agents_strands/steering.py +225 -0
- package/integrations/strands/flow_agents_strands/telemetry.py +238 -0
- package/integrations/strands/pyproject.toml +38 -0
- package/integrations/strands/tests/__init__.py +0 -0
- package/integrations/strands/tests/test_hooks.py +392 -0
- package/integrations/strands/tests/test_policy.py +315 -0
- package/integrations/strands/tests/test_telemetry.py +184 -0
- package/integrations/strands-ts/README.md +224 -0
- package/integrations/strands-ts/bin/conformance-shim.mjs +257 -0
- package/integrations/strands-ts/package.json +53 -0
- package/integrations/strands-ts/src/hooks.ts +312 -0
- package/integrations/strands-ts/src/index.ts +22 -0
- package/integrations/strands-ts/src/policy.ts +345 -0
- package/integrations/strands-ts/src/telemetry.ts +251 -0
- package/integrations/strands-ts/test/test-policy.ts +322 -0
- package/integrations/strands-ts/test/test-steering.ts +159 -0
- package/integrations/strands-ts/test/test-telemetry.ts +226 -0
- package/integrations/strands-ts/tsconfig.json +20 -0
- package/kits/catalog.json +6 -0
- package/kits/knowledge/adapters/default-store/index.js +821 -0
- package/kits/knowledge/adapters/flow-runner/index.js +1179 -0
- package/kits/knowledge/adapters/flow-runner/telemetry.js +174 -0
- package/kits/knowledge/docs/README.md +135 -0
- package/kits/knowledge/docs/store-contract.md +526 -0
- package/kits/knowledge/evals/consolidation/suite.test.js +1234 -0
- package/kits/knowledge/evals/contract-suite/suite.test.js +670 -0
- package/kits/knowledge/evals/ingest-compile/suite.test.js +574 -0
- package/kits/knowledge/evals/synthesis/suite.test.js +909 -0
- package/kits/knowledge/flows/compile.flow.json +60 -0
- package/kits/knowledge/flows/consolidate.flow.json +77 -0
- package/kits/knowledge/flows/ingest.flow.json +60 -0
- package/kits/knowledge/flows/store-contract.flow.json +48 -0
- package/kits/knowledge/flows/synthesize.flow.json +77 -0
- package/kits/knowledge/kit.json +78 -0
- package/package.json +7 -2
- package/packaging/conformance/README.md +142 -0
- package/packaging/conformance/fixtures/config-protection--allow-no-path.json +18 -0
- package/packaging/conformance/fixtures/config-protection--allow-safe-file.json +20 -0
- package/packaging/conformance/fixtures/config-protection--block-biome.json +20 -0
- package/packaging/conformance/fixtures/config-protection--block-eslintrc.json +20 -0
- package/packaging/conformance/fixtures/quality-gate--allow-no-path.json +17 -0
- package/packaging/conformance/fixtures/quality-gate--allow-nonexistent-file.json +19 -0
- package/packaging/conformance/fixtures/stop-goal-fit--allow-clean-cwd.json +17 -0
- package/packaging/conformance/fixtures/stop-goal-fit--block-strict-mode.json +23 -0
- package/packaging/conformance/fixtures/stop-goal-fit--warn-active-delivery.json +21 -0
- package/packaging/conformance/fixtures/workflow-steering--allow-no-state.json +16 -0
- package/packaging/conformance/fixtures/workflow-steering--inject-active-state.json +29 -0
- package/packaging/conformance/fixtures/workflow-steering--inject-subagent-steering.json +25 -0
- package/packaging/conformance/package.json +4 -0
- package/packaging/conformance/run-conformance.js +322 -0
- package/packaging/manifest.json +59 -0
- package/schemas/flow-agents-settings.schema.json +48 -0
- package/scripts/README.md +4 -0
- package/scripts/dogfood.js +16 -0
- package/scripts/hooks/opencode-hook-adapter.js +123 -0
- package/scripts/hooks/opencode-telemetry-hook.js +101 -0
- package/scripts/hooks/pi-hook-adapter.js +123 -0
- package/scripts/hooks/pi-telemetry-hook.js +105 -0
- package/scripts/hooks/run-hook.js +8 -0
- package/scripts/hooks/utterance-check.js +124 -22
- package/scripts/telemetry/lib/config.sh +5 -1
- package/src/cli/flow-kit.ts +10 -4
- package/src/cli/init.ts +219 -6
- package/src/cli/runtime-adapter.ts +10 -5
- package/src/cli/telemetry-doctor.ts +4 -1
- package/src/cli/utterance-check.ts +71 -1
- package/src/runtime-adapters.ts +35 -0
- package/src/tools/build-universal-bundles.ts +283 -0
- package/src/tools/filter-installed-packs.ts +3 -0
- package/src/tools/validate-source-tree.ts +5 -1
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test_knowledge_kit_live.sh — Acceptance: Knowledge Kit S5 live example
|
|
3
|
+
#
|
|
4
|
+
# Gated on:
|
|
5
|
+
# 1. ollama binary at /run/current-system/sw/bin/ollama
|
|
6
|
+
# 2. qwen3:1.7b model pulled (checked via ollama list)
|
|
7
|
+
# 3. Python venv with strands-agents[ollama] at /tmp/strands-py-live/venv
|
|
8
|
+
#
|
|
9
|
+
# Skips cleanly if any gate is absent (matching other harness conventions).
|
|
10
|
+
# Starts ollama serve, runs the live example, asserts evidence, stops ollama.
|
|
11
|
+
#
|
|
12
|
+
# Assertions:
|
|
13
|
+
# A1. Script exits 0 (overall PASS printed)
|
|
14
|
+
# A2. <workspace>/.telemetry/full.jsonl exists and contains tool.invoke + tool.result
|
|
15
|
+
# A3. <workspace>/.flow-agents/.telemetry/full.jsonl exists and contains
|
|
16
|
+
# session.start, tool.invoke, tool.result (FlowAgentsHooks events)
|
|
17
|
+
# A4. No new .telemetry directory created in the workspace's parent directory
|
|
18
|
+
# by this script (pre-existing parent-dir .telemetry is not counted)
|
|
19
|
+
# A5. At least 1 compiled record in <workspace>/.knowledge-store/records/
|
|
20
|
+
# A6. Compiled record has provenance source_ids referencing raw records
|
|
21
|
+
#
|
|
22
|
+
set -uo pipefail
|
|
23
|
+
|
|
24
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
25
|
+
|
|
26
|
+
OLLAMA_BIN="/run/current-system/sw/bin/ollama"
|
|
27
|
+
VENV_PYTHON="/tmp/strands-py-live/venv/bin/python3"
|
|
28
|
+
EXAMPLE_SCRIPT="$ROOT_DIR/integrations/strands/examples/knowledge_kit_live.py"
|
|
29
|
+
|
|
30
|
+
pass=0
|
|
31
|
+
fail=0
|
|
32
|
+
skip=0
|
|
33
|
+
OLLAMA_STARTED=0
|
|
34
|
+
|
|
35
|
+
_pass() { echo " ✓ $1"; pass=$((pass + 1)); }
|
|
36
|
+
_fail() { echo " ✗ $1"; fail=$((fail + 1)); }
|
|
37
|
+
_skip() { echo " ○ $1"; skip=$((skip + 1)); }
|
|
38
|
+
|
|
39
|
+
cleanup() {
|
|
40
|
+
if [[ "$OLLAMA_STARTED" -eq 1 ]]; then
|
|
41
|
+
pkill -f "ollama serve" 2>/dev/null || true
|
|
42
|
+
fi
|
|
43
|
+
}
|
|
44
|
+
trap cleanup EXIT
|
|
45
|
+
|
|
46
|
+
echo "=== Acceptance: Knowledge Kit S5 Live Example ==="
|
|
47
|
+
echo ""
|
|
48
|
+
|
|
49
|
+
# ── Gate checks ─────────────────────────────────────────────────────────────
|
|
50
|
+
if [[ ! -x "$OLLAMA_BIN" ]]; then
|
|
51
|
+
_skip "ollama binary not found at $OLLAMA_BIN"
|
|
52
|
+
echo ""
|
|
53
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
54
|
+
exit 0
|
|
55
|
+
fi
|
|
56
|
+
|
|
57
|
+
if [[ ! -x "$VENV_PYTHON" ]]; then
|
|
58
|
+
_skip "Python venv not found at $VENV_PYTHON — run: python3 -m venv /tmp/strands-py-live/venv && /tmp/strands-py-live/venv/bin/pip install 'strands-agents[ollama]'"
|
|
59
|
+
echo ""
|
|
60
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
61
|
+
exit 0
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
_pass "Gate: ollama binary present"
|
|
65
|
+
_pass "Gate: Python venv with strands-agents present"
|
|
66
|
+
echo ""
|
|
67
|
+
|
|
68
|
+
# ── Start ollama serve ───────────────────────────────────────────────────────
|
|
69
|
+
echo "--- Starting ollama serve ---"
|
|
70
|
+
"$OLLAMA_BIN" serve > /tmp/ollama-knowledge-kit-live.log 2>&1 &
|
|
71
|
+
OLLAMA_STARTED=1
|
|
72
|
+
|
|
73
|
+
# Wait for server to be ready (up to 15 seconds)
|
|
74
|
+
for i in {1..15}; do
|
|
75
|
+
if curl -s localhost:11434/v1/models >/dev/null 2>&1; then
|
|
76
|
+
_pass "ollama serve ready (${i}s)"
|
|
77
|
+
break
|
|
78
|
+
fi
|
|
79
|
+
if [[ "$i" -eq 15 ]]; then
|
|
80
|
+
_fail "ollama serve did not start within 15 seconds"
|
|
81
|
+
echo ""
|
|
82
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
83
|
+
exit 1
|
|
84
|
+
fi
|
|
85
|
+
sleep 1
|
|
86
|
+
done
|
|
87
|
+
|
|
88
|
+
# Model gate AFTER server start: ollama list errors when no server is running,
|
|
89
|
+
# which previously misreported a pulled model as missing (skip-path bug).
|
|
90
|
+
if ! "$OLLAMA_BIN" list 2>/dev/null | grep -q "qwen3:1.7b"; then
|
|
91
|
+
_skip "qwen3:1.7b model not pulled — run: ollama pull qwen3:1.7b"
|
|
92
|
+
echo ""
|
|
93
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
94
|
+
exit 0
|
|
95
|
+
fi
|
|
96
|
+
_pass "Gate: qwen3:1.7b model pulled"
|
|
97
|
+
echo ""
|
|
98
|
+
|
|
99
|
+
# ── Run the example ──────────────────────────────────────────────────────────
|
|
100
|
+
echo "--- Running knowledge_kit_live.py ---"
|
|
101
|
+
EXAMPLE_OUTPUT="$(mktemp /tmp/knowledge-kit-live-output.XXXXXX)"
|
|
102
|
+
|
|
103
|
+
FLOW_AGENTS_ROOT="$ROOT_DIR" \
|
|
104
|
+
"$VENV_PYTHON" "$EXAMPLE_SCRIPT" 2>&1 | tee "$EXAMPLE_OUTPUT"
|
|
105
|
+
EXAMPLE_EXIT="${PIPESTATUS[0]}"
|
|
106
|
+
|
|
107
|
+
echo ""
|
|
108
|
+
|
|
109
|
+
# ── Assert A1: script exits 0 ─────────────────────────────────────────────
|
|
110
|
+
if [[ "$EXAMPLE_EXIT" -eq 0 ]]; then
|
|
111
|
+
_pass "A1: example script exits 0"
|
|
112
|
+
else
|
|
113
|
+
_fail "A1: example script exited $EXAMPLE_EXIT"
|
|
114
|
+
fi
|
|
115
|
+
|
|
116
|
+
# Extract workspace path from script output
|
|
117
|
+
WORKSPACE="$(grep "^Workspace: " "$EXAMPLE_OUTPUT" | head -1 | sed 's/^Workspace: //')"
|
|
118
|
+
if [[ -z "$WORKSPACE" ]]; then
|
|
119
|
+
_fail "Could not extract workspace path from script output"
|
|
120
|
+
echo ""
|
|
121
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
122
|
+
exit 1
|
|
123
|
+
fi
|
|
124
|
+
|
|
125
|
+
echo " Workspace: $WORKSPACE"
|
|
126
|
+
KIT_TELEMETRY="$WORKSPACE/.telemetry/full.jsonl"
|
|
127
|
+
SESSION_TELEMETRY="$WORKSPACE/.flow-agents/.telemetry/full.jsonl"
|
|
128
|
+
STORE_RECORDS="$WORKSPACE/.knowledge-store/records"
|
|
129
|
+
|
|
130
|
+
# ── Assert A2: kit telemetry contains tool.invoke + tool.result ───────────
|
|
131
|
+
if [[ -f "$KIT_TELEMETRY" ]] && \
|
|
132
|
+
node -e "
|
|
133
|
+
const fs = require('fs');
|
|
134
|
+
const lines = fs.readFileSync('$KIT_TELEMETRY', 'utf8').trim().split('\n').filter(Boolean);
|
|
135
|
+
const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
|
|
136
|
+
const required = ['tool.invoke', 'tool.result'];
|
|
137
|
+
const missing = required.filter(t => !types.includes(t));
|
|
138
|
+
if (missing.length > 0) { process.stderr.write('missing: ' + missing.join(', ') + '\n'); process.exit(1); }
|
|
139
|
+
" 2>/dev/null; then
|
|
140
|
+
_pass "A2: kit telemetry contains tool.invoke + tool.result gate events"
|
|
141
|
+
else
|
|
142
|
+
_fail "A2: kit telemetry missing or lacks required event types (tool.invoke, tool.result)"
|
|
143
|
+
fi
|
|
144
|
+
|
|
145
|
+
# ── Assert A3: session telemetry contains session.start, tool.invoke, tool.result ─
|
|
146
|
+
if [[ -f "$SESSION_TELEMETRY" ]] && \
|
|
147
|
+
node -e "
|
|
148
|
+
const fs = require('fs');
|
|
149
|
+
const lines = fs.readFileSync('$SESSION_TELEMETRY', 'utf8').trim().split('\n').filter(Boolean);
|
|
150
|
+
const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
|
|
151
|
+
const required = ['session.start', 'tool.invoke', 'tool.result'];
|
|
152
|
+
const missing = required.filter(t => !types.includes(t));
|
|
153
|
+
if (missing.length > 0) { process.stderr.write('missing: ' + missing.join(', ') + '\n'); process.exit(1); }
|
|
154
|
+
" 2>/dev/null; then
|
|
155
|
+
_pass "A3: session telemetry contains session.start, tool.invoke, tool.result"
|
|
156
|
+
else
|
|
157
|
+
_fail "A3: session telemetry missing or lacks required FlowAgentsHooks events"
|
|
158
|
+
fi
|
|
159
|
+
|
|
160
|
+
# ── Assert A4: workspace telemetry does not leak to parent ────────────────
|
|
161
|
+
# This assertion checks that telemetry written during this test run does not
|
|
162
|
+
# appear in the parent directory. We verify that the workspace telemetry is
|
|
163
|
+
# contained within WORKSPACE, not in its parent.
|
|
164
|
+
# (Pre-existing .telemetry in the system temp dir is not counted as a leak.)
|
|
165
|
+
PARENT_TELEMETRY="$(dirname "$WORKSPACE")/.telemetry"
|
|
166
|
+
if [[ -d "$PARENT_TELEMETRY" ]]; then
|
|
167
|
+
# Only fail if the directory was modified during our test (mtime within last 60s)
|
|
168
|
+
PARENT_MTIME="$(find "$PARENT_TELEMETRY" -newer "$EXAMPLE_OUTPUT" -name "*.jsonl" 2>/dev/null | wc -l | tr -d ' ')"
|
|
169
|
+
if [[ "$PARENT_MTIME" -gt 0 ]]; then
|
|
170
|
+
_fail "A4: telemetry leaked — new .jsonl files written to workspace parent directory during this test"
|
|
171
|
+
else
|
|
172
|
+
_pass "A4: workspace telemetry contained within workspace (pre-existing parent .telemetry not modified by this test)"
|
|
173
|
+
fi
|
|
174
|
+
else
|
|
175
|
+
_pass "A4: no .telemetry in workspace parent directory"
|
|
176
|
+
fi
|
|
177
|
+
|
|
178
|
+
# ── Assert A5: at least 1 compiled record exists ─────────────────────────
|
|
179
|
+
COMPILED_COUNT=0
|
|
180
|
+
if [[ -d "$STORE_RECORDS" ]]; then
|
|
181
|
+
COMPILED_COUNT=$(grep -rl "type: compiled" "$STORE_RECORDS"/*.md 2>/dev/null | wc -l | tr -d ' ')
|
|
182
|
+
fi
|
|
183
|
+
if [[ "$COMPILED_COUNT" -ge 1 ]]; then
|
|
184
|
+
_pass "A5: compiled record found in store ($COMPILED_COUNT)"
|
|
185
|
+
else
|
|
186
|
+
_fail "A5: no compiled records found in $STORE_RECORDS"
|
|
187
|
+
fi
|
|
188
|
+
|
|
189
|
+
# ── Assert A6: compiled record has provenance source_ids ─────────────────
|
|
190
|
+
PROVENANCE_OK=0
|
|
191
|
+
if [[ -d "$STORE_RECORDS" ]]; then
|
|
192
|
+
for compiled_md in "$STORE_RECORDS"/*.md; do
|
|
193
|
+
[[ -f "$compiled_md" ]] || continue
|
|
194
|
+
if grep -q "type: compiled" "$compiled_md" && grep -q "source_ids:" "$compiled_md"; then
|
|
195
|
+
# Verify at least 2 raw ids are referenced
|
|
196
|
+
SOURCE_COUNT=$(grep -c "^ - " "$compiled_md" 2>/dev/null || echo 0)
|
|
197
|
+
if [[ "$SOURCE_COUNT" -ge 2 ]]; then
|
|
198
|
+
PROVENANCE_OK=1
|
|
199
|
+
break
|
|
200
|
+
fi
|
|
201
|
+
fi
|
|
202
|
+
done
|
|
203
|
+
fi
|
|
204
|
+
if [[ "$PROVENANCE_OK" -eq 1 ]]; then
|
|
205
|
+
_pass "A6: compiled record has provenance source_ids with resolving raw refs"
|
|
206
|
+
else
|
|
207
|
+
_fail "A6: compiled record missing source_ids or insufficient provenance refs"
|
|
208
|
+
fi
|
|
209
|
+
|
|
210
|
+
# ── Cleanup temp files ───────────────────────────────────────────────────
|
|
211
|
+
rm -f "$EXAMPLE_OUTPUT"
|
|
212
|
+
if [[ -d "$WORKSPACE" ]]; then
|
|
213
|
+
rm -rf "$WORKSPACE"
|
|
214
|
+
fi
|
|
215
|
+
|
|
216
|
+
echo ""
|
|
217
|
+
echo "==========================="
|
|
218
|
+
total=$((pass + fail))
|
|
219
|
+
echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
|
|
220
|
+
[[ "$fail" -gt 0 ]] && exit 1
|
|
221
|
+
exit 0
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
5
|
+
source "$ROOT_DIR/evals/lib/node.sh"
|
|
6
|
+
TMP_WORK=""
|
|
7
|
+
pass=0
|
|
8
|
+
fail=0
|
|
9
|
+
skip=0
|
|
10
|
+
|
|
11
|
+
cleanup() {
|
|
12
|
+
[[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
|
|
13
|
+
}
|
|
14
|
+
trap cleanup EXIT
|
|
15
|
+
|
|
16
|
+
_pass() { echo " ✓ $1"; pass=$((pass + 1)); }
|
|
17
|
+
_fail() { echo " ✗ $1"; fail=$((fail + 1)); }
|
|
18
|
+
_skip() { echo " ○ $1"; skip=$((skip + 1)); }
|
|
19
|
+
|
|
20
|
+
wait_for_telemetry() {
|
|
21
|
+
local file="$1"
|
|
22
|
+
local i=0
|
|
23
|
+
while [[ $i -lt 150 ]]; do
|
|
24
|
+
[[ -s "$file" ]] && return 0
|
|
25
|
+
sleep 0.1
|
|
26
|
+
i=$((i + 1))
|
|
27
|
+
done
|
|
28
|
+
return 1
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
echo "=== Harness Acceptance: opencode ==="
|
|
32
|
+
echo ""
|
|
33
|
+
|
|
34
|
+
if ! command -v opencode >/dev/null 2>&1; then
|
|
35
|
+
_skip "opencode CLI not installed"
|
|
36
|
+
echo ""
|
|
37
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
38
|
+
exit 0
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
cd "$ROOT_DIR"
|
|
42
|
+
flow_agents_node scripts/build-universal-bundles.js >/dev/null
|
|
43
|
+
|
|
44
|
+
TMP_WORK="$(mktemp -d /tmp/opencode-acceptance-work.XXXXXX)"
|
|
45
|
+
(cd dist/opencode && bash install.sh "$TMP_WORK") >/dev/null
|
|
46
|
+
|
|
47
|
+
echo "--- Plugin Load + Telemetry ---"
|
|
48
|
+
cd "$TMP_WORK"
|
|
49
|
+
rm -rf .telemetry
|
|
50
|
+
|
|
51
|
+
MODEL_ARGS=()
|
|
52
|
+
if [[ -n "${FLOW_AGENTS_ACCEPT_OPENCODE_MODEL:-}" ]]; then
|
|
53
|
+
MODEL_ARGS=(-m "$FLOW_AGENTS_ACCEPT_OPENCODE_MODEL")
|
|
54
|
+
fi
|
|
55
|
+
|
|
56
|
+
# Models sometimes answer without calling the tool (nondeterminism), which
|
|
57
|
+
# would void the tool.invoke/tool.result assertions — force the tool call
|
|
58
|
+
# and retry once if no tool events landed.
|
|
59
|
+
ACCEPT_PROMPT="You MUST call the read tool before replying — answering from memory is a failure. Read the first 5 lines of README.md with the read tool, then reply: done"
|
|
60
|
+
run_output=""
|
|
61
|
+
provider_error=0
|
|
62
|
+
for _attempt in 1 2; do
|
|
63
|
+
run_output="$(opencode run "${MODEL_ARGS[@]}" "$ACCEPT_PROMPT" 2>&1 || true)"
|
|
64
|
+
if echo "$run_output" | grep -qi "error"; then
|
|
65
|
+
provider_error=1
|
|
66
|
+
break
|
|
67
|
+
fi
|
|
68
|
+
provider_error=0
|
|
69
|
+
for _i in $(seq 1 50); do
|
|
70
|
+
[[ -s "$TMP_WORK/.telemetry/full.jsonl" ]] && grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
|
|
71
|
+
sleep 0.3
|
|
72
|
+
done
|
|
73
|
+
grep -q '"tool.invoke"' "$TMP_WORK/.telemetry/full.jsonl" 2>/dev/null && break
|
|
74
|
+
done
|
|
75
|
+
|
|
76
|
+
LATEST_LOG="$(ls -t ~/.local/share/opencode/log/*.log 2>/dev/null | head -1 || true)"
|
|
77
|
+
if [[ -n "$LATEST_LOG" ]] && grep -q "plugins/flow-agents.js loading plugin" "$LATEST_LOG" 2>/dev/null; then
|
|
78
|
+
_pass "opencode log confirms flow-agents plugin loaded"
|
|
79
|
+
else
|
|
80
|
+
_fail "opencode log did not confirm flow-agents plugin loaded"
|
|
81
|
+
fi
|
|
82
|
+
|
|
83
|
+
telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
|
|
84
|
+
if [[ "$provider_error" -eq 1 ]]; then
|
|
85
|
+
_skip "opencode telemetry assertions skipped (provider/auth error)"
|
|
86
|
+
_skip "opencode telemetry tool events skipped (provider/auth error)"
|
|
87
|
+
else
|
|
88
|
+
if wait_for_telemetry "$telemetry_file"; then
|
|
89
|
+
_pass "opencode telemetry log was written"
|
|
90
|
+
else
|
|
91
|
+
_fail "opencode telemetry log was not written"
|
|
92
|
+
fi
|
|
93
|
+
|
|
94
|
+
if [[ -f "$telemetry_file" ]] && \
|
|
95
|
+
node -e "
|
|
96
|
+
const fs = require('fs');
|
|
97
|
+
const lines = fs.readFileSync('$telemetry_file', 'utf8').trim().split('\n');
|
|
98
|
+
const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
|
|
99
|
+
const hasInvoke = types.some(t => t === 'tool.invoke');
|
|
100
|
+
const hasResult = types.some(t => t === 'tool.result');
|
|
101
|
+
process.exit(hasInvoke && hasResult ? 0 : 1);
|
|
102
|
+
" 2>/dev/null; then
|
|
103
|
+
_pass "opencode telemetry contains tool.invoke and tool.result events"
|
|
104
|
+
else
|
|
105
|
+
_fail "opencode telemetry missing tool.invoke or tool.result events"
|
|
106
|
+
fi
|
|
107
|
+
fi
|
|
108
|
+
|
|
109
|
+
PARENT_TELEMETRY="$(dirname "$TMP_WORK")/.telemetry"
|
|
110
|
+
if [[ -d "$PARENT_TELEMETRY" ]]; then
|
|
111
|
+
_fail "opencode wrote .telemetry to workspace parent directory"
|
|
112
|
+
else
|
|
113
|
+
_pass "no .telemetry leak to workspace parent directory"
|
|
114
|
+
fi
|
|
115
|
+
|
|
116
|
+
echo ""
|
|
117
|
+
echo "==========================="
|
|
118
|
+
total=$((pass + fail))
|
|
119
|
+
echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
|
|
120
|
+
[[ "$fail" -gt 0 ]] && exit 1
|
|
121
|
+
exit 0
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
5
|
+
source "$ROOT_DIR/evals/lib/node.sh"
|
|
6
|
+
TMP_WORK=""
|
|
7
|
+
pass=0
|
|
8
|
+
fail=0
|
|
9
|
+
skip=0
|
|
10
|
+
|
|
11
|
+
cleanup() {
|
|
12
|
+
[[ -n "$TMP_WORK" ]] && rm -rf "$TMP_WORK"
|
|
13
|
+
}
|
|
14
|
+
trap cleanup EXIT
|
|
15
|
+
|
|
16
|
+
_pass() { echo " ✓ $1"; pass=$((pass + 1)); }
|
|
17
|
+
_fail() { echo " ✗ $1"; fail=$((fail + 1)); }
|
|
18
|
+
_skip() { echo " ○ $1"; skip=$((skip + 1)); }
|
|
19
|
+
|
|
20
|
+
wait_for_telemetry() {
|
|
21
|
+
local file="$1"
|
|
22
|
+
local i=0
|
|
23
|
+
while [[ $i -lt 150 ]]; do
|
|
24
|
+
[[ -s "$file" ]] && return 0
|
|
25
|
+
sleep 0.1
|
|
26
|
+
i=$((i + 1))
|
|
27
|
+
done
|
|
28
|
+
return 1
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
echo "=== Harness Acceptance: pi ==="
|
|
32
|
+
echo ""
|
|
33
|
+
|
|
34
|
+
if ! command -v pi >/dev/null 2>&1; then
|
|
35
|
+
_skip "pi CLI not installed"
|
|
36
|
+
echo ""
|
|
37
|
+
echo "Results: ${pass}/$((pass + fail)) passed, ${fail} failed, ${skip} skipped"
|
|
38
|
+
exit 0
|
|
39
|
+
fi
|
|
40
|
+
|
|
41
|
+
cd "$ROOT_DIR"
|
|
42
|
+
flow_agents_node scripts/build-universal-bundles.js >/dev/null
|
|
43
|
+
|
|
44
|
+
TMP_WORK="$(mktemp -d /tmp/pi-acceptance-work.XXXXXX)"
|
|
45
|
+
(cd dist/pi && bash install.sh "$TMP_WORK") >/dev/null
|
|
46
|
+
|
|
47
|
+
echo "--- Telemetry ---"
|
|
48
|
+
cd "$TMP_WORK"
|
|
49
|
+
rm -rf .telemetry
|
|
50
|
+
|
|
51
|
+
run_output="$(pi --approve -p \
|
|
52
|
+
"Use your read tool to read the first 5 lines of README.md, then reply: done" 2>&1 || true)"
|
|
53
|
+
provider_error=0
|
|
54
|
+
if echo "$run_output" | grep -qi "error"; then
|
|
55
|
+
provider_error=1
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
telemetry_file="$TMP_WORK/.telemetry/full.jsonl"
|
|
59
|
+
if [[ "$provider_error" -eq 1 ]]; then
|
|
60
|
+
_skip "pi telemetry assertions skipped (provider/auth error)"
|
|
61
|
+
_skip "pi telemetry event types skipped (provider/auth error)"
|
|
62
|
+
_skip "pi telemetry session events skipped (provider/auth error)"
|
|
63
|
+
else
|
|
64
|
+
if wait_for_telemetry "$telemetry_file"; then
|
|
65
|
+
_pass "pi telemetry log was written"
|
|
66
|
+
else
|
|
67
|
+
_fail "pi telemetry log was not written"
|
|
68
|
+
fi
|
|
69
|
+
|
|
70
|
+
if [[ -f "$telemetry_file" ]] && \
|
|
71
|
+
node -e "
|
|
72
|
+
const fs = require('fs');
|
|
73
|
+
const lines = fs.readFileSync('$telemetry_file', 'utf8').trim().split('\n');
|
|
74
|
+
const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
|
|
75
|
+
const required = ['session.start', 'tool.invoke', 'tool.result', 'session.end'];
|
|
76
|
+
const missing = required.filter(t => !types.includes(t));
|
|
77
|
+
if (missing.length > 0) { process.stderr.write('missing: ' + missing.join(', ') + '\n'); process.exit(1); }
|
|
78
|
+
process.exit(0);
|
|
79
|
+
" 2>/dev/null; then
|
|
80
|
+
_pass "pi telemetry contains session.start, tool.invoke, tool.result, session.end"
|
|
81
|
+
else
|
|
82
|
+
_fail "pi telemetry missing one or more required event types (session.start, tool.invoke, tool.result, session.end)"
|
|
83
|
+
fi
|
|
84
|
+
|
|
85
|
+
# Assert session.start appears exactly once (guards against before_agent_start double-emit).
|
|
86
|
+
if [[ -f "$telemetry_file" ]] && \
|
|
87
|
+
node -e "
|
|
88
|
+
const fs = require('fs');
|
|
89
|
+
const lines = fs.readFileSync('$telemetry_file', 'utf8').trim().split('\n');
|
|
90
|
+
const types = lines.map(l => { try { return JSON.parse(l).event_type; } catch(e) { return ''; } });
|
|
91
|
+
const count = types.filter(t => t === 'session.start').length;
|
|
92
|
+
if (count !== 1) { process.stderr.write('session.start count=' + count + ' (expected exactly 1)\n'); process.exit(1); }
|
|
93
|
+
process.exit(0);
|
|
94
|
+
" 2>/dev/null; then
|
|
95
|
+
_pass "pi telemetry: session.start appears exactly once (no double-emit)"
|
|
96
|
+
else
|
|
97
|
+
_fail "pi telemetry: session.start count is not 1 (double-emit or missing)"
|
|
98
|
+
fi
|
|
99
|
+
fi
|
|
100
|
+
|
|
101
|
+
PARENT_TELEMETRY="$(dirname "$TMP_WORK")/.telemetry"
|
|
102
|
+
if [[ -d "$PARENT_TELEMETRY" ]]; then
|
|
103
|
+
_fail "pi wrote .telemetry to workspace parent directory"
|
|
104
|
+
else
|
|
105
|
+
_pass "no .telemetry leak to workspace parent directory"
|
|
106
|
+
fi
|
|
107
|
+
|
|
108
|
+
echo ""
|
|
109
|
+
echo "==========================="
|
|
110
|
+
total=$((pass + fail))
|
|
111
|
+
echo "Results: ${pass}/${total} passed, ${fail} failed, ${skip} skipped"
|
|
112
|
+
[[ "$fail" -gt 0 ]] && exit 1
|
|
113
|
+
exit 0
|