ultimate-pi 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/.pi/extensions/lib/harness-paths.ts +8 -0
  2. package/.pi/extensions/sentrux-rules-sync.ts +2 -8
  3. package/.pi/harness/browser.json +5 -1
  4. package/.pi/harness/debates/README.md +9 -0
  5. package/.pi/harness/docs/adrs/0006-sentrux-dual-layer.md +1 -1
  6. package/.pi/harness/docs/adrs/0009-sentrux-rules-lifecycle.md +2 -2
  7. package/.pi/harness/incidents/README.md +6 -0
  8. package/.pi/harness/release-readiness-report.md +128 -0
  9. package/.pi/harness/router/proposals/canary-proposal.json +96 -0
  10. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773891854/events.jsonl +2 -0
  11. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773891854/trace.json +17 -0
  12. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773912057/events.jsonl +2 -0
  13. package/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773912057/trace.json +17 -0
  14. package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096/events.jsonl +6 -0
  15. package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096/trace.json +42 -0
  16. package/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774136101/events.jsonl +1 -0
  17. package/.pi/harness/runs/019e2758-b332-771b-ad6f-54d0d8478768-1778776600591/events.jsonl +2 -0
  18. package/.pi/harness/runs/019e2758-b332-771b-ad6f-54d0d8478768-1778776600591/trace.json +17 -0
  19. package/.pi/harness/runs/README.md +6 -0
  20. package/.pi/harness/runs/budget-events.jsonl +4 -0
  21. package/.pi/harness/runs/canary-candidate-router.json +72 -0
  22. package/.pi/harness/runs/canary-evidence.json +9 -0
  23. package/.pi/harness/runs/index.jsonl +4 -0
  24. package/.pi/harness/sentrux/architecture.manifest.json +3 -3
  25. package/.pi/model-router.json +95 -0
  26. package/.pi/prompts/harness-setup.md +13 -14
  27. package/.pi/prompts/release.md +225 -0
  28. package/.pi/scripts/README.md +17 -0
  29. package/{scripts → .pi/scripts}/harness-verify.mjs +3 -3
  30. package/{scripts → .pi/scripts}/sentrux-rules-sync.mjs +2 -2
  31. package/.sentrux/.harness-rules-meta.json +2 -2
  32. package/.sentrux/rules.toml +3 -3
  33. package/CHANGELOG.md +8 -0
  34. package/firecrawl/.env +53 -0
  35. package/package.json +15 -5
  36. package/.ckignore +0 -41
  37. package/.codex/hooks.json +0 -15
  38. package/.env.example +0 -21
  39. package/.gitattributes +0 -1
  40. package/.github/banner-v2.png +0 -0
  41. package/.github/workflows/lint.yml +0 -33
  42. package/.github/workflows/publish-github-packages.yml +0 -35
  43. package/.github/workflows/publish-npm.yml +0 -32
  44. package/CONTRIBUTING.md +0 -166
  45. package/lefthook.yml +0 -9
  46. package/scripts/__pycache__/merge_graphify_corpora.cpython-314.pyc +0 -0
  47. package/scripts/index_youtube_urls.py +0 -376
  48. package/scripts/merge_graphify_corpora.py +0 -398
  49. package/scripts/regen_graphify_html.py +0 -46
  50. package/test/harness-verify.test.mjs +0 -33
  51. /package/{scripts → .pi/scripts}/harness-cli-verify.sh +0 -0
  52. /package/{scripts → .pi/scripts}/harness-graphify-bootstrap.sh +0 -0
@@ -45,3 +45,11 @@ export function resolveHarnessAsset(
45
45
  ): string {
46
46
  return join(getHarnessPackageRoot(moduleUrl), ...segments);
47
47
  }
48
+
49
+ /** Harness CLI scripts shipped under `.pi/scripts/` in the npm package. */
50
+ export function resolveHarnessScript(
51
+ moduleUrl: string,
52
+ scriptName: string,
53
+ ): string {
54
+ return resolveHarnessAsset(moduleUrl, ".pi", "scripts", scriptName);
55
+ }
@@ -4,21 +4,15 @@
4
4
 
5
5
  import { spawn } from "node:child_process";
6
6
  import { existsSync } from "node:fs";
7
- import { join } from "node:path";
8
7
  import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
9
- import { resolveHarnessAsset } from "./lib/harness-paths.js";
8
+ import { resolveHarnessScript } from "./lib/harness-paths.js";
10
9
 
11
10
  function resolveSyncScript(): string {
12
- const packaged = resolveHarnessAsset(
11
+ return resolveHarnessScript(
13
12
  // @ts-expect-error pi extensions run as ESM
14
13
  import.meta.url,
15
- "scripts",
16
14
  "sentrux-rules-sync.mjs",
17
15
  );
18
- if (existsSync(packaged)) {
19
- return packaged;
20
- }
21
- return join(process.cwd(), "scripts", "sentrux-rules-sync.mjs");
22
16
  }
23
17
 
24
18
  function runSync(args: string[]): Promise<{ code: number; output: string }> {
@@ -1 +1,5 @@
1
- {"headless": true, "timeout": 30000, "viewport": {"width": 1280, "height": 720}}
1
+ {
2
+ "headless": true,
3
+ "timeout": 30000,
4
+ "viewport": { "width": 1280, "height": 720 }
5
+ }
@@ -0,0 +1,9 @@
1
+ # Harness Debates
2
+
3
+ Store debate artifacts (`RoundResult`, `ConsensusPacket`, budget events) here.
4
+
5
+ Locked defaults for aggressive budget profile:
6
+
7
+ - `max_rounds=6`
8
+ - `round_token_cap=2500`
9
+ - `debate_global_cap=35000`
@@ -28,4 +28,4 @@ Evaluator trust requires both programmatic gates (policy, budget, integrity) and
28
28
  ## References
29
29
 
30
30
  - `.pi/harness/specs/observation.schema.json`
31
- - `scripts/harness-verify.mjs`
31
+ - `.pi/scripts/harness-verify.mjs`
@@ -11,7 +11,7 @@ Sentrux enforces architecture via [`.sentrux/rules.toml`](https://sentrux.dev/do
11
11
 
12
12
  1. **Canonical source:** [`.pi/harness/sentrux/architecture.manifest.json`](../../sentrux/architecture.manifest.json) — layers, boundaries, global constraints.
13
13
  2. **Generated artifact:** `.sentrux/rules.toml` — committed to git; managed block between `harness:managed:start/end` markers.
14
- 3. **Sync command:** `npm run harness:sentrux-sync` (`scripts/sentrux-rules-sync.mjs`).
14
+ 3. **Sync command:** `npm run harness:sentrux-sync` (`.pi/scripts/sentrux-rules-sync.mjs`).
15
15
  4. **Pi command:** `/harness-sentrux-sync` via `sentrux-rules-sync.ts` extension.
16
16
  5. **When to sync:**
17
17
  - `/harness-setup` Step 2.8 (after sentrux install)
@@ -34,5 +34,5 @@ Sentrux enforces architecture via [`.sentrux/rules.toml`](https://sentrux.dev/do
34
34
  ## References
35
35
 
36
36
  - ADR 0006 (Sentrux dual layer)
37
- - `scripts/sentrux-rules-sync.mjs`
37
+ - `.pi/scripts/sentrux-rules-sync.mjs`
38
38
  - `.pi/extensions/sentrux-rules-sync.ts`
@@ -0,0 +1,6 @@
1
+ # Harness Incidents
2
+
3
+ Store `IncidentRecord` artifacts and any policy override justifications here.
4
+
5
+ - Override policy: one human approver only.
6
+ - Justification is mandatory for every override record.
@@ -0,0 +1,128 @@
1
+ # Release Readiness Report
2
+
3
+ Date: 2026-05-14
4
+ Repo root used: `/home/aryaniyaps/ai-projects/ultimate-pi` (active workspace root, treated as canonical)
5
+
6
+ ## Requested remaining work
7
+
8
+ - `run-adversarial-canary-and-release`
9
+ - `final-prompt-expert-feature-sweep`
10
+
11
+ Plan file was not modified.
12
+
13
+ ## Final integration checks
14
+
15
+ ### 1) TypeScript compile check
16
+
17
+ - Command: `npm run check:ts`
18
+ - Result: PASS
19
+
20
+ ### 2) Full lint/format/test gate
21
+
22
+ - Command: `npm run check:ts && npm run lint && npm run format:check && npm test`
23
+ - Result: FAIL (expected in current tree state)
24
+ - Notes:
25
+ - `biome check` reports existing lint/format issues (including `.pi/extensions/custom-footer.ts` and multiple `.pi/harness/specs/*.json` files).
26
+ - `npm test` fails before test execution due Node runtime flag incompatibility:
27
+ - `node: bad option: --experimental-strip-types`
28
+
29
+ ### 3) Release preflight checks
30
+
31
+ - Command: `git rev-parse --is-inside-work-tree && git remote -v && git symbolic-ref -q HEAD && (git diff --quiet && git diff --cached --quiet && echo CLEAN || echo DIRTY)`
32
+ - Result:
33
+ - inside git repo: yes
34
+ - branch: `refs/heads/main`
35
+ - remote `origin`: configured
36
+ - tree cleanliness: `DIRTY` (release/tag push should stay blocked until clean)
37
+
38
+ ## Targeted canary validations
39
+
40
+ ### 1) Prompt and policy canary assertions
41
+
42
+ - Static canary suite executed against:
43
+ - harness prompt templates
44
+ - `policy-gate`
45
+ - `test-diff-integrity`
46
+ - `debate-orchestrator`
47
+ - Result: PASS after prompt sweep updates
48
+ - locked clauses in `harness-auto` preserved
49
+ - prompt argument parsing + usage surfaces present across harness prompts
50
+ - completion behavior sections present for operator-facing harness prompts
51
+ - policy/test/debate lock signals present in extension code
52
+
53
+ ### 2) Router tuning canary (proposal-only)
54
+
55
+ - Created synthetic canary evidence:
56
+ - `.pi/harness/runs/canary-evidence.json`
57
+ - Candidate router for dry proposal:
58
+ - `.pi/harness/runs/canary-candidate-router.json`
59
+ - Command:
60
+ - `node .pi/harness/router/propose-router-tuning.mjs --evidence ... --candidate ... --proposal-out .pi/harness/router/proposals/canary-proposal.json`
61
+ - Result: PASS (proposal created, no live router write)
62
+
63
+ ### 3) Harness schema parse check
64
+
65
+ - Command: Node JSON parse validation across `.pi/harness/specs/*.json`
66
+ - Result: PASS (all 9 schema files parse successfully)
67
+
68
+ ## Lightweight adversarial drills
69
+
70
+ ### 1) Negative apply drill (guardrail validation)
71
+
72
+ - Command:
73
+ - `node .pi/harness/router/apply-router-proposal.mjs --proposal ... --approve-by ... --justification ...`
74
+ - intentionally omitted `--write`
75
+ - Result: PASS (guard correctly blocked apply)
76
+ - Expected error:
77
+ - `missing --write (blind writes and implicit applies are disallowed)`
78
+
79
+ ### 2) Adversarial lock retention
80
+
81
+ - Verified locked governance semantics remain stated in `harness-auto`:
82
+ - adversarial review always required
83
+ - severity-policy-engine remains merge-block authority
84
+ - strict pre-PR gates mandatory
85
+ - never auto-merge
86
+
87
+ ## Prompt expert feature sweep
88
+
89
+ Using guidance from `.pi/agents/pi-pi/prompt-expert.md`, harness prompt templates were refined for:
90
+
91
+ 1. Argument handling:
92
+ - explicit `$ARGUMENTS` parse sections
93
+ - required/optional argument normalization
94
+ - deterministic usage fallback lines
95
+ 2. Completion behavior:
96
+ - explicit terminal output contracts for predictable downstream handoff
97
+ 3. UX consistency:
98
+ - harmonized command usage patterns and closure blocks across harness prompts
99
+ 4. Policy integrity:
100
+ - locked policy constraints intentionally kept intact
101
+
102
+ ## Files updated in this sweep
103
+
104
+ - `.pi/prompts/harness-auto.md`
105
+ - `.pi/prompts/harness-plan.md`
106
+ - `.pi/prompts/harness-run.md`
107
+ - `.pi/prompts/harness-review.md`
108
+ - `.pi/prompts/harness-critic.md`
109
+ - `.pi/prompts/harness-eval.md`
110
+ - `.pi/prompts/harness-trace.md`
111
+ - `.pi/prompts/harness-incident.md`
112
+ - `.pi/prompts/harness-router-tune.md`
113
+ - `.pi/prompts/harness-setup.md`
114
+ - `.pi/harness/release-readiness-report.md` (this report)
115
+
116
+ ## New canary artifacts
117
+
118
+ - `.pi/harness/runs/canary-evidence.json`
119
+ - `.pi/harness/runs/canary-candidate-router.json`
120
+ - `.pi/harness/router/proposals/canary-proposal.json`
121
+
122
+ ## Residual risks
123
+
124
+ 1. Full repo lint/format gate currently fails due pre-existing issues unrelated to this sweep.
125
+ 2. `npm test` is currently not runnable in this environment because the configured Node flag is unsupported.
126
+ 3. Release flow should remain blocked until working tree is clean and CI-equivalent checks pass.
127
+ 4. Router apply path was intentionally not executed with `--write` during this run (safety-preserving drill).
128
+
@@ -0,0 +1,96 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "proposal_id": "router-tune-2026-05-14T15-44-44-399Z",
4
+ "created_at": "2026-05-14T15:44:44.399Z",
5
+ "router_path": ".pi/model-router.json",
6
+ "base_router_sha256": "2a96fba517cc5b5147f37428d7ed62961b1968c0e83c0e69f02524265449856b",
7
+ "candidate_router_sha256": "2a96fba517cc5b5147f37428d7ed62961b1968c0e83c0e69f02524265449856b",
8
+ "evidence": {
9
+ "sample_count": 24,
10
+ "min_sample_count": 12,
11
+ "success_rate_delta": 0.08,
12
+ "cost_per_task_delta": -0.04,
13
+ "regression_guard_passed": true,
14
+ "trace_refs": ["run-canary-001", "run-canary-002"],
15
+ "notes": "canary validation synthetic evidence"
16
+ },
17
+ "status": "proposed",
18
+ "approval": {
19
+ "required": true,
20
+ "approved_by": null,
21
+ "approved_at": null,
22
+ "justification": null
23
+ },
24
+ "candidate_router": {
25
+ "defaultProfile": "auto",
26
+ "debug": false,
27
+ "classifierModel": "opencode-go/qwen3.6-plus",
28
+ "phaseBias": 0.5,
29
+ "maxSessionBudget": 1,
30
+ "largeContextThreshold": 100000,
31
+ "rules": [
32
+ {
33
+ "matches": ["deploy", "production", "release"],
34
+ "tier": "high",
35
+ "reason": "Safety check for production tasks"
36
+ },
37
+ {
38
+ "matches": "changelog",
39
+ "tier": "low"
40
+ }
41
+ ],
42
+ "profiles": {
43
+ "auto": {
44
+ "high": {
45
+ "model": "opencode-go/deepseek-v4-pro",
46
+ "thinking": "high",
47
+ "fallbacks": ["opencode-go/qwen3.6-plus", "opencode-go/kimi-k2.6"]
48
+ },
49
+ "medium": {
50
+ "model": "opencode-go/qwen3.6-plus",
51
+ "thinking": "medium",
52
+ "fallbacks": ["opencode-go/deepseek-v4-pro"]
53
+ },
54
+ "low": {
55
+ "model": "opencode-go/deepseek-v4-flash",
56
+ "thinking": "low",
57
+ "fallbacks": ["opencode-go/qwen3.5-plus"]
58
+ }
59
+ },
60
+ "cheap": {
61
+ "high": {
62
+ "model": "opencode-go/qwen3.6-plus",
63
+ "thinking": "low",
64
+ "fallbacks": ["opencode-go/qwen3.5-plus"]
65
+ },
66
+ "medium": {
67
+ "model": "opencode-go/qwen3.5-plus",
68
+ "thinking": "off",
69
+ "fallbacks": ["opencode-go/deepseek-v4-flash"]
70
+ },
71
+ "low": {
72
+ "model": "opencode-go/deepseek-v4-flash",
73
+ "thinking": "off",
74
+ "fallbacks": ["opencode-go/qwen3.5-plus"]
75
+ }
76
+ },
77
+ "deep": {
78
+ "high": {
79
+ "model": "opencode-go/deepseek-v4-pro",
80
+ "thinking": "xhigh",
81
+ "fallbacks": ["opencode-go/kimi-k2.6"]
82
+ },
83
+ "medium": {
84
+ "model": "opencode-go/kimi-k2.6",
85
+ "thinking": "medium",
86
+ "fallbacks": ["opencode-go/deepseek-v4-pro"]
87
+ },
88
+ "low": {
89
+ "model": "opencode-go/qwen3.6-plus",
90
+ "thinking": "low",
91
+ "fallbacks": ["opencode-go/deepseek-v4-flash"]
92
+ }
93
+ }
94
+ }
95
+ }
96
+ }
@@ -0,0 +1,2 @@
1
+ {"timestamp":"2026-05-14T15:51:31.965Z","type":"run_start","run_id":"019e272f-3eef-7107-9712-ce281de55707-1778773891854","plan_id":"plan-unknown","phase":"plan"}
2
+ {"timestamp":"2026-05-14T15:51:38.346Z","type":"run_end","run_id":"019e272f-3eef-7107-9712-ce281de55707-1778773891854","phase":"plan","tool_span_count":0,"artifact_ref_count":0}
@@ -0,0 +1,17 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "contract_version": "1.0.0",
4
+ "run_id": "019e272f-3eef-7107-9712-ce281de55707-1778773891854",
5
+ "plan_id": "plan-unknown",
6
+ "agent_id": "019e272f-3eef-7107-9712-ce281de55707",
7
+ "phase": "plan",
8
+ "model": "auto",
9
+ "thinking_level": "off",
10
+ "tool_spans": [],
11
+ "artifact_refs": [],
12
+ "cost": {
13
+ "input_tokens": 15381,
14
+ "output_tokens": 33,
15
+ "total_tokens": 15414
16
+ }
17
+ }
@@ -0,0 +1,2 @@
1
+ {"timestamp":"2026-05-14T15:51:52.062Z","type":"run_start","run_id":"019e272f-3eef-7107-9712-ce281de55707-1778773912057","plan_id":"plan-unknown","phase":"plan"}
2
+ {"timestamp":"2026-05-14T15:52:14.313Z","type":"run_end","run_id":"019e272f-3eef-7107-9712-ce281de55707-1778773912057","phase":"plan","tool_span_count":0,"artifact_ref_count":0}
@@ -0,0 +1,17 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "contract_version": "1.0.0",
4
+ "run_id": "019e272f-3eef-7107-9712-ce281de55707-1778773912057",
5
+ "plan_id": "plan-unknown",
6
+ "agent_id": "019e272f-3eef-7107-9712-ce281de55707",
7
+ "phase": "plan",
8
+ "model": "auto",
9
+ "thinking_level": "off",
10
+ "tool_spans": [],
11
+ "artifact_refs": [],
12
+ "cost": {
13
+ "input_tokens": 31337,
14
+ "output_tokens": 528,
15
+ "total_tokens": 31865
16
+ }
17
+ }
@@ -0,0 +1,6 @@
1
+ {"timestamp":"2026-05-14T15:54:46.136Z","type":"run_start","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096","plan_id":"plan-unknown","phase":"plan"}
2
+ {"timestamp":"2026-05-14T15:54:59.110Z","type":"tool_start","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096","tool_call_id":"call_00_7UHDcydTHJHVR2dT5xpb0903","tool_name":"bash"}
3
+ {"timestamp":"2026-05-14T15:54:59.137Z","type":"tool_start","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096","tool_call_id":"call_01_aNsry1whTl5hRf5Ew91t3142","tool_name":"bash"}
4
+ {"timestamp":"2026-05-14T15:54:59.139Z","type":"tool_start","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096","tool_call_id":"call_02_N2e56Q6vKr6cAYzd4Z9q7953","tool_name":"bash"}
5
+ {"timestamp":"2026-05-14T15:55:11.546Z","type":"tool_start","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096","tool_call_id":"call_00_wG71Rv3SKrf6R9K03EeS0264","tool_name":"ctx_batch_execute"}
6
+ {"timestamp":"2026-05-14T15:55:25.167Z","type":"run_end","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096","phase":"plan","tool_span_count":4,"artifact_ref_count":0}
@@ -0,0 +1,42 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "contract_version": "1.0.0",
4
+ "run_id": "019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096",
5
+ "plan_id": "plan-unknown",
6
+ "agent_id": "019e2732-8651-74e5-9f5d-4d06c3105f25",
7
+ "phase": "plan",
8
+ "model": "auto",
9
+ "thinking_level": "off",
10
+ "tool_spans": [
11
+ {
12
+ "tool_call_id": "call_00_7UHDcydTHJHVR2dT5xpb0903",
13
+ "tool_name": "bash",
14
+ "started_at": "2026-05-14T15:54:59.108Z",
15
+ "ended_at": "2026-05-14T15:54:59.108Z"
16
+ },
17
+ {
18
+ "tool_call_id": "call_01_aNsry1whTl5hRf5Ew91t3142",
19
+ "tool_name": "bash",
20
+ "started_at": "2026-05-14T15:54:59.136Z",
21
+ "ended_at": "2026-05-14T15:54:59.136Z"
22
+ },
23
+ {
24
+ "tool_call_id": "call_02_N2e56Q6vKr6cAYzd4Z9q7953",
25
+ "tool_name": "bash",
26
+ "started_at": "2026-05-14T15:54:59.139Z",
27
+ "ended_at": "2026-05-14T15:54:59.139Z"
28
+ },
29
+ {
30
+ "tool_call_id": "call_00_wG71Rv3SKrf6R9K03EeS0264",
31
+ "tool_name": "ctx_batch_execute",
32
+ "started_at": "2026-05-14T15:55:11.541Z",
33
+ "ended_at": "2026-05-14T15:55:11.541Z"
34
+ }
35
+ ],
36
+ "artifact_refs": [],
37
+ "cost": {
38
+ "input_tokens": 16951,
39
+ "output_tokens": 1020,
40
+ "total_tokens": 17971
41
+ }
42
+ }
@@ -0,0 +1 @@
1
+ {"timestamp":"2026-05-14T15:55:36.107Z","type":"run_start","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774136101","plan_id":"plan-unknown","phase":"plan"}
@@ -0,0 +1,2 @@
1
+ {"timestamp":"2026-05-14T16:36:40.660Z","type":"run_start","run_id":"019e2758-b332-771b-ad6f-54d0d8478768-1778776600591","plan_id":"plan-unknown","phase":"plan"}
2
+ {"timestamp":"2026-05-14T16:36:47.570Z","type":"run_end","run_id":"019e2758-b332-771b-ad6f-54d0d8478768-1778776600591","phase":"plan","tool_span_count":0,"artifact_ref_count":0}
@@ -0,0 +1,17 @@
1
+ {
2
+ "schema_version": "1.0.0",
3
+ "contract_version": "1.0.0",
4
+ "run_id": "019e2758-b332-771b-ad6f-54d0d8478768-1778776600591",
5
+ "plan_id": "plan-unknown",
6
+ "agent_id": "019e2758-b332-771b-ad6f-54d0d8478768",
7
+ "phase": "plan",
8
+ "model": "auto",
9
+ "thinking_level": "off",
10
+ "tool_spans": [],
11
+ "artifact_refs": [],
12
+ "cost": {
13
+ "input_tokens": 21,
14
+ "output_tokens": 32,
15
+ "total_tokens": 53
16
+ }
17
+ }
@@ -0,0 +1,6 @@
1
+ # Harness Runs
2
+
3
+ Store lightweight run metadata and trace indexes here.
4
+
5
+ - Primary source of truth for full trace payloads remains external telemetry.
6
+ - Local files should contain run IDs, pointers, and replay metadata only.
@@ -0,0 +1,4 @@
1
+ {"timestamp":"2026-05-14T15:54:59.134Z","schema_version":"1.0.0","contract_version":"1.0.0","event_type":"budget_exhausted","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25","debate_id":"plan-budget-guard","round_count":1,"budget_used":16593,"exhaustion_reason":"debate_global_cap_exceeded","caps":{"max_rounds":6,"round_token_cap":2500,"debate_global_cap":35000},"minimum_evidence_confidence":0.6,"default_policy_outcome":"block","human_override_allowed":true}
2
+ {"timestamp":"2026-05-14T15:54:59.138Z","schema_version":"1.0.0","contract_version":"1.0.0","event_type":"budget_exhausted","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25","debate_id":"plan-budget-guard","round_count":1,"budget_used":16593,"exhaustion_reason":"debate_global_cap_exceeded","caps":{"max_rounds":6,"round_token_cap":2500,"debate_global_cap":35000},"minimum_evidence_confidence":0.6,"default_policy_outcome":"block","human_override_allowed":true}
3
+ {"timestamp":"2026-05-14T15:54:59.140Z","schema_version":"1.0.0","contract_version":"1.0.0","event_type":"budget_exhausted","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25","debate_id":"plan-budget-guard","round_count":1,"budget_used":16593,"exhaustion_reason":"debate_global_cap_exceeded","caps":{"max_rounds":6,"round_token_cap":2500,"debate_global_cap":35000},"minimum_evidence_confidence":0.6,"default_policy_outcome":"block","human_override_allowed":true}
4
+ {"timestamp":"2026-05-14T15:55:11.581Z","schema_version":"1.0.0","contract_version":"1.0.0","event_type":"budget_exhausted","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25","debate_id":"plan-budget-guard","round_count":1,"budget_used":17161,"exhaustion_reason":"debate_global_cap_exceeded","caps":{"max_rounds":6,"round_token_cap":2500,"debate_global_cap":35000},"minimum_evidence_confidence":0.6,"default_policy_outcome":"block","human_override_allowed":true}
@@ -0,0 +1,72 @@
1
+ {
2
+ "defaultProfile": "auto",
3
+ "debug": false,
4
+ "classifierModel": "opencode-go/qwen3.6-plus",
5
+ "phaseBias": 0.5,
6
+ "maxSessionBudget": 1.0,
7
+ "largeContextThreshold": 100000,
8
+ "rules": [
9
+ {
10
+ "matches": ["deploy", "production", "release"],
11
+ "tier": "high",
12
+ "reason": "Safety check for production tasks"
13
+ },
14
+ {
15
+ "matches": "changelog",
16
+ "tier": "low"
17
+ }
18
+ ],
19
+ "profiles": {
20
+ "auto": {
21
+ "high": {
22
+ "model": "opencode-go/deepseek-v4-pro",
23
+ "thinking": "high",
24
+ "fallbacks": ["opencode-go/qwen3.6-plus", "opencode-go/kimi-k2.6"]
25
+ },
26
+ "medium": {
27
+ "model": "opencode-go/qwen3.6-plus",
28
+ "thinking": "medium",
29
+ "fallbacks": ["opencode-go/deepseek-v4-pro"]
30
+ },
31
+ "low": {
32
+ "model": "opencode-go/deepseek-v4-flash",
33
+ "thinking": "low",
34
+ "fallbacks": ["opencode-go/qwen3.5-plus"]
35
+ }
36
+ },
37
+ "cheap": {
38
+ "high": {
39
+ "model": "opencode-go/qwen3.6-plus",
40
+ "thinking": "low",
41
+ "fallbacks": ["opencode-go/qwen3.5-plus"]
42
+ },
43
+ "medium": {
44
+ "model": "opencode-go/qwen3.5-plus",
45
+ "thinking": "off",
46
+ "fallbacks": ["opencode-go/deepseek-v4-flash"]
47
+ },
48
+ "low": {
49
+ "model": "opencode-go/deepseek-v4-flash",
50
+ "thinking": "off",
51
+ "fallbacks": ["opencode-go/qwen3.5-plus"]
52
+ }
53
+ },
54
+ "deep": {
55
+ "high": {
56
+ "model": "opencode-go/deepseek-v4-pro",
57
+ "thinking": "xhigh",
58
+ "fallbacks": ["opencode-go/kimi-k2.6"]
59
+ },
60
+ "medium": {
61
+ "model": "opencode-go/kimi-k2.6",
62
+ "thinking": "medium",
63
+ "fallbacks": ["opencode-go/deepseek-v4-pro"]
64
+ },
65
+ "low": {
66
+ "model": "opencode-go/qwen3.6-plus",
67
+ "thinking": "low",
68
+ "fallbacks": ["opencode-go/deepseek-v4-flash"]
69
+ }
70
+ }
71
+ }
72
+ }
@@ -0,0 +1,9 @@
1
+ {
2
+ "sample_count": 24,
3
+ "min_sample_count": 12,
4
+ "success_rate_delta": 0.08,
5
+ "cost_per_task_delta": -0.04,
6
+ "regression_guard_passed": true,
7
+ "trace_refs": ["run-canary-001", "run-canary-002"],
8
+ "notes": "canary validation synthetic evidence"
9
+ }
@@ -0,0 +1,4 @@
1
+ {"timestamp":"2026-05-14T15:51:38.345Z","run_id":"019e272f-3eef-7107-9712-ce281de55707-1778773891854","plan_id":"plan-unknown","phase":"plan","trace_file":"/home/aryaniyaps/ai-projects/ultimate-pi/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773891854/trace.json"}
2
+ {"timestamp":"2026-05-14T15:52:14.312Z","run_id":"019e272f-3eef-7107-9712-ce281de55707-1778773912057","plan_id":"plan-unknown","phase":"plan","trace_file":"/home/aryaniyaps/ai-projects/ultimate-pi/.pi/harness/runs/019e272f-3eef-7107-9712-ce281de55707-1778773912057/trace.json"}
3
+ {"timestamp":"2026-05-14T15:55:25.166Z","run_id":"019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096","plan_id":"plan-unknown","phase":"plan","trace_file":"/home/aryaniyaps/ai-projects/ultimate-pi/.pi/harness/runs/019e2732-8651-74e5-9f5d-4d06c3105f25-1778774086096/trace.json"}
4
+ {"timestamp":"2026-05-14T16:36:47.569Z","run_id":"019e2758-b332-771b-ad6f-54d0d8478768-1778776600591","plan_id":"plan-unknown","phase":"plan","trace_file":"/home/aryaniyaps/ai-projects/ultimate-pi/.pi/harness/runs/019e2758-b332-771b-ad6f-54d0d8478768-1778776600591/trace.json"}
@@ -34,9 +34,9 @@
34
34
  },
35
35
  {
36
36
  "name": "tooling",
37
- "paths": ["scripts/*", "test/*"],
37
+ "paths": [".pi/scripts/*", "test/*"],
38
38
  "order": 4,
39
- "description": "Deterministic scripts and tests"
39
+ "description": "Harness CLI scripts and tests"
40
40
  }
41
41
  ],
42
42
  "boundaries": [
@@ -61,7 +61,7 @@
61
61
  "reason": "Contracts are data-only JSON schemas; extensions implement behavior"
62
62
  },
63
63
  {
64
- "from": "scripts/*",
64
+ "from": ".pi/scripts/*",
65
65
  "to": ".agents/skills/*",
66
66
  "reason": "CLI scripts stay independent of skill markdown"
67
67
  }
@@ -0,0 +1,95 @@
1
+ {
2
+ "defaultProfile": "auto",
3
+ "debug": false,
4
+ "classifierModel": "opencode-go/qwen3.6-plus",
5
+ "phaseBias": 0.5,
6
+ "maxSessionBudget": 1.0,
7
+ "largeContextThreshold": 100000,
8
+ "rules": [
9
+ {
10
+ "matches": [
11
+ "deploy",
12
+ "production",
13
+ "release"
14
+ ],
15
+ "tier": "high",
16
+ "reason": "Safety check for production tasks"
17
+ },
18
+ {
19
+ "matches": "changelog",
20
+ "tier": "low"
21
+ }
22
+ ],
23
+ "profiles": {
24
+ "auto": {
25
+ "high": {
26
+ "model": "opencode-go/deepseek-v4-pro",
27
+ "thinking": "high",
28
+ "fallbacks": [
29
+ "opencode-go/qwen3.6-plus",
30
+ "opencode-go/kimi-k2.6"
31
+ ]
32
+ },
33
+ "medium": {
34
+ "model": "opencode-go/qwen3.6-plus",
35
+ "thinking": "medium",
36
+ "fallbacks": [
37
+ "opencode-go/deepseek-v4-pro"
38
+ ]
39
+ },
40
+ "low": {
41
+ "model": "opencode-go/deepseek-v4-flash",
42
+ "thinking": "low",
43
+ "fallbacks": [
44
+ "opencode-go/qwen3.5-plus"
45
+ ]
46
+ }
47
+ },
48
+ "cheap": {
49
+ "high": {
50
+ "model": "opencode-go/qwen3.6-plus",
51
+ "thinking": "low",
52
+ "fallbacks": [
53
+ "opencode-go/qwen3.5-plus"
54
+ ]
55
+ },
56
+ "medium": {
57
+ "model": "opencode-go/qwen3.5-plus",
58
+ "thinking": "off",
59
+ "fallbacks": [
60
+ "opencode-go/deepseek-v4-flash"
61
+ ]
62
+ },
63
+ "low": {
64
+ "model": "opencode-go/deepseek-v4-flash",
65
+ "thinking": "off",
66
+ "fallbacks": [
67
+ "opencode-go/qwen3.5-plus"
68
+ ]
69
+ }
70
+ },
71
+ "deep": {
72
+ "high": {
73
+ "model": "opencode-go/deepseek-v4-pro",
74
+ "thinking": "xhigh",
75
+ "fallbacks": [
76
+ "opencode-go/kimi-k2.6"
77
+ ]
78
+ },
79
+ "medium": {
80
+ "model": "opencode-go/kimi-k2.6",
81
+ "thinking": "medium",
82
+ "fallbacks": [
83
+ "opencode-go/deepseek-v4-pro"
84
+ ]
85
+ },
86
+ "low": {
87
+ "model": "opencode-go/qwen3.6-plus",
88
+ "thinking": "low",
89
+ "fallbacks": [
90
+ "opencode-go/deepseek-v4-flash"
91
+ ]
92
+ }
93
+ }
94
+ }
95
+ }