instar 1.3.1 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/server.d.ts.map +1 -1
- package/dist/commands/server.js +20 -0
- package/dist/commands/server.js.map +1 -1
- package/dist/core/CodexCliIntelligenceProvider.d.ts +5 -4
- package/dist/core/CodexCliIntelligenceProvider.d.ts.map +1 -1
- package/dist/core/CodexCliIntelligenceProvider.js +55 -3
- package/dist/core/CodexCliIntelligenceProvider.js.map +1 -1
- package/dist/core/FeatureRolloutReconciler.d.ts +76 -0
- package/dist/core/FeatureRolloutReconciler.d.ts.map +1 -0
- package/dist/core/FeatureRolloutReconciler.js +151 -0
- package/dist/core/FeatureRolloutReconciler.js.map +1 -0
- package/dist/core/InitiativeTracker.d.ts +28 -0
- package/dist/core/InitiativeTracker.d.ts.map +1 -1
- package/dist/core/InitiativeTracker.js +5 -0
- package/dist/core/InitiativeTracker.js.map +1 -1
- package/dist/core/PostUpdateMigrator.d.ts.map +1 -1
- package/dist/core/PostUpdateMigrator.js +9 -0
- package/dist/core/PostUpdateMigrator.js.map +1 -1
- package/dist/core/featureRollout.d.ts +52 -0
- package/dist/core/featureRollout.d.ts.map +1 -0
- package/dist/core/featureRollout.js +65 -0
- package/dist/core/featureRollout.js.map +1 -0
- package/dist/core/featureRolloutScan.d.ts +24 -0
- package/dist/core/featureRolloutScan.d.ts.map +1 -0
- package/dist/core/featureRolloutScan.js +134 -0
- package/dist/core/featureRolloutScan.js.map +1 -0
- package/dist/scaffold/templates.d.ts.map +1 -1
- package/dist/scaffold/templates.js +1 -0
- package/dist/scaffold/templates.js.map +1 -1
- package/dist/server/CapabilityIndex.d.ts.map +1 -1
- package/dist/server/CapabilityIndex.js +26 -1
- package/dist/server/CapabilityIndex.js.map +1 -1
- package/package.json +1 -1
- package/src/data/builtin-manifest.json +18 -18
- package/src/scaffold/templates/jobs/instar/initiative-digest-review.md +38 -0
- package/src/scaffold/templates.ts +1 -0
- package/upgrades/1.3.2.md +49 -0
- package/upgrades/1.3.3.md +21 -0
- package/upgrades/side-effects/fix-codex-intel-clean-call.md +108 -0
- package/upgrades/side-effects/graduated-feature-rollout.md +43 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "./builtin-manifest.schema.json",
|
|
3
3
|
"schemaVersion": 1,
|
|
4
|
-
"generatedAt": "2026-05-
|
|
5
|
-
"instarVersion": "1.3.
|
|
4
|
+
"generatedAt": "2026-05-26T22:24:54.964Z",
|
|
5
|
+
"instarVersion": "1.3.3",
|
|
6
6
|
"entryCount": 192,
|
|
7
7
|
"entries": {
|
|
8
8
|
"hook:session-start": {
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
"domain": "identity",
|
|
12
12
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
13
13
|
"installedPath": ".instar/hooks/instar/session-start.sh",
|
|
14
|
-
"contentHash": "
|
|
14
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
15
15
|
"since": "2025-01-01"
|
|
16
16
|
},
|
|
17
17
|
"hook:dangerous-command-guard": {
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"domain": "safety",
|
|
21
21
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
22
22
|
"installedPath": ".instar/hooks/instar/dangerous-command-guard.sh",
|
|
23
|
-
"contentHash": "
|
|
23
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
24
24
|
"since": "2025-01-01"
|
|
25
25
|
},
|
|
26
26
|
"hook:grounding-before-messaging": {
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
"domain": "safety",
|
|
30
30
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
31
31
|
"installedPath": ".instar/hooks/instar/grounding-before-messaging.sh",
|
|
32
|
-
"contentHash": "
|
|
32
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
33
33
|
"since": "2025-01-01"
|
|
34
34
|
},
|
|
35
35
|
"hook:compaction-recovery": {
|
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
"domain": "identity",
|
|
39
39
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
40
40
|
"installedPath": ".instar/hooks/instar/compaction-recovery.sh",
|
|
41
|
-
"contentHash": "
|
|
41
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
42
42
|
"since": "2025-01-01"
|
|
43
43
|
},
|
|
44
44
|
"hook:external-operation-gate": {
|
|
@@ -47,7 +47,7 @@
|
|
|
47
47
|
"domain": "safety",
|
|
48
48
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
49
49
|
"installedPath": ".instar/hooks/instar/external-operation-gate.js",
|
|
50
|
-
"contentHash": "
|
|
50
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
51
51
|
"since": "2025-01-01"
|
|
52
52
|
},
|
|
53
53
|
"hook:deferral-detector": {
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
"domain": "safety",
|
|
57
57
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
58
58
|
"installedPath": ".instar/hooks/instar/deferral-detector.js",
|
|
59
|
-
"contentHash": "
|
|
59
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
60
60
|
"since": "2025-01-01"
|
|
61
61
|
},
|
|
62
62
|
"hook:post-action-reflection": {
|
|
@@ -65,7 +65,7 @@
|
|
|
65
65
|
"domain": "evolution",
|
|
66
66
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
67
67
|
"installedPath": ".instar/hooks/instar/post-action-reflection.js",
|
|
68
|
-
"contentHash": "
|
|
68
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
69
69
|
"since": "2025-01-01"
|
|
70
70
|
},
|
|
71
71
|
"hook:external-communication-guard": {
|
|
@@ -74,7 +74,7 @@
|
|
|
74
74
|
"domain": "safety",
|
|
75
75
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
76
76
|
"installedPath": ".instar/hooks/instar/external-communication-guard.js",
|
|
77
|
-
"contentHash": "
|
|
77
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
78
78
|
"since": "2025-01-01"
|
|
79
79
|
},
|
|
80
80
|
"hook:scope-coherence-collector": {
|
|
@@ -83,7 +83,7 @@
|
|
|
83
83
|
"domain": "coherence",
|
|
84
84
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
85
85
|
"installedPath": ".instar/hooks/instar/scope-coherence-collector.js",
|
|
86
|
-
"contentHash": "
|
|
86
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
87
87
|
"since": "2025-01-01"
|
|
88
88
|
},
|
|
89
89
|
"hook:scope-coherence-checkpoint": {
|
|
@@ -92,7 +92,7 @@
|
|
|
92
92
|
"domain": "coherence",
|
|
93
93
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
94
94
|
"installedPath": ".instar/hooks/instar/scope-coherence-checkpoint.js",
|
|
95
|
-
"contentHash": "
|
|
95
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
96
96
|
"since": "2025-01-01"
|
|
97
97
|
},
|
|
98
98
|
"hook:free-text-guard": {
|
|
@@ -101,7 +101,7 @@
|
|
|
101
101
|
"domain": "safety",
|
|
102
102
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
103
103
|
"installedPath": ".instar/hooks/instar/free-text-guard.sh",
|
|
104
|
-
"contentHash": "
|
|
104
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
105
105
|
"since": "2025-01-01"
|
|
106
106
|
},
|
|
107
107
|
"hook:claim-intercept": {
|
|
@@ -110,7 +110,7 @@
|
|
|
110
110
|
"domain": "coherence",
|
|
111
111
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
112
112
|
"installedPath": ".instar/hooks/instar/claim-intercept.js",
|
|
113
|
-
"contentHash": "
|
|
113
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
114
114
|
"since": "2025-01-01"
|
|
115
115
|
},
|
|
116
116
|
"hook:claim-intercept-response": {
|
|
@@ -119,7 +119,7 @@
|
|
|
119
119
|
"domain": "coherence",
|
|
120
120
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
121
121
|
"installedPath": ".instar/hooks/instar/claim-intercept-response.js",
|
|
122
|
-
"contentHash": "
|
|
122
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
123
123
|
"since": "2025-01-01"
|
|
124
124
|
},
|
|
125
125
|
"hook:stop-gate-router": {
|
|
@@ -128,7 +128,7 @@
|
|
|
128
128
|
"domain": "safety",
|
|
129
129
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
130
130
|
"installedPath": ".instar/hooks/instar/stop-gate-router.js",
|
|
131
|
-
"contentHash": "
|
|
131
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
132
132
|
"since": "2025-01-01"
|
|
133
133
|
},
|
|
134
134
|
"hook:auto-approve-permissions": {
|
|
@@ -137,7 +137,7 @@
|
|
|
137
137
|
"domain": "safety",
|
|
138
138
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
139
139
|
"installedPath": ".instar/hooks/instar/auto-approve-permissions.js",
|
|
140
|
-
"contentHash": "
|
|
140
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
141
141
|
"since": "2025-01-01"
|
|
142
142
|
},
|
|
143
143
|
"job:health-check": {
|
|
@@ -1481,7 +1481,7 @@
|
|
|
1481
1481
|
"type": "subsystem",
|
|
1482
1482
|
"domain": "updates",
|
|
1483
1483
|
"sourcePath": "src/core/PostUpdateMigrator.ts",
|
|
1484
|
-
"contentHash": "
|
|
1484
|
+
"contentHash": "9756cf164786964fc342e90fe21961dd99eaea24ecb173643d5417fcd1abb3eb",
|
|
1485
1485
|
"since": "2025-01-01"
|
|
1486
1486
|
},
|
|
1487
1487
|
"subsystem:scheduler": {
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Initiative Digest Review
|
|
3
|
+
description: Twice-weekly review of the initiative board. The self-driving half of the InitiativeTracker — surfaces initiatives that need a decision, and for ships-staged features in rollout (dry-run to live to default-on) gathers promotion evidence and posts an explicit, evidence-gated recommendation. Near-silent — posts ONLY when a genuinely-new decision is waiting. Operator-gated and flag-derived; it recommends, it never flips a config flag. See GRADUATED-FEATURE-ROLLOUT-SPEC section 4.2.
|
|
4
|
+
schedule: "0 11 * * 1,4"
|
|
5
|
+
priority: medium
|
|
6
|
+
expectedDurationMinutes: 3
|
|
7
|
+
model: sonnet
|
|
8
|
+
enabled: true
|
|
9
|
+
tags:
|
|
10
|
+
- cat:learning
|
|
11
|
+
- initiative
|
|
12
|
+
- rollout
|
|
13
|
+
toolAllowlist: "*"
|
|
14
|
+
unrestrictedTools: true
|
|
15
|
+
---
|
|
16
|
+
You are running the twice-weekly initiative digest review (Mondays and Thursdays). This is the self-driving half of the InitiativeTracker: it makes sure nothing in flight stalls or is forgotten, and it drives ships-staged features toward default-on — without you (the human) ever having to remember. Be concise; post AT MOST one consolidated, conversational Telegram message, and only when there is genuinely something to decide.
|
|
17
|
+
|
|
18
|
+
Context: the FeatureRolloutReconciler auto-populates the board from approved specs + merges. A ships-staged feature carries a rollout track whose stage (dry-run → live → default-on) is DERIVED from observing its config flag — you must NEVER flip the flag yourself; you recommend, the human flips `.instar/config.json`, and the next reconcile observes it and advances the stage.
|
|
19
|
+
|
|
20
|
+
Steps:
|
|
21
|
+
|
|
22
|
+
1. **Pull the digest:** `curl -s -H "Authorization: Bearer $AUTH" http://localhost:$PORT/initiatives/digest`. It returns items flagged `needs-user`, `ready-to-advance`, `stale`, or `next-check-due`.
|
|
23
|
+
|
|
24
|
+
2. **Near-silent edge filter.** For each `needs-user` item, only surface it if it is NEWLY needs-user since the last surface (compare against the initiative's `rollout.lastDigestNotifiedAt`, or its `updatedAt`). Do NOT re-surface a decision you already raised and the user hasn't acted on — that is the noise the near-silent standard forbids. `stale` / counts stay on the pull surface (the digest endpoint + dashboard); do not push them.
|
|
25
|
+
|
|
26
|
+
3. **For each ships-staged rollout track that is genuinely ready for a decision:** read its `rollout.evidenceSource`, gather the evidence (e.g. read the named log filter / hit the endpoint), and sanity-check it against `rollout.promotionCriteria`. If the criteria are met, recommend the next stage explicitly: "X has been clean in dry-run for 2 weeks (N events, all genuinely as-expected) — ready to flip to live? That's `flagPath` → dryRun:false in config." If something looks WRONG (e.g. evidence shows the feature misbehaved), lead with that and recommend holding/investigating, regardless of the clock.
|
|
27
|
+
|
|
28
|
+
4. **Stall nag-decay (§4.7).** If a track has been recommended for advancement K times (≈3 cycles) with no action, STOP re-recommending it and note once that it's parked pending an explicit "resume" — never nag forever.
|
|
29
|
+
|
|
30
|
+
5. **Compose ONE message** to the appropriate topic, plain English, under ~700 chars, no raw JSON. Lead with the single most important decision. If nothing is newly actionable, **post nothing and exit** — most runs should be silent.
|
|
31
|
+
|
|
32
|
+
6. After surfacing, stamp `rollout.lastDigestNotifiedAt` (via PATCH /initiatives/:id) on the tracks you surfaced, so the next run's edge filter doesn't repeat them.
|
|
33
|
+
|
|
34
|
+
GUARDRAILS — do NOT cross these:
|
|
35
|
+
- You RECOMMEND; you never advance a rollout stage or flip a config flag yourself. Advancement happens when the human edits config and the reconciler observes it.
|
|
36
|
+
- You never mark a default-on track complete (the reconciler archives it, reopenable).
|
|
37
|
+
- This is the InitiativeTracker's driver. It is DISJOINT from the Evolution Action Queue (`evolution-overdue-check`) and from user Commitments — do not re-surface items those systems own.
|
|
38
|
+
- Stay near-silent. A digest that pings every run becomes the thing the user dismisses unread. Silence is the default; a message is the exception that means "a real decision is waiting."
|
|
@@ -670,6 +670,7 @@ I maintain registries that are the source of truth for specific categories. Thes
|
|
|
670
670
|
| Question | Check First |
|
|
671
671
|
|----------|-------------|
|
|
672
672
|
| What can I do? | \`curl -H "Authorization: Bearer $AUTH" http://localhost:${port}/capabilities\` |
|
|
673
|
+
| What are we working on? / status of a project or initiative? | \`curl -H "Authorization: Bearer $AUTH" http://localhost:${port}/initiatives\` + \`/projects\` (and \`/initiatives/digest\` for what needs a decision) — NEVER answer this from memory |
|
|
673
674
|
| Who do I work with? | \`.instar/USER.md\` |
|
|
674
675
|
| What have I learned? | \`.instar/MEMORY.md\` |
|
|
675
676
|
| What jobs do I have? | \`.instar/jobs.json\` or \`curl -H "Authorization: Bearer $AUTH" http://localhost:${port}/jobs\` |
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Upgrade Guide — NEXT
|
|
2
|
+
|
|
3
|
+
<!-- bump: patch -->
|
|
4
|
+
<!-- Valid values: patch, minor, major -->
|
|
5
|
+
<!-- patch = bug fixes, refactors, test additions, doc updates -->
|
|
6
|
+
<!-- minor = new features, new APIs, new capabilities (backwards-compatible) -->
|
|
7
|
+
<!-- major = breaking changes to existing APIs or behavior -->
|
|
8
|
+
|
|
9
|
+
## What Changed
|
|
10
|
+
|
|
11
|
+
**Codex-powered agents stop reloading their full identity on every background "judgment" call.** Instar makes ~1,500+ tiny internal LLM calls per agent per day — classify this message, did that turn finish, summarize this chunk, extract the intent. On a Codex-powered agent, each of those ran `codex exec` *inside the agent's project directory*, which made Codex load the agent's entire ~26 KB `AGENTS.md` identity AND fire the project's `.codex/hooks.json` (session_start / user_prompt_submit / stop) **every single time** — just to answer one word like "normal."
|
|
12
|
+
|
|
13
|
+
This was the dominant cause of two visible problems on Codex agents: the flood of "actively working / message delivered / still working" notifications (the session_start hook firing on ~1,550 spawns/day, so the monitoring layer thought a real session was constantly starting), and intermittent "couldn't deliver — please resend" failures (a dozen of these heavyweight spawns landing in one minute saturated the machine so a real inbound message couldn't get a process slot).
|
|
14
|
+
|
|
15
|
+
The fix gives those calls a clean notepad — the Codex analog of what `ClaudeCliIntelligenceProvider` already does with `--setting-sources user`. `CodexCliIntelligenceProvider` now runs judgment calls in an empty, private (0700, unguessable-name via `mkdtempSync`) scratch directory instead of the project dir, plus `-c project_doc_max_bytes=0`. No identity load, no project hooks. Claude-powered agents are unaffected (they were already clean).
|
|
16
|
+
|
|
17
|
+
## Evidence
|
|
18
|
+
|
|
19
|
+
Reproduced live on this machine's Codex install (codex-cli 0.133.0), before/after.
|
|
20
|
+
|
|
21
|
+
**Before (production incident, 2026-05-25 rollout logs in `~/.codex/sessions/`):** 1,601
|
|
22
|
+
`codex exec` spawns in one day, ~1,550 of them internal judgment calls. A sampled
|
|
23
|
+
message-classifier rollout (21:52) re-injected the full ~26 KB `AGENTS.md` identity AND a
|
|
24
|
+
`SESSION START` block — firing session_start — just to output the single word `normal`.
|
|
25
|
+
Those rollouts ran with `cwd` = the agent's project dir and were 63–110 KB each.
|
|
26
|
+
|
|
27
|
+
**After (controlled run of the built fixed provider against the real codex binary,
|
|
28
|
+
2026-05-26 13:49):** called `CodexCliIntelligenceProvider.evaluate()` with a unique marker
|
|
29
|
+
prompt; located the exact rollout it produced
|
|
30
|
+
(`rollout-2026-05-26T13-49-18-…019e660c….jsonl`). Observed:
|
|
31
|
+
- `cwd` = `/var/folders/…/T/instar-codex-intel-scratch-AOYJWS` (the mkdtemp scratch dir) ✓
|
|
32
|
+
- `AGENTS.md instructions` blocks: **0** (was ≥1) ✓
|
|
33
|
+
- `SESSION START` blocks: **0** (was 1) ✓
|
|
34
|
+
- `CURRENT TIME` hook markers (user_prompt_submit): **0** ✓
|
|
35
|
+
- rollout size 29.6 KB (was 63–110 KB) — the residue is codex's own base prompt, not instar identity.
|
|
36
|
+
|
|
37
|
+
The identity load and the session_start/user_prompt_submit hook firing are gone for
|
|
38
|
+
judgment calls. The agent still returned the correct answer.
|
|
39
|
+
|
|
40
|
+
## What to Tell Your User
|
|
41
|
+
|
|
42
|
+
- **If you run a Codex-powered agent, it should get noticeably quieter and more reliable — no action needed.** The "still working" notification spam and the occasional dropped/"please resend" messages were mostly this one plumbing bug; the agent was effectively re-reading its whole identity ~1,500 times a day. Claude-powered agents won't notice anything (they were never affected).
|
|
43
|
+
|
|
44
|
+
## Summary of New Capabilities
|
|
45
|
+
|
|
46
|
+
| Capability | How to Use |
|
|
47
|
+
|-----------|-----------|
|
|
48
|
+
| Codex judgment calls run identity-free + hook-free | Automatic. `CodexCliIntelligenceProvider` runs `codex exec` in an empty `mkdtempSync` scratch dir + `-c project_doc_max_bytes=0` instead of the project dir. |
|
|
49
|
+
| Hardened scratch dir | Automatic. Unguessable random name, 0700 perms, recreated if a tmp-reaper deletes it — nothing can be planted in the cwd these calls run from. |
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Upgrade Guide — NEXT
|
|
2
|
+
|
|
3
|
+
<!-- bump: minor -->
|
|
4
|
+
|
|
5
|
+
## What Changed
|
|
6
|
+
|
|
7
|
+
**Graduated Feature Rollout — the InitiativeTracker now populates and drives itself.** Features that ship behind a dry-run/off flag are auto-registered as tracker initiatives (from their approved spec + merge — no one has to remember), and a single twice-weekly driver surfaces an evidence-based promotion recommendation (dry-run → live → default-on) until a human advances it. A feature can never silently reach default-on: the stage is derived from observing the config flag, and the driver never flips it. The tracker is also wired into discoverability so "what are we working on?" is answered from the live board, not memory.
|
|
8
|
+
|
|
9
|
+
## What to Tell Your User
|
|
10
|
+
|
|
11
|
+
- Ask "what are we working on?" and I'll answer from the live initiative board, not from memory.
|
|
12
|
+
- Features that need time to mature won't stall or be forgotten — there's a standing twice-weekly check that nudges each toward fully-on, with you approving each step.
|
|
13
|
+
|
|
14
|
+
## Summary of New Capabilities
|
|
15
|
+
|
|
16
|
+
| Capability | How to Use |
|
|
17
|
+
|-----------|-----------|
|
|
18
|
+
| Self-populating initiative tracker | Automatic — approved+merged specs register themselves |
|
|
19
|
+
| Twice-weekly rollout driver | Builtin job; recommends promotion, never auto-advances |
|
|
20
|
+
| `ships-staged` spec frontmatter | Declares a feature ships dark → gets a rollout track |
|
|
21
|
+
| Initiative discoverability | `GET /initiatives` surfaced in /capabilities + Registry-First |
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Side-Effects Review — Codex Intelligence-Provider Clean-Call Fix
|
|
2
|
+
|
|
3
|
+
**Version / slug:** `fix-codex-intel-clean-call`
|
|
4
|
+
**Date:** 2026-05-26
|
|
5
|
+
**Author:** Echo
|
|
6
|
+
**Spec:** `docs/specs/CODEX-INTELLIGENCE-PROVIDER-CLEAN-CALL-SPEC.md` (converged + approved)
|
|
7
|
+
|
|
8
|
+
## Summary of the change
|
|
9
|
+
|
|
10
|
+
`CodexCliIntelligenceProvider.evaluate()` ran `codex exec --cd <agent project dir>` for
|
|
11
|
+
every internal LLM "judgment" call (message classification, terminal-output analysis,
|
|
12
|
+
arc extraction, usher, coherence, etc.). Running in the project dir made Codex load the
|
|
13
|
+
full ~26 KB `AGENTS.md` identity AND fire the project's `.codex/hooks.json`
|
|
14
|
+
(session_start / user_prompt_submit / stop) on **every** call — ~1,550 such calls/day,
|
|
15
|
+
causing notification spam (session_start firing constantly) and spawn-storm delivery
|
|
16
|
+
failures (12 heavyweight spawns/minute saturating the machine).
|
|
17
|
+
|
|
18
|
+
The fix runs these calls in an empty, owner-only scratch dir instead — the Codex analog
|
|
19
|
+
of `ClaudeCliIntelligenceProvider`'s `--setting-sources user`. No identity, no project
|
|
20
|
+
hooks.
|
|
21
|
+
|
|
22
|
+
**Files changed (source):**
|
|
23
|
+
- `src/core/CodexCliIntelligenceProvider.ts` — `evaluate()` now uses an `mkdtempSync`
|
|
24
|
+
scratch dir for `--cd` (not the project dir) + `-c project_doc_max_bytes=0`; added the
|
|
25
|
+
`resolveIntelligenceScratchDir()` helper; removed the now-dead `workingDirectory` field
|
|
26
|
+
(kept on the options type for API compat).
|
|
27
|
+
|
|
28
|
+
**Files changed (tests):**
|
|
29
|
+
- `tests/unit/CodexCliIntelligenceProvider.test.ts` — updated the `--cd` assertion (it
|
|
30
|
+
previously asserted the buggy project-dir behavior) + added 7 cases covering the
|
|
31
|
+
scratch-dir contract, 0700 perms, unguessable name, and tmp-reaper recovery (12 total).
|
|
32
|
+
|
|
33
|
+
**Files changed (spec / report / release notes):**
|
|
34
|
+
- `docs/specs/CODEX-INTELLIGENCE-PROVIDER-CLEAN-CALL-SPEC.md` (+ `.eli16.md`)
|
|
35
|
+
- `docs/specs/reports/codex-intelligence-provider-clean-call-convergence.md`
|
|
36
|
+
- `upgrades/NEXT.md`
|
|
37
|
+
|
|
38
|
+
## Decision-point inventory
|
|
39
|
+
|
|
40
|
+
- **Scratch dir, not the project dir** — the core fix. Judgment calls are cwd-independent
|
|
41
|
+
(per the existing code comment), so an empty cwd is correct.
|
|
42
|
+
- **`mkdtempSync` (random suffix, 0700), not a fixed name** — convergence security finding:
|
|
43
|
+
a fixed `/tmp` name on Linux is plantable (`.codex/hooks.json` squatting; not gated by
|
|
44
|
+
`project_doc_max_bytes`). The unguessable, owner-only dir closes that vector.
|
|
45
|
+
- **Re-verify-before-use** — recreate the dir if a tmp-reaper deleted it during a
|
|
46
|
+
long-lived process.
|
|
47
|
+
- **`-c project_doc_max_bytes=0`** — belt-and-suspenders for an `AGENTS.md` on the cwd
|
|
48
|
+
walk-up; real key, already used in `contextScopeControl.ts`.
|
|
49
|
+
- **Drop `workingDirectory` as exec cwd** — verified only `route.ts` passes it, and only
|
|
50
|
+
for its own PreferenceStore DB path, never the codex cwd.
|
|
51
|
+
|
|
52
|
+
## Over-block / under-block analysis
|
|
53
|
+
|
|
54
|
+
- **Over-block:** none. The provider gates nothing; it only changes the cwd of a spawn.
|
|
55
|
+
Judgment calls that worked before continue to work (the fake-codex unit tests confirm
|
|
56
|
+
the full arg contract).
|
|
57
|
+
- **Under-block:** the *intended* behavioral subtraction is "stop loading identity + firing
|
|
58
|
+
hooks for judgment calls." There is no path where a judgment call legitimately needed the
|
|
59
|
+
identity or hooks — they are stateless classifications/extractions. If a future caller
|
|
60
|
+
did need project context, it must pass it in the prompt (as all current callers do), not
|
|
61
|
+
rely on cwd.
|
|
62
|
+
|
|
63
|
+
## Level-of-abstraction fit
|
|
64
|
+
|
|
65
|
+
The fix lives in the single provider that owns the `codex exec` invocation — the same layer
|
|
66
|
+
where the Claude sibling already solves the identical problem with `--setting-sources user`.
|
|
67
|
+
No higher-level orchestration or config knob is introduced; the concern is local to the
|
|
68
|
+
spawn, so the fix is local to the spawn. Correct altitude.
|
|
69
|
+
|
|
70
|
+
## Signal-vs-authority compliance
|
|
71
|
+
|
|
72
|
+
N/A in the gate sense — this change neither detects nor blocks anything. It is a pure
|
|
73
|
+
invocation-hygiene fix. It does not touch any sentinel/gate authority boundary.
|
|
74
|
+
|
|
75
|
+
## Interactions
|
|
76
|
+
|
|
77
|
+
- **Claude provider:** untouched; asymmetry (flag vs scratch-cwd) is intentional and
|
|
78
|
+
documented — Codex has no single equivalent flag.
|
|
79
|
+
- **Callers (`reflect.ts`, `route.ts`, `server.ts`):** none depend on the codex cwd
|
|
80
|
+
content; verified during integration review. No behavior change for them beyond the
|
|
81
|
+
intended one.
|
|
82
|
+
- **Concurrency:** `mkdtempSync` once + cached + `existsSync` re-check; no race under the
|
|
83
|
+
high call volume (idempotent, read-only dir).
|
|
84
|
+
- **Monitoring layer:** positive interaction — the session_start hook no longer fires on
|
|
85
|
+
judgment spawns, so PresenceProxy/standby stops mistaking them for real sessions
|
|
86
|
+
(the notification-spam root cause).
|
|
87
|
+
|
|
88
|
+
## Rollback cost
|
|
89
|
+
|
|
90
|
+
Trivial and isolated. Revert the single source file (and its test). No persisted state, no
|
|
91
|
+
schema, no config/hook/template/migration to unwind — the only on-disk footprint is an
|
|
92
|
+
empty 0700 tmp dir that the OS reaps on its own. Reverting restores the prior (buggy but
|
|
93
|
+
functional) behavior with zero data implications.
|
|
94
|
+
|
|
95
|
+
## Migration parity
|
|
96
|
+
|
|
97
|
+
Code-only change inside the compiled provider. No agent-installed file
|
|
98
|
+
(settings/hooks/config/templates/skills) references the old behavior, so **no
|
|
99
|
+
`PostUpdateMigrator` entry is required** — existing Codex agents receive the fix via the
|
|
100
|
+
normal package update path. Verified by grep during integration review.
|
|
101
|
+
|
|
102
|
+
## Testing evidence
|
|
103
|
+
|
|
104
|
+
- Unit: 12 tests in `CodexCliIntelligenceProvider.test.ts` pass; sibling env-allowlist (4)
|
|
105
|
+
+ factory (10) tests unaffected; clean `tsc` build.
|
|
106
|
+
- Live / bug-fix evidence bar: the before/after rollout reproduction on a real Codex agent
|
|
107
|
+
(identity-loaded before, bare after) is run as the post-merge test-as-self gate and
|
|
108
|
+
recorded before the fix is declared shipped.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Side-Effects Review — Graduated Feature Rollout
|
|
2
|
+
|
|
3
|
+
Spec: `docs/specs/GRADUATED-FEATURE-ROLLOUT-SPEC.md` (v2 CONVERGED + ratified; driver twice-weekly). Branch `build/graduated-feature-rollout` off JKHeadley/main @ v1.3.0.
|
|
4
|
+
|
|
5
|
+
## What changes for a deployed agent
|
|
6
|
+
|
|
7
|
+
Makes the existing InitiativeTracker self-populating + self-driving — no parallel system, no new persistence namespace. Additively:
|
|
8
|
+
- **Schema:** the `Initiative` type gains an optional typed `rollout` block (`flagPath`, `stage`, `evidenceSource`, `promotionCriteria`, `lastDigestNotifiedAt`) + a `RolloutStage` type. Purely additive; pre-rollout records leave it undefined; create/update plumb it through (whitelisted, like the other project-scope fields), so TaskFlow serialization is unchanged for records that don't use it.
|
|
9
|
+
- **`FeatureRolloutReconciler`** (new, server-wired, in-process): auto-registers/advances a `kind:'task'` initiative from spec frontmatter + trace + git state. Bounded since-last-run scan; OCC `ifMatch` on every write; id normalize/truncate/hash; rename-by-specPath; bounded backfill (historical specs terminal, only recent/ships-staged active). In-process because `POST /initiatives` deliberately drops the needed fields.
|
|
10
|
+
- **One twice-weekly builtin driver job** (`initiative-digest-review`, Mon+Thu): reads `/initiatives/digest`, gathers evidence, sets `needsUser` recommendations. **Read-only w.r.t. config flags** — it never flips `flagPath`. Retires/replaces the bespoke `session-reaper-promotion-review`.
|
|
11
|
+
- **Discoverability (Layer D):** `/initiatives` un-suppressed in the capability matrix; a Registry-First "what are we working on" row in the CLAUDE.md template; a session-start line that fires ONLY on a *new* needs-user edge (deduped) — near-silent.
|
|
12
|
+
|
|
13
|
+
## The safety invariant (over/under-block)
|
|
14
|
+
|
|
15
|
+
The danger is a feature silently reaching `default-on`. Structurally impossible here: `deriveRolloutStage` computes the stage from **observation only** (live flag + shipped ConfigDefaults default); `default-on` requires the shipped default to be enabled (a human code change). The driver has **no write path** to flags. And `default-on` *archives* the track (reopenable) rather than marking all phases `done` (which would seal the record against a future regression via the immutable TaskFlow terminal). Under-block (a stalled rollout) is handled by nag-decay (§4.7), not by forcing advancement.
|
|
16
|
+
|
|
17
|
+
## Level-of-abstraction / signal-vs-authority
|
|
18
|
+
|
|
19
|
+
The reconciler + driver compute signals (verdicts, recommendations); authority to advance stays with the human flipping the config flag. The reconciler only *observes* and reflects.
|
|
20
|
+
|
|
21
|
+
## Interactions
|
|
22
|
+
|
|
23
|
+
- Built ON the InitiativeTracker; disjoint from the Evolution Action Queue (`evolution-overdue-check`) and Commitments (§4.8) — no double-nag.
|
|
24
|
+
- Auto-registered tasks are top-level (retroactive `parentProjectId` attach is validation-rejected); project membership stays a deliberate `/instar-project` act.
|
|
25
|
+
- The driver replaces the bespoke per-feature job (which the builtin-job reconciler retired on restart) — a single builtin, restart-durable.
|
|
26
|
+
|
|
27
|
+
## Rollback
|
|
28
|
+
|
|
29
|
+
The `rollout` schema field is additive/optional. The reconciler + driver ship off/observational; disabling the driver job (`enabled:false`) and not wiring the reconciler reverts to today's passive tracker. No data migration; archived tracks remain readable.
|
|
30
|
+
|
|
31
|
+
## Tests
|
|
32
|
+
|
|
33
|
+
3-tier incl. the dogfood backfill e2e (SessionReaper retroactive), near-silent edge dedupe, flag-never-flipped, wiring-integrity. Live test-as-self before merge.
|
|
34
|
+
|
|
35
|
+
## Post-build review fixes (multi-agent code review)
|
|
36
|
+
|
|
37
|
+
Independent review (correctness + wiring/regression passes) confirmed the wiring is clean and the no-silent-default-on invariant holds, and caught one BLOCKER + minors, all fixed:
|
|
38
|
+
- **BLOCKER — default-on seal:** `status:'archived'` maps to TaskFlow's TERMINAL `cancelled`, which would seal a default-on rollout track against a later regression (tests missed it by not enabling TaskFlow). Fixed: default-on now parks the track as **`paused`** (non-terminal → reopenable, and off the active/stale list); historical non-rollout backfill keeps `archived` (genuinely terminal). A new TaskFlow-enabled regression test proves default-on→live reopens (would fail pre-fix).
|
|
39
|
+
- **MINOR:** `makeFlagObserver` now handles bare-boolean flags (not just `{enabled,dryRun}` objects). Scanner reads `createdAt ?? timestamp` (trace timestamp field drift). Test uses `SafeFsExecutor` (destructive-tool lint).
|
|
40
|
+
|
|
41
|
+
## CI fix — job-template frontmatter YAML
|
|
42
|
+
|
|
43
|
+
CI caught a real bug the local pre-push smoke missed: the driver job's `description` contained `Near-silent: posts` — the `: ` made the real YAML loader read it as a nested mapping ("bad indentation of a mapping entry"), so `installBuiltinJobs` errored during migration and several migration/parity unit shards failed. Fixed by removing colon-space / arrow / `§` chars from the description (matching the unquoted-no-colon convention of the other job templates). Verified: js-yaml parses it, and default-jobs-valid + migration-guarantee + parity-primitives-lifecycle are green.
|