voidforge-build 23.10.0 → 23.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.claude/agents/bashir-field-medic.md +1 -0
- package/dist/.claude/agents/coulson-release.md +3 -0
- package/dist/.claude/agents/irulan-historian.md +3 -0
- package/dist/.claude/agents/loki-chaos.md +1 -0
- package/dist/.claude/agents/picard-architecture.md +3 -0
- package/dist/.claude/agents/silver-surfer-herald.md +3 -0
- package/dist/.claude/agents/sisko-campaign.md +3 -0
- package/dist/.claude/commands/architect.md +38 -0
- package/dist/.claude/commands/campaign.md +2 -0
- package/dist/.claude/commands/gauntlet.md +11 -0
- package/dist/.claude/commands/git.md +49 -6
- package/dist/CHANGELOG.md +84 -0
- package/dist/CLAUDE.md +13 -4
- package/dist/VERSION.md +3 -1
- package/dist/docs/methods/AI_INTELLIGENCE.md +15 -0
- package/dist/docs/methods/BACKEND_ENGINEER.md +48 -0
- package/dist/docs/methods/CAMPAIGN.md +196 -1
- package/dist/docs/methods/DEVOPS_ENGINEER.md +16 -0
- package/dist/docs/methods/FORGE_KEEPER.md +18 -0
- package/dist/docs/methods/GAUNTLET.md +2 -0
- package/dist/docs/methods/QA_ENGINEER.md +46 -0
- package/dist/docs/methods/RELEASE_MANAGER.md +85 -0
- package/dist/docs/methods/SECURITY_AUDITOR.md +53 -0
- package/dist/docs/methods/SUB_AGENTS.md +90 -0
- package/dist/docs/methods/SYSTEMS_ARCHITECT.md +42 -2
- package/dist/docs/methods/TESTING.md +17 -0
- package/dist/docs/methods/TIME_VAULT.md +17 -0
- package/dist/docs/patterns/adr-verification-gate.md +80 -0
- package/dist/docs/patterns/ai-eval.ts +87 -0
- package/dist/docs/patterns/ai-prompt-safety.ts +242 -0
- package/dist/docs/patterns/audit-log.ts +132 -0
- package/dist/docs/patterns/llm-state-dedup.ts +246 -0
- package/dist/docs/patterns/middleware.ts +83 -0
- package/dist/docs/patterns/multi-tenant-pool-bypass.ts +134 -0
- package/dist/docs/patterns/multi-tenant-property-test.ts +127 -0
- package/dist/docs/patterns/refactor-extraction.md +96 -0
- package/dist/wizard/lib/project-init.js +57 -0
- package/package.json +1 -1
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern: Audit Log (system-event NULL trap + integrity)
|
|
3
|
+
*
|
|
4
|
+
* Source: Field report #319 §6. `audit_log.org_id INTEGER NOT NULL DEFAULT 1`
|
|
5
|
+
* rejects explicit NULL inserts. Spec called for `org_id=NULL` for system
|
|
6
|
+
* events; code wrote `None`; PG raised IntegrityError; an `except Exception:
|
|
7
|
+
* pass` swallowed it; the audit row was silently lost on every system event.
|
|
8
|
+
*
|
|
9
|
+
* The audit table cannot be a system of record AND a tenant-scoped table at
|
|
10
|
+
* the same time. This pattern documents the two valid resolutions and the
|
|
11
|
+
* integrity properties any audit pipeline must hold.
|
|
12
|
+
*
|
|
13
|
+
* Pairs with /docs/patterns/financial-transaction.ts (hash-chained append)
|
|
14
|
+
* for higher-stakes audit trails.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
// ── The Two Valid Patterns ────────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
// Pattern 1: Schema relaxation — make org_id nullable, write NULL for system
|
|
20
|
+
// events. Most explicit. Visible in `\d audit_log`. Migration cost.
|
|
21
|
+
//
|
|
22
|
+
// ALTER TABLE audit_log ALTER COLUMN org_id DROP NOT NULL;
|
|
23
|
+
// -- Operators query: WHERE org_id IS NULL
|
|
24
|
+
//
|
|
25
|
+
// Pattern 2: Sentinel + tag — write the placeholder DEFAULT (e.g., 1) plus a
|
|
26
|
+
// `decisions.system_event = true` JSONB flag. Cheaper, reversible. Operators
|
|
27
|
+
// query: WHERE (decisions->>'system_event')::boolean = true.
|
|
28
|
+
|
|
29
|
+
// ── TypeScript implementation (Pattern 2 — sentinel + tag) ───────────────
|
|
30
|
+
|
|
31
|
+
export type AuditEntry = {
|
|
32
|
+
org_id: number; // Schema DEFAULT for system events; real org id otherwise
|
|
33
|
+
user_id: string | null; // null for system events
|
|
34
|
+
action: string;
|
|
35
|
+
resource_type: string;
|
|
36
|
+
resource_id: string | null;
|
|
37
|
+
decisions: AuditDecisions;
|
|
38
|
+
occurred_at: Date;
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
export type AuditDecisions = {
|
|
42
|
+
system_event?: true; // Tag for system-scope writes (Pattern 2)
|
|
43
|
+
reason?: string;
|
|
44
|
+
actor_role?: string;
|
|
45
|
+
// Free-form context — keep keys stable so operator queries don't drift
|
|
46
|
+
[key: string]: unknown;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
const SYSTEM_ORG_ID_PLACEHOLDER = 1; // Must match the schema DEFAULT
|
|
50
|
+
|
|
51
|
+
export async function writeAudit(
|
|
52
|
+
db: { execute: (sql: string, params: unknown[]) => Promise<void> },
|
|
53
|
+
entry: Omit<AuditEntry, 'occurred_at'>,
|
|
54
|
+
): Promise<void> {
|
|
55
|
+
// Mark system events explicitly. Pattern 2 invariant: every system_event=true
|
|
56
|
+
// row uses SYSTEM_ORG_ID_PLACEHOLDER as org_id.
|
|
57
|
+
if (entry.decisions.system_event && entry.org_id !== SYSTEM_ORG_ID_PLACEHOLDER) {
|
|
58
|
+
throw new Error(
|
|
59
|
+
`audit-log invariant: system_event=true requires org_id=${SYSTEM_ORG_ID_PLACEHOLDER}, got ${entry.org_id}`,
|
|
60
|
+
);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
await db.execute(
|
|
64
|
+
`INSERT INTO audit_log (org_id, user_id, action, resource_type, resource_id, decisions, occurred_at)
|
|
65
|
+
VALUES ($1, $2, $3, $4, $5, $6, NOW())`,
|
|
66
|
+
[
|
|
67
|
+
entry.org_id,
|
|
68
|
+
entry.user_id,
|
|
69
|
+
entry.action,
|
|
70
|
+
entry.resource_type,
|
|
71
|
+
entry.resource_id,
|
|
72
|
+
JSON.stringify(entry.decisions),
|
|
73
|
+
],
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Convenience wrappers — make system vs tenant calls obvious at the call site.
|
|
78
|
+
|
|
79
|
+
export const writeSystemAudit = (
|
|
80
|
+
db: Parameters<typeof writeAudit>[0],
|
|
81
|
+
entry: Omit<AuditEntry, 'org_id' | 'occurred_at' | 'user_id' | 'decisions'> & {
|
|
82
|
+
decisions: Omit<AuditDecisions, 'system_event'>;
|
|
83
|
+
},
|
|
84
|
+
) =>
|
|
85
|
+
writeAudit(db, {
|
|
86
|
+
...entry,
|
|
87
|
+
org_id: SYSTEM_ORG_ID_PLACEHOLDER,
|
|
88
|
+
user_id: null,
|
|
89
|
+
decisions: { ...entry.decisions, system_event: true },
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
export const writeTenantAudit = (
|
|
93
|
+
db: Parameters<typeof writeAudit>[0],
|
|
94
|
+
entry: Omit<AuditEntry, 'occurred_at'> & { org_id: number; user_id: string },
|
|
95
|
+
) => writeAudit(db, entry);
|
|
96
|
+
|
|
97
|
+
// ── Integrity properties (assert in tests) ────────────────────────────────
|
|
98
|
+
//
|
|
99
|
+
// 1. NEVER `try { ... } catch { /* ignore */ }` around audit writes.
|
|
100
|
+
// Audit-write failures are themselves the most important class of audit
|
|
101
|
+
// event. If the audit pipeline can fail silently, you have no audit.
|
|
102
|
+
//
|
|
103
|
+
// 2. Audit writes inside the same transaction as the action they describe.
|
|
104
|
+
// A separate transaction risks the action committing while the audit
|
|
105
|
+
// rolls back (or vice versa).
|
|
106
|
+
//
|
|
107
|
+
// 3. Append-only at the application layer (no UPDATE/DELETE on audit_log).
|
|
108
|
+
// Enforce via revoked grants on the runtime role:
|
|
109
|
+
// REVOKE UPDATE, DELETE ON audit_log FROM <runtime_role>;
|
|
110
|
+
//
|
|
111
|
+
// 4. Tests assert: writeSystemAudit + writeTenantAudit produce
|
|
112
|
+
// distinguishable rows. Operator query against `decisions->>'system_event'`
|
|
113
|
+
// must surface system events without false positives from real org=1.
|
|
114
|
+
|
|
115
|
+
// ── Anti-patterns ─────────────────────────────────────────────────────────
|
|
116
|
+
//
|
|
117
|
+
// - `org_id INTEGER NOT NULL DEFAULT N` + `INSERT ... VALUES (NULL, ...)`
|
|
118
|
+
// → IntegrityError. Pick Pattern 1 (drop NOT NULL) or Pattern 2 (write N
|
|
119
|
+
// + tag). Don't try to do both halfway.
|
|
120
|
+
//
|
|
121
|
+
// - System events written with a real user's `org_id` "for convenience."
|
|
122
|
+
// The audit trail conflates platform actions with tenant actions; legal
|
|
123
|
+
// discovery cannot separate them.
|
|
124
|
+
//
|
|
125
|
+
// - JSONB tag without a stable key. `decisions.systemEvent` vs
|
|
126
|
+
// `decisions.system_event` vs `decisions.is_system` — operator queries
|
|
127
|
+
// break across versions. Lock the key in this file and keep it.
|
|
128
|
+
//
|
|
129
|
+
// - Wave 3 convergence (field report #319): Riker, Kenobi, Hawkgirl, Loki
|
|
130
|
+
// each independently flagged the NULL trap. When 3+ reviewers agree on
|
|
131
|
+
// the same finding, it is real, not stylistic — promote to a pattern,
|
|
132
|
+
// not a one-off fix.
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern: LLM State Dedup — IDs are NOT keys
|
|
3
|
+
*
|
|
4
|
+
* Rule: LLM-emitted identifiers are display labels, not primary keys.
|
|
5
|
+
*
|
|
6
|
+
* Why: each LLM invocation is stateless from the model's perspective. Two
|
|
7
|
+
* cycles that propose the same fix will produce DIFFERENT id strings, even
|
|
8
|
+
* for substantively identical commands. The model has no memory of prior
|
|
9
|
+
* ids; it generates a fresh string from current context, drifts every cycle.
|
|
10
|
+
*
|
|
11
|
+
* Field report #330 (threadplex-ops): an hourly run asked Claude to emit
|
|
12
|
+
* `approval_needed[]` entries with an `id` field. The runtime keyed dedup
|
|
13
|
+
* on `id`. Over 5 hours of identical context, Claude emitted ids:
|
|
14
|
+
*
|
|
15
|
+
* `a3f9c2` (cycle 1)
|
|
16
|
+
* `a3f7c2` (cycle 2)
|
|
17
|
+
* `a3f7b2` (cycle 3)
|
|
18
|
+
* `a3f9c1` (cycle 4)
|
|
19
|
+
*
|
|
20
|
+
* Four proposals to stop the same container. Four Telegram approval cards.
|
|
21
|
+
* Zero collapse. The dedup key was wrong by construction.
|
|
22
|
+
*
|
|
23
|
+
* This pattern applies to ANY VoidForge project using an LLM as a decision
|
|
24
|
+
* engine that emits actionable items (approvals, tickets, tasks, queued ops).
|
|
25
|
+
*
|
|
26
|
+
* Agents: Hari Seldon (AI architecture), Bayta Darell (eval), Stark (backend)
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
import { createHash } from 'node:crypto'
|
|
30
|
+
|
|
31
|
+
// --- The rule ---
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Dedup keys must be derived from the OPERATIVE CONTENT, not from the LLM's
|
|
35
|
+
* id field. The operative content is the part of the proposal that, if
|
|
36
|
+
* executed, would produce the same observable outcome.
|
|
37
|
+
*
|
|
38
|
+
* For shell commands: the canonical command string.
|
|
39
|
+
* For HTTP requests: (method, path, normalized body).
|
|
40
|
+
* For database operations: (table, primary key, op-type).
|
|
41
|
+
* For user notifications: (recipient, channel, message-hash).
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
export interface ProposalDedupKey {
|
|
45
|
+
/** Content-hash of the operative payload — the actual dedup key. */
|
|
46
|
+
contentHash: string
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Optional looser key for command-string drift collapse — `docker stop X`,
|
|
50
|
+
* `docker compose stop X`, `docker rm -f X` all collapse to the same
|
|
51
|
+
* (verb, target) tuple even though contentHash differs.
|
|
52
|
+
*/
|
|
53
|
+
logicalKey?: string
|
|
54
|
+
|
|
55
|
+
/** The LLM-emitted id, retained as a display label only. NEVER as primary key. */
|
|
56
|
+
displayId?: string
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// --- Hash the operative content ---
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* For shell commands: hash the canonical command string. Normalize whitespace
|
|
63
|
+
* and quoting before hashing so cosmetically-different but semantically-
|
|
64
|
+
* identical commands collapse.
|
|
65
|
+
*/
|
|
66
|
+
export function shellCommandHash(command: string): string {
|
|
67
|
+
const canonical = command
|
|
68
|
+
.trim()
|
|
69
|
+
.replace(/\s+/g, ' ') // Collapse whitespace
|
|
70
|
+
.replace(/(['"])\s+/g, '$1 ') // Normalize quote-adjacent spaces
|
|
71
|
+
|
|
72
|
+
return createHash('sha256').update(canonical).digest('hex').slice(0, 12)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* For HTTP request proposals: hash (method, path, sorted-body-keys).
|
|
77
|
+
* Sort body keys so `{a: 1, b: 2}` and `{b: 2, a: 1}` hash identically.
|
|
78
|
+
*/
|
|
79
|
+
export function httpRequestHash(req: {
|
|
80
|
+
method: string
|
|
81
|
+
path: string
|
|
82
|
+
body?: Record<string, unknown>
|
|
83
|
+
}): string {
|
|
84
|
+
const sortedBody = req.body
|
|
85
|
+
? JSON.stringify(req.body, Object.keys(req.body).sort())
|
|
86
|
+
: ''
|
|
87
|
+
const canonical = `${req.method.toUpperCase()} ${req.path} ${sortedBody}`
|
|
88
|
+
return createHash('sha256').update(canonical).digest('hex').slice(0, 12)
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// --- Logical-key fallback for command-string drift ---
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Some commands have multiple syntactic forms that produce the same outcome.
|
|
95
|
+
* Extract (verb, target) tuple so all forms collapse to the same logical key.
|
|
96
|
+
*
|
|
97
|
+
* Examples that all map to ('stop', 'kometa-run'):
|
|
98
|
+
* docker stop kometa-run
|
|
99
|
+
* docker compose stop kometa-run
|
|
100
|
+
* docker rm -f kometa-run (different verb but same target — flag separately)
|
|
101
|
+
*/
|
|
102
|
+
export function dockerLogicalKey(command: string): string | null {
|
|
103
|
+
const verbs = ['stop', 'start', 'restart', 'rm', 'kill', 'pause']
|
|
104
|
+
for (const verb of verbs) {
|
|
105
|
+
const re = new RegExp(`\\bdocker\\s+(?:compose\\s+)?${verb}\\b\\s+(?:-\\S+\\s+)*([\\w.-]+)`, 'i')
|
|
106
|
+
const m = command.match(re)
|
|
107
|
+
if (m) return `${verb}:${m[1]}`
|
|
108
|
+
}
|
|
109
|
+
return null
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// --- Lifecycle states must enumerate every in-flight status ---
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Even with correct dedup keys, the snapshot used for dedup-comparison must
|
|
116
|
+
* cover ALL operator-visible in-flight states — not just `pending`.
|
|
117
|
+
*
|
|
118
|
+
* Field report #330: the threadplex-ops snapshot filtered only
|
|
119
|
+
* `status == "pending"`, missing `executing` and `interrupted` rows that
|
|
120
|
+
* were also operator-visible. The dedup key was correct but the snapshot
|
|
121
|
+
* was incomplete, producing the same duplication symptom.
|
|
122
|
+
*
|
|
123
|
+
* The lifecycle table below is the reference. Extend per-project.
|
|
124
|
+
*/
|
|
125
|
+
export const PROPOSAL_LIFECYCLE_STATES = [
|
|
126
|
+
'pending', // Awaiting operator approval
|
|
127
|
+
'executing', // Operator approved; runtime executing the action
|
|
128
|
+
'interrupted', // Execution paused (operator pause, system pause, retry-backoff)
|
|
129
|
+
'completed', // Execution succeeded
|
|
130
|
+
'failed', // Execution failed (terminal — operator must re-issue)
|
|
131
|
+
'cancelled', // Operator cancelled before execution
|
|
132
|
+
'expired', // Approval window timed out
|
|
133
|
+
] as const
|
|
134
|
+
|
|
135
|
+
export type LifecycleState = typeof PROPOSAL_LIFECYCLE_STATES[number]
|
|
136
|
+
|
|
137
|
+
/** In-flight states the dedup snapshot must include to prevent duplicate proposals. */
|
|
138
|
+
export const IN_FLIGHT_STATES: readonly LifecycleState[] = [
|
|
139
|
+
'pending',
|
|
140
|
+
'executing',
|
|
141
|
+
'interrupted',
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
// --- AUTHORITY-style contract: tell the LLM the key shape ---
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* The LLM cannot enforce a dedup contract it doesn't know about. Document
|
|
148
|
+
* the contract in the agent's authority/instruction document so the LLM
|
|
149
|
+
* understands what "same target" means.
|
|
150
|
+
*
|
|
151
|
+
* Example AUTHORITY.md fragment:
|
|
152
|
+
*
|
|
153
|
+
* ## Approval Identifier Contract
|
|
154
|
+
*
|
|
155
|
+
* Each proposal you emit MUST include both:
|
|
156
|
+
*
|
|
157
|
+
* id — a human-readable display label. NOT a key. You may
|
|
158
|
+
* emit any short label that helps the operator scan.
|
|
159
|
+
*
|
|
160
|
+
* cmd_hash — sha256(command)[:12]. The runtime keys dedup on this.
|
|
161
|
+
* Two proposals with the same cmd_hash collapse into one
|
|
162
|
+
* approval card.
|
|
163
|
+
*
|
|
164
|
+
* The runtime also computes a logical_key from the command verb + target
|
|
165
|
+
* name. Proposals with the same logical_key are surfaced as a cluster
|
|
166
|
+
* even if cmd_hash differs (e.g., `docker stop X` and `docker rm -f X`
|
|
167
|
+
* both target X with different verbs — operator sees both, decides once).
|
|
168
|
+
*/
|
|
169
|
+
|
|
170
|
+
export const AUTHORITY_FRAGMENT_TEMPLATE = `
|
|
171
|
+
## Approval Identifier Contract
|
|
172
|
+
|
|
173
|
+
Each proposal MUST include:
|
|
174
|
+
|
|
175
|
+
id — display label. Not a key. You may emit any short label.
|
|
176
|
+
cmd_hash — sha256(command)[:12]. The runtime keys dedup on this.
|
|
177
|
+
|
|
178
|
+
The runtime also computes a logical_key from (verb, target). Proposals
|
|
179
|
+
sharing logical_key are surfaced as a cluster even with different
|
|
180
|
+
cmd_hash values.
|
|
181
|
+
`.trim()
|
|
182
|
+
|
|
183
|
+
// --- Putting it together ---
|
|
184
|
+
|
|
185
|
+
export interface ApprovalProposal {
|
|
186
|
+
id: string // Display only — DO NOT USE AS KEY
|
|
187
|
+
cmdHash: string // Primary dedup key
|
|
188
|
+
logicalKey: string | null // Secondary cluster key
|
|
189
|
+
command: string
|
|
190
|
+
proposedAt: string // ISO timestamp
|
|
191
|
+
state: LifecycleState
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
export function dedupProposals(
|
|
195
|
+
newProposal: { id: string; command: string },
|
|
196
|
+
existing: ApprovalProposal[]
|
|
197
|
+
): { duplicate: boolean; collapsedInto?: ApprovalProposal; logicalCluster?: ApprovalProposal[] } {
|
|
198
|
+
const cmdHash = shellCommandHash(newProposal.command)
|
|
199
|
+
const logicalKey = dockerLogicalKey(newProposal.command)
|
|
200
|
+
|
|
201
|
+
// Snapshot covers ALL in-flight states — not just pending
|
|
202
|
+
const inFlight = existing.filter((p) => IN_FLIGHT_STATES.includes(p.state))
|
|
203
|
+
|
|
204
|
+
// Hard duplicate: same cmd_hash
|
|
205
|
+
const exact = inFlight.find((p) => p.cmdHash === cmdHash)
|
|
206
|
+
if (exact) {
|
|
207
|
+
return { duplicate: true, collapsedInto: exact }
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Soft cluster: same logical_key, different command form
|
|
211
|
+
if (logicalKey) {
|
|
212
|
+
const cluster = inFlight.filter((p) => p.logicalKey === logicalKey)
|
|
213
|
+
if (cluster.length > 0) {
|
|
214
|
+
return { duplicate: false, logicalCluster: cluster }
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return { duplicate: false }
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// --- Anti-patterns ---
|
|
222
|
+
|
|
223
|
+
/* ANTI-PATTERN 1: LLM ids as primary keys
|
|
224
|
+
* `INSERT INTO approvals (id, ...) VALUES (?, ...)` where `id` is the
|
|
225
|
+
* LLM-emitted string. Two LLM calls with substantively identical input
|
|
226
|
+
* will produce different ids; the database rows do NOT collapse.
|
|
227
|
+
*
|
|
228
|
+
* Fix: store `cmd_hash` as the PK and `display_id` as a label column.
|
|
229
|
+
*/
|
|
230
|
+
|
|
231
|
+
/* ANTI-PATTERN 2: Dedup snapshot filtered to a single state
|
|
232
|
+
* `SELECT * FROM approvals WHERE state = 'pending'` for dedup comparison.
|
|
233
|
+
* Misses `executing` and `interrupted` rows that are operator-visible.
|
|
234
|
+
*
|
|
235
|
+
* Fix: use IN_FLIGHT_STATES list. Document which states are excluded
|
|
236
|
+
* from dedup (typically `completed`, `failed`, `cancelled`, `expired`).
|
|
237
|
+
*/
|
|
238
|
+
|
|
239
|
+
/* ANTI-PATTERN 3: Hash the LLM's whole emitted JSON
|
|
240
|
+
* `sha256(JSON.stringify(proposal))` includes display_id, timestamps,
|
|
241
|
+
* reasoning prose — all of which drift per cycle even when the action
|
|
242
|
+
* is identical. Hash explodes; collapse never happens.
|
|
243
|
+
*
|
|
244
|
+
* Fix: hash only the operative payload (the command, the request body,
|
|
245
|
+
* the target identifier — never the LLM's free-text fields).
|
|
246
|
+
*/
|
|
@@ -154,6 +154,89 @@ export function withRequestLogging(
|
|
|
154
154
|
}
|
|
155
155
|
}
|
|
156
156
|
|
|
157
|
+
// --- Hot-path logging gate (fire-once / rate-limited) ---
|
|
158
|
+
//
|
|
159
|
+
// Source: Field report #319 §5. Stark's RlsDeadlineMiddleware originally
|
|
160
|
+
// emitted `logger.critical(...)` on every 503 — at 100 rps × 24h = 8.6M
|
|
161
|
+
// critical-level lines/day. No rate-limit, no fire-once. Would crater the
|
|
162
|
+
// log aggregator and Sentry quota.
|
|
163
|
+
//
|
|
164
|
+
// ANY middleware that emits log lines on a hot path (every request, every
|
|
165
|
+
// connection) MUST gate the emission. Two acceptable patterns:
|
|
166
|
+
//
|
|
167
|
+
// 1. Fire-once flag (preferred for state transitions): emit once when
|
|
168
|
+
// state changes, then suppress until reset. Pair with an audit row
|
|
169
|
+
// + Sentry capture inside the same fire-once branch.
|
|
170
|
+
// 2. Rate-limit window (sample-based): emit at most N per window via a
|
|
171
|
+
// token-bucket or last-emit-timestamp gate.
|
|
172
|
+
//
|
|
173
|
+
// Naked `logger.critical(...)` per-request is a denial-of-service vector
|
|
174
|
+
// against your own observability pipeline.
|
|
175
|
+
|
|
176
|
+
type FireOnceState = { fired: boolean; firedAt: number | null };
|
|
177
|
+
const fireOnceStates = new Map<string, FireOnceState>();
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Fire-once gate. Returns true if the caller should emit; false if
|
|
181
|
+
* emission has already happened for this key (until reset()).
|
|
182
|
+
*
|
|
183
|
+
* Use for state-transition events (deadline tripped, circuit opened,
|
|
184
|
+
* degraded mode entered) where the climactic event matters once.
|
|
185
|
+
*/
|
|
186
|
+
export function fireOnce(key: string): boolean {
|
|
187
|
+
const state = fireOnceStates.get(key) ?? { fired: false, firedAt: null };
|
|
188
|
+
if (state.fired) return false;
|
|
189
|
+
state.fired = true;
|
|
190
|
+
state.firedAt = Date.now();
|
|
191
|
+
fireOnceStates.set(key, state);
|
|
192
|
+
return true;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
export function resetFireOnce(key: string): void {
|
|
196
|
+
fireOnceStates.delete(key);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Token-bucket rate limiter for hot-path logs. Returns true if the caller
|
|
201
|
+
* should emit; false if the bucket is empty.
|
|
202
|
+
*
|
|
203
|
+
* Use for sampled logging where you want N emissions per window
|
|
204
|
+
* (e.g., 1 per minute, 10 per hour).
|
|
205
|
+
*/
|
|
206
|
+
const tokenBuckets = new Map<string, { tokens: number; lastRefill: number }>();
|
|
207
|
+
|
|
208
|
+
export function shouldEmit(
|
|
209
|
+
key: string,
|
|
210
|
+
maxPerWindow: number,
|
|
211
|
+
windowMs: number,
|
|
212
|
+
): boolean {
|
|
213
|
+
const now = Date.now();
|
|
214
|
+
const bucket = tokenBuckets.get(key) ?? { tokens: maxPerWindow, lastRefill: now };
|
|
215
|
+
const elapsed = now - bucket.lastRefill;
|
|
216
|
+
if (elapsed >= windowMs) {
|
|
217
|
+
bucket.tokens = maxPerWindow;
|
|
218
|
+
bucket.lastRefill = now;
|
|
219
|
+
}
|
|
220
|
+
if (bucket.tokens > 0) {
|
|
221
|
+
bucket.tokens -= 1;
|
|
222
|
+
tokenBuckets.set(key, bucket);
|
|
223
|
+
return true;
|
|
224
|
+
}
|
|
225
|
+
tokenBuckets.set(key, bucket);
|
|
226
|
+
return false;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Usage example: 503 deadline middleware
|
|
230
|
+
//
|
|
231
|
+
// if (deadlinePassed) {
|
|
232
|
+
// if (fireOnce('rls-deadline-tripped')) {
|
|
233
|
+
// logger.fatal({ deadline_iso, evidence }, 'RLS migration deadline tripped');
|
|
234
|
+
// writeAuditRow({ action: 'rls_deadline_tripped', decisions: { ... } });
|
|
235
|
+
// Sentry.captureMessage('rls_deadline_tripped', 'fatal');
|
|
236
|
+
// }
|
|
237
|
+
// return new Response('Service Unavailable', { status: 503 });
|
|
238
|
+
// }
|
|
239
|
+
|
|
157
240
|
// --- Rate limiting middleware ---
|
|
158
241
|
// Simple in-memory rate limiter. Replace with Redis for multi-instance.
|
|
159
242
|
const rateLimitMap = new Map<string, { count: number; resetAt: number }>()
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern: Multi-Tenant Pool Bypass (pre-org-resolution scope)
|
|
3
|
+
*
|
|
4
|
+
* Source: Field report #316 §8 (Union Station, M-04c W2). FORCE RLS with a
|
|
5
|
+
* non-owner runtime role means every connection acquired from the tenant
|
|
6
|
+
* pool MUST have `app.current_org_id` set before the first query. But some
|
|
7
|
+
* code paths legitimately need cross-tenant access:
|
|
8
|
+
*
|
|
9
|
+
* - Auth pre-resolution (looking up which org a session belongs to)
|
|
10
|
+
* - System daemons (queue cleanup, retention sweeps, leader-elected work)
|
|
11
|
+
* - Admin endpoints (cross-tenant reports, ops tooling)
|
|
12
|
+
*
|
|
13
|
+
* These can't set org_id (they don't have one), so they need to bypass the
|
|
14
|
+
* tenant pool entirely and acquire from the admin pool. The
|
|
15
|
+
* `pre_org_resolution_scope` ContextVar wrapper makes this explicit and
|
|
16
|
+
* mechanically enforceable.
|
|
17
|
+
*
|
|
18
|
+
* The TS version below is illustrative; the canonical implementation in
|
|
19
|
+
* Union Station is Python (asyncpg). Same shape ports cleanly.
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
23
|
+
|
|
24
|
+
// ── ContextVar / AsyncLocalStorage ────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
type TenantContext = {
|
|
27
|
+
org_id: number | null; // null when in pre-resolution scope
|
|
28
|
+
pre_resolution: boolean; // true ⇒ acquire from admin pool, not tenant pool
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
const tenantContext = new AsyncLocalStorage<TenantContext>();
|
|
32
|
+
|
|
33
|
+
// ── Tenant scope (per-request, normal path) ──────────────────────────────
|
|
34
|
+
|
|
35
|
+
export async function withTenant<T>(
|
|
36
|
+
org_id: number,
|
|
37
|
+
fn: () => Promise<T>,
|
|
38
|
+
): Promise<T> {
|
|
39
|
+
return tenantContext.run({ org_id, pre_resolution: false }, fn);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// ── Pre-org-resolution scope (cross-tenant or auth lookup) ───────────────
|
|
43
|
+
|
|
44
|
+
export async function preOrgResolutionScope<T>(fn: () => Promise<T>): Promise<T> {
|
|
45
|
+
return tenantContext.run({ org_id: null, pre_resolution: true }, fn);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ── Pool acquisition routes by ContextVar ─────────────────────────────────
|
|
49
|
+
|
|
50
|
+
import type { Pool, PoolClient } from 'pg'; // illustrative — real types vary
|
|
51
|
+
|
|
52
|
+
declare const tenantPool: Pool; // BYPASSRLS=f, RLS enforced
|
|
53
|
+
declare const adminPool: Pool; // BYPASSRLS=t, cross-tenant work
|
|
54
|
+
|
|
55
|
+
export async function acquireConnection(): Promise<PoolClient> {
|
|
56
|
+
const ctx = tenantContext.getStore();
|
|
57
|
+
|
|
58
|
+
if (!ctx) {
|
|
59
|
+
throw new Error(
|
|
60
|
+
'acquireConnection called outside any tenant context. ' +
|
|
61
|
+
'Wrap caller with withTenant(orgId, ...) or preOrgResolutionScope(...).',
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if (ctx.pre_resolution) {
|
|
66
|
+
// Cross-tenant work — acquire from the admin pool.
|
|
67
|
+
return adminPool.connect();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Normal request — acquire from the tenant pool. The pool callback is
|
|
71
|
+
// expected to SET app.current_org_id so RLS policies can reference it.
|
|
72
|
+
if (ctx.org_id === null) {
|
|
73
|
+
throw new Error(
|
|
74
|
+
'Tenant context missing org_id outside pre_resolution scope. ' +
|
|
75
|
+
'This indicates a callsite that should have called preOrgResolutionScope().',
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
return tenantPool.connect();
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ── Usage examples ────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
// 1. HTTP middleware (per-request)
|
|
84
|
+
//
|
|
85
|
+
// app.use(async (req, res, next) => {
|
|
86
|
+
// await withTenant(req.user.org_id, () => next());
|
|
87
|
+
// });
|
|
88
|
+
//
|
|
89
|
+
// 2. Daemon (cross-tenant queue cleanup)
|
|
90
|
+
//
|
|
91
|
+
// cron.schedule('*/5 * * * *', async () => {
|
|
92
|
+
// await preOrgResolutionScope(async () => {
|
|
93
|
+
// const conn = await acquireConnection(); // → admin pool
|
|
94
|
+
// await conn.query('DELETE FROM job_queue WHERE completed_at < NOW() - INTERVAL \'30 days\'');
|
|
95
|
+
// conn.release();
|
|
96
|
+
// });
|
|
97
|
+
// });
|
|
98
|
+
//
|
|
99
|
+
// 3. Auth lookup (caller doesn't yet know org_id)
|
|
100
|
+
//
|
|
101
|
+
// async function resolveSession(sessionToken: string): Promise<{ org_id: number; user_id: string }> {
|
|
102
|
+
// return preOrgResolutionScope(async () => {
|
|
103
|
+
// const conn = await acquireConnection(); // → admin pool
|
|
104
|
+
// try {
|
|
105
|
+
// const row = await conn.query(
|
|
106
|
+
// 'SELECT org_id, user_id FROM sessions WHERE token = $1 AND expires_at > NOW()',
|
|
107
|
+
// [sessionToken],
|
|
108
|
+
// );
|
|
109
|
+
// return row.rows[0];
|
|
110
|
+
// } finally {
|
|
111
|
+
// conn.release();
|
|
112
|
+
// }
|
|
113
|
+
// });
|
|
114
|
+
// }
|
|
115
|
+
|
|
116
|
+
// ── Anti-patterns ─────────────────────────────────────────────────────────
|
|
117
|
+
//
|
|
118
|
+
// 1. Acquiring from the tenant pool in a daemon. Without org_id set, the RLS
|
|
119
|
+
// policy denies every query → daemon crashes on first tick. Or worse:
|
|
120
|
+
// the policy uses a fail-open arm and the daemon silently sees zero rows.
|
|
121
|
+
//
|
|
122
|
+
// 2. Bypassing FORCE RLS by hard-coding the connection string with the
|
|
123
|
+
// runtime role's password. The whole point of the admin pool is the
|
|
124
|
+
// BYPASSRLS=t identity — preserve that boundary.
|
|
125
|
+
//
|
|
126
|
+
// 3. preOrgResolutionScope wrapping per-request handlers. The middleware
|
|
127
|
+
// already set the tenant context; switching to admin pool there is a
|
|
128
|
+
// privilege escalation. preOrgResolutionScope is for code paths that
|
|
129
|
+
// legitimately don't have an org_id yet (or never will).
|
|
130
|
+
//
|
|
131
|
+
// 4. Forgetting to wrap lifespan startup. Field report #319 §2: 4 lifespan
|
|
132
|
+
// paths in Union Station's M-05 cutover failed-fast immediately because
|
|
133
|
+
// the RLS-strict role rejected unscoped queries. See BACKEND_ENGINEER.md
|
|
134
|
+
// "Lifespan & Daemon ContextVar Coverage" for the sweep checklist.
|