kanban-system 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +76 -0
- package/CLAUDE.md +108 -0
- package/README.md +272 -0
- package/agents/_TEMPLATE.md +42 -0
- package/agents/backend-agent.md +81 -0
- package/agents/deploy-gate-agent.md +73 -0
- package/agents/frontend-agent.md +73 -0
- package/agents/monitor-agent.md +65 -0
- package/agents/orchestrator.md +91 -0
- package/agents/reviewer-codex.md +51 -0
- package/bin/cli.js +171 -0
- package/config.example.js +99 -0
- package/docs/adapting-to-your-project.md +155 -0
- package/docs/example-apex.md +86 -0
- package/docs/the-pattern.md +92 -0
- package/hooks/launchd.plist.template +66 -0
- package/hooks/pre-push.sample +61 -0
- package/lib/config.cjs +138 -0
- package/lib/detect/_template.cjs +63 -0
- package/lib/detect/rules.json +28 -0
- package/lib/detect/sentry.cjs +86 -0
- package/lib/detect/vercel.cjs +62 -0
- package/lib/gate/index.cjs +182 -0
- package/lib/runner/adapters/both.cjs +33 -0
- package/lib/runner/adapters/claude.cjs +119 -0
- package/lib/runner/adapters/codex.cjs +43 -0
- package/lib/runner/adapters/reviewer.cjs +91 -0
- package/lib/runner/budget.cjs +75 -0
- package/lib/runner/index.cjs +93 -0
- package/lib/runner/result-merger.cjs +58 -0
- package/lib/runner/worktree-manager.cjs +64 -0
- package/lib/watch/scheduler.cjs +164 -0
- package/package.json +59 -0
- package/playbooks/_TEMPLATE.html +54 -0
- package/playbooks/build-fail.html +57 -0
- package/playbooks/deploy-rollback.html +53 -0
- package/playbooks/e2e-regression.html +58 -0
- package/playbooks/playbook.css +26 -0
- package/playbooks/sentry-spike.html +53 -0
- package/server/kanban.cjs +1152 -0
- package/skills/archive.md +18 -0
- package/skills/gate.md +22 -0
- package/skills/standup.md +24 -0
- package/skills/triage.md +24 -0
- package/ui/kanban.html +628 -0
- package/ui/styles/kanban.css +436 -0
- package/ui/styles/progress.css +315 -0
- package/ui/styles/tokens.css +291 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Adapting kanban-system to your project
|
|
2
|
+
|
|
3
|
+
This harness ships generic. To point it at *your* front-end / back-end repo, you
|
|
4
|
+
edit a handful of files. Nothing here knows about a particular framework — examples
|
|
5
|
+
below cover common stacks, but the shape is the same everywhere.
|
|
6
|
+
|
|
7
|
+
Prerequisites: Node ≥ 20, a git repo for your application, and (optionally) the
|
|
8
|
+
`claude` and/or `codex` CLIs on PATH if you want the runner to actually execute work
|
|
9
|
+
(without them the runner falls back to deterministic stub verdicts, which is fine for
|
|
10
|
+
trying things out).
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Step 1 — `config.js`: point at your repo and your stack
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
cp config.example.js config.js # config.js is gitignored
|
|
18
|
+
cp .env.example .env # fill in tokens; .env is gitignored
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
In `config.js`, set:
|
|
22
|
+
- `projectName` — what shows in the board UI and Slack.
|
|
23
|
+
- `repoPath` — **absolute** path to your application repo. The gate runs commands
|
|
24
|
+
here; the runner creates git worktrees here.
|
|
25
|
+
- `kanbanPort` — default 8080 (env `PORT` overrides).
|
|
26
|
+
- `deployCommands` — the build/test chain the gate runs, in order, fail-fast:
|
|
27
|
+
```
|
|
28
|
+
Node / Vite: [{ name:"01-typecheck", cmd:"npx", args:["tsc","--noEmit"] },
|
|
29
|
+
{ name:"02-build", cmd:"npm", args:["run","build"] }]
|
|
30
|
+
Rust: [{ name:"build", cmd:"cargo", args:["build","--release"] },
|
|
31
|
+
{ name:"test", cmd:"cargo", args:["test"] }]
|
|
32
|
+
Go: [{ name:"vet", cmd:"go", args:["vet","./..."] },
|
|
33
|
+
{ name:"test", cmd:"go", args:["test","./..."] }]
|
|
34
|
+
Python: [{ name:"lint", cmd:"ruff", args:["check","."] },
|
|
35
|
+
{ name:"test", cmd:"pytest", args:["-q"] }]
|
|
36
|
+
```
|
|
37
|
+
Add an E2E stage too if you have one (e.g. a Playwright golden-path spec).
|
|
38
|
+
- `buildOutputDir` — the built-output directory the gate inspects for bundle-size
|
|
39
|
+
regressions (`dist`, `build`, `.next`, …). Set to `null` to skip.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Step 2 — `agents/`: make ownership match your directory layout
|
|
44
|
+
|
|
45
|
+
Open `agents/frontend-agent.md` and `agents/backend-agent.md` and edit the `owns:`
|
|
46
|
+
globs in the frontmatter so they match where your code actually lives. The
|
|
47
|
+
orchestrator uses these to route "a task touches this file" → "this agent owns it".
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
# frontend-agent owns (examples)
|
|
51
|
+
Next.js / CRA: app/**, src/app/**, components/**, pages/**, styles/**, public/**
|
|
52
|
+
Vite + React: src/**, src/components/**, src/pages/**, src/styles/**, src/locales/**
|
|
53
|
+
SvelteKit: src/routes/**, src/lib/components/**, static/**
|
|
54
|
+
|
|
55
|
+
# backend-agent owns (examples)
|
|
56
|
+
Node API: server/**, api/**, src/server/**, src/api/**
|
|
57
|
+
Rails: app/controllers/**, app/models/**, db/migrate/**
|
|
58
|
+
Go services: internal/**, cmd/**
|
|
59
|
+
Supabase: supabase/functions/**, supabase/migrations/**
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Keep them **non-overlapping**. Need more roles than two? Copy `agents/_TEMPLATE.md`
|
|
63
|
+
to `agents/<name>.md`, fill in the frontmatter (especially `name`, `mission`, `runner`,
|
|
64
|
+
`owns`), and add it to `config.js → agents`. Leave `orchestrator.md`, `deploy-gate-agent.md`,
|
|
65
|
+
`monitor-agent.md`, and `reviewer-codex.md` mostly as-is — they're already generic;
|
|
66
|
+
just trim anything that doesn't apply.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Step 3 — `lib/detect/`: wire up your monitoring (or skip it)
|
|
71
|
+
|
|
72
|
+
`config.js → detectors` lists which detectors run on the 24h watch loop. Built-in:
|
|
73
|
+
- **`sentry`** — error groups + error-rate spikes. Needs `SENTRY_AUTH_TOKEN`,
|
|
74
|
+
`SENTRY_ORG_SLUG`, `SENTRY_PROJECT_SLUG` in `.env`.
|
|
75
|
+
- **`vercel`** — deploy state. Needs `VERCEL_TOKEN`, `VERCEL_PROJECT_ID` (+ `VERCEL_TEAM_ID`
|
|
76
|
+
for team accounts).
|
|
77
|
+
|
|
78
|
+
Both degrade gracefully — if their env vars are blank they post a low-severity
|
|
79
|
+
"config missing" task instead of crashing, so you can leave them enabled while you
|
|
80
|
+
get around to it.
|
|
81
|
+
|
|
82
|
+
Using something else (Datadog, CloudWatch, Prometheus, a `/healthz` endpoint, a
|
|
83
|
+
custom metrics API)? Copy `lib/detect/_template.cjs` → `lib/detect/<name>.cjs`,
|
|
84
|
+
implement `run(ruleSet, state)`, register it in the `detectors` map at the top of
|
|
85
|
+
`lib/watch/scheduler.cjs`, add a `{ "detector": "<name>", ... }` block to
|
|
86
|
+
`lib/detect/rules.json`, and enable it in `config.js → detectors`. No monitoring at
|
|
87
|
+
all? Leave `detectors` empty — everything else still works.
|
|
88
|
+
|
|
89
|
+
Tune thresholds in `lib/detect/rules.json` (the scheduler re-reads it every sweep —
|
|
90
|
+
no restart).
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Step 4 — `agents/deploy-gate-agent.md` + `lib/gate/`: set your build/test commands
|
|
95
|
+
|
|
96
|
+
The gate doesn't need code changes — it runs whatever you put in `config.js →
|
|
97
|
+
deployCommands` (step 1). Just review `agents/deploy-gate-agent.md` so the prose
|
|
98
|
+
matches what you configured (it's the playbook reviewers will read when the gate
|
|
99
|
+
fails). The gate stages run serially, fail-fast; the failing stage's log lands in
|
|
100
|
+
`data/runs/gate-<ts>/<stage>.log` and a "needs human" task is auto-created.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Step 5 — `playbooks/`: one per incident you care about
|
|
105
|
+
|
|
106
|
+
Start from `playbooks/_TEMPLATE.html`. Adapt the three example playbooks
|
|
107
|
+
(`build-fail`, `e2e-regression`, `sentry-spike`) and `deploy-rollback`, and add your
|
|
108
|
+
own — a payment-webhook failure, a queue backlog, a third-party outage, whatever your
|
|
109
|
+
system actually has. Keep each on one page; they're read under pressure. The monitor
|
|
110
|
+
agent routes anomalies into tasks; link the relevant playbook from those tasks (and
|
|
111
|
+
from `agents/*.md`).
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Step 6 — `hooks/`: install the pre-push gate + schedule the watch
|
|
116
|
+
|
|
117
|
+
**Pre-push gate** — install into *your application repo* (the one `config.js →
|
|
118
|
+
repoPath` points at):
|
|
119
|
+
```bash
|
|
120
|
+
ln -sf /abs/path/to/kanban-system/hooks/pre-push.sample /abs/path/to/your-app/.git/hooks/pre-push
|
|
121
|
+
chmod +x /abs/path/to/kanban-system/hooks/pre-push.sample
|
|
122
|
+
export KANBAN_SYSTEM_DIR=/abs/path/to/kanban-system # so the hook can find the harness
|
|
123
|
+
```
|
|
124
|
+
Now `git push` runs the gate; a human can override with `KANBAN_GATE_BYPASS=1 git push`
|
|
125
|
+
(audit-logged to `data/runs/overrides.jsonl`).
|
|
126
|
+
|
|
127
|
+
**24h watch** —
|
|
128
|
+
- macOS: edit the three `__PLACEHOLDER__`s in `hooks/launchd.plist.template`, copy it
|
|
129
|
+
to `~/Library/LaunchAgents/`, `launchctl load` it.
|
|
130
|
+
- Linux / cron: add the cron line from the comment at the top of that template
|
|
131
|
+
(`*/5 * * * * … scheduler.cjs --once`).
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Step 7 (optional) — Slack reporting
|
|
136
|
+
|
|
137
|
+
Create a Slack app (bot + app token, Socket Mode), put `SLACK_BOT_TOKEN`,
|
|
138
|
+
`SLACK_APP_TOKEN`, `SLACK_CHANNEL_ID` in `.env`, set `SLACK_COMMAND` (default
|
|
139
|
+
`/kanban`). With that, the board posts start / progress / done updates, and the slash
|
|
140
|
+
command works (`/kanban board`, `/kanban list`, `/kanban add`, `/kanban ask`,
|
|
141
|
+
`/kanban exec`, `/kanban stop`). Without it, everything else runs fine — Slack is off.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Run it
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
npm install
|
|
149
|
+
npm start # → http://localhost:8080
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Then exercise it: open the board, create a task (UI or `POST /api/tasks`), watch it
|
|
153
|
+
move. Run the gate (`npm run gate`). Run one watch sweep (`npm run watch:once`). See
|
|
154
|
+
`docs/the-pattern.md` for *why* it's built this way, and `docs/example-apex.md` for a
|
|
155
|
+
worked example from a real project.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Example: how APEX used this harness
|
|
2
|
+
|
|
3
|
+
> This is the one place in the repo where project-specific domain content lives —
|
|
4
|
+
> as a worked example, so the generic pieces have something concrete to point at.
|
|
5
|
+
> Everything here is illustrative; nothing in `agents/`, `playbooks/`, or `lib/`
|
|
6
|
+
> depends on it.
|
|
7
|
+
|
|
8
|
+
**APEX** is an AI-skills certification exam platform — a React + TypeScript front end
|
|
9
|
+
(Vite, React Router, i18n in three languages), a Supabase back end (Edge Functions in
|
|
10
|
+
Deno, Postgres with row-level security, Supabase Auth + social OAuth), payments via a
|
|
11
|
+
third-party SDK, Sentry for error tracking, deployed on Vercel + Supabase. The harness
|
|
12
|
+
ran inside that repo under the codename "Sentinel". Here's how the generic patterns
|
|
13
|
+
mapped onto a real, high-stakes system.
|
|
14
|
+
|
|
15
|
+
## Config (`config.js`)
|
|
16
|
+
- `repoPath` → the `apex-platform` checkout.
|
|
17
|
+
- `deployCommands` → `npx tsc --noEmit` → `npm run build` (the Vite build, which is
|
|
18
|
+
exactly what Vercel runs) → `npx playwright test e2e/golden-path.spec.ts`.
|
|
19
|
+
- `buildOutputDir` → `dist`.
|
|
20
|
+
- The gate's golden paths (the E2E stage): sign in (email + Google + Kakao + Naver),
|
|
21
|
+
start exam → submit Part 1/2/3, result page → credential issuance, cart → payment →
|
|
22
|
+
order confirmation, admin cohort/voucher flows.
|
|
23
|
+
|
|
24
|
+
## Agents
|
|
25
|
+
APEX ran 14 agents split into two groups:
|
|
26
|
+
|
|
27
|
+
- **OpsGuard (8)** — the generic dev/ops layer this template ships:
|
|
28
|
+
`orchestrator`, `route-agent` (React Router tree + i18n route slugs + lazy/Suspense),
|
|
29
|
+
`feature-agent` (front-end feature work — split here into `frontend-agent`),
|
|
30
|
+
`data-agent` (Supabase Edge Functions + migrations + RLS — split here into
|
|
31
|
+
`backend-agent`, always `runner: both`), `content-agent` (exam-question / rubric
|
|
32
|
+
validation — APEX-specific, not in the template), `e2e-agent` (Playwright golden
|
|
33
|
+
paths), `deploy-gate-agent`, `monitor-agent` (Sentry / Vercel / Supabase polling).
|
|
34
|
+
- **ExamGuard (6)** — the *domain* layer, entirely APEX-specific and **not** part of
|
|
35
|
+
the template: `signup-agent` (sign-up funnel), `session-agent` (live exam sessions),
|
|
36
|
+
`notify-agent` (exam notifications), `proctor-agent` (cheating-signal detection —
|
|
37
|
+
see the human-approval note below), `grading-agent` (the Part-3 free-response
|
|
38
|
+
grading queue, `runner: both`), `credential-agent` (independent score recomputation
|
|
39
|
+
before a credential is issued, `runner: both`).
|
|
40
|
+
|
|
41
|
+
The lesson for *your* project: keep the generic agents, add a domain group of your own
|
|
42
|
+
the way APEX added ExamGuard.
|
|
43
|
+
|
|
44
|
+
## A selvedge boundary that mattered: the exam engine
|
|
45
|
+
APEX had an `exam-engine` selvedge — a small set of paths (the scoring/grading Edge
|
|
46
|
+
Functions, the shared exam-data directory, the scoring/grading services on the front
|
|
47
|
+
end) that *only* the exam-engine agent could modify; no other agent touched them. The
|
|
48
|
+
shared `_shared/` helpers were a cross-check zone (exam-engine + backend jointly).
|
|
49
|
+
This is the "non-overlapping ownership, with shared surfaces requiring a cross-check"
|
|
50
|
+
idea from `docs/the-pattern.md` applied to the part of the system where a bug is most
|
|
51
|
+
expensive — a wrong score on a certification exam.
|
|
52
|
+
|
|
53
|
+
## A human-approval rule that's worth copying: the proctor agent
|
|
54
|
+
APEX's `proctor-agent` detected possible cheating signals during a live exam — and
|
|
55
|
+
its `runner` policy was deliberately conservative, and it had **zero auto-actions**.
|
|
56
|
+
It never blocked, flagged, or failed a candidate on its own. It *recorded* the signal
|
|
57
|
+
and routed a task to the "needs human" column; a person decided. The generalization:
|
|
58
|
+
when an automated decision is high-consequence *and* about a person (a moderation
|
|
59
|
+
action, an account suspension, a fraud flag), the agent's job is detection and
|
|
60
|
+
escalation, never enforcement. Build the gate so a human is always the last signature.
|
|
61
|
+
|
|
62
|
+
## Cross-validation, applied
|
|
63
|
+
APEX used `runner: both` (independent re-implementation, diffed) for exactly the
|
|
64
|
+
places you'd expect: Supabase migrations and RLS-policy changes (a bad one corrupts or
|
|
65
|
+
leaks user data), Part-3 grading prompts (validated by a content reviewer, never edited
|
|
66
|
+
solo), and credential score computation (recomputed independently before issuance).
|
|
67
|
+
Everything else — front-end features, routing, the deploy gate — used `reviewer:codex`
|
|
68
|
+
(Claude implements, Codex reviews). Deterministic work (Playwright runs, log polling,
|
|
69
|
+
state transitions) ran single-model. A daily cap on the second model with a Claude
|
|
70
|
+
fallback chain kept the cost bounded.
|
|
71
|
+
|
|
72
|
+
## Governance / commit hygiene
|
|
73
|
+
APEX wired the harness into a stricter governance system ("LOOM" — strict workflow
|
|
74
|
+
"shuttles" for dev, content, partnerships, plus an always-on ops shuttle), enforced the
|
|
75
|
+
kanban-first protocol from `agents/orchestrator.md` verbatim, required the pre-deploy
|
|
76
|
+
gate to pass before any push, and used a `[APEX-XXX] type: description` commit
|
|
77
|
+
convention referencing task IDs. None of that machinery is in the template — but the
|
|
78
|
+
two load-bearing rules (task-first, gate-before-push) are.
|
|
79
|
+
|
|
80
|
+
## Playbooks
|
|
81
|
+
APEX shipped 12. The 6 ops ones generalize and ship here (build-fail, e2e-regression,
|
|
82
|
+
sentry-spike, plus payment-webhook-fail, RLS-violation, content-flag — the first three
|
|
83
|
+
adapted into `playbooks/`). The 6 exam ones were domain-specific (cheating-detected,
|
|
84
|
+
grading-SLA-breach, credential-issuance-fail, session-stuck, signup-fail-spike,
|
|
85
|
+
notification-bounce) and are not included — they're the ExamGuard equivalent of the
|
|
86
|
+
playbooks you'll write for *your* domain.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# The pattern
|
|
2
|
+
|
|
3
|
+
Why this harness is shaped the way it is. Five ideas, each one a constraint that's
|
|
4
|
+
saved someone a bad day.
|
|
5
|
+
|
|
6
|
+
## 1. Kanban-first: every instruction becomes a task before work starts
|
|
7
|
+
|
|
8
|
+
When a person asks for something — "fix the login bug", "add a coupon field", "the
|
|
9
|
+
build is failing" — the orchestrator's *first* action is to write a kanban task:
|
|
10
|
+
capture the instruction verbatim in the description, derive a short subject, route
|
|
11
|
+
it to an agent, set a runner, then transition it to `in_progress` and only then
|
|
12
|
+
start the work.
|
|
13
|
+
|
|
14
|
+
Why bother:
|
|
15
|
+
- **Nothing is invisible.** Two agents (or an agent and a human) can't both quietly
|
|
16
|
+
be "working on the login thing" — there's one card, one owner, one state.
|
|
17
|
+
- **Routing is explicit.** The agent and the runner (`claude` / `codex` / `both` /
|
|
18
|
+
`reviewer:*`) are written down, not implied.
|
|
19
|
+
- **The trail exists.** `createdAt` / `startedAt` / `completedAt`, the report path,
|
|
20
|
+
the cross-validation verdict — all on the card.
|
|
21
|
+
- **Standups write themselves** from the activity log.
|
|
22
|
+
|
|
23
|
+
The one exception: **incident response.** If production is impacted (or about to be)
|
|
24
|
+
and the fix is a small, obviously-reversible change, do it *now* — but register a
|
|
25
|
+
post-hoc task within an hour, tagged `metadata.source = "incident-response"`, with
|
|
26
|
+
what you did and any follow-up. Nothing else qualifies. Refactors, docs, features,
|
|
27
|
+
ordinary bugs — task first.
|
|
28
|
+
|
|
29
|
+
(Implemented by `agents/orchestrator.md`; the board is `server/kanban.cjs`.)
|
|
30
|
+
|
|
31
|
+
## 2. Cross-validation: pick the verification level deliberately
|
|
32
|
+
|
|
33
|
+
Every task has a `runner`. Three modes, in increasing cost and rigor:
|
|
34
|
+
|
|
35
|
+
- **single-model** (`claude` or `codex`) — for deterministic / mechanical work where
|
|
36
|
+
a second opinion only adds latency: running a test suite, polling an API, applying
|
|
37
|
+
a state transition.
|
|
38
|
+
- **`reviewer:codex`** (or `reviewer:claude`) — the executor model does the work in
|
|
39
|
+
an isolated git worktree and produces a verdict; the *other* model is handed that
|
|
40
|
+
report and asked to flag what was missed. Cheap, catches most mistakes. Default for
|
|
41
|
+
implementation work — front-end features, routing, the deploy gate, refactors. A
|
|
42
|
+
blocking flag downgrades the verdict to `needs_human`.
|
|
43
|
+
- **`both`** — Claude *and* Codex independently do the work from the same spec, in
|
|
44
|
+
separate worktrees; the orchestrator diffs the two. Agreement → auto-merge.
|
|
45
|
+
Disagreement → the task moves to the "needs human" column. Use this for the
|
|
46
|
+
high-stakes stuff: schema migrations, access-control / RLS-style policies, anything
|
|
47
|
+
that can corrupt or leak data, money paths. **The disagreement is the safety
|
|
48
|
+
feature** — it's the system refusing to ship something two independent reads
|
|
49
|
+
couldn't converge on.
|
|
50
|
+
|
|
51
|
+
The orchestrator can also *auto-promote*: a single-model task with severity ≥ a
|
|
52
|
+
threshold (`CROSS_VALIDATION_THRESHOLD`) gets bumped to `both`, budget permitting.
|
|
53
|
+
And there's a daily cap on the second model (`DAILY_CODEX_BUDGET`) — when it's spent,
|
|
54
|
+
`codex` / `both` / `reviewer:codex` fall back to Claude alone, and Claude steps down
|
|
55
|
+
through `MODEL_FALLBACK_CHAIN` under load. (Implemented by `lib/runner/`.)
|
|
56
|
+
|
|
57
|
+
## 3. Selvedge boundaries: agents own non-overlapping territory
|
|
58
|
+
|
|
59
|
+
Each agent declares `owns:` globs (relative to your repo). The orchestrator uses them
|
|
60
|
+
to answer "which agent owns this file?" — and agents stay on their side of the line.
|
|
61
|
+
`frontend-agent` doesn't touch the server; `backend-agent` doesn't touch components;
|
|
62
|
+
the `deploy-gate-agent` never edits application code at all, it only runs commands.
|
|
63
|
+
Shared surfaces — shared type definitions, dependency manifests, migrations — are the
|
|
64
|
+
places that *require* a cross-check, which is exactly where `runner: both` earns its
|
|
65
|
+
keep. Clean boundaries make routing automatic and make "who broke this?" answerable.
|
|
66
|
+
|
|
67
|
+
## 4. Human-approval gates: some things never auto-merge
|
|
68
|
+
|
|
69
|
+
A hard gate is one that *cannot* be bypassed by an agent — only by a human, audibly.
|
|
70
|
+
Two in this harness:
|
|
71
|
+
|
|
72
|
+
- **The pre-deploy gate** (`lib/gate/index.cjs`, run by `hooks/pre-push.sample` on
|
|
73
|
+
`git push`): runs your build/test commands fail-fast. Fail → push blocked, a
|
|
74
|
+
"needs human" task auto-created with the logs linked. The only override is `git push
|
|
75
|
+
--no-verify` or `KANBAN_GATE_BYPASS=1 git push`, the latter logged to
|
|
76
|
+
`data/runs/overrides.jsonl` and reviewed at standup.
|
|
77
|
+
- **Cross-validation disagreement** (`runner: both`): when the two models disagree,
|
|
78
|
+
the task goes to the "needs human" column with a diff. No auto-merge. A person picks.
|
|
79
|
+
|
|
80
|
+
The general principle: anything irreversible or externally visible (a deploy, a
|
|
81
|
+
destructive migration, a money movement, in some projects a moderation action) gets a
|
|
82
|
+
gate where a human is the last signature. Agents do the work; humans own the commit
|
|
83
|
+
that ships it.
|
|
84
|
+
|
|
85
|
+
## 5. Incident playbooks: scannable runbooks, not prose
|
|
86
|
+
|
|
87
|
+
A playbook (`playbooks/*.html`) is a one-page runbook for one incident type: what
|
|
88
|
+
fires it, how to diagnose, a decision tree, when to escalate, what to do afterwards.
|
|
89
|
+
They're short and skimmable because they get read under pressure. Tasks link to them;
|
|
90
|
+
the monitor agent routes anomalies into tasks that reference them. Start from
|
|
91
|
+
`playbooks/_TEMPLATE.html`, write one per incident you actually care about, and keep
|
|
92
|
+
each on a single page — if it grows past a screen, it's documentation, not a playbook.
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
|
3
|
+
<!--
|
|
4
|
+
kanban-system — macOS launchd agent for the 24h watch daemon.
|
|
5
|
+
|
|
6
|
+
1. Replace the THREE __PLACEHOLDERS__ below:
|
|
7
|
+
__NODE_PATH__ e.g. /opt/homebrew/bin/node (run `which node`)
|
|
8
|
+
__KANBAN_SYSTEM_DIR__ absolute path to your kanban-system checkout
|
|
9
|
+
__USER_HOME__ e.g. /Users/you (run `echo $HOME`)
|
|
10
|
+
2. Rename the file to something stable, e.g. com.you.kanban-system.watch.plist,
|
|
11
|
+
and make the <key>Label</key> match the file name (without .plist).
|
|
12
|
+
3. Install:
|
|
13
|
+
cp com.you.kanban-system.watch.plist ~/Library/LaunchAgents/
|
|
14
|
+
launchctl load ~/Library/LaunchAgents/com.you.kanban-system.watch.plist
|
|
15
|
+
Stop:
|
|
16
|
+
launchctl unload ~/Library/LaunchAgents/com.you.kanban-system.watch.plist
|
|
17
|
+
Logs:
|
|
18
|
+
tail -f ~/Library/Logs/kanban-system/watch.{out,err}.log
|
|
19
|
+
|
|
20
|
+
CRON ALTERNATIVE (Linux, or if you'd rather use cron):
|
|
21
|
+
# run a single sweep every 5 minutes — the scheduler reads .env via lib/config.cjs
|
|
22
|
+
*/5 * * * * cd __KANBAN_SYSTEM_DIR__ && __NODE_PATH__ lib/watch/scheduler.cjs --once >> __KANBAN_SYSTEM_DIR__/data/logs/watch-cron.log 2>&1
|
|
23
|
+
-->
|
|
24
|
+
<plist version="1.0">
|
|
25
|
+
<dict>
|
|
26
|
+
<key>Label</key>
|
|
27
|
+
<string>com.you.kanban-system.watch</string>
|
|
28
|
+
|
|
29
|
+
<key>ProgramArguments</key>
|
|
30
|
+
<array>
|
|
31
|
+
<string>__NODE_PATH__</string>
|
|
32
|
+
<string>__KANBAN_SYSTEM_DIR__/lib/watch/scheduler.cjs</string>
|
|
33
|
+
</array>
|
|
34
|
+
|
|
35
|
+
<key>WorkingDirectory</key>
|
|
36
|
+
<string>__KANBAN_SYSTEM_DIR__</string>
|
|
37
|
+
|
|
38
|
+
<key>EnvironmentVariables</key>
|
|
39
|
+
<dict>
|
|
40
|
+
<key>WATCH_INTERVAL_MS</key>
|
|
41
|
+
<string>300000</string>
|
|
42
|
+
<key>NODE_ENV</key>
|
|
43
|
+
<string>production</string>
|
|
44
|
+
</dict>
|
|
45
|
+
|
|
46
|
+
<key>RunAtLoad</key>
|
|
47
|
+
<true/>
|
|
48
|
+
|
|
49
|
+
<key>KeepAlive</key>
|
|
50
|
+
<dict>
|
|
51
|
+
<key>SuccessfulExit</key>
|
|
52
|
+
<false/>
|
|
53
|
+
<key>Crashed</key>
|
|
54
|
+
<true/>
|
|
55
|
+
</dict>
|
|
56
|
+
|
|
57
|
+
<key>ThrottleInterval</key>
|
|
58
|
+
<integer>30</integer>
|
|
59
|
+
|
|
60
|
+
<key>StandardOutPath</key>
|
|
61
|
+
<string>__USER_HOME__/Library/Logs/kanban-system/watch.out.log</string>
|
|
62
|
+
|
|
63
|
+
<key>StandardErrorPath</key>
|
|
64
|
+
<string>__USER_HOME__/Library/Logs/kanban-system/watch.err.log</string>
|
|
65
|
+
</dict>
|
|
66
|
+
</plist>
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# kanban-system pre-push hook — runs the deploy gate before allowing a push.
|
|
3
|
+
#
|
|
4
|
+
# Install into YOUR APPLICATION REPO (the one config.js → repoPath points at):
|
|
5
|
+
# ln -sf /absolute/path/to/kanban-system/hooks/pre-push.sample .git/hooks/pre-push
|
|
6
|
+
# chmod +x /absolute/path/to/kanban-system/hooks/pre-push.sample
|
|
7
|
+
# (Or copy it, if you prefer not to symlink.)
|
|
8
|
+
#
|
|
9
|
+
# Tell it where the harness lives — either edit KANBAN_SYSTEM_DIR below, or export it:
|
|
10
|
+
# export KANBAN_SYSTEM_DIR=/absolute/path/to/kanban-system
|
|
11
|
+
#
|
|
12
|
+
# Bypass (audited — logs to kanban-system/data/runs/overrides.jsonl):
|
|
13
|
+
# KANBAN_GATE_BYPASS=1 git push
|
|
14
|
+
# Bypass entirely (not logged):
|
|
15
|
+
# git push --no-verify
|
|
16
|
+
|
|
17
|
+
set -e
|
|
18
|
+
|
|
19
|
+
# --- where is the harness? -----------------------------------------------------
|
|
20
|
+
KANBAN_SYSTEM_DIR="${KANBAN_SYSTEM_DIR:-}"
|
|
21
|
+
if [ -z "$KANBAN_SYSTEM_DIR" ]; then
|
|
22
|
+
# Best effort: if this hook is a symlink, follow it to find the harness root.
|
|
23
|
+
HOOK_PATH="$(readlink -f "$0" 2>/dev/null || echo "$0")"
|
|
24
|
+
KANBAN_SYSTEM_DIR="$(cd "$(dirname "$HOOK_PATH")/.." 2>/dev/null && pwd || true)"
|
|
25
|
+
fi
|
|
26
|
+
GATE_RUNNER="$KANBAN_SYSTEM_DIR/lib/gate/index.cjs"
|
|
27
|
+
|
|
28
|
+
if [ -z "$KANBAN_SYSTEM_DIR" ] || [ ! -f "$GATE_RUNNER" ]; then
|
|
29
|
+
echo "[kanban-system] gate runner not found (set KANBAN_SYSTEM_DIR) — skipping gate"
|
|
30
|
+
exit 0
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
# --- audited bypass ------------------------------------------------------------
|
|
34
|
+
if [ "$KANBAN_GATE_BYPASS" = "1" ]; then
|
|
35
|
+
echo "[kanban-system] BYPASS — gate skipped (audit logged)"
|
|
36
|
+
OVERRIDE_LOG="$KANBAN_SYSTEM_DIR/data/runs/overrides.jsonl"
|
|
37
|
+
mkdir -p "$(dirname "$OVERRIDE_LOG")"
|
|
38
|
+
printf '{"timestamp":"%s","branch":"%s","user":"%s","reason":"KANBAN_GATE_BYPASS=1"}\n' \
|
|
39
|
+
"$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
|
40
|
+
"$(git rev-parse --abbrev-ref HEAD)" \
|
|
41
|
+
"$(git config user.email || whoami)" \
|
|
42
|
+
>> "$OVERRIDE_LOG"
|
|
43
|
+
exit 0
|
|
44
|
+
fi
|
|
45
|
+
|
|
46
|
+
# --- run the gate --------------------------------------------------------------
|
|
47
|
+
BRANCH="$(git rev-parse --abbrev-ref HEAD)"
|
|
48
|
+
echo "[kanban-system] running pre-deploy gate for branch '$BRANCH'..."
|
|
49
|
+
node "$GATE_RUNNER"
|
|
50
|
+
GATE_EXIT=$?
|
|
51
|
+
|
|
52
|
+
if [ $GATE_EXIT -ne 0 ]; then
|
|
53
|
+
echo ""
|
|
54
|
+
echo "[kanban-system] ✗ gate FAILED (exit $GATE_EXIT). Push BLOCKED."
|
|
55
|
+
echo "[kanban-system] reports: $KANBAN_SYSTEM_DIR/data/runs/gate-*/"
|
|
56
|
+
echo "[kanban-system] to bypass (logged): KANBAN_GATE_BYPASS=1 git push"
|
|
57
|
+
exit $GATE_EXIT
|
|
58
|
+
fi
|
|
59
|
+
|
|
60
|
+
echo "[kanban-system] ✓ gate passed. Push allowed."
|
|
61
|
+
exit 0
|
package/lib/config.cjs
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* kanban-system config loader.
|
|
4
|
+
*
|
|
5
|
+
* Resolution order:
|
|
6
|
+
* 1. <repo-root>/config.js (your edited copy — gitignored)
|
|
7
|
+
* 2. <repo-root>/config.example.js (template fallback, so the server boots)
|
|
8
|
+
* 3. env-var overrides on top of whatever loaded
|
|
9
|
+
*
|
|
10
|
+
* Also loads <repo-root>/.env (without a dependency) so launchd / cron — which
|
|
11
|
+
* don't source your shell — still see SLACK_*, SENTRY_*, etc.
|
|
12
|
+
*
|
|
13
|
+
* `require("../lib/config.cjs")` from anywhere in the repo returns the merged
|
|
14
|
+
* config object. The repo root is the parent of lib/.
|
|
15
|
+
*/
|
|
16
|
+
const fs = require("fs");
|
|
17
|
+
const path = require("path");
|
|
18
|
+
|
|
19
|
+
const REPO_ROOT = path.resolve(__dirname, "..");
|
|
20
|
+
|
|
21
|
+
// ── Load <repo-root>/.env without external deps ──────────────────────────────
|
|
22
|
+
(function loadDotEnv() {
|
|
23
|
+
const envPath = path.join(REPO_ROOT, ".env");
|
|
24
|
+
if (!fs.existsSync(envPath)) return;
|
|
25
|
+
for (let line of fs.readFileSync(envPath, "utf-8").split(/\r?\n/)) {
|
|
26
|
+
line = line.trim();
|
|
27
|
+
if (!line || line.startsWith("#")) continue;
|
|
28
|
+
const eq = line.indexOf("=");
|
|
29
|
+
if (eq < 0) continue;
|
|
30
|
+
const key = line.slice(0, eq).trim();
|
|
31
|
+
let val = line.slice(eq + 1);
|
|
32
|
+
if ((val.startsWith('"') && val.endsWith('"')) || (val.startsWith("'") && val.endsWith("'"))) {
|
|
33
|
+
val = val.slice(1, -1);
|
|
34
|
+
}
|
|
35
|
+
if (process.env[key] === undefined || process.env[key] === "") process.env[key] = val;
|
|
36
|
+
}
|
|
37
|
+
})();
|
|
38
|
+
|
|
39
|
+
// ── Load config.js (or fall back to config.example.js) ───────────────────────
|
|
40
|
+
function loadProjectConfig() {
|
|
41
|
+
const realPath = path.join(REPO_ROOT, "config.js");
|
|
42
|
+
const examplePath = path.join(REPO_ROOT, "config.example.js");
|
|
43
|
+
let cfg = {};
|
|
44
|
+
let source = null;
|
|
45
|
+
if (fs.existsSync(realPath)) {
|
|
46
|
+
cfg = require(realPath);
|
|
47
|
+
source = realPath;
|
|
48
|
+
} else if (fs.existsSync(examplePath)) {
|
|
49
|
+
cfg = require(examplePath);
|
|
50
|
+
source = examplePath;
|
|
51
|
+
if (!process.env.KANBAN_QUIET_CONFIG) {
|
|
52
|
+
console.warn("[config] config.js not found — using config.example.js. Copy it: cp config.example.js config.js");
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return { cfg: cfg || {}, source };
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const { cfg, source } = loadProjectConfig();
|
|
59
|
+
|
|
60
|
+
// ── Env-var overrides ────────────────────────────────────────────────────────
|
|
61
|
+
const port =
|
|
62
|
+
(process.env.PORT && parseInt(process.env.PORT, 10)) ||
|
|
63
|
+
cfg.kanbanPort ||
|
|
64
|
+
8080;
|
|
65
|
+
|
|
66
|
+
const slack = Object.assign(
|
|
67
|
+
{
|
|
68
|
+
botToken: "",
|
|
69
|
+
appToken: "",
|
|
70
|
+
channelId: "",
|
|
71
|
+
webhookUrl: "",
|
|
72
|
+
adminUsers: [],
|
|
73
|
+
command: "/kanban",
|
|
74
|
+
},
|
|
75
|
+
cfg.slack || {},
|
|
76
|
+
);
|
|
77
|
+
if (process.env.SLACK_BOT_TOKEN) slack.botToken = process.env.SLACK_BOT_TOKEN;
|
|
78
|
+
if (process.env.SLACK_APP_TOKEN) slack.appToken = process.env.SLACK_APP_TOKEN;
|
|
79
|
+
if (process.env.SLACK_CHANNEL_ID) slack.channelId = process.env.SLACK_CHANNEL_ID;
|
|
80
|
+
if (process.env.SLACK_AGENT_WEBHOOK) slack.webhookUrl = process.env.SLACK_AGENT_WEBHOOK;
|
|
81
|
+
if (process.env.SLACK_ADMIN_USERS) slack.adminUsers = process.env.SLACK_ADMIN_USERS.split(",").filter(Boolean);
|
|
82
|
+
if (process.env.SLACK_COMMAND) slack.command = process.env.SLACK_COMMAND;
|
|
83
|
+
|
|
84
|
+
// ── Telegram (optional — Ops Thread mirror panel) ────────────────────────────
|
|
85
|
+
// Drop in TELEGRAM_BOT_TOKEN + TELEGRAM_CHAT_ID and the kanban server starts
|
|
86
|
+
// mirroring the Ops Thread panel to that Telegram chat (and pulls operator
|
|
87
|
+
// replies back via getUpdates polling). Both blank ⇒ panel still works locally
|
|
88
|
+
// as a kanban-only chat; it just doesn't sync with Telegram.
|
|
89
|
+
const telegram = Object.assign(
|
|
90
|
+
{
|
|
91
|
+
botToken: "", // BotFather → token
|
|
92
|
+
chatId: "", // DM chat id (run /api/telegram/whoami → "send any DM to your bot, then GET /api/telegram/whoami")
|
|
93
|
+
allowedChatIds: [], // optional allowlist; empty ⇒ chatId only
|
|
94
|
+
pollEnabled: true, // start the long-poll worker on boot when token+chatId present
|
|
95
|
+
pollIntervalMs: 1500, // gap between long-poll cycles (long-poll itself blocks 25s)
|
|
96
|
+
},
|
|
97
|
+
cfg.telegram || {},
|
|
98
|
+
);
|
|
99
|
+
if (process.env.TELEGRAM_BOT_TOKEN) telegram.botToken = process.env.TELEGRAM_BOT_TOKEN;
|
|
100
|
+
if (process.env.TELEGRAM_CHAT_ID) telegram.chatId = process.env.TELEGRAM_CHAT_ID;
|
|
101
|
+
if (process.env.TELEGRAM_ALLOWED_CHAT_IDS) telegram.allowedChatIds = process.env.TELEGRAM_ALLOWED_CHAT_IDS.split(",").map((s) => s.trim()).filter(Boolean);
|
|
102
|
+
if (process.env.TELEGRAM_POLL_ENABLED) telegram.pollEnabled = process.env.TELEGRAM_POLL_ENABLED !== "0";
|
|
103
|
+
if (process.env.TELEGRAM_POLL_INTERVAL_MS) telegram.pollIntervalMs = parseInt(process.env.TELEGRAM_POLL_INTERVAL_MS, 10);
|
|
104
|
+
|
|
105
|
+
const config = {
|
|
106
|
+
repoRoot: REPO_ROOT,
|
|
107
|
+
configSource: source,
|
|
108
|
+
projectName: cfg.projectName || path.basename(cfg.repoPath || REPO_ROOT),
|
|
109
|
+
// Absolute path to the application repo this harness drives. Defaults to the
|
|
110
|
+
// harness repo itself if unset (so demos work), but you should set it.
|
|
111
|
+
repoPath: cfg.repoPath ? path.resolve(cfg.repoPath) : REPO_ROOT,
|
|
112
|
+
port,
|
|
113
|
+
deployCommands: cfg.deployCommands || [],
|
|
114
|
+
buildOutputDir: cfg.buildOutputDir || null,
|
|
115
|
+
agents: cfg.agents || [],
|
|
116
|
+
detectors: cfg.detectors || [],
|
|
117
|
+
gateTimeoutMs: parseInt(process.env.GATE_TIMEOUT_MS || "600000", 10),
|
|
118
|
+
slack,
|
|
119
|
+
telegram,
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
module.exports = config;
|
|
123
|
+
|
|
124
|
+
// Allow `node lib/config.cjs` to print the resolved config.
|
|
125
|
+
if (require.main === module) {
|
|
126
|
+
const redacted = JSON.parse(JSON.stringify(config));
|
|
127
|
+
redacted.slack = {
|
|
128
|
+
...redacted.slack,
|
|
129
|
+
botToken: redacted.slack.botToken ? "(set)" : "",
|
|
130
|
+
appToken: redacted.slack.appToken ? "(set)" : "",
|
|
131
|
+
webhookUrl: redacted.slack.webhookUrl ? "(set)" : "",
|
|
132
|
+
};
|
|
133
|
+
redacted.telegram = {
|
|
134
|
+
...redacted.telegram,
|
|
135
|
+
botToken: redacted.telegram.botToken ? "(set)" : "",
|
|
136
|
+
};
|
|
137
|
+
console.log(JSON.stringify(redacted, null, 2));
|
|
138
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Detector template — copy this to lib/detect/<name>.cjs, register it in
|
|
3
|
+
* lib/watch/scheduler.cjs (the `detectors` map), add a `{ "detector": "<name>" ... }`
|
|
4
|
+
* block to lib/detect/rules.json, and enable it in config.js → detectors.
|
|
5
|
+
*
|
|
6
|
+
* A detector exports `run(ruleSet, state)` and returns an array of alert objects.
|
|
7
|
+
* It must:
|
|
8
|
+
* - degrade gracefully when its credentials/config are missing (return a single
|
|
9
|
+
* low-severity "config-missing" alert, deduped, instead of throwing);
|
|
10
|
+
* - never throw out of `run` for a transient API error — catch it and return a
|
|
11
|
+
* low-severity "api-error" alert;
|
|
12
|
+
* - keep state it needs (baselines, "already seen" markers) on the `state` object —
|
|
13
|
+
* the scheduler persists it to data/runs/watch-state.json between sweeps.
|
|
14
|
+
*
|
|
15
|
+
* Alert shape (only `source`, `signal`, `severity` are required):
|
|
16
|
+
* {
|
|
17
|
+
* source: "<detector name>",
|
|
18
|
+
* signal: "<machine-readable signal id, matches rules.json>",
|
|
19
|
+
* severity: "low" | "medium" | "high" | "critical",
|
|
20
|
+
* message: "human-readable one-liner",
|
|
21
|
+
* threshold: "what the limit was", // optional, for the task body
|
|
22
|
+
* value: "what we observed", // optional
|
|
23
|
+
* routesTo: "frontend-agent", // optional; which agent the task goes to
|
|
24
|
+
* evidence: { ...arbitrary JSON... } // optional; gets dumped into the task
|
|
25
|
+
* }
|
|
26
|
+
*
|
|
27
|
+
* Example skeleton for "datadog" — replace with your monitoring of choice
|
|
28
|
+
* (CloudWatch, Prometheus, a /healthz endpoint, a custom metrics API, …):
|
|
29
|
+
*/
|
|
30
|
+
const API_KEY = process.env.DATADOG_API_KEY || "";
|
|
31
|
+
const APP_KEY = process.env.DATADOG_APP_KEY || "";
|
|
32
|
+
|
|
33
|
+
async function run(ruleSet, state) {
|
|
34
|
+
// 1. Config check — surface the gap once a day, don't crash.
|
|
35
|
+
if (!API_KEY || !APP_KEY) {
|
|
36
|
+
const k = "datadog:config-missing";
|
|
37
|
+
if (state.alerts[k] && Date.now() - state.alerts[k] < 24 * 3600 * 1000) return [];
|
|
38
|
+
return [{
|
|
39
|
+
source: "datadog", signal: "config-missing", severity: "low",
|
|
40
|
+
message: "DATADOG_API_KEY / DATADOG_APP_KEY not set — this detector is disabled.",
|
|
41
|
+
threshold: "env present", value: "missing", routesTo: "orchestrator",
|
|
42
|
+
evidence: { needs: ["DATADOG_API_KEY", "DATADOG_APP_KEY"] },
|
|
43
|
+
}];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const alerts = [];
|
|
47
|
+
try {
|
|
48
|
+
// 2. Fetch a metric / query a log search / hit a health endpoint.
|
|
49
|
+
// const data = await fetch("https://api.datadoghq.com/api/v1/query?...", { headers: { "DD-API-KEY": API_KEY, "DD-APPLICATION-KEY": APP_KEY } }).then(r => r.json());
|
|
50
|
+
|
|
51
|
+
// 3. Compare against a threshold (and/or a rolling baseline you keep on `state`).
|
|
52
|
+
// const baseline = state["datadog:baseline:errors"] || null;
|
|
53
|
+
// if (baseline && observed > baseline * 3) alerts.push({ source: "datadog", signal: "...", severity: "high", ... });
|
|
54
|
+
// state["datadog:baseline:errors"] = baseline ? baseline * 0.8 + observed * 0.2 : observed;
|
|
55
|
+
|
|
56
|
+
// 4. Optional heartbeat (deduped, so the operator knows polling is alive).
|
|
57
|
+
} catch (e) {
|
|
58
|
+
return [{ source: "datadog", signal: "api-error", severity: "low", message: `Datadog API error: ${e.message}`, routesTo: "orchestrator", evidence: { error: e.message } }];
|
|
59
|
+
}
|
|
60
|
+
return alerts;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
module.exports = { run };
|