npm - @zibby/workflow-templates - Versions diffs - 0.7.1 → 0.9.1 - Mend

@zibby/workflow-templates 0.7.1 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/browser-test-automation/icon.png +0 -0
package/code-analysis/icon.png +0 -0
package/generate-test-cases/icon.png +0 -0
package/index.js +353 -3
package/notify-lark/icon.png +0 -0
package/notify-lark/package.json +2 -1
package/notify-notion/icon.png +0 -0
package/notify-slack/icon.png +0 -0
package/notify-slack/package.json +2 -1
package/package.json +4 -1
package/pipeline-supervisor/README.md +51 -0
package/pipeline-supervisor/graph.mjs +75 -0
package/pipeline-supervisor/icon.png +0 -0
package/pipeline-supervisor/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +1 -0
package/pipeline-supervisor/nodes/notify-node.js +162 -0
package/pipeline-supervisor/nodes/propose-node.js +91 -0
package/pipeline-supervisor/nodes/scan-pipelines-node.js +316 -0
package/pipeline-supervisor/package.json +19 -0
package/pipeline-supervisor/state.js +151 -0
package/sentry-triage/graph.mjs +25 -18
package/sentry-triage/icon.png +0 -0
package/sentry-triage/nodes/dispatch-node.js +120 -59
package/browser-test-automation/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +0 -1
package/code-analysis/node_modules/.vite/vitest/da39a3ee5e6b4b0d3255bfef95601890afd80709/results.json +0 -1

package/pipeline-supervisor/nodes/notify-node.js ADDED Viewed

@@ -0,0 +1,162 @@
+/**
+ * notify — LLM + SKILLS.CHAT_NOTIFY. Posts ONE human-reviewable card
+ * summarizing the improvement proposals to the configured chat destination.
+ * This is the "NOTIFY" half of READ → PROPOSE → NOTIFY: the human reads the
+ * card and acts manually. The supervisor never applies a change itself in v1.
+ *
+ * Mirrors sentry-triage's dispatch node: chatNotifySkill.resolve() picks
+ * slack or lark from which env var is set (SLACK_CHANNEL vs LARK_RECEIVE_ID),
+ * so the LLM only ever sees one provider's tools.
+ *
+ * ENV tab config — required:
+ *   SLACK_CHANNEL OR LARK_RECEIVE_ID  — provider selector + destination
+ * ENV tab config — optional:
+ *   SLACK_MENTIONS / LARK_MENTIONS    — JSON array of mentions on the card
+ */
+import { z, SKILLS } from '@zibby/core';
+const DispatchedRecordSchema = z.object({
+  status: z.enum(['sent', 'skipped', 'failed']),
+  // nullish (not optional) on purpose — the LLM emits explicit null rather
+  // than omitting keys; .optional() would reject null and fail the node.
+  recipient: z.object({
+    kind: z.enum(['channel', 'user_dm', 'usergroup']).nullish(),
+    id: z.string().nullish(),
+    label: z.string().nullish(),
+  }).nullish(),
+  proposalCount: z.number().nullish(),
+  messageTs: z.string().nullish(),   // Slack
+  messageId: z.string().nullish(),   // Lark
+  detail: z.string().nullish(),
+});
+const NotifyOutputSchema = z.object({
+  dispatched: z.array(DispatchedRecordSchema),
+  summary: z.object({
+    total: z.number(),
+    sent: z.number(),
+    skipped: z.number(),
+    failed: z.number(),
+  }),
+});
+const CHANGE_KIND_LABEL = {
+  add_test_gate: 'Add a test gate',
+  tweak_prompt: 'Tweak the prompt',
+  add_human_approval_gate: 'Add a human-approval gate',
+  drop_redundant_step: 'Drop a redundant step',
+  other: 'Other',
+};
+const NOTIFY_PROMPT = (state = {}) => {
+  const proposals = state?.propose_improvements?.proposals || [];
+  const scan = state?.scan_pipelines || {};
+  const lookbackHours = scan.lookbackHours || state?.lookbackHours || 24;
+  const slackChannel = process.env.SLACK_CHANNEL || '';
+  const larkReceiveId = process.env.LARK_RECEIVE_ID || '';
+  // ── No-op short-circuit ─────────────────────────────────────────
+  // Nothing flagged → keep the run green without a model round-trip or
+  // forcing channel setup. Return the empty envelope verbatim.
+  if (proposals.length === 0) {
+    return `pipeline-supervisor found no problem pipelines this run — nothing to propose.
+Return this exact JSON envelope and call no tools:
+\`\`\`json
+{ "dispatched": [{ "status": "skipped", "proposalCount": 0, "detail": "no flagged pipelines" }], "summary": { "total": 0, "sent": 0, "skipped": 1, "failed": 0 } }
+\`\`\`
+`;
+  }
+  // ── Provider selection ──────────────────────────────────────────
+  let provider, postTool, channelId, mentionsRaw;
+  if (slackChannel) {
+    provider = 'slack';
+    postTool = 'slack_post_message';
+    channelId = slackChannel;
+    mentionsRaw = process.env.SLACK_MENTIONS || '[]';
+  } else if (larkReceiveId) {
+    provider = 'lark';
+    postTool = 'lark_send_message';
+    channelId = larkReceiveId;
+    mentionsRaw = process.env.LARK_MENTIONS || '[]';
+  } else {
+    throw new Error(
+      'pipeline-supervisor has proposals to post but no destination configured. ' +
+      'Go to Project Settings → ENV and set ONE of:\n' +
+      '  - SLACK_CHANNEL=#your-channel    (uses connected Slack integration)\n' +
+      '  - LARK_RECEIVE_ID=oc_xxxxxxxx    (uses connected Lark integration)'
+    );
+  }
+  let mentions;
+  try { mentions = JSON.parse(mentionsRaw); } catch { mentions = []; }
+  if (!Array.isArray(mentions)) mentions = [];
+  const windowLabel = lookbackHours < 48
+    ? `the past ${lookbackHours} hours`
+    : `the past ${Math.round(lookbackHours / 24)} days`;
+  const writeGuide = provider === 'slack'
+    ? `# How to post it — a Slack review card
+Post ONCE with \`slack_post_message({ channel, text, blocks })\`. \`text\` = a one-line fallback. \`blocks\` = real Block Kit objects only:
+1. \`header\` — { "type": "header", "text": { "type": "plain_text", "text": "🛠️ Pipeline Supervisor — ${windowLabel}", "emoji": true } }
+2. \`context\` — one line: how many pipelines flagged, scanned over ${windowLabel}.
+3. Per proposal — a \`section\` then a small \`context\`:
+   { "type": "divider" }
+   { "type": "section", "text": { "type": "mrkdwn", "text": "*<pipeline>* — <problem>\\n*Suggestion (${'`'}<changeKind label>${'`'}):* <suggestion>" } }
+   { "type": "context", "elements": [{ "type": "mrkdwn", "text": "↳ <evidence — the concrete metric>" }] }
+4. final \`context\` — make clear these are PROPOSALS for a human to review and apply; the supervisor did NOT change anything.
+- header text is plain_text; section & context are mrkdwn.
+- Real Block Kit types only (header / section / divider / context).`
+    : `# How to write it — a Lark review note, talk like a teammate
+Post ONCE with \`lark_send_message({ receive_id, msg_type:"text", content })\`. Open with one sentence about ${windowLabel} and how many pipelines you flagged. Then, per proposal: the pipeline name, the problem, your suggested change (say which of the four moves it is), and the evidence number. End by making clear these are PROPOSALS for a human to review and apply — the supervisor changed nothing. No form blocks; sound like a person.`;
+  return `You are the notify node of pipeline-supervisor. Post ONE chat message with the **${postTool}** tool summarizing the improvement proposals for a human to review.
+# Destination
+Channel/receive_id: ${JSON.stringify(channelId)} (${provider}). Post with \`${postTool}\`.
+${mentions.length > 0 ? `Prepend these mentions: ${JSON.stringify(mentions.join(' '))}` : ''}
+# Framing (important)
+These are PROPOSALS. The supervisor read other pipelines' run history and is SUGGESTING changes a human will review and apply by hand. Do NOT imply anything was already changed. Each card line should read like "Pipeline X failed 4/5 runs on step Y — I'd suggest <change>. (review)".
+# changeKind → human label
+${Object.entries(CHANGE_KIND_LABEL).map(([k, v]) => `- ${k} → ${v}`).join('\n')}
+${writeGuide}
+# Output (outputSchema-enforced)
+Return ONE record for the message you posted (status "sent"), or "failed" with a \`detail\`. \`proposalCount\` = number of proposals in the card. \`recipient\` records where it went.
+\`\`\`json
+{
+  "dispatched": [
+    { "status": "sent", "recipient": { "kind": "channel", "id": ${JSON.stringify(channelId)} }, "proposalCount": ${proposals.length}${provider === 'slack' ? ',\n      "messageTs": "1716109330.555"' : ',\n      "messageId": "om_xxxxx"'} }
+  ],
+  "summary": { "total": 1, "sent": 1, "skipped": 0, "failed": 0 }
+}
+\`\`\`
+# Proposals to post
+\`\`\`json
+${JSON.stringify(proposals, null, 2)}
+\`\`\`
+# Rules
+- ONE message → ONE \`sent\` record.
+- Don't invent pipelines, metrics, or suggestions — only what's in the data above.
+- Keep it tight. If there are 2 proposals, a short card is the right answer.
+`;
+};
+export const notifyNode = {
+  name: 'notify',
+  skills: [SKILLS.CHAT_NOTIFY],
+  outputSchema: NotifyOutputSchema,
+  prompt: NOTIFY_PROMPT,
+};

package/pipeline-supervisor/nodes/propose-node.js ADDED Viewed

@@ -0,0 +1,91 @@
+/**
+ * propose_improvements — LLM. Reads the per-pipeline health summary from
+ * scan_pipelines and emits ONE concrete, reviewable improvement proposal
+ * per FLAGGED pipeline. No tools — everything it needs is inlined as JSON.
+ *
+ * This is the "propose" half of READ → PROPOSE → NOTIFY. It does NOT touch
+ * any other workflow's graph. It only describes a change a human can apply.
+ *
+ * ─────────────────────────────────────────────────────────────────────────
+ * TODO (future, DELIBERATELY NOT IMPLEMENTED in v1 — the safe L3 boundary):
+ *   Auto-PATCH the target pipeline's graph. When we promote this template
+ *   from "notify only" to "self-iterating", a new node AFTER human approval
+ *   would call the workflow-update API to actually apply an accepted
+ *   `changeKind` (e.g. insert a test-gate node, edit a prompt). That step
+ *   must be gated behind explicit human approval + snapshot/dry-run/verify/
+ *   rollback (see MEMORY: app-upgrade-strategy-agentic). v1 stops at the
+ *   proposal so a human reviews and applies the change by hand.
+ * ─────────────────────────────────────────────────────────────────────────
+ */
+import { z } from '@zibby/core';
+const ProposalSchema = z.object({
+  workflowType: z.string(),
+  problem: z.string(),
+  changeKind: z.enum([
+    'add_test_gate',
+    'tweak_prompt',
+    'add_human_approval_gate',
+    'drop_redundant_step',
+    'other',
+  ]),
+  suggestion: z.string(),
+  evidence: z.string().optional(),
+  confidence: z.number().min(0).max(1).optional(),
+});
+const ProposeOutputSchema = z.object({
+  proposals: z.array(ProposalSchema),
+});
+const GUIDE = `You are the propose_improvements node of pipeline-supervisor — a workflow that watches a Zibby project's OTHER pipelines and proposes concrete fixes. This is "Zibby managing Zibby."
+You are given a per-pipeline health summary as JSON below. Each entry is one pipeline (a workflow type) with its recent run stats: total / failed / succeeded / running, failRate, medianDurationMs, a worstRun example, and whether it's \`flagged\` (+ \`flagReason\`).
+# Your job
+For EACH pipeline where \`flagged === true\`, emit ONE proposal. Do NOT propose anything for un-flagged pipelines. If nothing is flagged, return an empty \`proposals\` array.
+# Pick ONE concrete change per problem — \`changeKind\` must be one of:
+- **add_test_gate** — the pipeline ships broken output / fails late. Propose inserting a validation/test step that catches the failure earlier (before the expensive/irreversible step).
+- **tweak_prompt** — an LLM node is making the same mistake repeatedly (e.g. wrong format, hallucinated tool call). Propose a specific prompt change.
+- **add_human_approval_gate** — the pipeline takes a risky/irreversible action and keeps getting it wrong. Propose a human-approval gate before that step.
+- **drop_redundant_step** — a step adds latency or failure surface with no value (e.g. an LLM round-trip that adds no judgment). Propose dropping it. Use this for clear "slow outlier" flags.
+- **other** — only when none of the above fit; explain in \`suggestion\`.
+# Each proposal must be:
+- **Specific**: name the pipeline, the symptom, and the exact change. Not "improve reliability" — instead "add a JSON-schema validation gate after the 'generate' node; 3 of the last 4 runs failed there with a malformed-output error."
+- **Evidence-backed**: put the concrete number / worstRun detail in \`evidence\` ("failRate 75% over 4 runs; worst run exec_abc failed on step 'deploy'"). Pull it straight from the data — never invent a metric.
+- **Reviewable, not auto-applied**: phrase \`suggestion\` as a recommendation a human will read and apply. You are NOT editing any graph.
+- **confidence** reflects how clean the signal is. A pipeline failing 4/4 on the same step → 0.9. A borderline slow outlier → 0.5.
+# Rules
+- ONE proposal per flagged pipeline. No duplicates.
+- Only use pipelines/numbers present in the data block. Don't invent pipelines, steps, or error messages.
+- \`problem\` is one sentence (the symptom). \`suggestion\` is one-to-three sentences (the fix).
+- Temperature 0. This is analysis, not creative writing.
+- Call NO tools — you have everything you need below.`;
+const PROPOSE_PROMPT = (state = {}) => {
+  const pipelines = state?.scan_pipelines?.pipelines || [];
+  const flagged = pipelines.filter((p) => p.flagged);
+  return `${GUIDE}
+## Context for this run
+- Scanned project: ${state?.scan_pipelines?.projectId || '(unknown)'}
+- Lookback: ${state?.scan_pipelines?.lookbackHours || '?'}h
+- Pipelines analyzed: ${pipelines.length}; flagged as problems: ${flagged.length}
+## Pipeline health summary (propose ONLY for flagged === true)
+\`\`\`json
+${JSON.stringify(pipelines, null, 2)}
+\`\`\`
+`;
+};
+export const proposeNode = {
+  name: 'propose_improvements',
+  outputSchema: ProposeOutputSchema,
+  prompt: PROPOSE_PROMPT,
+};

package/pipeline-supervisor/nodes/scan-pipelines-node.js ADDED Viewed

@@ -0,0 +1,316 @@
+/**
+ * scan_pipelines — DETERMINISTIC. Reads the project's recent executions
+ * across ALL pipelines via the Zibby REST API, then rolls them up per
+ * pipeline (workflow type) into a health summary the proposer reasons over.
+ *
+ * ── How the supervisor reads OTHER pipelines' results (the chosen mechanism) ──
+ *
+ * Mechanism: a DIRECT, authed HTTPS call to the Zibby REST API
+ *   GET {ZIBBY_ACCOUNT_API_URL|api-prod.zibby.app}/executions?projectId=<id>&limit=<n>
+ * (the same `listExecutions` route the dashboard + MCP server use), carrying
+ *   Authorization: Bearer <ZIBBY_PAT>
+ *
+ * Auth — why a user PAT, and NOT the injected PROJECT_API_TOKEN:
+ *   The executor injects PROJECT_API_TOKEN (a `zby_*` PROJECT token) into
+ *   every Fargate task. That token authenticates as the PROJECT
+ *   (authType:'project') and carries NO userId. But every cross-pipeline
+ *   READ route — /executions, /jobs/:projectId, /all/:projectId — pulls
+ *   `userId` out of the authorizer context and 401/403s when it's absent
+ *   (executions.js listExecutions: `if (!userId) return 401`;
+ *   workflow-logs.js: verifyProjectAccess(userId, …)). The remote MCP server
+ *   (mcp-server.js) goes further and validates a `zby_pat_*` PAT specifically.
+ *   So the project token literally cannot read these routes.
+ *
+ *   The credential that works is a USER personal access token (zby_pat_…),
+ *   supplied at deploy time as ZIBBY_PAT in the ENV tab. It resolves to a
+ *   userId via the authorizer's PAT path, and verifyProjectAccess then
+ *   confirms that user can see the supervised project. This is the same
+ *   credential class the MCP server requires, so the auth model is identical
+ *   whether you reach the data via REST (this node) or via the MCP tools
+ *   (zibby_list_executions / zibby_get_all_workflow_logs).
+ *
+ * Why REST over the MCP tools:
+ *   - No MCP client to stand up inside the workflow process; one fetch().
+ *   - The MCP `zibby_list_executions` tool is a thin proxy to THIS SAME
+ *     REST route, so we lose nothing by calling it directly.
+ *   - Deterministic + free: no LLM round-trip to drive a tool call for a
+ *     pure data pull.
+ *
+ * Per-pipeline rollup:
+ *   - "pipeline" = one workflow type/slug in the project. We group the
+ *     recent executions by `workflowType` and compute total / failed /
+ *     succeeded / running, failRate, and a median completed-run duration.
+ *   - A pipeline is `flagged` when failRate >= minFailRate (with >= 3 runs
+ *     so a single fluke doesn't trip it) OR it's a clear "slow" outlier.
+ *   - worstRun cites the single worst recent run so the proposer has a
+ *     concrete example to point at. failedStep/errorSummary are best-effort
+ *     — populated from whatever the execution row carries; absent is fine.
+ */
+import { z } from 'zod';
+const PipelineHealthSchema = z.object({
+  workflowType: z.string(),
+  workflowUuid: z.string().optional(),
+  total: z.number(),
+  failed: z.number(),
+  succeeded: z.number(),
+  running: z.number(),
+  failRate: z.number(),
+  medianDurationMs: z.number().optional(),
+  worstRun: z.object({
+    executionId: z.string().optional(),
+    status: z.string().optional(),
+    durationMs: z.number().optional(),
+    failedStep: z.string().optional(),
+    errorSummary: z.string().optional(),
+    startedAt: z.string().optional(),
+  }).optional(),
+  flagged: z.boolean(),
+  flagReason: z.string().optional(),
+});
+const ScanOutputSchema = z.object({
+  projectId: z.string(),
+  lookbackHours: z.number(),
+  scannedAt: z.string(),
+  totalExecutions: z.number(),
+  pipelines: z.array(PipelineHealthSchema),
+});
+// Statuses the executions API uses. Anything in FAILED_STATUSES counts
+// against the pipeline; SUCCEEDED is the clean path; the rest are in-flight.
+const FAILED_STATUSES = new Set(['failed', 'cancelled', 'blocked', 'insufficient_context']);
+const SUCCEEDED_STATUSES = new Set(['completed']);
+const RUNNING_STATUSES = new Set(['running', 'queued', 'starting', 'uploading']);
+function getAccountApiUrl() {
+  const raw = process.env.ZIBBY_ACCOUNT_API_URL;
+  if (raw) return raw.replace(/\/$/, '');
+  const env = process.env.ZIBBY_ENV || 'prod';
+  if (env === 'local') return 'http://localhost:3001';
+  return process.env.ZIBBY_PROD_ACCOUNT_API_URL || 'https://api-prod.zibby.app';
+}
+function median(nums) {
+  const xs = nums.filter((n) => typeof n === 'number' && isFinite(n)).sort((a, b) => a - b);
+  if (xs.length === 0) return undefined;
+  const mid = Math.floor(xs.length / 2);
+  return xs.length % 2 ? xs[mid] : Math.round((xs[mid - 1] + xs[mid]) / 2);
+}
+// Best-effort duration extraction. Execution rows don't carry a uniform
+// durationMs; derive it from start/end timestamps when both exist.
+function durationMsOf(exec) {
+  if (typeof exec.durationMs === 'number') return exec.durationMs;
+  const start = exec.startedAt || exec.createdAt;
+  const end = exec.completedAt || exec.finishedAt || exec.updatedAt;
+  if (start && end) {
+    const d = new Date(end).getTime() - new Date(start).getTime();
+    if (isFinite(d) && d >= 0) return d;
+  }
+  return undefined;
+}
+// Best-effort "what step failed / why" — execution rows vary; surface
+// whatever's there without inventing anything.
+function failureDetail(exec) {
+  return {
+    failedStep: exec.failedStep || exec.currentStep || exec.lastNode || undefined,
+    errorSummary: (exec.error || exec.errorMessage || exec.failureReason || '')
+      .toString().slice(0, 280) || undefined,
+  };
+}
+export const scanPipelinesNode = {
+  name: 'scan_pipelines',
+  outputSchema: ScanOutputSchema,
+  // 2 min — a single paginated /executions pull is usually <2s; headroom
+  // for a large project's history + transient API slowness.
+  timeout: 2 * 60 * 1000,
+  execute: async (context) => {
+    const state = (context?.state && typeof context.state.getAll === 'function')
+      ? context.state.getAll()
+      : context;
+    const lookbackHours = Number(state?.lookbackHours) || 24;
+    const minFailRate = typeof state?.minFailRate === 'number' ? state.minFailRate : 0.4;
+    const maxPipelines = Number(state?.maxPipelines) || 25;
+    const filters = Array.isArray(state?.targetWorkflowTypes)
+      ? state.targetWorkflowTypes.map((s) => String(s).toLowerCase())
+      : null;
+    // Supervised project: explicit override, else the running project.
+    const projectId = process.env.SUPERVISOR_PROJECT_ID || process.env.PROJECT_ID;
+    const pat = process.env.ZIBBY_PAT || process.env.ZIBBY_USER_TOKEN;
+    if (!projectId) {
+      throw new Error(
+        'pipeline-supervisor: no project to supervise. PROJECT_ID is injected by the ' +
+        'executor; set SUPERVISOR_PROJECT_ID in the ENV tab to point at a different project.'
+      );
+    }
+    if (!pat) {
+      throw new Error(
+        'pipeline-supervisor: ZIBBY_PAT is not set. The supervisor reads OTHER pipelines\' ' +
+        'executions via the Zibby REST API, which requires a USER personal access token ' +
+        '(zby_pat_…). The Fargate-injected PROJECT_API_TOKEN is a project token and the ' +
+        '/executions route rejects it (no userId). Create a PAT in the dashboard and set it ' +
+        'as ZIBBY_PAT in Project Settings → ENV.'
+      );
+    }
+    const base = getAccountApiUrl();
+    // limit=200 is the API ceiling; one page covers lookback for any realistic
+    // project. We post-filter by lookbackHours below rather than relying on a
+    // server-side time filter the route doesn't expose.
+    const url = `${base}/executions?projectId=${encodeURIComponent(projectId)}&limit=200`;
+    console.log(`Scanning executions: ${url}`);
+    console.log(`Lookback: ${lookbackHours}h · minFailRate: ${minFailRate} · maxPipelines: ${maxPipelines}`);
+    const res = await fetch(url, {
+      method: 'GET',
+      headers: { Authorization: `Bearer ${pat}` },
+    });
+    if (!res.ok) {
+      const body = await res.text().catch(() => '');
+      if (res.status === 401 || res.status === 403) {
+        throw new Error(
+          `pipeline-supervisor: ${res.status} reading /executions. ZIBBY_PAT is invalid, ` +
+          `expired, or its owner can't access project ${projectId}. ${body.slice(0, 200)}`
+        );
+      }
+      throw new Error(`pipeline-supervisor: /executions returned ${res.status}: ${body.slice(0, 300)}`);
+    }
+    const payload = await res.json().catch(() => ({}));
+    const all = Array.isArray(payload?.executions) ? payload.executions : [];
+    // Window + name filter.
+    const cutoff = Date.now() - lookbackHours * 3600 * 1000;
+    const inWindow = all.filter((e) => {
+      const t = new Date(e.createdAt || e.startedAt || 0).getTime();
+      return isFinite(t) && t >= cutoff;
+    });
+    const considered = filters
+      ? inWindow.filter((e) => {
+          const wt = String(e.workflowType || '').toLowerCase();
+          return filters.some((f) => wt.includes(f));
+        })
+      : inWindow;
+    console.log(
+      `Fetched ${all.length} execution(s); ${inWindow.length} in the last ${lookbackHours}h` +
+      `${filters ? `, ${considered.length} after type filter` : ''}.`
+    );
+    // ── Group by pipeline (workflowType) ──────────────────────────────
+    const byPipeline = new Map();
+    for (const e of considered) {
+      const wt = e.workflowType || '(unknown)';
+      if (!byPipeline.has(wt)) byPipeline.set(wt, []);
+      byPipeline.get(wt).push(e);
+    }
+    let pipelines = [];
+    for (const [workflowType, runs] of byPipeline.entries()) {
+      const failed = runs.filter((r) => FAILED_STATUSES.has(r.status));
+      const succeeded = runs.filter((r) => SUCCEEDED_STATUSES.has(r.status));
+      const running = runs.filter((r) => RUNNING_STATUSES.has(r.status));
+      // Fail rate over TERMINAL runs only — in-flight runs aren't a verdict yet.
+      const terminal = failed.length + succeeded.length;
+      const failRate = terminal > 0 ? failed.length / terminal : 0;
+      const durations = succeeded.map(durationMsOf).filter((d) => typeof d === 'number');
+      const medianDurationMs = median(durations);
+      // Worst run = a failure if any, else the slowest succeeded run.
+      let worstRun;
+      const worstFail = failed
+        .slice()
+        .sort((a, b) => new Date(b.createdAt || 0) - new Date(a.createdAt || 0))[0];
+      if (worstFail) {
+        const det = failureDetail(worstFail);
+        worstRun = {
+          executionId: worstFail.executionId,
+          status: worstFail.status,
+          durationMs: durationMsOf(worstFail),
+          startedAt: worstFail.createdAt || worstFail.startedAt,
+          ...det,
+        };
+      } else {
+        const slow = succeeded
+          .slice()
+          .sort((a, b) => (durationMsOf(b) || 0) - (durationMsOf(a) || 0))[0];
+        if (slow) {
+          worstRun = {
+            executionId: slow.executionId,
+            status: slow.status,
+            durationMs: durationMsOf(slow),
+            startedAt: slow.createdAt || slow.startedAt,
+          };
+        }
+      }
+      // Flag: enough terminal runs AND failRate over threshold. The >= 3
+      // guard keeps a single failed run from flagging a pipeline that's
+      // otherwise fine.
+      let flagged = false;
+      let flagReason;
+      if (terminal >= 3 && failRate >= minFailRate) {
+        flagged = true;
+        flagReason = `failRate ${(failRate * 100).toFixed(0)}% over ${terminal} terminal run(s) (≥ ${(minFailRate * 100).toFixed(0)}% threshold)`;
+      }
+      pipelines.push({
+        workflowType,
+        total: runs.length,
+        failed: failed.length,
+        succeeded: succeeded.length,
+        running: running.length,
+        failRate: Number(failRate.toFixed(3)),
+        medianDurationMs,
+        worstRun,
+        flagged,
+        flagReason,
+      });
+    }
+    // ── Slow-outlier flag (cross-pipeline) ────────────────────────────
+    // A pipeline whose median run is > 3× the median-of-medians is "slow",
+    // even if it's not failing. Only meaningful with a few pipelines that
+    // actually have durations.
+    const meds = pipelines.map((p) => p.medianDurationMs).filter((d) => typeof d === 'number');
+    const globalMed = median(meds);
+    if (globalMed && globalMed > 0) {
+      for (const p of pipelines) {
+        if (!p.flagged && typeof p.medianDurationMs === 'number' && p.medianDurationMs > globalMed * 3) {
+          p.flagged = true;
+          p.flagReason = `median run ${(p.medianDurationMs / 1000).toFixed(0)}s is >3× the project median (${(globalMed / 1000).toFixed(0)}s) — slow outlier`;
+        }
+      }
+    }
+    // Flagged first, then worst failRate, then most runs. Cap to maxPipelines.
+    pipelines.sort((a, b) =>
+      (Number(b.flagged) - Number(a.flagged)) ||
+      (b.failRate - a.failRate) ||
+      (b.total - a.total)
+    );
+    pipelines = pipelines.slice(0, maxPipelines);
+    const flaggedCount = pipelines.filter((p) => p.flagged).length;
+    console.log(`Rolled up ${pipelines.length} pipeline(s); ${flaggedCount} flagged.`);
+    for (const p of pipelines.filter((x) => x.flagged)) {
+      console.log(`  ⚠ ${p.workflowType}: ${p.flagReason}`);
+    }
+    return {
+      projectId,
+      lookbackHours,
+      scannedAt: new Date().toISOString(),
+      totalExecutions: considered.length,
+      pipelines,
+    };
+  },
+};

package/pipeline-supervisor/package.json ADDED Viewed

@@ -0,0 +1,19 @@
+{
+  "name": "pipeline-supervisor",
+  "version": "1.0.0",
+  "private": true,
+  "type": "module",
+  "description": "Zibby managing Zibby — a scheduled supervisor that scans the project's other pipelines, finds the failing/slow ones, and posts human-reviewable improvement proposals to Slack or Lark. Read + propose + notify only (v1 never edits other workflows).",
+  "main": "graph.mjs",
+  "scripts": {
+    "test": "vitest run"
+  },
+  "dependencies": {
+    "@zibby/core": "^0.5.1",
+    "@zibby/skills": "^0.1.26",
+    "zod": "^3.23.0"
+  },
+  "devDependencies": {
+    "vitest": "^2.1.5"
+  }
+}