@integrity-labs/agt-cli 0.27.150-test.15 → 0.27.150-test.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21097,6 +21097,22 @@ var AdminDebugClient = class _AdminDebugClient {
21097
21097
  listAlerts(args) {
21098
21098
  return this.get("/admin/debug/alerts", _AdminDebugClient.cleanQuery(args));
21099
21099
  }
21100
+ // ─────────────────── request-scoped customer grant (ENG-6279) ───────────────────
21101
+ /**
21102
+ * Open scoped READ access to a SELF-MANAGED customer org so the read tools can
21103
+ * reach it (ENG-6279). POSTs to /admin/debug/grants, which mints the short-TTL,
21104
+ * logged, customer-revocable read grant; the read tools then return that org's
21105
+ * data for the grant's lifetime. Reuses `post()` — same token + 401-re-exchange
21106
+ * flow as the reads. For an already-standing org (is_internal / fully_managed)
21107
+ * the API returns `{ already_authorized: true }` instead of minting a redundant
21108
+ * grant — not an error.
21109
+ */
21110
+ requestAccess(args) {
21111
+ return this.post("/admin/debug/grants", {
21112
+ target_org_id: args.target_org_id,
21113
+ reason: args.reason
21114
+ });
21115
+ }
21100
21116
  // ─────────────────────── remedial actions (ENG-6197) ───────────────────────
21101
21117
  async post(path, body, retried = false) {
21102
21118
  const token = await this.getValidToken();
@@ -21167,6 +21183,12 @@ var listAlertsSchema = external_exports.object({
21167
21183
  open: external_exports.boolean().optional().describe("When true, only currently-open (unclosed) alerts."),
21168
21184
  limit: limitSchema
21169
21185
  });
21186
+ var requestAccessSchema = external_exports.object({
21187
+ target_org_id: external_exports.string().min(1).max(64).describe("UUID of the customer organization to open scoped read access to."),
21188
+ reason: external_exports.string().min(1).max(2e3).describe(
21189
+ "Why access is needed \u2014 recorded on the customer-visible access log and shown to the org when they review or revoke it. Be specific."
21190
+ )
21191
+ });
21170
21192
  var requestActionSchema = external_exports.object({
21171
21193
  action: external_exports.enum(["restart", "clear_pending_inbound"]).describe(
21172
21194
  "The remedial action. restart = bounce the agent (reversible; backlog replays). clear_pending_inbound = move stuck pending-inbound markers aside (drops queued inbound messages; they are NOT reprocessed)."
@@ -21181,7 +21203,7 @@ var checkActionStatusSchema = external_exports.object({
21181
21203
  // package.json
21182
21204
  var package_default = {
21183
21205
  name: "@integrity-labs/augmented-admin-mcp",
21184
- version: "0.1.0",
21206
+ version: "0.1.1",
21185
21207
  description: "Augmented Admin Debug \u2014 Integrity Labs staff-only MCP server for cross-org agent/host/integration/alert diagnostics. Thin client over the Augmented API's /admin/debug/* surface; authority + the diagnostic projection live server-side. ENG-6195.",
21186
21208
  type: "module",
21187
21209
  private: true,
@@ -21249,7 +21271,7 @@ function formatError2(err) {
21249
21271
  var server = new McpServer({ name: "augmented-admin-mcp", version: package_default.version });
21250
21272
  server.tool(
21251
21273
  "debug_search_agents",
21252
- "Search managed end-user agents across all orgs for troubleshooting. Returns a diagnostic projection per agent (code_name, status, environment, risk_tier, org, heartbeat verdict) \u2014 never credentials or transcripts. Filter with { q, status, environment, limit }.",
21274
+ "Search managed end-user agents across authorized orgs for troubleshooting. Returns a diagnostic projection per agent (code_name, status, environment, risk_tier, org, heartbeat verdict) \u2014 never credentials or transcripts. Authorized = IL-owned + fully-managed orgs (standing) plus any self-managed org you hold an active grant for (see debug_request_access). Filter with { q, status, environment, limit }.",
21253
21275
  searchAgentsSchema.shape,
21254
21276
  async (args) => {
21255
21277
  try {
@@ -21275,7 +21297,7 @@ server.tool(
21275
21297
  );
21276
21298
  server.tool(
21277
21299
  "debug_search_hosts",
21278
- "Search managed hosts across all orgs. Returns a projection per host (name, org, status, framework version, last-seen verdict, EC2 + Claude auth status) \u2014 no api-key fingerprints or secrets. Filter with { q, status, limit }.",
21300
+ "Search managed hosts across authorized orgs. Returns a projection per host (name, org, status, framework version, last-seen verdict, EC2 + Claude auth status) \u2014 no api-key fingerprints or secrets. Filter with { q, status, limit }.",
21279
21301
  searchHostsSchema.shape,
21280
21302
  async (args) => {
21281
21303
  try {
@@ -21288,7 +21310,7 @@ server.tool(
21288
21310
  );
21289
21311
  server.tool(
21290
21312
  "debug_list_alerts",
21291
- "List recent platform alerts across all orgs (host-down, agent-stale, probe-timeout, auth-failed), including NULL-team host alerts. Returns a projection per alert (kind, severity, message, source, open/closed state). Filter with { severity, open, limit }.",
21313
+ "List recent platform alerts across authorized orgs (host-down, agent-stale, probe-timeout, auth-failed), including NULL-team host alerts. Returns a projection per alert (kind, severity, message, source, open/closed state). Filter with { severity, open, limit }.",
21292
21314
  listAlertsSchema.shape,
21293
21315
  async (args) => {
21294
21316
  try {
@@ -21299,6 +21321,22 @@ server.tool(
21299
21321
  }
21300
21322
  }
21301
21323
  );
21324
+ server.tool(
21325
+ "debug_request_access",
21326
+ "Open scoped READ access to a SELF-MANAGED customer org so the read tools (debug_search_agents / debug_get_agent / debug_search_hosts / debug_list_alerts) can reach it. Use this when a read returns nothing for an org you have a legitimate support reason to troubleshoot. It mints a short-TTL, logged, customer-revocable read grant \u2014 the customer sees the access in their Support-access console and can revoke it, and it auto-expires; it does NOT grant any write/remedial capability. For an already-authorized org (Integrity-Labs-internal or fully-managed) this returns { already_authorized: true } instead of minting a redundant grant \u2014 just use the read tools. Pass { target_org_id, reason }; the reason is recorded verbatim on the customer-visible access log, so be specific.",
21327
+ requestAccessSchema.shape,
21328
+ async (args) => {
21329
+ try {
21330
+ const result = await client.requestAccess({
21331
+ target_org_id: args.target_org_id,
21332
+ reason: args.reason
21333
+ });
21334
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
21335
+ } catch (err) {
21336
+ return { content: [{ type: "text", text: formatError2(err) }], isError: true };
21337
+ }
21338
+ }
21339
+ );
21302
21340
  server.tool(
21303
21341
  "request_action",
21304
21342
  `Request a HITL-gated remedial action on a customer agent: "restart" (bounce the agent \u2014 reversible, backlog replays) or "clear_pending_inbound" (move stuck pending-inbound markers aside \u2014 drops queued inbound messages, NOT reprocessed). A HUMAN must approve in Slack before anything happens, and only if writes are armed for this stage; in shadow mode the approval runs but nothing executes. Returns { request_id, status, write_mode, notification_status }. Surface a brief, jargon-free note to the user (e.g. "I've asked an admin to approve restarting that agent"). Pass { action, target_agent_id, reason }.`,
@@ -25,9 +25,8 @@ import {
25
25
  takeAcpxExecFailureCount,
26
26
  takeZombieDetection,
27
27
  writePersistentClaudeWrapper
28
- } from "./chunk-7GKJZBTB.js";
29
- import "./chunk-WOOYOAPG.js";
30
- import "./chunk-354FAVQR.js";
28
+ } from "./chunk-HXMLMIR4.js";
29
+ import "./chunk-FZTGR2AQ.js";
31
30
  import "./chunk-XWVM4KPK.js";
32
31
  export {
33
32
  SEND_KEYS_ENTER_DELAY_MS,
@@ -57,4 +56,4 @@ export {
57
56
  takeZombieDetection,
58
57
  writePersistentClaudeWrapper
59
58
  };
60
- //# sourceMappingURL=persistent-session-35PWSTLO.js.map
59
+ //# sourceMappingURL=persistent-session-IKQLTZZ6.js.map
@@ -1,8 +1,7 @@
1
1
  import {
2
2
  paneLogPath
3
- } from "./chunk-7GKJZBTB.js";
4
- import "./chunk-WOOYOAPG.js";
5
- import "./chunk-354FAVQR.js";
3
+ } from "./chunk-HXMLMIR4.js";
4
+ import "./chunk-FZTGR2AQ.js";
6
5
  import "./chunk-XWVM4KPK.js";
7
6
 
8
7
  // src/lib/responsiveness-probe.ts
@@ -155,4 +154,4 @@ export {
155
154
  livePendingInboundOldestAgeSeconds,
156
155
  oldestLivePendingInboundMtimeMs
157
156
  };
158
- //# sourceMappingURL=responsiveness-probe-MA4M2QM4.js.map
157
+ //# sourceMappingURL=responsiveness-probe-2K4QHOWW.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/lib/responsiveness-probe.ts"],"sourcesContent":["/**\n * ENG-5399 — Tier 1 responsiveness probe (manager-side).\n *\n * Cheap, fast-cadence canary that catches \"agent went silent\" inside\n * minutes, well before the existing synthetic-probe cron's ~35 min\n * staleness window (`SyntheticReplyAgeSeconds`, ENG-5122).\n *\n * Mechanism: for each managed agent, read the mtime of the agent's\n * `pane.log` and report `now - mtime` as `PaneActivityAgeSeconds` via\n * a new `/host/responsiveness-probe` endpoint. `pane.log` is the\n * tmux pipe-pane sink set up by `setupPaneLog()` — any visible\n * activity (assistant turns, tool calls, in-place progress\n * heartbeats) bumps its mtime. A silent agent has a steadily\n * climbing age that lands in CloudWatch and trips a per-agent alarm.\n *\n * ENG-6017 adds a second per-agent signal on the same cadence:\n * `pending_inbound_oldest_age_seconds` — the age of the oldest marker\n * file across the agent's `*-pending-inbound/` directories (written by\n * the channel MCP servers for inbounds awaiting delivery). This is the\n * one artifact of the \"message typed but never submitted\" failure mode\n * that every other canary is blind to: in the koda incident\n * (2026-06-04) an operator Slack DM sat undelivered for 40+ minutes\n * while pane-activity stayed fresh (health checks), synthetic probes\n * were answered by the one-shot fallback, and heartbeat/session-alive\n * only reflect manager health. The field is OMITTED (not zero) when the\n * agent has no pending-inbound markers — the API treats absent as\n * \"no signal\", never as \"healthy\" (absent-vs-zero matters for\n * mixed-version fleets where old CLIs don't report it at all).\n *\n * Run from `pollCycle()` in `manager-worker.ts` on a configurable\n * interval (default 5 min via `AUGMENTED_RESPONSIVENESS_INTERVAL_MS`).\n */\n\nimport { mkdirSync, readdirSync, readFileSync, renameSync, statSync } from 'node:fs';\nimport { dirname, join } from 'node:path';\nimport { paneLogPath } from './persistent-session.js';\n\nexport interface ResponsivenessProbeResult {\n code_name: string;\n pane_activity_age_seconds: number;\n /**\n * ENG-6017: age (s) of the oldest marker file across the agent's\n * `*-pending-inbound/` directories. Omitted when no markers exist —\n * absent means \"no signal\", NOT \"zero / healthy\".\n */\n pending_inbound_oldest_age_seconds?: number;\n}\n\nconst DEFAULT_INTERVAL_MS = 5 * 60 * 1000;\n\nexport function getResponsivenessIntervalMs(): number {\n const raw = process.env.AUGMENTED_RESPONSIVENESS_INTERVAL_MS;\n if (!raw) return DEFAULT_INTERVAL_MS;\n const parsed = Number.parseInt(raw, 10);\n return Number.isFinite(parsed) && parsed > 0 ? parsed : DEFAULT_INTERVAL_MS;\n}\n\n/**\n * ENG-6017: oldest pending-inbound marker mtime (ms epoch) for an agent,\n * or null when the agent has no markers / no pending-inbound dirs.\n *\n * The channel MCP servers (slack-channel, telegram-channel, …) write one\n * marker file per inbound into `~/.augmented/<codeName>/<channel>-pending-\n * inbound/` and clear it when the agent acknowledges the message. The\n * directory layout is the contract here — read-only, no IPC with the MCP\n * (the MCP and CLI release independently; file mtimes need no protocol).\n *\n * ENG-6072: only plain, non-hidden files count as markers. The msteams MCP\n * keeps `.markers/` and `.processed/` housekeeping SUBDIRECTORIES inside its\n * pending-inbound dir; their mtimes never advance, so statting every dirent\n * made the gauge climb forever and fired pending-inbound-stale on agents with\n * zero stranded messages (kylie ~3.4d / scout ~34h false ALARMs the moment\n * ENG-6023 activated the alarm). Dot-entries are skipped wholesale — the\n * hidden namespace is reserved for MCP bookkeeping, never for markers.\n */\nfunction oldestPendingInboundMtimeMs(agentHomeDir: string): number | null {\n let oldest: number | null = null;\n let entries;\n try {\n entries = readdirSync(agentHomeDir, { withFileTypes: true });\n } catch {\n return null; // agent home missing — nothing to report\n }\n for (const entry of entries) {\n if (!entry.isDirectory() || !entry.name.endsWith('-pending-inbound')) continue;\n const dir = join(agentHomeDir, entry.name);\n let files;\n try {\n files = readdirSync(dir, { withFileTypes: true });\n } catch {\n continue;\n }\n for (const file of files) {\n if (!file.isFile() || file.name.startsWith('.')) continue;\n try {\n const mtimeMs = statSync(join(dir, file.name)).mtimeMs;\n if (oldest === null || mtimeMs < oldest) oldest = mtimeMs;\n } catch {\n // Marker drained between readdir and stat — that's the happy path.\n }\n }\n }\n return oldest;\n}\n\n/**\n * ENG-6160: classify a marker file for the LIVE-inbound scan.\n * - `true` → flagged `\"undeliverable\": true` (dead-letter, exclude).\n * - `null` → vanished mid-scan (ENOENT) — drained between stat and read, the\n * happy path; exclude it rather than count an already-gone file.\n * - `false` → still present but malformed / unreadable for another reason —\n * treated as LIVE so a corrupt marker can never mask a real wedge.\n */\nfunction isUndeliverableMarker(markerPath: string): boolean | null {\n try {\n const parsed = JSON.parse(readFileSync(markerPath, 'utf8')) as { undeliverable?: unknown };\n return parsed?.undeliverable === true;\n } catch (error) {\n return (error as NodeJS.ErrnoException).code === 'ENOENT' ? null : false;\n }\n}\n\n/**\n * ENG-6160: oldest *LIVE* pending-inbound marker mtime (ms epoch) for an agent,\n * or null when there is no live marker. \"Live\" excludes:\n *\n * - markers older than `sessionStartMs` — a marker written before the current\n * session started is a leftover from a PREVIOUS session and cannot mean\n * *this* session is failing to drain. This is the load-bearing exclusion:\n * without it, an orphan marker survives a fresh respawn and the wedge\n * detector re-fires forever on a healthy idle agent (the sherlock enforce\n * loop, 2026-06-08: `inboundAge=3389s` on a `● Ready.` session).\n * - markers flagged `undeliverable: true` — already dead-lettered by the channel.\n *\n * Distinct from `oldestPendingInboundMtimeMs` (which counts ALL markers and\n * feeds the ENG-6017 `pending-inbound-stale` CloudWatch alarm — that alarm\n * *wants* to fire on a stuck inbound, so its semantics must NOT change). This\n * variant is wedge-detection-only.\n */\nexport function oldestLivePendingInboundMtimeMs(\n agentHomeDir: string,\n opts: { sessionStartMs?: number | null } = {},\n): number | null {\n const sessionStartMs = opts.sessionStartMs ?? null;\n let oldest: number | null = null;\n let entries;\n try {\n entries = readdirSync(agentHomeDir, { withFileTypes: true });\n } catch {\n return null;\n }\n for (const entry of entries) {\n if (!entry.isDirectory() || !entry.name.endsWith('-pending-inbound')) continue;\n const dir = join(agentHomeDir, entry.name);\n let files;\n try {\n files = readdirSync(dir, { withFileTypes: true });\n } catch {\n continue;\n }\n for (const file of files) {\n if (!file.isFile() || file.name.startsWith('.')) continue;\n const full = join(dir, file.name);\n let mtimeMs: number;\n try {\n mtimeMs = statSync(full).mtimeMs;\n } catch {\n continue; // drained between readdir and stat — happy path\n }\n if (sessionStartMs !== null && mtimeMs < sessionStartMs) continue; // pre-session leftover\n const undeliverable = isUndeliverableMarker(full);\n if (undeliverable === null) continue; // vanished between stat and read — drained, exclude\n if (undeliverable) continue; // already dead-lettered\n if (oldest === null || mtimeMs < oldest) oldest = mtimeMs;\n }\n }\n return oldest;\n}\n\n/**\n * ENG-6160: age (s) of the oldest LIVE pending-inbound marker for an agent, or\n * null when none. The wedge detector uses this instead of the alarm-facing\n * `pending_inbound_oldest_age_seconds` so a stale/dead-letter marker can't\n * false-fire a respawn.\n */\nexport function livePendingInboundOldestAgeSeconds(\n codeName: string,\n sessionStartMs: number | null,\n now: Date = new Date(),\n): number | null {\n const oldest = oldestLivePendingInboundMtimeMs(dirname(paneLogPath(codeName)), { sessionStartMs });\n if (oldest === null) return null;\n return Math.max(0, Math.floor((now.getTime() - oldest) / 1000));\n}\n\n/**\n * ENG-6160: move every pending-inbound marker for an agent aside into a sibling\n * `<channel>-pending-inbound-stale/` directory (NOT silently deleted — the\n * payload pointer is preserved for forensics), returning the count moved.\n *\n * Called on a force-fresh wedge respawn: the markers belonged to the wedged\n * session that is being torn down; the fresh session cannot meaningfully\n * process a stale, out-of-context message, and leaving them on disk both keeps\n * the ENG-6017 alarm lit and (pre-ENG-6160) re-fed the wedge loop. The stale\n * dir does not end in `-pending-inbound`, so neither the probe nor this scan\n * re-counts moved markers.\n */\nexport function deadLetterPendingInbound(codeName: string, _now: Date = new Date()): number {\n const home = dirname(paneLogPath(codeName));\n let moved = 0;\n let entries;\n try {\n entries = readdirSync(home, { withFileTypes: true });\n } catch {\n return 0;\n }\n for (const entry of entries) {\n if (!entry.isDirectory() || !entry.name.endsWith('-pending-inbound')) continue;\n const dir = join(home, entry.name);\n const deadDir = join(home, `${entry.name}-stale`);\n let files;\n try {\n files = readdirSync(dir, { withFileTypes: true });\n } catch {\n continue;\n }\n for (const file of files) {\n if (!file.isFile() || file.name.startsWith('.')) continue;\n try {\n mkdirSync(deadDir, { recursive: true });\n renameSync(join(dir, file.name), join(deadDir, file.name));\n moved++;\n } catch {\n // best-effort — a marker that vanished or can't move is left as-is\n }\n }\n }\n return moved;\n}\n\n/**\n * Compute the pane.log age for each agent. Missing or unreadable\n * pane.log returns null — the caller should drop those entries\n * rather than fabricate a \"fresh\" or \"ancient\" value. A missing\n * file means the agent has never spawned in this manager generation,\n * which is a separate problem covered by SessionAliveAgeSeconds.\n */\nexport function collectResponsivenessProbes(\n codeNames: string[],\n now: Date = new Date(),\n): ResponsivenessProbeResult[] {\n const nowMs = now.getTime();\n const results: ResponsivenessProbeResult[] = [];\n for (const codeName of codeNames) {\n try {\n const panePath = paneLogPath(codeName);\n const mtimeMs = statSync(panePath).mtimeMs;\n const ageSeconds = Math.max(0, Math.floor((nowMs - mtimeMs) / 1000));\n const result: ResponsivenessProbeResult = {\n code_name: codeName,\n pane_activity_age_seconds: ageSeconds,\n };\n // ENG-6017: piggyback the pending-inbound drain-age scan on the same\n // cadence. Field omitted (not 0) when there are no markers.\n const oldestMarkerMs = oldestPendingInboundMtimeMs(dirname(panePath));\n if (oldestMarkerMs !== null) {\n result.pending_inbound_oldest_age_seconds = Math.max(\n 0,\n Math.floor((nowMs - oldestMarkerMs) / 1000),\n );\n }\n results.push(result);\n } catch {\n // No pane.log yet (fresh agent, never spawned) — skip. The\n // session-alive monitor already covers the \"should be running\n // but isn't\" case.\n }\n }\n return results;\n}\n"],"mappings":";;;;;;;;AAiCA,SAAS,WAAW,aAAa,cAAc,YAAY,gBAAgB;AAC3E,SAAS,SAAS,YAAY;AAc9B,IAAM,sBAAsB,IAAI,KAAK;AAE9B,SAAS,8BAAsC;AACpD,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,QAAO;AACjB,QAAM,SAAS,OAAO,SAAS,KAAK,EAAE;AACtC,SAAO,OAAO,SAAS,MAAM,KAAK,SAAS,IAAI,SAAS;AAC1D;AAoBA,SAAS,4BAA4B,cAAqC;AACxE,MAAI,SAAwB;AAC5B,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,cAAc,EAAE,eAAe,KAAK,CAAC;AAAA,EAC7D,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,SAAS,SAAS;AAC3B,QAAI,CAAC,MAAM,YAAY,KAAK,CAAC,MAAM,KAAK,SAAS,kBAAkB,EAAG;AACtE,UAAM,MAAM,KAAK,cAAc,MAAM,IAAI;AACzC,QAAI;AACJ,QAAI;AACF,cAAQ,YAAY,KAAK,EAAE,eAAe,KAAK,CAAC;AAAA,IAClD,QAAQ;AACN;AAAA,IACF;AACA,eAAW,QAAQ,OAAO;AACxB,UAAI,CAAC,KAAK,OAAO,KAAK,KAAK,KAAK,WAAW,GAAG,EAAG;AACjD,UAAI;AACF,cAAM,UAAU,SAAS,KAAK,KAAK,KAAK,IAAI,CAAC,EAAE;AAC/C,YAAI,WAAW,QAAQ,UAAU,OAAQ,UAAS;AAAA,MACpD,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAUA,SAAS,sBAAsB,YAAoC;AACjE,MAAI;AACF,UAAM,SAAS,KAAK,MAAM,aAAa,YAAY,MAAM,CAAC;AAC1D,WAAO,QAAQ,kBAAkB;AAAA,EACnC,SAAS,OAAO;AACd,WAAQ,MAAgC,SAAS,WAAW,OAAO;AAAA,EACrE;AACF;AAmBO,SAAS,gCACd,cACA,OAA2C,CAAC,GAC7B;AACf,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,MAAI,SAAwB;AAC5B,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,cAAc,EAAE,eAAe,KAAK,CAAC;AAAA,EAC7D,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,SAAS,SAAS;AAC3B,QAAI,CAAC,MAAM,YAAY,KAAK,CAAC,MAAM,KAAK,SAAS,kBAAkB,EAAG;AACtE,UAAM,MAAM,KAAK,cAAc,MAAM,IAAI;AACzC,QAAI;AACJ,QAAI;AACF,cAAQ,YAAY,KAAK,EAAE,eAAe,KAAK,CAAC;AAAA,IAClD,QAAQ;AACN;AAAA,IACF;AACA,eAAW,QAAQ,OAAO;AACxB,UAAI,CAAC,KAAK,OAAO,KAAK,KAAK,KAAK,WAAW,GAAG,EAAG;AACjD,YAAM,OAAO,KAAK,KAAK,KAAK,IAAI;AAChC,UAAI;AACJ,UAAI;AACF,kBAAU,SAAS,IAAI,EAAE;AAAA,MAC3B,QAAQ;AACN;AAAA,MACF;AACA,UAAI,mBAAmB,QAAQ,UAAU,eAAgB;AACzD,YAAM,gBAAgB,sBAAsB,IAAI;AAChD,UAAI,kBAAkB,KAAM;AAC5B,UAAI,cAAe;AACnB,UAAI,WAAW,QAAQ,UAAU,OAAQ,UAAS;AAAA,IACpD;AAAA,EACF;AACA,SAAO;AACT;AAQO,SAAS,mCACd,UACA,gBACA,MAAY,oBAAI,KAAK,GACN;AACf,QAAM,SAAS,gCAAgC,QAAQ,YAAY,QAAQ,CAAC,GAAG,EAAE,eAAe,CAAC;AACjG,MAAI,WAAW,KAAM,QAAO;AAC5B,SAAO,KAAK,IAAI,GAAG,KAAK,OAAO,IAAI,QAAQ,IAAI,UAAU,GAAI,CAAC;AAChE;AAcO,SAAS,yBAAyB,UAAkB,OAAa,oBAAI,KAAK,GAAW;AAC1F,QAAM,OAAO,QAAQ,YAAY,QAAQ,CAAC;AAC1C,MAAI,QAAQ;AACZ,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,MAAM,EAAE,eAAe,KAAK,CAAC;AAAA,EACrD,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,SAAS,SAAS;AAC3B,QAAI,CAAC,MAAM,YAAY,KAAK,CAAC,MAAM,KAAK,SAAS,kBAAkB,EAAG;AACtE,UAAM,MAAM,KAAK,MAAM,MAAM,IAAI;AACjC,UAAM,UAAU,KAAK,MAAM,GAAG,MAAM,IAAI,QAAQ;AAChD,QAAI;AACJ,QAAI;AACF,cAAQ,YAAY,KAAK,EAAE,eAAe,KAAK,CAAC;AAAA,IAClD,QAAQ;AACN;AAAA,IACF;AACA,eAAW,QAAQ,OAAO;AACxB,UAAI,CAAC,KAAK,OAAO,KAAK,KAAK,KAAK,WAAW,GAAG,EAAG;AACjD,UAAI;AACF,kBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,mBAAW,KAAK,KAAK,KAAK,IAAI,GAAG,KAAK,SAAS,KAAK,IAAI,CAAC;AACzD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,SAAS,4BACd,WACA,MAAY,oBAAI,KAAK,GACQ;AAC7B,QAAM,QAAQ,IAAI,QAAQ;AAC1B,QAAM,UAAuC,CAAC;AAC9C,aAAW,YAAY,WAAW;AAChC,QAAI;AACF,YAAM,WAAW,YAAY,QAAQ;AACrC,YAAM,UAAU,SAAS,QAAQ,EAAE;AACnC,YAAM,aAAa,KAAK,IAAI,GAAG,KAAK,OAAO,QAAQ,WAAW,GAAI,CAAC;AACnE,YAAM,SAAoC;AAAA,QACxC,WAAW;AAAA,QACX,2BAA2B;AAAA,MAC7B;AAGA,YAAM,iBAAiB,4BAA4B,QAAQ,QAAQ,CAAC;AACpE,UAAI,mBAAmB,MAAM;AAC3B,eAAO,qCAAqC,KAAK;AAAA,UAC/C;AAAA,UACA,KAAK,OAAO,QAAQ,kBAAkB,GAAI;AAAA,QAC5C;AAAA,MACF;AACA,cAAQ,KAAK,MAAM;AAAA,IACrB,QAAQ;AAAA,IAIR;AAAA,EACF;AACA,SAAO;AACT;","names":[]}
1
+ {"version":3,"sources":["../src/lib/responsiveness-probe.ts"],"sourcesContent":["/**\n * ENG-5399 — Tier 1 responsiveness probe (manager-side).\n *\n * Cheap, fast-cadence canary that catches \"agent went silent\" inside\n * minutes, well before the existing synthetic-probe cron's ~35 min\n * staleness window (`SyntheticReplyAgeSeconds`, ENG-5122).\n *\n * Mechanism: for each managed agent, read the mtime of the agent's\n * `pane.log` and report `now - mtime` as `PaneActivityAgeSeconds` via\n * a new `/host/responsiveness-probe` endpoint. `pane.log` is the\n * tmux pipe-pane sink set up by `setupPaneLog()` — any visible\n * activity (assistant turns, tool calls, in-place progress\n * heartbeats) bumps its mtime. A silent agent has a steadily\n * climbing age that lands in CloudWatch and trips a per-agent alarm.\n *\n * ENG-6017 adds a second per-agent signal on the same cadence:\n * `pending_inbound_oldest_age_seconds` — the age of the oldest marker\n * file across the agent's `*-pending-inbound/` directories (written by\n * the channel MCP servers for inbounds awaiting delivery). This is the\n * one artifact of the \"message typed but never submitted\" failure mode\n * that every other canary is blind to: in the koda incident\n * (2026-06-04) an operator Slack DM sat undelivered for 40+ minutes\n * while pane-activity stayed fresh (health checks), synthetic probes\n * were answered by the one-shot fallback, and heartbeat/session-alive\n * only reflect manager health. The field is OMITTED (not zero) when the\n * agent has no pending-inbound markers — the API treats absent as\n * \"no signal\", never as \"healthy\" (absent-vs-zero matters for\n * mixed-version fleets where old CLIs don't report it at all).\n *\n * Run from `pollCycle()` in `manager-worker.ts` on a configurable\n * interval (default 5 min via `AUGMENTED_RESPONSIVENESS_INTERVAL_MS`).\n */\n\nimport { mkdirSync, readdirSync, readFileSync, renameSync, statSync } from 'node:fs';\nimport { dirname, join } from 'node:path';\nimport { paneLogPath } from './persistent-session.js';\n\nexport interface ResponsivenessProbeResult {\n code_name: string;\n pane_activity_age_seconds: number;\n /**\n * ENG-6017: age (s) of the oldest marker file across the agent's\n * `*-pending-inbound/` directories. Omitted when no markers exist —\n * absent means \"no signal\", NOT \"zero / healthy\".\n */\n pending_inbound_oldest_age_seconds?: number;\n}\n\nconst DEFAULT_INTERVAL_MS = 5 * 60 * 1000;\n\nexport function getResponsivenessIntervalMs(): number {\n const raw = process.env.AUGMENTED_RESPONSIVENESS_INTERVAL_MS;\n if (!raw) return DEFAULT_INTERVAL_MS;\n const parsed = Number.parseInt(raw, 10);\n return Number.isFinite(parsed) && parsed > 0 ? parsed : DEFAULT_INTERVAL_MS;\n}\n\n/**\n * ENG-6017: oldest pending-inbound marker mtime (ms epoch) for an agent,\n * or null when the agent has no markers / no pending-inbound dirs.\n *\n * The channel MCP servers (slack-channel, telegram-channel, …) write one\n * marker file per inbound into `~/.augmented/<codeName>/<channel>-pending-\n * inbound/` and clear it when the agent acknowledges the message. The\n * directory layout is the contract here — read-only, no IPC with the MCP\n * (the MCP and CLI release independently; file mtimes need no protocol).\n *\n * ENG-6072: only plain, non-hidden files count as markers. The msteams MCP\n * keeps `.markers/` and `.processed/` housekeeping SUBDIRECTORIES inside its\n * pending-inbound dir; their mtimes never advance, so statting every dirent\n * made the gauge climb forever and fired pending-inbound-stale on agents with\n * zero stranded messages (kylie ~3.4d / scout ~34h false ALARMs the moment\n * ENG-6023 activated the alarm). Dot-entries are skipped wholesale — the\n * hidden namespace is reserved for MCP bookkeeping, never for markers.\n */\nfunction oldestPendingInboundMtimeMs(agentHomeDir: string): number | null {\n let oldest: number | null = null;\n let entries;\n try {\n entries = readdirSync(agentHomeDir, { withFileTypes: true });\n } catch {\n return null; // agent home missing — nothing to report\n }\n for (const entry of entries) {\n if (!entry.isDirectory() || !entry.name.endsWith('-pending-inbound')) continue;\n const dir = join(agentHomeDir, entry.name);\n let files;\n try {\n files = readdirSync(dir, { withFileTypes: true });\n } catch {\n continue;\n }\n for (const file of files) {\n if (!file.isFile() || file.name.startsWith('.')) continue;\n try {\n const mtimeMs = statSync(join(dir, file.name)).mtimeMs;\n if (oldest === null || mtimeMs < oldest) oldest = mtimeMs;\n } catch {\n // Marker drained between readdir and stat — that's the happy path.\n }\n }\n }\n return oldest;\n}\n\n/**\n * ENG-6160: classify a marker file for the LIVE-inbound scan.\n * - `true` → flagged `\"undeliverable\": true` (dead-letter, exclude).\n * - `null` → vanished mid-scan (ENOENT) — drained between stat and read, the\n * happy path; exclude it rather than count an already-gone file.\n * - `false` → still present but malformed / unreadable for another reason —\n * treated as LIVE so a corrupt marker can never mask a real wedge.\n */\nfunction isUndeliverableMarker(markerPath: string): boolean | null {\n try {\n const parsed = JSON.parse(readFileSync(markerPath, 'utf8')) as { undeliverable?: unknown };\n return parsed?.undeliverable === true;\n } catch (error) {\n return (error as NodeJS.ErrnoException).code === 'ENOENT' ? null : false;\n }\n}\n\n/**\n * ENG-6160: oldest *LIVE* pending-inbound marker mtime (ms epoch) for an agent,\n * or null when there is no live marker. \"Live\" excludes:\n *\n * - markers older than `sessionStartMs` — a marker written before the current\n * session started is a leftover from a PREVIOUS session and cannot mean\n * *this* session is failing to drain. This is the load-bearing exclusion:\n * without it, an orphan marker survives a fresh respawn and the wedge\n * detector re-fires forever on a healthy idle agent (the sherlock enforce\n * loop, 2026-06-08: `inboundAge=3389s` on a `● Ready.` session).\n * - markers flagged `undeliverable: true` — already dead-lettered by the channel.\n *\n * Distinct from `oldestPendingInboundMtimeMs` (which counts ALL markers and\n * feeds the ENG-6017 `pending-inbound-stale` CloudWatch alarm — that alarm\n * *wants* to fire on a stuck inbound, so its semantics must NOT change). This\n * variant is wedge-detection-only.\n */\nexport function oldestLivePendingInboundMtimeMs(\n agentHomeDir: string,\n opts: { sessionStartMs?: number | null } = {},\n): number | null {\n const sessionStartMs = opts.sessionStartMs ?? null;\n let oldest: number | null = null;\n let entries;\n try {\n entries = readdirSync(agentHomeDir, { withFileTypes: true });\n } catch {\n return null;\n }\n for (const entry of entries) {\n if (!entry.isDirectory() || !entry.name.endsWith('-pending-inbound')) continue;\n const dir = join(agentHomeDir, entry.name);\n let files;\n try {\n files = readdirSync(dir, { withFileTypes: true });\n } catch {\n continue;\n }\n for (const file of files) {\n if (!file.isFile() || file.name.startsWith('.')) continue;\n const full = join(dir, file.name);\n let mtimeMs: number;\n try {\n mtimeMs = statSync(full).mtimeMs;\n } catch {\n continue; // drained between readdir and stat — happy path\n }\n if (sessionStartMs !== null && mtimeMs < sessionStartMs) continue; // pre-session leftover\n const undeliverable = isUndeliverableMarker(full);\n if (undeliverable === null) continue; // vanished between stat and read — drained, exclude\n if (undeliverable) continue; // already dead-lettered\n if (oldest === null || mtimeMs < oldest) oldest = mtimeMs;\n }\n }\n return oldest;\n}\n\n/**\n * ENG-6160: age (s) of the oldest LIVE pending-inbound marker for an agent, or\n * null when none. The wedge detector uses this instead of the alarm-facing\n * `pending_inbound_oldest_age_seconds` so a stale/dead-letter marker can't\n * false-fire a respawn.\n */\nexport function livePendingInboundOldestAgeSeconds(\n codeName: string,\n sessionStartMs: number | null,\n now: Date = new Date(),\n): number | null {\n const oldest = oldestLivePendingInboundMtimeMs(dirname(paneLogPath(codeName)), { sessionStartMs });\n if (oldest === null) return null;\n return Math.max(0, Math.floor((now.getTime() - oldest) / 1000));\n}\n\n/**\n * ENG-6160: move every pending-inbound marker for an agent aside into a sibling\n * `<channel>-pending-inbound-stale/` directory (NOT silently deleted — the\n * payload pointer is preserved for forensics), returning the count moved.\n *\n * Called on a force-fresh wedge respawn: the markers belonged to the wedged\n * session that is being torn down; the fresh session cannot meaningfully\n * process a stale, out-of-context message, and leaving them on disk both keeps\n * the ENG-6017 alarm lit and (pre-ENG-6160) re-fed the wedge loop. The stale\n * dir does not end in `-pending-inbound`, so neither the probe nor this scan\n * re-counts moved markers.\n */\nexport function deadLetterPendingInbound(codeName: string, _now: Date = new Date()): number {\n const home = dirname(paneLogPath(codeName));\n let moved = 0;\n let entries;\n try {\n entries = readdirSync(home, { withFileTypes: true });\n } catch {\n return 0;\n }\n for (const entry of entries) {\n if (!entry.isDirectory() || !entry.name.endsWith('-pending-inbound')) continue;\n const dir = join(home, entry.name);\n const deadDir = join(home, `${entry.name}-stale`);\n let files;\n try {\n files = readdirSync(dir, { withFileTypes: true });\n } catch {\n continue;\n }\n for (const file of files) {\n if (!file.isFile() || file.name.startsWith('.')) continue;\n try {\n mkdirSync(deadDir, { recursive: true });\n renameSync(join(dir, file.name), join(deadDir, file.name));\n moved++;\n } catch {\n // best-effort — a marker that vanished or can't move is left as-is\n }\n }\n }\n return moved;\n}\n\n/**\n * Compute the pane.log age for each agent. Missing or unreadable\n * pane.log returns null — the caller should drop those entries\n * rather than fabricate a \"fresh\" or \"ancient\" value. A missing\n * file means the agent has never spawned in this manager generation,\n * which is a separate problem covered by SessionAliveAgeSeconds.\n */\nexport function collectResponsivenessProbes(\n codeNames: string[],\n now: Date = new Date(),\n): ResponsivenessProbeResult[] {\n const nowMs = now.getTime();\n const results: ResponsivenessProbeResult[] = [];\n for (const codeName of codeNames) {\n try {\n const panePath = paneLogPath(codeName);\n const mtimeMs = statSync(panePath).mtimeMs;\n const ageSeconds = Math.max(0, Math.floor((nowMs - mtimeMs) / 1000));\n const result: ResponsivenessProbeResult = {\n code_name: codeName,\n pane_activity_age_seconds: ageSeconds,\n };\n // ENG-6017: piggyback the pending-inbound drain-age scan on the same\n // cadence. Field omitted (not 0) when there are no markers.\n const oldestMarkerMs = oldestPendingInboundMtimeMs(dirname(panePath));\n if (oldestMarkerMs !== null) {\n result.pending_inbound_oldest_age_seconds = Math.max(\n 0,\n Math.floor((nowMs - oldestMarkerMs) / 1000),\n );\n }\n results.push(result);\n } catch {\n // No pane.log yet (fresh agent, never spawned) — skip. The\n // session-alive monitor already covers the \"should be running\n // but isn't\" case.\n }\n }\n return results;\n}\n"],"mappings":";;;;;;;AAiCA,SAAS,WAAW,aAAa,cAAc,YAAY,gBAAgB;AAC3E,SAAS,SAAS,YAAY;AAc9B,IAAM,sBAAsB,IAAI,KAAK;AAE9B,SAAS,8BAAsC;AACpD,QAAM,MAAM,QAAQ,IAAI;AACxB,MAAI,CAAC,IAAK,QAAO;AACjB,QAAM,SAAS,OAAO,SAAS,KAAK,EAAE;AACtC,SAAO,OAAO,SAAS,MAAM,KAAK,SAAS,IAAI,SAAS;AAC1D;AAoBA,SAAS,4BAA4B,cAAqC;AACxE,MAAI,SAAwB;AAC5B,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,cAAc,EAAE,eAAe,KAAK,CAAC;AAAA,EAC7D,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,SAAS,SAAS;AAC3B,QAAI,CAAC,MAAM,YAAY,KAAK,CAAC,MAAM,KAAK,SAAS,kBAAkB,EAAG;AACtE,UAAM,MAAM,KAAK,cAAc,MAAM,IAAI;AACzC,QAAI;AACJ,QAAI;AACF,cAAQ,YAAY,KAAK,EAAE,eAAe,KAAK,CAAC;AAAA,IAClD,QAAQ;AACN;AAAA,IACF;AACA,eAAW,QAAQ,OAAO;AACxB,UAAI,CAAC,KAAK,OAAO,KAAK,KAAK,KAAK,WAAW,GAAG,EAAG;AACjD,UAAI;AACF,cAAM,UAAU,SAAS,KAAK,KAAK,KAAK,IAAI,CAAC,EAAE;AAC/C,YAAI,WAAW,QAAQ,UAAU,OAAQ,UAAS;AAAA,MACpD,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAUA,SAAS,sBAAsB,YAAoC;AACjE,MAAI;AACF,UAAM,SAAS,KAAK,MAAM,aAAa,YAAY,MAAM,CAAC;AAC1D,WAAO,QAAQ,kBAAkB;AAAA,EACnC,SAAS,OAAO;AACd,WAAQ,MAAgC,SAAS,WAAW,OAAO;AAAA,EACrE;AACF;AAmBO,SAAS,gCACd,cACA,OAA2C,CAAC,GAC7B;AACf,QAAM,iBAAiB,KAAK,kBAAkB;AAC9C,MAAI,SAAwB;AAC5B,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,cAAc,EAAE,eAAe,KAAK,CAAC;AAAA,EAC7D,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,SAAS,SAAS;AAC3B,QAAI,CAAC,MAAM,YAAY,KAAK,CAAC,MAAM,KAAK,SAAS,kBAAkB,EAAG;AACtE,UAAM,MAAM,KAAK,cAAc,MAAM,IAAI;AACzC,QAAI;AACJ,QAAI;AACF,cAAQ,YAAY,KAAK,EAAE,eAAe,KAAK,CAAC;AAAA,IAClD,QAAQ;AACN;AAAA,IACF;AACA,eAAW,QAAQ,OAAO;AACxB,UAAI,CAAC,KAAK,OAAO,KAAK,KAAK,KAAK,WAAW,GAAG,EAAG;AACjD,YAAM,OAAO,KAAK,KAAK,KAAK,IAAI;AAChC,UAAI;AACJ,UAAI;AACF,kBAAU,SAAS,IAAI,EAAE;AAAA,MAC3B,QAAQ;AACN;AAAA,MACF;AACA,UAAI,mBAAmB,QAAQ,UAAU,eAAgB;AACzD,YAAM,gBAAgB,sBAAsB,IAAI;AAChD,UAAI,kBAAkB,KAAM;AAC5B,UAAI,cAAe;AACnB,UAAI,WAAW,QAAQ,UAAU,OAAQ,UAAS;AAAA,IACpD;AAAA,EACF;AACA,SAAO;AACT;AAQO,SAAS,mCACd,UACA,gBACA,MAAY,oBAAI,KAAK,GACN;AACf,QAAM,SAAS,gCAAgC,QAAQ,YAAY,QAAQ,CAAC,GAAG,EAAE,eAAe,CAAC;AACjG,MAAI,WAAW,KAAM,QAAO;AAC5B,SAAO,KAAK,IAAI,GAAG,KAAK,OAAO,IAAI,QAAQ,IAAI,UAAU,GAAI,CAAC;AAChE;AAcO,SAAS,yBAAyB,UAAkB,OAAa,oBAAI,KAAK,GAAW;AAC1F,QAAM,OAAO,QAAQ,YAAY,QAAQ,CAAC;AAC1C,MAAI,QAAQ;AACZ,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,MAAM,EAAE,eAAe,KAAK,CAAC;AAAA,EACrD,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,SAAS,SAAS;AAC3B,QAAI,CAAC,MAAM,YAAY,KAAK,CAAC,MAAM,KAAK,SAAS,kBAAkB,EAAG;AACtE,UAAM,MAAM,KAAK,MAAM,MAAM,IAAI;AACjC,UAAM,UAAU,KAAK,MAAM,GAAG,MAAM,IAAI,QAAQ;AAChD,QAAI;AACJ,QAAI;AACF,cAAQ,YAAY,KAAK,EAAE,eAAe,KAAK,CAAC;AAAA,IAClD,QAAQ;AACN;AAAA,IACF;AACA,eAAW,QAAQ,OAAO;AACxB,UAAI,CAAC,KAAK,OAAO,KAAK,KAAK,KAAK,WAAW,GAAG,EAAG;AACjD,UAAI;AACF,kBAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AACtC,mBAAW,KAAK,KAAK,KAAK,IAAI,GAAG,KAAK,SAAS,KAAK,IAAI,CAAC;AACzD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,SAAS,4BACd,WACA,MAAY,oBAAI,KAAK,GACQ;AAC7B,QAAM,QAAQ,IAAI,QAAQ;AAC1B,QAAM,UAAuC,CAAC;AAC9C,aAAW,YAAY,WAAW;AAChC,QAAI;AACF,YAAM,WAAW,YAAY,QAAQ;AACrC,YAAM,UAAU,SAAS,QAAQ,EAAE;AACnC,YAAM,aAAa,KAAK,IAAI,GAAG,KAAK,OAAO,QAAQ,WAAW,GAAI,CAAC;AACnE,YAAM,SAAoC;AAAA,QACxC,WAAW;AAAA,QACX,2BAA2B;AAAA,MAC7B;AAGA,YAAM,iBAAiB,4BAA4B,QAAQ,QAAQ,CAAC;AACpE,UAAI,mBAAmB,MAAM;AAC3B,eAAO,qCAAqC,KAAK;AAAA,UAC/C;AAAA,UACA,KAAK,OAAO,QAAQ,kBAAkB,GAAI;AAAA,QAC5C;AAAA,MACF;AACA,cAAQ,KAAK,MAAM;AAAA,IACrB,QAAQ;AAAA,IAIR;AAAA,EACF;AACA,SAAO;AACT;","names":[]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@integrity-labs/agt-cli",
3
- "version": "0.27.150-test.15",
3
+ "version": "0.27.150-test.17",
4
4
  "description": "Augmented Team CLI — agent provisioning and management",
5
5
  "type": "module",
6
6
  "engines": {