aui-agent-builder 0.3.85 → 0.3.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import { AUIClient, applyScopeLevel } from "../api-client/index.js";
10
10
  import { findAuiFiles, parseAuiFile } from "../utils/index.js";
11
11
  import { validate } from "./validate.js";
12
12
  import { getTracer, SpanStatusCode, setUserContext } from "../telemetry.js";
13
+ import { trace } from "@opentelemetry/api";
13
14
  import { getItemLevelDiff } from "../utils/git.js";
14
15
  import { AuthenticationError, CLIError, ConfigError, ValidationError } from "../errors/index.js";
15
16
  import { StatusLine, Spinner, ErrorDisplay, Hint, } from "../ui/components/index.js";
@@ -23,20 +24,54 @@ function log(node) {
23
24
  }
24
25
  function startSpinner(label) {
25
26
  const inst = render(_jsx(Spinner, { label: label }));
27
+ let unmounted = false;
28
+ const safeUnmount = () => {
29
+ if (unmounted)
30
+ return;
31
+ unmounted = true;
32
+ inst.unmount();
33
+ };
26
34
  return {
27
35
  succeed(msg) {
28
- inst.unmount();
36
+ safeUnmount();
29
37
  log(_jsx(StatusLine, { kind: "success", label: msg }));
30
38
  },
31
39
  fail(msg) {
32
- inst.unmount();
40
+ safeUnmount();
33
41
  log(_jsx(StatusLine, { kind: "error", label: msg }));
34
42
  },
35
43
  stop() {
36
- inst.unmount();
44
+ safeUnmount();
45
+ },
46
+ /**
47
+ * Internal: unconditionally unmount, no log line. Used by `withSpinner`
48
+ * to guarantee the spinner stops even when the wrapped body throws an
49
+ * exception that escapes the surrounding try/catch (which would
50
+ * otherwise leave a phantom spinner spinning forever).
51
+ */
52
+ _forceUnmount() {
53
+ safeUnmount();
37
54
  },
38
55
  };
39
56
  }
57
+ /**
58
+ * Wrap a sync- or async-returning callback so the spinner ALWAYS unmounts,
59
+ * even on uncaught exceptions. The callback can call `.succeed()` / `.fail()`
60
+ * itself to render a final status line; otherwise the spinner just stops.
61
+ *
62
+ * This pattern eliminates the "phantom spinner" foot-gun where an exception
63
+ * thrown between `startSpinner(...)` and `.succeed/.fail` leaves the Ink
64
+ * render mounted forever — blocking the chat UI's "Still thinking…" state.
65
+ */
66
+ async function withSpinner(label, fn) {
67
+ const spinner = startSpinner(label);
68
+ try {
69
+ return await fn(spinner);
70
+ }
71
+ finally {
72
+ spinner._forceUnmount();
73
+ }
74
+ }
40
75
  /**
41
76
  * Push local agent configuration to the backend
42
77
  */
@@ -90,13 +125,47 @@ async function _push(pushSpan, agentCode, options = {}) {
90
125
  log(_jsx(StatusLine, { kind: "info", label: "Validating configuration..." }));
91
126
  else
92
127
  stderrLog("Validating configuration...");
93
- const valid = await validate(projectRoot, { verbose: false });
128
+ // Wrap the validate call in its own span so a "stuck at validate"
129
+ // hang shows up clearly in Logfire as `aui.push.preflight.validate`
130
+ // with status = unset (still running) — instead of the parent
131
+ // `aui.push` span just sitting there with no clue why.
132
+ const validateTracer = getTracer();
133
+ const valid = await validateTracer.startActiveSpan("aui.push.preflight.validate", async (vSpan) => {
134
+ vSpan.setAttribute("push.preflight.step", "validate");
135
+ vSpan.setAttribute("push.preflight.skipValidation", false);
136
+ vSpan.setAttribute("push.preflight.force", options.force === true);
137
+ try {
138
+ const ok = await validate(projectRoot, { verbose: false });
139
+ vSpan.setAttribute("push.preflight.validate.ok", ok);
140
+ vSpan.setStatus({ code: SpanStatusCode.OK });
141
+ return ok;
142
+ }
143
+ catch (err) {
144
+ // validate() shouldn't throw under normal conditions, but if a
145
+ // schema fetch or git call inside it does, surface it here so
146
+ // we don't lose the error to the parent span's generic handler.
147
+ const msg = err instanceof Error ? err.message : String(err);
148
+ vSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
149
+ vSpan.recordException(err instanceof Error ? err : new Error(msg));
150
+ throw err;
151
+ }
152
+ finally {
153
+ vSpan.end();
154
+ }
155
+ });
94
156
  if (!valid && !options.force) {
95
157
  pushSpan.setAttribute("push.exit_reason", "validation_failed");
158
+ pushSpan.addEvent("preflight.validation_rejected_push");
96
159
  throw new ValidationError("Push aborted due to validation errors.", {
97
160
  suggestion: "Fix the errors above, or use --force to push anyway.",
98
161
  });
99
162
  }
163
+ if (!valid && options.force) {
164
+ pushSpan.addEvent("preflight.validation_failed_but_forced");
165
+ }
166
+ }
167
+ else {
168
+ pushSpan.addEvent("preflight.validation_skipped");
100
169
  }
101
170
  if (!json)
102
171
  log(_jsx(StatusLine, { kind: "info", label: "Pushing agent changes..." }));
@@ -284,12 +353,49 @@ async function _push(pushSpan, agentCode, options = {}) {
284
353
  // If the project has version_id in .auirc or --version-id is passed,
285
354
  // we validate it's a draft. If no version context exists, we auto-detect
286
355
  // available drafts. Push is rejected if no draft is found.
356
+ //
357
+ // Wrapped in an `aui.push.preflight.resolve-version` span so a hang on
358
+ // listAgents / listVersions / getVersion shows up clearly in Logfire
359
+ // instead of being lumped under the parent push span. This is the
360
+ // step that hits agent-management with up to 3 sequential calls.
287
361
  let prePushDraft = null;
288
362
  if (projectConfig.version_id || options.versionId) {
289
- prePushDraft = await resolveVersionDraft(config, projectConfig, session, options.versionId);
363
+ const resolveTracer = getTracer();
364
+ prePushDraft = await resolveTracer.startActiveSpan("aui.push.preflight.resolve-version", async (rSpan) => {
365
+ rSpan.setAttribute("push.preflight.step", "resolve-version");
366
+ rSpan.setAttribute("push.preflight.has_explicit_version_id", !!options.versionId);
367
+ rSpan.setAttribute("push.preflight.has_auirc_version_id", !!projectConfig.version_id);
368
+ if (projectConfig.agent_id) {
369
+ rSpan.setAttribute("push.preflight.network_id", projectConfig.agent_id);
370
+ }
371
+ try {
372
+ const draft = await resolveVersionDraft(config, projectConfig, session, options.versionId);
373
+ rSpan.setAttribute("push.preflight.resolved_version_id", draft.versionId);
374
+ rSpan.setAttribute("push.preflight.resolved_version_label", draft.label);
375
+ rSpan.setAttribute("push.preflight.resolved_agent_id", draft.agentId);
376
+ rSpan.setStatus({ code: SpanStatusCode.OK });
377
+ return draft;
378
+ }
379
+ catch (err) {
380
+ const msg = err instanceof Error ? err.message : String(err);
381
+ rSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
382
+ rSpan.recordException(err instanceof Error ? err : new Error(msg));
383
+ throw err;
384
+ }
385
+ finally {
386
+ rSpan.end();
387
+ }
388
+ });
290
389
  agentSettingsParams.version_id = prePushDraft.versionId;
390
+ pushSpan.setAttribute("push.version_id", prePushDraft.versionId);
391
+ pushSpan.setAttribute("push.version_label", prePushDraft.label);
291
392
  log(_jsx(StatusLine, { kind: "info", label: `Pushing into draft version: ${prePushDraft.label}` }));
292
393
  }
394
+ else {
395
+ pushSpan.addEvent("preflight.no_draft_version_required", {
396
+ reason: "legacy push (no version_id in .auirc or --version-id flag)",
397
+ });
398
+ }
293
399
  const pushTasks = buildPushTasks(diff, fileData, projectRoot, getFileDiff);
294
400
  pushSpan.setAttribute("push.task_count", pushTasks.length);
295
401
  if (diff) {
@@ -418,7 +524,12 @@ async function _push(pushSpan, agentCode, options = {}) {
418
524
  // JSON envelope, and the non-zero exit code (BFF contract: zero silent
419
525
  // errors anywhere in the push pipeline).
420
526
  const kbResult = await pushKnowledgeHubs(projectRoot, projectConfig);
527
+ pushSpan.setAttribute("push.kb.ok", kbResult.ok);
528
+ pushSpan.setAttribute("push.kb.failures", kbResult.failures.length);
421
529
  if (!kbResult.ok) {
530
+ pushSpan.addEvent("kb.failures_folded_into_pushFailures", {
531
+ count: kbResult.failures.length,
532
+ });
422
533
  for (const kbFailure of kbResult.failures) {
423
534
  failed++;
424
535
  pushFailures.push(kbFailure);
@@ -450,10 +561,16 @@ async function _push(pushSpan, agentCode, options = {}) {
450
561
  process.stdout.isTTY === true;
451
562
  if (!isInteractive) {
452
563
  failed += authFailedTasks.length;
564
+ pushSpan.addEvent("auth.fallback.non_interactive_rejected", {
565
+ failed_task_count: authFailedTasks.length,
566
+ });
453
567
  throw new AuthenticationError(`Authentication failed for ${authFailedTasks.length} push task(s); cannot prompt for an API key (non-interactive session).`, {
454
568
  suggestion: "Pass --api-key <key>, set AUI_AGENT_TOOLS_API_KEY, or run `aui login` to refresh credentials.",
455
569
  });
456
570
  }
571
+ pushSpan.addEvent("auth.fallback.api_key_prompted", {
572
+ failed_task_count: authFailedTasks.length,
573
+ });
457
574
  log(_jsxs(Box, { flexDirection: "column", paddingX: 1, children: [_jsx(StatusLine, { kind: "warning", label: "Authentication failed. Your access token may not have permission." }), _jsx(Hint, { message: "You can provide an API key as a fallback. It will be saved to ~/.aui/agent-settings-key" })] }));
458
575
  const { key } = await inquirer.prompt([
459
576
  {
@@ -466,6 +583,9 @@ async function _push(pushSpan, agentCode, options = {}) {
466
583
  if (key && key.trim()) {
467
584
  saveAgentSettingsApiKey(key.trim());
468
585
  client.setAgentSettingsApiKey(key.trim());
586
+ pushSpan.addEvent("auth.fallback.api_key_provided", {
587
+ retrying_task_count: authFailedTasks.length,
588
+ });
469
589
  log(_jsx(StatusLine, { kind: "success", label: "Key saved." }));
470
590
  log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "info", label: `Retrying ${authFailedTasks.length} change(s) with API key...` }) }));
471
591
  authFailed = false;
@@ -495,10 +615,16 @@ async function _push(pushSpan, agentCode, options = {}) {
495
615
  }
496
616
  else {
497
617
  failed += authFailedTasks.length;
618
+ pushSpan.addEvent("auth.fallback.api_key_skipped", {
619
+ uncovered_task_count: authFailedTasks.length,
620
+ });
498
621
  }
499
622
  }
500
623
  else if (authFailed && authFailedTasks.length > 0) {
501
624
  failed += authFailedTasks.length;
625
+ pushSpan.addEvent("auth.fallback.saved_key_still_failed", {
626
+ failed_task_count: authFailedTasks.length,
627
+ });
502
628
  log(_jsx(ErrorDisplay, { error: new AuthenticationError("Auth failed even with saved API key.", {
503
629
  suggestion: "Try: rm ~/.aui/agent-settings-key",
504
630
  }) }));
@@ -532,6 +658,7 @@ async function _push(pushSpan, agentCode, options = {}) {
532
658
  if (prePushDraft) {
533
659
  const SNAPSHOT_MAX_ATTEMPTS = 4;
534
660
  const SNAPSHOT_RETRY_BASE_MS = 1000;
661
+ const snapshotTracer = getTracer();
535
662
  for (let attempt = 1; attempt <= SNAPSHOT_MAX_ATTEMPTS; attempt++) {
536
663
  snapshotAttempts = attempt;
537
664
  const label = attempt === 1
@@ -540,26 +667,65 @@ async function _push(pushSpan, agentCode, options = {}) {
540
667
  if (json)
541
668
  stderrLog(label);
542
669
  const snapshotSpinner = json ? null : startSpinner(label);
670
+ // Per-attempt span — each snapshot upload is a network call that can
671
+ // hang for minutes (large multipart upload). Surfacing each attempt
672
+ // separately in Logfire lets us see retry behavior, attempt latency,
673
+ // and which attempt finally succeeded. Logfire query:
674
+ // `name:"aui.push.task.snapshot" AND attributes."snapshot.attempt":3`
675
+ // finds every push that needed a third try.
543
676
  let attemptError;
544
- try {
545
- const snapshotResult = await pushSnapshot(client, prePushDraft.agentId, prePushDraft.versionId, projectRoot, fileData);
546
- if (snapshotResult.success) {
547
- const okMsg = attempt === 1
548
- ? `Snapshot pushed (${fileData.length} file(s))`
549
- : `Snapshot pushed (${fileData.length} file(s), attempt ${attempt}/${SNAPSHOT_MAX_ATTEMPTS})`;
550
- if (snapshotSpinner)
551
- snapshotSpinner.succeed(okMsg);
552
- else
553
- stderrLog(okMsg);
554
- snapshotSucceeded = true;
555
- snapshotError = undefined;
556
- break;
677
+ const attemptResolved = await snapshotTracer.startActiveSpan("aui.push.task.snapshot", async (snapSpan) => {
678
+ snapSpan.setAttribute("push.task.type", "snapshot");
679
+ snapSpan.setAttribute("push.task.label", label);
680
+ snapSpan.setAttribute("snapshot.attempt", attempt);
681
+ snapSpan.setAttribute("snapshot.max_attempts", SNAPSHOT_MAX_ATTEMPTS);
682
+ snapSpan.setAttribute("snapshot.file_count", fileData.length);
683
+ snapSpan.setAttribute("snapshot.agent_id", prePushDraft.agentId);
684
+ snapSpan.setAttribute("snapshot.version_id", prePushDraft.versionId);
685
+ try {
686
+ const snapshotResult = await pushSnapshot(client, prePushDraft.agentId, prePushDraft.versionId, projectRoot, fileData);
687
+ if (snapshotResult.success) {
688
+ snapSpan.setStatus({ code: SpanStatusCode.OK });
689
+ snapSpan.setAttribute("snapshot.outcome", "success");
690
+ return { ok: true, error: undefined };
691
+ }
692
+ const errMsg = snapshotResult.error || "Unknown snapshot error";
693
+ snapSpan.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
694
+ snapSpan.setAttribute("snapshot.outcome", "failed");
695
+ snapSpan.setAttribute("push.task.error", errMsg);
696
+ if (attempt < SNAPSHOT_MAX_ATTEMPTS) {
697
+ snapSpan.addEvent("snapshot.retry_will_follow", {
698
+ next_attempt: attempt + 1,
699
+ backoff_ms: SNAPSHOT_RETRY_BASE_MS * Math.pow(2, attempt - 1),
700
+ });
701
+ }
702
+ return { ok: false, error: errMsg };
557
703
  }
558
- attemptError = snapshotResult.error || "Unknown snapshot error";
559
- }
560
- catch (error) {
561
- attemptError = error instanceof Error ? error.message : String(error);
704
+ catch (error) {
705
+ const errMsg = error instanceof Error ? error.message : String(error);
706
+ snapSpan.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
707
+ snapSpan.recordException(error instanceof Error ? error : new Error(errMsg));
708
+ snapSpan.setAttribute("snapshot.outcome", "exception");
709
+ snapSpan.setAttribute("push.task.error", errMsg);
710
+ return { ok: false, error: errMsg };
711
+ }
712
+ finally {
713
+ snapSpan.end();
714
+ }
715
+ });
716
+ if (attemptResolved.ok) {
717
+ const okMsg = attempt === 1
718
+ ? `Snapshot pushed (${fileData.length} file(s))`
719
+ : `Snapshot pushed (${fileData.length} file(s), attempt ${attempt}/${SNAPSHOT_MAX_ATTEMPTS})`;
720
+ if (snapshotSpinner)
721
+ snapshotSpinner.succeed(okMsg);
722
+ else
723
+ stderrLog(okMsg);
724
+ snapshotSucceeded = true;
725
+ snapshotError = undefined;
726
+ break;
562
727
  }
728
+ attemptError = attemptResolved.error;
563
729
  snapshotError = attemptError;
564
730
  const isLast = attempt === SNAPSHOT_MAX_ATTEMPTS;
565
731
  const failMsg = isLast
@@ -617,13 +783,27 @@ async function _push(pushSpan, agentCode, options = {}) {
617
783
  if (filesSafeToCommit.length > 0) {
618
784
  commitBaselineFiles(projectRoot, filesSafeToCommit, `pushed ${succeeded} change(s) (${failedFiles.size} file(s) held back due to per-task failures)`);
619
785
  baselineUpdated = true;
786
+ pushSpan.addEvent("baseline.partial_commit", {
787
+ committed_files: filesSafeToCommit.length,
788
+ held_back_files: failedFiles.size,
789
+ });
790
+ }
791
+ else {
792
+ pushSpan.addEvent("baseline.fully_held_back", {
793
+ failed_files: failedFiles.size,
794
+ });
620
795
  }
621
796
  }
622
797
  else if (failed === 0) {
623
798
  commitBaseline(projectRoot, "pushed changes");
624
799
  baselineUpdated = true;
800
+ pushSpan.addEvent("baseline.full_commit");
625
801
  }
626
802
  }
803
+ else {
804
+ pushSpan.addEvent("baseline.skipped_due_to_snapshot_failure");
805
+ }
806
+ pushSpan.setAttribute("push.baseline_updated", baselineUpdated);
627
807
  log(_jsx(PushFinalSummary, { succeeded: succeeded, failed: failed, baselineUpdated: baselineUpdated, logDir: logRelPath, memoryPath: memoryPath, snapshotStatus: snapshotStatus, snapshotError: snapshotError }));
628
808
  if (failed > 0) {
629
809
  log(_jsxs(Box, { flexDirection: "column", paddingX: 1, children: [_jsx(StatusLine, { kind: "warning", label: `${failed} entity change(s) failed to push to DB.` }), pushFailures.map((f) => (_jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [_jsxs(Text, { color: "red", children: [" ", icons.error, " ", f.label] }), _jsxs(Text, { color: colors.muted, children: [" Error: ", f.error] }), f.file && _jsxs(Text, { color: colors.muted, children: [" File: ", f.file] })] }, f.label))), _jsxs(Box, { marginTop: 1, children: [_jsx(Text, { color: colors.info, bold: true, children: "What to do next: " }), _jsxs(Text, { color: colors.muted, children: ["Fix the issues above and re-run ", _jsx(Text, { bold: true, children: "aui push" }), " to retry the failed changes."] })] })] }));
@@ -1029,26 +1209,50 @@ async function pushKnowledgeHubs(projectRoot, projectConfig) {
1029
1209
  log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Cannot delete "${kbName}" — no knowledge_base_id stored. Push the KB first, then delete.` }) }));
1030
1210
  continue;
1031
1211
  }
1032
- try {
1033
- await kbViewClient.deleteKnowledgeBase(kbId, scope, kbName);
1034
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName}` }) }));
1035
- }
1036
- catch (delErr) {
1037
- // Per-KB error: count it, keep going so partial work shows up.
1038
- if (isNotFoundError(delErr)) {
1039
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName} (already absent)` }) }));
1212
+ // Per-KB delete in its own span so each one shows up in Logfire as
1213
+ // `aui.push.task.kb-delete` with status, kb name, kb id, and error
1214
+ // body. Same observability shape as agent-settings entity tasks.
1215
+ const kbDelTracer = getTracer();
1216
+ await kbDelTracer.startActiveSpan("aui.push.task.kb-delete", async (span) => {
1217
+ span.setAttribute("push.task.type", "kb-delete");
1218
+ span.setAttribute("push.task.label", `Delete knowledge base: ${kbName}`);
1219
+ span.setAttribute("push.task.file", `knowledge-hubs/${kbDirName}/kb.json`);
1220
+ span.setAttribute("push.task.kb_id", kbId);
1221
+ span.setAttribute("push.task.kb_name", kbName);
1222
+ try {
1223
+ await kbViewClient.deleteKnowledgeBase(kbId, scope, kbName);
1224
+ span.setStatus({ code: SpanStatusCode.OK });
1225
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName}` }) }));
1040
1226
  }
1041
- else {
1042
- kbDeleteSucceeded = false;
1043
- const errMsg = delErr instanceof Error ? delErr.message : String(delErr);
1044
- failures.push({
1045
- label: `Delete knowledge base: ${kbName}`,
1046
- file: `knowledge-hubs/${kbDirName}/kb.json`,
1047
- error: errMsg,
1048
- });
1049
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to delete "${kbName}": ${errMsg}` }) }));
1227
+ catch (delErr) {
1228
+ // Per-KB error: count it, keep going so partial work shows up.
1229
+ if (isNotFoundError(delErr)) {
1230
+ span.setStatus({ code: SpanStatusCode.OK });
1231
+ span.addEvent("fallback.delete_404_already_absent");
1232
+ span.setAttribute("push.task.fallback", "delete_404_already_absent");
1233
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName} (already absent)` }) }));
1234
+ }
1235
+ else {
1236
+ kbDeleteSucceeded = false;
1237
+ const errMsg = delErr instanceof Error ? delErr.message : String(delErr);
1238
+ span.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
1239
+ span.recordException(delErr instanceof Error ? delErr : new Error(errMsg));
1240
+ span.setAttribute("push.task.error", errMsg);
1241
+ if (delErr.statusCode) {
1242
+ span.setAttribute("push.task.error_status_code", delErr.statusCode);
1243
+ }
1244
+ failures.push({
1245
+ label: `Delete knowledge base: ${kbName}`,
1246
+ file: `knowledge-hubs/${kbDirName}/kb.json`,
1247
+ error: errMsg,
1248
+ });
1249
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to delete "${kbName}": ${errMsg}` }) }));
1250
+ }
1050
1251
  }
1051
- }
1252
+ finally {
1253
+ span.end();
1254
+ }
1255
+ });
1052
1256
  }
1053
1257
  if (kbDeleteSucceeded) {
1054
1258
  deleteSpinner.succeed(`${deletedKBDirs.length} knowledge base(s) deleted`);
@@ -1085,41 +1289,64 @@ async function pushKnowledgeHubs(projectRoot, projectConfig) {
1085
1289
  log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Skipped unsupported file: ${path.basename(skipped)} (only .pdf, .md, .txt, .json)` }) }));
1086
1290
  }
1087
1291
  if (supportedFiles.length > 0) {
1088
- try {
1089
- const importResult = await kbViewClient.importFiles({
1090
- files: supportedFiles,
1091
- scope,
1092
- created_by: userId,
1093
- knowledge_base_name: kbData.name,
1094
- knowledge_base_description: kbData.description,
1095
- });
1096
- if (importResult.knowledge_base_id) {
1097
- const kbJsonPath = path.join(kbDir, "kb.json");
1098
- try {
1099
- const raw = JSON.parse(fs.readFileSync(kbJsonPath, "utf-8"));
1100
- raw.knowledge_base_id = importResult.knowledge_base_id;
1101
- fs.writeFileSync(kbJsonPath, JSON.stringify(raw, null, 2) + "\n");
1102
- }
1103
- catch (writeErr) {
1104
- // kb.json id write fail is non-fatal but tell the user so the
1105
- // next push doesn't surprise them with "no knowledge_base_id stored".
1106
- if (process.env.AUI_DEBUG) {
1107
- console.warn(`[debug] failed to write knowledge_base_id back to ${kbJsonPath}:`, writeErr);
1292
+ // Per-KB upload in its own span — Logfire query
1293
+ // `name:"aui.push.task.kb-upload" AND status_code:ERROR` finds
1294
+ // every KB push failure across all agents.
1295
+ const kbUpTracer = getTracer();
1296
+ await kbUpTracer.startActiveSpan("aui.push.task.kb-upload", async (span) => {
1297
+ span.setAttribute("push.task.type", "kb-upload");
1298
+ span.setAttribute("push.task.label", `Push knowledge base: ${kbData.name || kbDirName}`);
1299
+ span.setAttribute("push.task.file", `knowledge-hubs/${kbDirName}/kb.json`);
1300
+ span.setAttribute("push.task.kb_name", kbData.name || kbDirName);
1301
+ span.setAttribute("push.task.file_count", supportedFiles.length);
1302
+ try {
1303
+ const importResult = await kbViewClient.importFiles({
1304
+ files: supportedFiles,
1305
+ scope,
1306
+ created_by: userId,
1307
+ knowledge_base_name: kbData.name,
1308
+ knowledge_base_description: kbData.description,
1309
+ });
1310
+ span.setStatus({ code: SpanStatusCode.OK });
1311
+ if (importResult.knowledge_base_id) {
1312
+ span.setAttribute("push.task.kb_id", importResult.knowledge_base_id);
1313
+ const kbJsonPath = path.join(kbDir, "kb.json");
1314
+ try {
1315
+ const raw = JSON.parse(fs.readFileSync(kbJsonPath, "utf-8"));
1316
+ raw.knowledge_base_id = importResult.knowledge_base_id;
1317
+ fs.writeFileSync(kbJsonPath, JSON.stringify(raw, null, 2) + "\n");
1318
+ }
1319
+ catch (writeErr) {
1320
+ // kb.json id write fail is non-fatal but tell the user so the
1321
+ // next push doesn't surprise them with "no knowledge_base_id stored".
1322
+ span.addEvent("kb_id_writeback_failed");
1323
+ if (process.env.AUI_DEBUG) {
1324
+ console.warn(`[debug] failed to write knowledge_base_id back to ${kbJsonPath}:`, writeErr);
1325
+ }
1326
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Could not persist knowledge_base_id back to ${path.basename(kbJsonPath)} — re-import or run \`aui pull\` to recover.` }) }));
1108
1327
  }
1109
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Could not persist knowledge_base_id back to ${path.basename(kbJsonPath)} — re-import or run \`aui pull\` to recover.` }) }));
1110
1328
  }
1111
1329
  }
1112
- }
1113
- catch (uploadErr) {
1114
- hadUploadFailure = true;
1115
- const errMsg = uploadErr instanceof Error ? uploadErr.message : String(uploadErr);
1116
- failures.push({
1117
- label: `Push knowledge base: ${kbData.name || kbDirName}`,
1118
- file: `knowledge-hubs/${kbDirName}/kb.json`,
1119
- error: errMsg,
1120
- });
1121
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to push "${kbData.name || kbDirName}": ${errMsg}` }) }));
1122
- }
1330
+ catch (uploadErr) {
1331
+ hadUploadFailure = true;
1332
+ const errMsg = uploadErr instanceof Error ? uploadErr.message : String(uploadErr);
1333
+ span.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
1334
+ span.recordException(uploadErr instanceof Error ? uploadErr : new Error(errMsg));
1335
+ span.setAttribute("push.task.error", errMsg);
1336
+ if (uploadErr.statusCode) {
1337
+ span.setAttribute("push.task.error_status_code", uploadErr.statusCode);
1338
+ }
1339
+ failures.push({
1340
+ label: `Push knowledge base: ${kbData.name || kbDirName}`,
1341
+ file: `knowledge-hubs/${kbDirName}/kb.json`,
1342
+ error: errMsg,
1343
+ });
1344
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to push "${kbData.name || kbDirName}": ${errMsg}` }) }));
1345
+ }
1346
+ finally {
1347
+ span.end();
1348
+ }
1349
+ });
1123
1350
  }
1124
1351
  }
1125
1352
  if (hadUploadFailure) {
@@ -1622,6 +1849,32 @@ function isTransient5xx(err) {
1622
1849
  ?? err.status;
1623
1850
  return code === 500 || code === 502 || code === 503 || code === 504;
1624
1851
  }
1852
+ /**
1853
+ * Tag the currently-active span with a fallback-decision event + attribute,
1854
+ * so Logfire shows exactly which adaptive layer fired during a push.
1855
+ *
1856
+ * Useful queries once published:
1857
+ * - `attributes."push.task.fallback":"patch_404_to_post"` → every drift
1858
+ * recovery (next push self-healed a previously-failed POST).
1859
+ * - `attributes."push.task.fallback":"transient_retry"` → backend 5xx
1860
+ * events that were absorbed by the retry layer.
1861
+ * - `attributes."push.task.fallback":"post_409_to_patch"` → "create"
1862
+ * calls that converted to "update" because the row pre-existed.
1863
+ * - `attributes."push.task.fallback":"delete_404_already_absent"` →
1864
+ * deletes that no-op'd because the row was already gone.
1865
+ *
1866
+ * No-op when there's no active span (e.g. unit tests outside the push flow).
1867
+ */
1868
+ function recordFallbackEvent(kind, detail) {
1869
+ const span = trace.getActiveSpan();
1870
+ if (!span)
1871
+ return;
1872
+ span.addEvent(`fallback.${kind}`, detail);
1873
+ span.setAttribute("push.task.fallback", kind);
1874
+ for (const [k, v] of Object.entries(detail ?? {})) {
1875
+ span.setAttribute(`push.task.fallback.${k}`, v);
1876
+ }
1877
+ }
1625
1878
  /**
1626
1879
  * Run one entity-settings write call once, and retry exactly once on a
1627
1880
  * transient 5xx after a 1s back-off. The snapshot upload has its own
@@ -1640,6 +1893,11 @@ async function withTransientRetry(label, fn) {
1640
1893
  if (process.env.AUI_DEBUG) {
1641
1894
  console.log(`[debug] ${label} got ${code}, retrying once after 1000ms`);
1642
1895
  }
1896
+ recordFallbackEvent("transient_retry", {
1897
+ label,
1898
+ status_code: code ?? 0,
1899
+ backoff_ms: 1000,
1900
+ });
1643
1901
  await new Promise((r) => setTimeout(r, 1000));
1644
1902
  return await fn();
1645
1903
  }
@@ -1671,6 +1929,7 @@ async function _executePushTask(client, params, task) {
1671
1929
  if (process.env.AUI_DEBUG) {
1672
1930
  console.log(`[debug] patch-tool ${task.toolName}: 404 not found, falling back to POST`);
1673
1931
  }
1932
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-tool", tool: String(task.toolName ?? "") });
1674
1933
  return client.createTool(params, task.body);
1675
1934
  }
1676
1935
  throw err;
@@ -1689,6 +1948,7 @@ async function _executePushTask(client, params, task) {
1689
1948
  const body = task.body;
1690
1949
  const toolCode = body.code || "";
1691
1950
  const toolName = toolCode.toUpperCase().replace(/-/g, "_");
1951
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-tool", tool: toolName });
1692
1952
  return client.patchTool(params, toolName, body);
1693
1953
  }
1694
1954
  throw err;
@@ -1704,6 +1964,7 @@ async function _executePushTask(client, params, task) {
1704
1964
  if (process.env.AUI_DEBUG) {
1705
1965
  console.log(`[debug] delete-tool ${task.toolName}: 404 already absent`);
1706
1966
  }
1967
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-tool", tool: String(task.toolName ?? "") });
1707
1968
  return DELETE_ALREADY_ABSENT;
1708
1969
  }
1709
1970
  throw err;
@@ -1727,6 +1988,7 @@ async function _executePushTask(client, params, task) {
1727
1988
  if (process.env.AUI_DEBUG) {
1728
1989
  console.log(`[debug] create-parameter ${task.itemCode}: 409, falling back to PATCH`);
1729
1990
  }
1991
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-parameter", code: String(task.itemCode ?? "") });
1730
1992
  return client.patchParameter(params, task.itemCode, task.body);
1731
1993
  }
1732
1994
  throw err;
@@ -1742,6 +2004,7 @@ async function _executePushTask(client, params, task) {
1742
2004
  if (process.env.AUI_DEBUG) {
1743
2005
  console.log(`[debug] patch-parameter ${task.itemCode}: 404 not found, falling back to POST`);
1744
2006
  }
2007
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-parameter", code: String(task.itemCode ?? "") });
1745
2008
  return client.createParameter(params, task.body);
1746
2009
  }
1747
2010
  throw err;
@@ -1757,6 +2020,7 @@ async function _executePushTask(client, params, task) {
1757
2020
  if (process.env.AUI_DEBUG) {
1758
2021
  console.log(`[debug] delete-parameter ${task.itemCode}: 404 already absent`);
1759
2022
  }
2023
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-parameter", code: String(task.itemCode ?? "") });
1760
2024
  return DELETE_ALREADY_ABSENT;
1761
2025
  }
1762
2026
  throw err;
@@ -1772,6 +2036,7 @@ async function _executePushTask(client, params, task) {
1772
2036
  if (process.env.AUI_DEBUG) {
1773
2037
  console.log(`[debug] create-entity ${task.itemCode}: 409, falling back to PATCH`);
1774
2038
  }
2039
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-entity", code: String(task.itemCode ?? "") });
1775
2040
  return client.patchEntity(params, task.itemCode, task.body);
1776
2041
  }
1777
2042
  throw err;
@@ -1787,6 +2052,7 @@ async function _executePushTask(client, params, task) {
1787
2052
  if (process.env.AUI_DEBUG) {
1788
2053
  console.log(`[debug] patch-entity ${task.itemCode}: 404, falling back to POST`);
1789
2054
  }
2055
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-entity", code: String(task.itemCode ?? "") });
1790
2056
  return client.createEntity(params, task.body);
1791
2057
  }
1792
2058
  throw err;
@@ -1802,6 +2068,7 @@ async function _executePushTask(client, params, task) {
1802
2068
  if (process.env.AUI_DEBUG) {
1803
2069
  console.log(`[debug] delete-entity ${task.itemCode}: 404 already absent`);
1804
2070
  }
2071
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-entity", code: String(task.itemCode ?? "") });
1805
2072
  return DELETE_ALREADY_ABSENT;
1806
2073
  }
1807
2074
  throw err;
@@ -1817,6 +2084,7 @@ async function _executePushTask(client, params, task) {
1817
2084
  if (process.env.AUI_DEBUG) {
1818
2085
  console.log(`[debug] create-integration ${task.itemCode}: 409, falling back to PATCH`);
1819
2086
  }
2087
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-integration", code: String(task.itemCode ?? "") });
1820
2088
  return client.patchIntegration(params, task.itemCode, task.body);
1821
2089
  }
1822
2090
  throw err;
@@ -1832,6 +2100,7 @@ async function _executePushTask(client, params, task) {
1832
2100
  if (process.env.AUI_DEBUG) {
1833
2101
  console.log(`[debug] patch-integration ${task.itemCode}: 404 not found, falling back to POST`);
1834
2102
  }
2103
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-integration", code: String(task.itemCode ?? "") });
1835
2104
  return client.createIntegration(params, task.body);
1836
2105
  }
1837
2106
  throw err;
@@ -1847,6 +2116,7 @@ async function _executePushTask(client, params, task) {
1847
2116
  if (process.env.AUI_DEBUG) {
1848
2117
  console.log(`[debug] delete-integration ${task.itemCode}: 404 already absent`);
1849
2118
  }
2119
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-integration", code: String(task.itemCode ?? "") });
1850
2120
  return DELETE_ALREADY_ABSENT;
1851
2121
  }
1852
2122
  throw err;