aui-agent-builder 0.3.85 → 0.3.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ import { AUIClient, applyScopeLevel } from "../api-client/index.js";
10
10
  import { findAuiFiles, parseAuiFile } from "../utils/index.js";
11
11
  import { validate } from "./validate.js";
12
12
  import { getTracer, SpanStatusCode, setUserContext } from "../telemetry.js";
13
+ import { trace } from "@opentelemetry/api";
13
14
  import { getItemLevelDiff } from "../utils/git.js";
14
15
  import { AuthenticationError, CLIError, ConfigError, ValidationError } from "../errors/index.js";
15
16
  import { StatusLine, Spinner, ErrorDisplay, Hint, } from "../ui/components/index.js";
@@ -23,20 +24,54 @@ function log(node) {
23
24
  }
24
25
  function startSpinner(label) {
25
26
  const inst = render(_jsx(Spinner, { label: label }));
27
+ let unmounted = false;
28
+ const safeUnmount = () => {
29
+ if (unmounted)
30
+ return;
31
+ unmounted = true;
32
+ inst.unmount();
33
+ };
26
34
  return {
27
35
  succeed(msg) {
28
- inst.unmount();
36
+ safeUnmount();
29
37
  log(_jsx(StatusLine, { kind: "success", label: msg }));
30
38
  },
31
39
  fail(msg) {
32
- inst.unmount();
40
+ safeUnmount();
33
41
  log(_jsx(StatusLine, { kind: "error", label: msg }));
34
42
  },
35
43
  stop() {
36
- inst.unmount();
44
+ safeUnmount();
45
+ },
46
+ /**
47
+ * Internal: unconditionally unmount, no log line. Used by `withSpinner`
48
+ * to guarantee the spinner stops even when the wrapped body throws an
49
+ * exception that escapes the surrounding try/catch (which would
50
+ * otherwise leave a phantom spinner spinning forever).
51
+ */
52
+ _forceUnmount() {
53
+ safeUnmount();
37
54
  },
38
55
  };
39
56
  }
57
+ /**
58
+ * Wrap a sync- or async-returning callback so the spinner ALWAYS unmounts,
59
+ * even on uncaught exceptions. The callback can call `.succeed()` / `.fail()`
60
+ * itself to render a final status line; otherwise the spinner just stops.
61
+ *
62
+ * This pattern eliminates the "phantom spinner" foot-gun where an exception
63
+ * thrown between `startSpinner(...)` and `.succeed/.fail` leaves the Ink
64
+ * render mounted forever — blocking the chat UI's "Still thinking…" state.
65
+ */
66
+ async function withSpinner(label, fn) {
67
+ const spinner = startSpinner(label);
68
+ try {
69
+ return await fn(spinner);
70
+ }
71
+ finally {
72
+ spinner._forceUnmount();
73
+ }
74
+ }
40
75
  /**
41
76
  * Push local agent configuration to the backend
42
77
  */
@@ -90,13 +125,47 @@ async function _push(pushSpan, agentCode, options = {}) {
90
125
  log(_jsx(StatusLine, { kind: "info", label: "Validating configuration..." }));
91
126
  else
92
127
  stderrLog("Validating configuration...");
93
- const valid = await validate(projectRoot, { verbose: false });
128
+ // Wrap the validate call in its own span so a "stuck at validate"
129
+ // hang shows up clearly in Logfire as `aui.push.preflight.validate`
130
+ // with status = unset (still running) — instead of the parent
131
+ // `aui.push` span just sitting there with no clue why.
132
+ const validateTracer = getTracer();
133
+ const valid = await validateTracer.startActiveSpan("aui.push.preflight.validate", async (vSpan) => {
134
+ vSpan.setAttribute("push.preflight.step", "validate");
135
+ vSpan.setAttribute("push.preflight.skipValidation", false);
136
+ vSpan.setAttribute("push.preflight.force", options.force === true);
137
+ try {
138
+ const ok = await validate(projectRoot, { verbose: false });
139
+ vSpan.setAttribute("push.preflight.validate.ok", ok);
140
+ vSpan.setStatus({ code: SpanStatusCode.OK });
141
+ return ok;
142
+ }
143
+ catch (err) {
144
+ // validate() shouldn't throw under normal conditions, but if a
145
+ // schema fetch or git call inside it does, surface it here so
146
+ // we don't lose the error to the parent span's generic handler.
147
+ const msg = err instanceof Error ? err.message : String(err);
148
+ vSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
149
+ vSpan.recordException(err instanceof Error ? err : new Error(msg));
150
+ throw err;
151
+ }
152
+ finally {
153
+ vSpan.end();
154
+ }
155
+ });
94
156
  if (!valid && !options.force) {
95
157
  pushSpan.setAttribute("push.exit_reason", "validation_failed");
158
+ pushSpan.addEvent("preflight.validation_rejected_push");
96
159
  throw new ValidationError("Push aborted due to validation errors.", {
97
160
  suggestion: "Fix the errors above, or use --force to push anyway.",
98
161
  });
99
162
  }
163
+ if (!valid && options.force) {
164
+ pushSpan.addEvent("preflight.validation_failed_but_forced");
165
+ }
166
+ }
167
+ else {
168
+ pushSpan.addEvent("preflight.validation_skipped");
100
169
  }
101
170
  if (!json)
102
171
  log(_jsx(StatusLine, { kind: "info", label: "Pushing agent changes..." }));
@@ -284,11 +353,78 @@ async function _push(pushSpan, agentCode, options = {}) {
284
353
  // If the project has version_id in .auirc or --version-id is passed,
285
354
  // we validate it's a draft. If no version context exists, we auto-detect
286
355
  // available drafts. Push is rejected if no draft is found.
356
+ //
357
+ // Wrapped in an `aui.push.preflight.resolve-version` span so a hang on
358
+ // listAgents / listVersions / getVersion shows up clearly in Logfire
359
+ // instead of being lumped under the parent push span. This is the
360
+ // step that hits agent-management with up to 3 sequential calls.
287
361
  let prePushDraft = null;
288
362
  if (projectConfig.version_id || options.versionId) {
289
- prePushDraft = await resolveVersionDraft(config, projectConfig, session, options.versionId);
363
+ const resolveTracer = getTracer();
364
+ prePushDraft = await resolveTracer.startActiveSpan("aui.push.preflight.resolve-version", async (rSpan) => {
365
+ rSpan.setAttribute("push.preflight.step", "resolve-version");
366
+ rSpan.setAttribute("push.preflight.has_explicit_version_id", !!options.versionId);
367
+ rSpan.setAttribute("push.preflight.has_auirc_version_id", !!projectConfig.version_id);
368
+ if (projectConfig.agent_id) {
369
+ rSpan.setAttribute("push.preflight.network_id", projectConfig.agent_id);
370
+ }
371
+ try {
372
+ const draft = await resolveVersionDraft(config, projectConfig, session, options.versionId);
373
+ rSpan.setAttribute("push.preflight.resolved_version_id", draft.versionId);
374
+ rSpan.setAttribute("push.preflight.resolved_version_label", draft.label);
375
+ rSpan.setAttribute("push.preflight.resolved_agent_id", draft.agentId);
376
+ rSpan.setStatus({ code: SpanStatusCode.OK });
377
+ return draft;
378
+ }
379
+ catch (err) {
380
+ const msg = err instanceof Error ? err.message : String(err);
381
+ rSpan.setStatus({ code: SpanStatusCode.ERROR, message: msg });
382
+ rSpan.recordException(err instanceof Error ? err : new Error(msg));
383
+ throw err;
384
+ }
385
+ finally {
386
+ rSpan.end();
387
+ }
388
+ });
290
389
  agentSettingsParams.version_id = prePushDraft.versionId;
291
- log(_jsx(StatusLine, { kind: "info", label: `Pushing into draft version: ${prePushDraft.label}` }));
390
+ // Per a117251 (alboim): every agent-settings write body must carry the
391
+ // agent-management UUID. Setting it on `agentSettingsParams` here means
392
+ // every subsequent `client.<entity>` call funnels through `versionBody`
393
+ // and includes `agent_id` automatically.
394
+ agentSettingsParams.agent_id = prePushDraft.agentId;
395
+ pushSpan.setAttribute("push.version_id", prePushDraft.versionId);
396
+ pushSpan.setAttribute("push.version_label", prePushDraft.label);
397
+ pushSpan.setAttribute("push.agent_management_id", prePushDraft.agentId);
398
+ // Persist agent_management_id back to .auirc on first push so subsequent
399
+ // pushes skip the listAgents lookup. Mirrors what
400
+ // `resolvePushAgentManagementId` does in the legacy branch — keeps both
401
+ // paths converging on the same .auirc state. Non-fatal if the write
402
+ // fails (we already have the id in memory for this push).
403
+ if (!projectConfig.agent_management_id) {
404
+ try {
405
+ saveProjectConfig({ ...projectConfig, agent_management_id: prePushDraft.agentId }, projectRoot);
406
+ pushSpan.addEvent("auirc.agent_management_id_persisted_from_draft", {
407
+ agent_management_id: prePushDraft.agentId,
408
+ });
409
+ }
410
+ catch (err) {
411
+ if (process.env.AUI_DEBUG) {
412
+ console.warn("[debug] failed to persist agent_management_id back to .auirc:", err instanceof Error ? err.message : err);
413
+ }
414
+ }
415
+ }
416
+ }
417
+ else {
418
+ // Legacy push (no version_id) — still need agent_id on write bodies.
419
+ // resolvePushAgentManagementId reads from .auirc first (cached by
420
+ // import-agent / pull-agent), falls back to listAgents lookup + writes
421
+ // back to .auirc so subsequent pushes skip the lookup.
422
+ pushSpan.addEvent("preflight.no_draft_version_required", {
423
+ reason: "legacy push (no version_id in .auirc or --version-id flag)",
424
+ });
425
+ const legacyAgentId = await resolvePushAgentManagementId(config, projectConfig, session, projectRoot);
426
+ agentSettingsParams.agent_id = legacyAgentId;
427
+ pushSpan.setAttribute("push.agent_management_id", legacyAgentId);
292
428
  }
293
429
  const pushTasks = buildPushTasks(diff, fileData, projectRoot, getFileDiff);
294
430
  pushSpan.setAttribute("push.task_count", pushTasks.length);
@@ -418,7 +554,12 @@ async function _push(pushSpan, agentCode, options = {}) {
418
554
  // JSON envelope, and the non-zero exit code (BFF contract: zero silent
419
555
  // errors anywhere in the push pipeline).
420
556
  const kbResult = await pushKnowledgeHubs(projectRoot, projectConfig);
557
+ pushSpan.setAttribute("push.kb.ok", kbResult.ok);
558
+ pushSpan.setAttribute("push.kb.failures", kbResult.failures.length);
421
559
  if (!kbResult.ok) {
560
+ pushSpan.addEvent("kb.failures_folded_into_pushFailures", {
561
+ count: kbResult.failures.length,
562
+ });
422
563
  for (const kbFailure of kbResult.failures) {
423
564
  failed++;
424
565
  pushFailures.push(kbFailure);
@@ -450,10 +591,16 @@ async function _push(pushSpan, agentCode, options = {}) {
450
591
  process.stdout.isTTY === true;
451
592
  if (!isInteractive) {
452
593
  failed += authFailedTasks.length;
594
+ pushSpan.addEvent("auth.fallback.non_interactive_rejected", {
595
+ failed_task_count: authFailedTasks.length,
596
+ });
453
597
  throw new AuthenticationError(`Authentication failed for ${authFailedTasks.length} push task(s); cannot prompt for an API key (non-interactive session).`, {
454
598
  suggestion: "Pass --api-key <key>, set AUI_AGENT_TOOLS_API_KEY, or run `aui login` to refresh credentials.",
455
599
  });
456
600
  }
601
+ pushSpan.addEvent("auth.fallback.api_key_prompted", {
602
+ failed_task_count: authFailedTasks.length,
603
+ });
457
604
  log(_jsxs(Box, { flexDirection: "column", paddingX: 1, children: [_jsx(StatusLine, { kind: "warning", label: "Authentication failed. Your access token may not have permission." }), _jsx(Hint, { message: "You can provide an API key as a fallback. It will be saved to ~/.aui/agent-settings-key" })] }));
458
605
  const { key } = await inquirer.prompt([
459
606
  {
@@ -466,6 +613,9 @@ async function _push(pushSpan, agentCode, options = {}) {
466
613
  if (key && key.trim()) {
467
614
  saveAgentSettingsApiKey(key.trim());
468
615
  client.setAgentSettingsApiKey(key.trim());
616
+ pushSpan.addEvent("auth.fallback.api_key_provided", {
617
+ retrying_task_count: authFailedTasks.length,
618
+ });
469
619
  log(_jsx(StatusLine, { kind: "success", label: "Key saved." }));
470
620
  log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "info", label: `Retrying ${authFailedTasks.length} change(s) with API key...` }) }));
471
621
  authFailed = false;
@@ -495,10 +645,16 @@ async function _push(pushSpan, agentCode, options = {}) {
495
645
  }
496
646
  else {
497
647
  failed += authFailedTasks.length;
648
+ pushSpan.addEvent("auth.fallback.api_key_skipped", {
649
+ uncovered_task_count: authFailedTasks.length,
650
+ });
498
651
  }
499
652
  }
500
653
  else if (authFailed && authFailedTasks.length > 0) {
501
654
  failed += authFailedTasks.length;
655
+ pushSpan.addEvent("auth.fallback.saved_key_still_failed", {
656
+ failed_task_count: authFailedTasks.length,
657
+ });
502
658
  log(_jsx(ErrorDisplay, { error: new AuthenticationError("Auth failed even with saved API key.", {
503
659
  suggestion: "Try: rm ~/.aui/agent-settings-key",
504
660
  }) }));
@@ -532,6 +688,7 @@ async function _push(pushSpan, agentCode, options = {}) {
532
688
  if (prePushDraft) {
533
689
  const SNAPSHOT_MAX_ATTEMPTS = 4;
534
690
  const SNAPSHOT_RETRY_BASE_MS = 1000;
691
+ const snapshotTracer = getTracer();
535
692
  for (let attempt = 1; attempt <= SNAPSHOT_MAX_ATTEMPTS; attempt++) {
536
693
  snapshotAttempts = attempt;
537
694
  const label = attempt === 1
@@ -540,26 +697,65 @@ async function _push(pushSpan, agentCode, options = {}) {
540
697
  if (json)
541
698
  stderrLog(label);
542
699
  const snapshotSpinner = json ? null : startSpinner(label);
700
+ // Per-attempt span — each snapshot upload is a network call that can
701
+ // hang for minutes (large multipart upload). Surfacing each attempt
702
+ // separately in Logfire lets us see retry behavior, attempt latency,
703
+ // and which attempt finally succeeded. Logfire query:
704
+ // `name:"aui.push.task.snapshot" AND attributes."snapshot.attempt":3`
705
+ // finds every push that needed a third try.
543
706
  let attemptError;
544
- try {
545
- const snapshotResult = await pushSnapshot(client, prePushDraft.agentId, prePushDraft.versionId, projectRoot, fileData);
546
- if (snapshotResult.success) {
547
- const okMsg = attempt === 1
548
- ? `Snapshot pushed (${fileData.length} file(s))`
549
- : `Snapshot pushed (${fileData.length} file(s), attempt ${attempt}/${SNAPSHOT_MAX_ATTEMPTS})`;
550
- if (snapshotSpinner)
551
- snapshotSpinner.succeed(okMsg);
552
- else
553
- stderrLog(okMsg);
554
- snapshotSucceeded = true;
555
- snapshotError = undefined;
556
- break;
707
+ const attemptResolved = await snapshotTracer.startActiveSpan("aui.push.task.snapshot", async (snapSpan) => {
708
+ snapSpan.setAttribute("push.task.type", "snapshot");
709
+ snapSpan.setAttribute("push.task.label", label);
710
+ snapSpan.setAttribute("snapshot.attempt", attempt);
711
+ snapSpan.setAttribute("snapshot.max_attempts", SNAPSHOT_MAX_ATTEMPTS);
712
+ snapSpan.setAttribute("snapshot.file_count", fileData.length);
713
+ snapSpan.setAttribute("snapshot.agent_id", prePushDraft.agentId);
714
+ snapSpan.setAttribute("snapshot.version_id", prePushDraft.versionId);
715
+ try {
716
+ const snapshotResult = await pushSnapshot(client, prePushDraft.agentId, prePushDraft.versionId, projectRoot, fileData);
717
+ if (snapshotResult.success) {
718
+ snapSpan.setStatus({ code: SpanStatusCode.OK });
719
+ snapSpan.setAttribute("snapshot.outcome", "success");
720
+ return { ok: true, error: undefined };
721
+ }
722
+ const errMsg = snapshotResult.error || "Unknown snapshot error";
723
+ snapSpan.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
724
+ snapSpan.setAttribute("snapshot.outcome", "failed");
725
+ snapSpan.setAttribute("push.task.error", errMsg);
726
+ if (attempt < SNAPSHOT_MAX_ATTEMPTS) {
727
+ snapSpan.addEvent("snapshot.retry_will_follow", {
728
+ next_attempt: attempt + 1,
729
+ backoff_ms: SNAPSHOT_RETRY_BASE_MS * Math.pow(2, attempt - 1),
730
+ });
731
+ }
732
+ return { ok: false, error: errMsg };
557
733
  }
558
- attemptError = snapshotResult.error || "Unknown snapshot error";
559
- }
560
- catch (error) {
561
- attemptError = error instanceof Error ? error.message : String(error);
734
+ catch (error) {
735
+ const errMsg = error instanceof Error ? error.message : String(error);
736
+ snapSpan.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
737
+ snapSpan.recordException(error instanceof Error ? error : new Error(errMsg));
738
+ snapSpan.setAttribute("snapshot.outcome", "exception");
739
+ snapSpan.setAttribute("push.task.error", errMsg);
740
+ return { ok: false, error: errMsg };
741
+ }
742
+ finally {
743
+ snapSpan.end();
744
+ }
745
+ });
746
+ if (attemptResolved.ok) {
747
+ const okMsg = attempt === 1
748
+ ? `Snapshot pushed (${fileData.length} file(s))`
749
+ : `Snapshot pushed (${fileData.length} file(s), attempt ${attempt}/${SNAPSHOT_MAX_ATTEMPTS})`;
750
+ if (snapshotSpinner)
751
+ snapshotSpinner.succeed(okMsg);
752
+ else
753
+ stderrLog(okMsg);
754
+ snapshotSucceeded = true;
755
+ snapshotError = undefined;
756
+ break;
562
757
  }
758
+ attemptError = attemptResolved.error;
563
759
  snapshotError = attemptError;
564
760
  const isLast = attempt === SNAPSHOT_MAX_ATTEMPTS;
565
761
  const failMsg = isLast
@@ -617,13 +813,27 @@ async function _push(pushSpan, agentCode, options = {}) {
617
813
  if (filesSafeToCommit.length > 0) {
618
814
  commitBaselineFiles(projectRoot, filesSafeToCommit, `pushed ${succeeded} change(s) (${failedFiles.size} file(s) held back due to per-task failures)`);
619
815
  baselineUpdated = true;
816
+ pushSpan.addEvent("baseline.partial_commit", {
817
+ committed_files: filesSafeToCommit.length,
818
+ held_back_files: failedFiles.size,
819
+ });
820
+ }
821
+ else {
822
+ pushSpan.addEvent("baseline.fully_held_back", {
823
+ failed_files: failedFiles.size,
824
+ });
620
825
  }
621
826
  }
622
827
  else if (failed === 0) {
623
828
  commitBaseline(projectRoot, "pushed changes");
624
829
  baselineUpdated = true;
830
+ pushSpan.addEvent("baseline.full_commit");
625
831
  }
626
832
  }
833
+ else {
834
+ pushSpan.addEvent("baseline.skipped_due_to_snapshot_failure");
835
+ }
836
+ pushSpan.setAttribute("push.baseline_updated", baselineUpdated);
627
837
  log(_jsx(PushFinalSummary, { succeeded: succeeded, failed: failed, baselineUpdated: baselineUpdated, logDir: logRelPath, memoryPath: memoryPath, snapshotStatus: snapshotStatus, snapshotError: snapshotError }));
628
838
  if (failed > 0) {
629
839
  log(_jsxs(Box, { flexDirection: "column", paddingX: 1, children: [_jsx(StatusLine, { kind: "warning", label: `${failed} entity change(s) failed to push to DB.` }), pushFailures.map((f) => (_jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [_jsxs(Text, { color: "red", children: [" ", icons.error, " ", f.label] }), _jsxs(Text, { color: colors.muted, children: [" Error: ", f.error] }), f.file && _jsxs(Text, { color: colors.muted, children: [" File: ", f.file] })] }, f.label))), _jsxs(Box, { marginTop: 1, children: [_jsx(Text, { color: colors.info, bold: true, children: "What to do next: " }), _jsxs(Text, { color: colors.muted, children: ["Fix the issues above and re-run ", _jsx(Text, { bold: true, children: "aui push" }), " to retry the failed changes."] })] })] }));
@@ -717,11 +927,13 @@ async function _push(pushSpan, agentCode, options = {}) {
717
927
  throw error;
718
928
  }
719
929
  }
720
- async function resolveVersionDraft(config, projectConfig, session, explicitVersionId) {
721
- // Every error path below MUST throw a typed CLIError (not return null).
722
- // Returning null silently exits the CLI with code 0 the BFF then thinks
723
- // the push succeeded when nothing actually happened, and the failure
724
- // never reaches Logfire because no exception bubbled to handleError.
930
+ /**
931
+ * Lookup the agent-management record for the current `.auirc` project
932
+ * (preferred) or the active session fallbacksame precedence as draft
933
+ * resolution. Each attempt records its error so callers can surface the full
934
+ * picture instead of silently dropping `agent_id` from request bodies.
935
+ */
936
+ async function lookupAgentManagementInfoForPush(config, projectConfig, session) {
725
937
  const client = new AUIClient({
726
938
  baseUrl: config.apiUrl,
727
939
  authToken: config.authToken,
@@ -733,55 +945,110 @@ async function resolveVersionDraft(config, projectConfig, session, explicitVersi
733
945
  if (key)
734
946
  client.setAgentSettingsApiKey(key);
735
947
  let agentInfo;
948
+ const errors = [];
736
949
  const agentMgmtId = session.agent_management_id;
737
- // Project's network_id (from .auirc) takes priority over session — when
738
- // you're inside a project, that's the agent you mean. Session agent may
739
- // point at a different agent (e.g. last `aui agents --switch`).
740
950
  const projectNetworkId = projectConfig.agent_id;
741
951
  const fallbackNetworkId = session.network_id;
742
952
  if (projectNetworkId) {
743
953
  try {
744
954
  const resp = await client.agentManagement.listAgents(client.getOrganizationId(), 1, 50, { network_id: projectNetworkId });
745
- agentInfo = resp.items.find((a) => a.scope.network_id === projectNetworkId || a.id === projectNetworkId);
955
+ agentInfo = resp.items.find((a) => a.scope.network_id === projectNetworkId ||
956
+ a.id === projectNetworkId);
957
+ if (!agentInfo) {
958
+ errors.push(`listAgents(network_id=${projectNetworkId}) returned ${resp.items.length} item(s), none matched.`);
959
+ }
746
960
  }
747
961
  catch (err) {
748
- // Listing fall-through is fine because the next two branches try other
749
- // resolution paths AND a final ConfigError is thrown below if none
750
- // succeed. But emit a debug warning so an operator with AUI_DEBUG=1
751
- // can see WHICH branch failed and why (zero silent errors policy).
962
+ // Accumulate into `errors` so the eventual ConfigError can list every
963
+ // resolution path that failed (alboim's a117251). Also emit AUI_DEBUG
964
+ // warning for live operator observability (zero silent errors policy).
965
+ errors.push(`listAgents(network_id=${projectNetworkId}) threw: ${err instanceof Error ? err.message : String(err)}`);
752
966
  if (process.env.AUI_DEBUG) {
753
967
  console.warn(`[debug] resolveVersionDraft: listAgents(network_id=${projectNetworkId}) failed:`, err instanceof Error ? err.message : err);
754
968
  }
755
969
  }
756
970
  }
757
- // Fall back to session's agent_management_id only when not inside a project
758
- if (!agentInfo && !projectNetworkId && agentMgmtId) {
971
+ // Try the session's agent_management_id even when the project has a network
972
+ // id it's a direct getAgent call, no list scan, and it gracefully covers
973
+ // the case where listAgents fell through above.
974
+ if (!agentInfo && agentMgmtId) {
759
975
  try {
760
976
  agentInfo = await client.agentManagement.getAgent(agentMgmtId);
761
977
  }
762
978
  catch (err) {
979
+ errors.push(`getAgent(${agentMgmtId}) threw: ${err instanceof Error ? err.message : String(err)}`);
763
980
  if (process.env.AUI_DEBUG) {
764
981
  console.warn(`[debug] resolveVersionDraft: getAgent(${agentMgmtId}) failed (stale id?):`, err instanceof Error ? err.message : err);
765
982
  }
766
983
  }
767
984
  }
768
- // Last resort: session's network_id
769
- if (!agentInfo && fallbackNetworkId) {
985
+ if (!agentInfo && fallbackNetworkId && fallbackNetworkId !== projectNetworkId) {
770
986
  try {
771
987
  const resp = await client.agentManagement.listAgents(client.getOrganizationId(), 1, 50, { network_id: fallbackNetworkId });
772
- agentInfo = resp.items.find((a) => a.scope.network_id === fallbackNetworkId || a.id === fallbackNetworkId);
988
+ agentInfo = resp.items.find((a) => a.scope.network_id === fallbackNetworkId ||
989
+ a.id === fallbackNetworkId);
990
+ if (!agentInfo) {
991
+ errors.push(`listAgents(network_id=${fallbackNetworkId}) returned ${resp.items.length} item(s), none matched.`);
992
+ }
773
993
  }
774
994
  catch (err) {
995
+ errors.push(`listAgents(network_id=${fallbackNetworkId}) threw: ${err instanceof Error ? err.message : String(err)}`);
775
996
  if (process.env.AUI_DEBUG) {
776
997
  console.warn(`[debug] resolveVersionDraft: listAgents(network_id=${fallbackNetworkId}) failed:`, err instanceof Error ? err.message : err);
777
998
  }
778
999
  }
779
1000
  }
1001
+ return { agentInfo, errors };
1002
+ }
1003
+ /**
1004
+ * Return the agent-management UUID to send as `agent_id` on agent-settings
1005
+ * write bodies. Reads `.auirc` first; falls back to `lookupAgentManagementInfoForPush`
1006
+ * and **persists** the resolved id back to `.auirc` so subsequent pushes don't
1007
+ * pay the lookup cost. Throws `ConfigError` if no id can be resolved — never
1008
+ * silently returns undefined, because that's how entities ended up in the DB
1009
+ * without `agent_id`.
1010
+ */
1011
+ async function resolvePushAgentManagementId(config, projectConfig, session, projectRoot) {
1012
+ if (projectConfig.agent_management_id)
1013
+ return projectConfig.agent_management_id;
1014
+ const { agentInfo, errors } = await lookupAgentManagementInfoForPush(config, projectConfig, session);
780
1015
  if (!agentInfo) {
781
- throw new ConfigError("Could not resolve agent for version management.", {
1016
+ const detail = errors.length > 0 ? `\n - ${errors.join("\n - ")}` : "";
1017
+ throw new ConfigError(`Could not resolve agent-management id for this project.${detail}`, {
1018
+ suggestion: "Re-run `aui import-agent` (will populate .auirc.agent_management_id) or `aui pull` to back-fill it.",
1019
+ });
1020
+ }
1021
+ // Migrate legacy projects: persist back so the next push skips the lookup.
1022
+ try {
1023
+ saveProjectConfig({ ...projectConfig, agent_management_id: agentInfo.id }, projectRoot);
1024
+ }
1025
+ catch {
1026
+ // .auirc write failure is non-fatal — we already have the id in memory.
1027
+ }
1028
+ return agentInfo.id;
1029
+ }
1030
+ async function resolveVersionDraft(config, projectConfig, session, explicitVersionId) {
1031
+ // Every error path below MUST throw a typed CLIError (not return null).
1032
+ // Returning null silently exits the CLI with code 0 — the BFF then thinks
1033
+ // the push succeeded when nothing actually happened, and the failure
1034
+ // never reaches Logfire because no exception bubbled to handleError.
1035
+ const { agentInfo, errors: lookupErrors } = await lookupAgentManagementInfoForPush(config, projectConfig, session);
1036
+ if (!agentInfo) {
1037
+ const detail = lookupErrors.length > 0 ? `\n - ${lookupErrors.join("\n - ")}` : "";
1038
+ throw new ConfigError(`Could not resolve agent for version management.${detail}`, {
782
1039
  suggestion: "Run `aui import-agent` to link an agent, or check your session with `aui status`.",
783
1040
  });
784
1041
  }
1042
+ const client = new AUIClient({
1043
+ baseUrl: config.apiUrl,
1044
+ authToken: config.authToken,
1045
+ accountId: config.accountId,
1046
+ organizationId: config.organizationId,
1047
+ environment: config.environment,
1048
+ });
1049
+ const key = loadAgentSettingsApiKey();
1050
+ if (key)
1051
+ client.setAgentSettingsApiKey(key);
785
1052
  // If user passed --version-id, validate it's a draft
786
1053
  if (explicitVersionId) {
787
1054
  let ver;
@@ -1029,26 +1296,50 @@ async function pushKnowledgeHubs(projectRoot, projectConfig) {
1029
1296
  log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Cannot delete "${kbName}" — no knowledge_base_id stored. Push the KB first, then delete.` }) }));
1030
1297
  continue;
1031
1298
  }
1032
- try {
1033
- await kbViewClient.deleteKnowledgeBase(kbId, scope, kbName);
1034
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName}` }) }));
1035
- }
1036
- catch (delErr) {
1037
- // Per-KB error: count it, keep going so partial work shows up.
1038
- if (isNotFoundError(delErr)) {
1039
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName} (already absent)` }) }));
1299
+ // Per-KB delete in its own span so each one shows up in Logfire as
1300
+ // `aui.push.task.kb-delete` with status, kb name, kb id, and error
1301
+ // body. Same observability shape as agent-settings entity tasks.
1302
+ const kbDelTracer = getTracer();
1303
+ await kbDelTracer.startActiveSpan("aui.push.task.kb-delete", async (span) => {
1304
+ span.setAttribute("push.task.type", "kb-delete");
1305
+ span.setAttribute("push.task.label", `Delete knowledge base: ${kbName}`);
1306
+ span.setAttribute("push.task.file", `knowledge-hubs/${kbDirName}/kb.json`);
1307
+ span.setAttribute("push.task.kb_id", kbId);
1308
+ span.setAttribute("push.task.kb_name", kbName);
1309
+ try {
1310
+ await kbViewClient.deleteKnowledgeBase(kbId, scope, kbName);
1311
+ span.setStatus({ code: SpanStatusCode.OK });
1312
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName}` }) }));
1040
1313
  }
1041
- else {
1042
- kbDeleteSucceeded = false;
1043
- const errMsg = delErr instanceof Error ? delErr.message : String(delErr);
1044
- failures.push({
1045
- label: `Delete knowledge base: ${kbName}`,
1046
- file: `knowledge-hubs/${kbDirName}/kb.json`,
1047
- error: errMsg,
1048
- });
1049
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to delete "${kbName}": ${errMsg}` }) }));
1314
+ catch (delErr) {
1315
+ // Per-KB error: count it, keep going so partial work shows up.
1316
+ if (isNotFoundError(delErr)) {
1317
+ span.setStatus({ code: SpanStatusCode.OK });
1318
+ span.addEvent("fallback.delete_404_already_absent");
1319
+ span.setAttribute("push.task.fallback", "delete_404_already_absent");
1320
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "success", label: `Deleted: ${kbName} (already absent)` }) }));
1321
+ }
1322
+ else {
1323
+ kbDeleteSucceeded = false;
1324
+ const errMsg = delErr instanceof Error ? delErr.message : String(delErr);
1325
+ span.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
1326
+ span.recordException(delErr instanceof Error ? delErr : new Error(errMsg));
1327
+ span.setAttribute("push.task.error", errMsg);
1328
+ if (delErr.statusCode) {
1329
+ span.setAttribute("push.task.error_status_code", delErr.statusCode);
1330
+ }
1331
+ failures.push({
1332
+ label: `Delete knowledge base: ${kbName}`,
1333
+ file: `knowledge-hubs/${kbDirName}/kb.json`,
1334
+ error: errMsg,
1335
+ });
1336
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to delete "${kbName}": ${errMsg}` }) }));
1337
+ }
1050
1338
  }
1051
- }
1339
+ finally {
1340
+ span.end();
1341
+ }
1342
+ });
1052
1343
  }
1053
1344
  if (kbDeleteSucceeded) {
1054
1345
  deleteSpinner.succeed(`${deletedKBDirs.length} knowledge base(s) deleted`);
@@ -1085,41 +1376,64 @@ async function pushKnowledgeHubs(projectRoot, projectConfig) {
1085
1376
  log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Skipped unsupported file: ${path.basename(skipped)} (only .pdf, .md, .txt, .json)` }) }));
1086
1377
  }
1087
1378
  if (supportedFiles.length > 0) {
1088
- try {
1089
- const importResult = await kbViewClient.importFiles({
1090
- files: supportedFiles,
1091
- scope,
1092
- created_by: userId,
1093
- knowledge_base_name: kbData.name,
1094
- knowledge_base_description: kbData.description,
1095
- });
1096
- if (importResult.knowledge_base_id) {
1097
- const kbJsonPath = path.join(kbDir, "kb.json");
1098
- try {
1099
- const raw = JSON.parse(fs.readFileSync(kbJsonPath, "utf-8"));
1100
- raw.knowledge_base_id = importResult.knowledge_base_id;
1101
- fs.writeFileSync(kbJsonPath, JSON.stringify(raw, null, 2) + "\n");
1102
- }
1103
- catch (writeErr) {
1104
- // kb.json id write fail is non-fatal but tell the user so the
1105
- // next push doesn't surprise them with "no knowledge_base_id stored".
1106
- if (process.env.AUI_DEBUG) {
1107
- console.warn(`[debug] failed to write knowledge_base_id back to ${kbJsonPath}:`, writeErr);
1379
+ // Per-KB upload in its own span — Logfire query
1380
+ // `name:"aui.push.task.kb-upload" AND status_code:ERROR` finds
1381
+ // every KB push failure across all agents.
1382
+ const kbUpTracer = getTracer();
1383
+ await kbUpTracer.startActiveSpan("aui.push.task.kb-upload", async (span) => {
1384
+ span.setAttribute("push.task.type", "kb-upload");
1385
+ span.setAttribute("push.task.label", `Push knowledge base: ${kbData.name || kbDirName}`);
1386
+ span.setAttribute("push.task.file", `knowledge-hubs/${kbDirName}/kb.json`);
1387
+ span.setAttribute("push.task.kb_name", kbData.name || kbDirName);
1388
+ span.setAttribute("push.task.file_count", supportedFiles.length);
1389
+ try {
1390
+ const importResult = await kbViewClient.importFiles({
1391
+ files: supportedFiles,
1392
+ scope,
1393
+ created_by: userId,
1394
+ knowledge_base_name: kbData.name,
1395
+ knowledge_base_description: kbData.description,
1396
+ });
1397
+ span.setStatus({ code: SpanStatusCode.OK });
1398
+ if (importResult.knowledge_base_id) {
1399
+ span.setAttribute("push.task.kb_id", importResult.knowledge_base_id);
1400
+ const kbJsonPath = path.join(kbDir, "kb.json");
1401
+ try {
1402
+ const raw = JSON.parse(fs.readFileSync(kbJsonPath, "utf-8"));
1403
+ raw.knowledge_base_id = importResult.knowledge_base_id;
1404
+ fs.writeFileSync(kbJsonPath, JSON.stringify(raw, null, 2) + "\n");
1405
+ }
1406
+ catch (writeErr) {
1407
+ // kb.json id write fail is non-fatal but tell the user so the
1408
+ // next push doesn't surprise them with "no knowledge_base_id stored".
1409
+ span.addEvent("kb_id_writeback_failed");
1410
+ if (process.env.AUI_DEBUG) {
1411
+ console.warn(`[debug] failed to write knowledge_base_id back to ${kbJsonPath}:`, writeErr);
1412
+ }
1413
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Could not persist knowledge_base_id back to ${path.basename(kbJsonPath)} — re-import or run \`aui pull\` to recover.` }) }));
1108
1414
  }
1109
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "warning", label: `Could not persist knowledge_base_id back to ${path.basename(kbJsonPath)} — re-import or run \`aui pull\` to recover.` }) }));
1110
1415
  }
1111
1416
  }
1112
- }
1113
- catch (uploadErr) {
1114
- hadUploadFailure = true;
1115
- const errMsg = uploadErr instanceof Error ? uploadErr.message : String(uploadErr);
1116
- failures.push({
1117
- label: `Push knowledge base: ${kbData.name || kbDirName}`,
1118
- file: `knowledge-hubs/${kbDirName}/kb.json`,
1119
- error: errMsg,
1120
- });
1121
- log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to push "${kbData.name || kbDirName}": ${errMsg}` }) }));
1122
- }
1417
+ catch (uploadErr) {
1418
+ hadUploadFailure = true;
1419
+ const errMsg = uploadErr instanceof Error ? uploadErr.message : String(uploadErr);
1420
+ span.setStatus({ code: SpanStatusCode.ERROR, message: errMsg });
1421
+ span.recordException(uploadErr instanceof Error ? uploadErr : new Error(errMsg));
1422
+ span.setAttribute("push.task.error", errMsg);
1423
+ if (uploadErr.statusCode) {
1424
+ span.setAttribute("push.task.error_status_code", uploadErr.statusCode);
1425
+ }
1426
+ failures.push({
1427
+ label: `Push knowledge base: ${kbData.name || kbDirName}`,
1428
+ file: `knowledge-hubs/${kbDirName}/kb.json`,
1429
+ error: errMsg,
1430
+ });
1431
+ log(_jsx(Box, { paddingX: 1, children: _jsx(StatusLine, { kind: "error", label: `Failed to push "${kbData.name || kbDirName}": ${errMsg}` }) }));
1432
+ }
1433
+ finally {
1434
+ span.end();
1435
+ }
1436
+ });
1123
1437
  }
1124
1438
  }
1125
1439
  if (hadUploadFailure) {
@@ -1622,6 +1936,32 @@ function isTransient5xx(err) {
1622
1936
  ?? err.status;
1623
1937
  return code === 500 || code === 502 || code === 503 || code === 504;
1624
1938
  }
1939
+ /**
1940
+ * Tag the currently-active span with a fallback-decision event + attribute,
1941
+ * so Logfire shows exactly which adaptive layer fired during a push.
1942
+ *
1943
+ * Useful queries once published:
1944
+ * - `attributes."push.task.fallback":"patch_404_to_post"` → every drift
1945
+ * recovery (next push self-healed a previously-failed POST).
1946
+ * - `attributes."push.task.fallback":"transient_retry"` → backend 5xx
1947
+ * events that were absorbed by the retry layer.
1948
+ * - `attributes."push.task.fallback":"post_409_to_patch"` → "create"
1949
+ * calls that converted to "update" because the row pre-existed.
1950
+ * - `attributes."push.task.fallback":"delete_404_already_absent"` →
1951
+ * deletes that no-op'd because the row was already gone.
1952
+ *
1953
+ * No-op when there's no active span (e.g. unit tests outside the push flow).
1954
+ */
1955
+ function recordFallbackEvent(kind, detail) {
1956
+ const span = trace.getActiveSpan();
1957
+ if (!span)
1958
+ return;
1959
+ span.addEvent(`fallback.${kind}`, detail);
1960
+ span.setAttribute("push.task.fallback", kind);
1961
+ for (const [k, v] of Object.entries(detail ?? {})) {
1962
+ span.setAttribute(`push.task.fallback.${k}`, v);
1963
+ }
1964
+ }
1625
1965
  /**
1626
1966
  * Run one entity-settings write call once, and retry exactly once on a
1627
1967
  * transient 5xx after a 1s back-off. The snapshot upload has its own
@@ -1640,6 +1980,11 @@ async function withTransientRetry(label, fn) {
1640
1980
  if (process.env.AUI_DEBUG) {
1641
1981
  console.log(`[debug] ${label} got ${code}, retrying once after 1000ms`);
1642
1982
  }
1983
+ recordFallbackEvent("transient_retry", {
1984
+ label,
1985
+ status_code: code ?? 0,
1986
+ backoff_ms: 1000,
1987
+ });
1643
1988
  await new Promise((r) => setTimeout(r, 1000));
1644
1989
  return await fn();
1645
1990
  }
@@ -1671,6 +2016,7 @@ async function _executePushTask(client, params, task) {
1671
2016
  if (process.env.AUI_DEBUG) {
1672
2017
  console.log(`[debug] patch-tool ${task.toolName}: 404 not found, falling back to POST`);
1673
2018
  }
2019
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-tool", tool: String(task.toolName ?? "") });
1674
2020
  return client.createTool(params, task.body);
1675
2021
  }
1676
2022
  throw err;
@@ -1689,6 +2035,7 @@ async function _executePushTask(client, params, task) {
1689
2035
  const body = task.body;
1690
2036
  const toolCode = body.code || "";
1691
2037
  const toolName = toolCode.toUpperCase().replace(/-/g, "_");
2038
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-tool", tool: toolName });
1692
2039
  return client.patchTool(params, toolName, body);
1693
2040
  }
1694
2041
  throw err;
@@ -1704,6 +2051,7 @@ async function _executePushTask(client, params, task) {
1704
2051
  if (process.env.AUI_DEBUG) {
1705
2052
  console.log(`[debug] delete-tool ${task.toolName}: 404 already absent`);
1706
2053
  }
2054
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-tool", tool: String(task.toolName ?? "") });
1707
2055
  return DELETE_ALREADY_ABSENT;
1708
2056
  }
1709
2057
  throw err;
@@ -1727,6 +2075,7 @@ async function _executePushTask(client, params, task) {
1727
2075
  if (process.env.AUI_DEBUG) {
1728
2076
  console.log(`[debug] create-parameter ${task.itemCode}: 409, falling back to PATCH`);
1729
2077
  }
2078
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-parameter", code: String(task.itemCode ?? "") });
1730
2079
  return client.patchParameter(params, task.itemCode, task.body);
1731
2080
  }
1732
2081
  throw err;
@@ -1742,6 +2091,7 @@ async function _executePushTask(client, params, task) {
1742
2091
  if (process.env.AUI_DEBUG) {
1743
2092
  console.log(`[debug] patch-parameter ${task.itemCode}: 404 not found, falling back to POST`);
1744
2093
  }
2094
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-parameter", code: String(task.itemCode ?? "") });
1745
2095
  return client.createParameter(params, task.body);
1746
2096
  }
1747
2097
  throw err;
@@ -1757,6 +2107,7 @@ async function _executePushTask(client, params, task) {
1757
2107
  if (process.env.AUI_DEBUG) {
1758
2108
  console.log(`[debug] delete-parameter ${task.itemCode}: 404 already absent`);
1759
2109
  }
2110
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-parameter", code: String(task.itemCode ?? "") });
1760
2111
  return DELETE_ALREADY_ABSENT;
1761
2112
  }
1762
2113
  throw err;
@@ -1772,6 +2123,7 @@ async function _executePushTask(client, params, task) {
1772
2123
  if (process.env.AUI_DEBUG) {
1773
2124
  console.log(`[debug] create-entity ${task.itemCode}: 409, falling back to PATCH`);
1774
2125
  }
2126
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-entity", code: String(task.itemCode ?? "") });
1775
2127
  return client.patchEntity(params, task.itemCode, task.body);
1776
2128
  }
1777
2129
  throw err;
@@ -1787,6 +2139,7 @@ async function _executePushTask(client, params, task) {
1787
2139
  if (process.env.AUI_DEBUG) {
1788
2140
  console.log(`[debug] patch-entity ${task.itemCode}: 404, falling back to POST`);
1789
2141
  }
2142
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-entity", code: String(task.itemCode ?? "") });
1790
2143
  return client.createEntity(params, task.body);
1791
2144
  }
1792
2145
  throw err;
@@ -1802,6 +2155,7 @@ async function _executePushTask(client, params, task) {
1802
2155
  if (process.env.AUI_DEBUG) {
1803
2156
  console.log(`[debug] delete-entity ${task.itemCode}: 404 already absent`);
1804
2157
  }
2158
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-entity", code: String(task.itemCode ?? "") });
1805
2159
  return DELETE_ALREADY_ABSENT;
1806
2160
  }
1807
2161
  throw err;
@@ -1817,6 +2171,7 @@ async function _executePushTask(client, params, task) {
1817
2171
  if (process.env.AUI_DEBUG) {
1818
2172
  console.log(`[debug] create-integration ${task.itemCode}: 409, falling back to PATCH`);
1819
2173
  }
2174
+ recordFallbackEvent("post_409_to_patch", { task_type: "create-integration", code: String(task.itemCode ?? "") });
1820
2175
  return client.patchIntegration(params, task.itemCode, task.body);
1821
2176
  }
1822
2177
  throw err;
@@ -1832,6 +2187,7 @@ async function _executePushTask(client, params, task) {
1832
2187
  if (process.env.AUI_DEBUG) {
1833
2188
  console.log(`[debug] patch-integration ${task.itemCode}: 404 not found, falling back to POST`);
1834
2189
  }
2190
+ recordFallbackEvent("patch_404_to_post", { task_type: "patch-integration", code: String(task.itemCode ?? "") });
1835
2191
  return client.createIntegration(params, task.body);
1836
2192
  }
1837
2193
  throw err;
@@ -1847,6 +2203,7 @@ async function _executePushTask(client, params, task) {
1847
2203
  if (process.env.AUI_DEBUG) {
1848
2204
  console.log(`[debug] delete-integration ${task.itemCode}: 404 already absent`);
1849
2205
  }
2206
+ recordFallbackEvent("delete_404_already_absent", { task_type: "delete-integration", code: String(task.itemCode ?? "") });
1850
2207
  return DELETE_ALREADY_ABSENT;
1851
2208
  }
1852
2209
  throw err;