@clawling/clawchat-plugin-openclaw 2026.5.12-39 → 2026.5.13-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/runtime.ts CHANGED
@@ -16,6 +16,8 @@ import { createPluginRuntimeStore } from "openclaw/plugin-sdk/runtime-store";
16
16
  import { createOpenclawClawlingClient } from "./client.ts";
17
17
  import { createOpenclawClawlingApiClient } from "./api-client.ts";
18
18
  import { ClawlingApiError } from "./api-types.ts";
19
+ import { RefreshManager } from "./refresh-manager.ts";
20
+ import type { OpenclawClawchatMutateConfigFile } from "./login.runtime.ts";
19
21
  import {
20
22
  CHANNEL_ID,
21
23
  effectiveOutputVisibility,
@@ -35,7 +37,7 @@ import {
35
37
  setAlignedOutboundLogContext,
36
38
  } from "./outbound.ts";
37
39
  import { formatWsLog } from "./ws-log.ts";
38
- import { createProtocolControlHandler, createReconnectTracker } from "./ws-alignment.ts";
40
+ import { createNotifySignalObserver, createProtocolControlHandler, createReconnectTracker } from "./ws-alignment.ts";
39
41
  import {
40
42
  clawChatDbPathForStateDir,
41
43
  getClawChatStore,
@@ -77,6 +79,8 @@ type RuntimeConnectionStore = Pick<
77
79
  Pick<
78
80
  ClawChatStore,
79
81
  | "getActivationCredentials"
82
+ | "rotateActivationTokens"
83
+ | "clearActivationCredentials"
80
84
  | "insertMessage"
81
85
  | "claimMessageOnce"
82
86
  | "markMessageAcknowledged"
@@ -85,6 +89,7 @@ type RuntimeConnectionStore = Pick<
85
89
  | "releaseActivationBootstrapClaim"
86
90
  | "markActivationBootstrapSent"
87
91
  | "getActivationConversation"
92
+ | "getLastResolvedDeviceId"
88
93
  >
89
94
  >;
90
95
 
@@ -111,11 +116,44 @@ const OPENCLAW_CONFIRM_SLASH_COMMANDS = new Set([
111
116
  "nevermind",
112
117
  ]);
113
118
  const GROUP_OWNER_ATTENTION_TITLE = "requires owner attention";
119
+ // §C.1 — user-visible message emitted on permanent token expiry. Kept
120
+ // byte-identical to the Hermes plugin (parity spec §C.1.4).
121
+ const CLAWCHAT_TOKEN_EXPIRED_MESSAGE =
122
+ "ClawChat token expired and could not be refreshed. Re-pair with `/clawchat-activate <code>`.";
123
+ const CLAWCHAT_TOKEN_EXPIRED_LAST_ERROR = "token expired — re-pair required";
114
124
 
115
125
  function isRecord(value: unknown): value is Record<string, unknown> {
116
126
  return Boolean(value && typeof value === "object" && !Array.isArray(value));
117
127
  }
118
128
 
129
+ /**
130
+ * §A.2 — classify a WS `hello-fail` reason for refresh gating.
131
+ * - "token-rejected": reason names an authentication failure → refresh.
132
+ * - "auth-unavailable": 5xx auth-backend outage → backoff, DO NOT refresh.
133
+ * - "generic": unattributed → refresh only if the token is at/near expiry.
134
+ *
135
+ * `auth service unavailable` is already split off by the ws-client into a
136
+ * TransportError (backoff), but we classify defensively here too.
137
+ */
138
+ export function classifyHelloFailReason(
139
+ reason: string,
140
+ ): "token-rejected" | "auth-unavailable" | "generic" {
141
+ const r = (reason || "").toLowerCase();
142
+ if (/auth service unavailable|temporarily unavailable/.test(r)) return "auth-unavailable";
143
+ if (/authentication failed|invalid token|token expired|unauthorized|auth failed|invalid credentials/.test(r)) {
144
+ return "token-rejected";
145
+ }
146
+ return "generic";
147
+ }
148
+
149
+ /** Read `channels.<CHANNEL_ID>.refreshToken` from a live config, or null. */
150
+ function readConfigRefreshToken(cfg: OpenClawConfig): string | null {
151
+ const channels = (cfg as { channels?: Record<string, unknown> }).channels;
152
+ const channel = isRecord(channels) ? channels[CHANNEL_ID] : undefined;
153
+ const refreshToken = isRecord(channel) ? channel.refreshToken : undefined;
154
+ return typeof refreshToken === "string" && refreshToken.trim() ? refreshToken.trim() : null;
155
+ }
156
+
119
157
  function withFullVerboseDispatchConfig(cfg: OpenClawConfig, agentId: string): OpenClawConfig {
120
158
  const cfgRecord = cfg as Record<string, unknown>;
121
159
  const agents = isRecord(cfgRecord.agents) ? cfgRecord.agents : {};
@@ -339,7 +377,11 @@ function metadataScopesFromEnvelope(env: Envelope): string[] {
339
377
  }
340
378
 
341
379
  function shouldRefreshBehaviorForScopes(scopes: string[]): boolean {
342
- return scopes.includes("behavior");
380
+ // §9.3: empty/absent scope ⇒ "refetch everything"; unknown scope strings must
381
+ // also trigger a refresh. Only the known non-behavior scopes (title,
382
+ // description) leave agent behavior untouched.
383
+ if (scopes.length === 0) return true;
384
+ return scopes.some((scope) => scope !== "title" && scope !== "description");
343
385
  }
344
386
 
345
387
  function shouldRefreshConversationForScopes(scopes: string[]): boolean {
@@ -419,8 +461,56 @@ export interface StartGatewayParams {
419
461
  activationPollIntervalMs?: number;
420
462
  /** Test hook only. */
421
463
  rejectedActivationToken?: string;
464
+ /** Test hook only — fetch impl used by the refresh manager (`/v1/auth/refresh`). */
465
+ refreshFetchImpl?: typeof fetch;
466
+ /** Test hook only — setTimeout override for the refresh manager's proactive timer. */
467
+ refreshSetTimer?: (cb: () => void, ms: number) => ReturnType<typeof setTimeout> | number;
468
+ /** Test hook only — clearTimeout override for the refresh manager's proactive timer. */
469
+ refreshClearTimer?: (handle: ReturnType<typeof setTimeout> | number) => void;
470
+ /** Test hook only — jitter override (ms) for the proactive timer. */
471
+ refreshJitter?: () => number;
472
+ /** Test hook only — config-file mutator used to persist rotated/blanked creds. */
473
+ mutateConfigFile?: OpenclawClawchatMutateConfigFile;
474
+ /** Internal — set when the current attempt is a refresh-driven reconnect. */
475
+ refreshReconnectDepth?: number;
476
+ /**
477
+ * Internal — epoch-ms at which the current refresh-driven reconnect streak
478
+ * began. Used with `refreshReconnectDepth` to bound a rotate-then-reject loop
479
+ * (§A.3/§A.4): once depth exceeds the cap inside the window we stop re-entering
480
+ * via refresh and fall back to plain transport backoff with the current token.
481
+ */
482
+ refreshReconnectWindowStartedAt?: number;
483
+ /**
484
+ * Internal — the refresh manager's single-flight latch + min-interval state,
485
+ * carried across gateway re-enters so the §A.3 guards (rejected-token latch,
486
+ * min-interval floor) actually constrain a cross-reconnect refresh loop instead
487
+ * of resetting on every fresh `RefreshManager`.
488
+ */
489
+ refreshManagerState?: { rejectedToken: string | null; lastAttemptAt: number };
490
+ /**
491
+ * Internal — set when the current attempt is a plain transport-backoff
492
+ * re-enter (transient/skipped reactive refresh). Carries the current (unchanged)
493
+ * token so creds stay untouched; the auth-failure teardown is suppressed.
494
+ */
495
+ transportBackoffReconnect?: boolean;
496
+ /** Test hook only — setTimeout override for the transport-backoff delay. */
497
+ backoffTimer?: (cb: () => void, ms: number) => void;
498
+ /** Test hook only — fixed transport-backoff delay (ms) for re-enter. */
499
+ transportBackoffDelayMs?: number;
422
500
  }
423
501
 
502
+ /**
503
+ * §A.3/§A.4 — max consecutive refresh-driven reconnects within
504
+ * `REFRESH_RECONNECT_WINDOW_MS` before we abandon the refresh loop and fall back
505
+ * to plain transport backoff (or auto-logout on a permanent reject). Bounds a
506
+ * server that keeps rotating-then-rejecting fresh tokens.
507
+ */
508
+ export const MAX_REFRESH_RECONNECTS = 3;
509
+ export const REFRESH_RECONNECT_WINDOW_MS = 5 * 60_000;
510
+ /** §B — default transport-backoff delay before a reactive re-enter (capped 30s). */
511
+ export const TRANSPORT_BACKOFF_BASE_MS = 1_000;
512
+ export const TRANSPORT_BACKOFF_MAX_MS = 30_000;
513
+
424
514
  function resolveConnectionStore(
425
515
  params: StartGatewayParams,
426
516
  runtime: PluginRuntime,
@@ -563,15 +653,95 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
563
653
  });
564
654
  if (!activationAccount) return;
565
655
  account = activationAccount.account;
656
+ // §A.0 — fallback expiry source. Prefer the SQLite `activated_at`; null when
657
+ // the credentials came from config (no activation row yet) — in that case the
658
+ // refresh manager relies on the JWT `exp` alone.
659
+ let activatedAtMs: number | null =
660
+ activationAccount.source === "sqlite" && store?.getActivationCredentials
661
+ ? store.getActivationCredentials({ platform: "openclaw", accountId })?.activatedAt ?? null
662
+ : null;
566
663
  let conversationApiClient: ReturnType<typeof createOpenclawClawlingApiClient> | undefined;
567
- const getConversationApiClient = () => {
568
- conversationApiClient ??= createOpenclawClawlingApiClient({
664
+ const buildConversationApiClient = (): ReturnType<typeof createOpenclawClawlingApiClient> =>
665
+ createOpenclawClawlingApiClient({
569
666
  baseUrl: account.baseUrl,
570
667
  mediaBaseUrl: account.mediaBaseUrl,
571
668
  token: account.token,
572
669
  userId: account.userId,
573
670
  });
574
- return conversationApiClient;
671
+ // §A.2.1 — forward reference to the single-flight REST refresh wrapper; set
672
+ // once the refresh manager exists. Until then, calls run un-wrapped.
673
+ let restWithRefresh: (<T>(call: () => Promise<T>) => Promise<T>) | null = null;
674
+ // Returns a proxy whose every method call runs through `restWithRefresh`, so a
675
+ // 401/403 transparently triggers one single-flight refresh + retry. The proxy
676
+ // reads the cached client lazily on each call so a post-refresh rebuild is
677
+ // picked up automatically.
678
+ const getConversationApiClient = (): ReturnType<typeof createOpenclawClawlingApiClient> => {
679
+ return new Proxy({} as ReturnType<typeof createOpenclawClawlingApiClient>, {
680
+ get: (_target, prop) => {
681
+ return (...args: unknown[]) => {
682
+ const invoke = () => {
683
+ conversationApiClient ??= buildConversationApiClient();
684
+ const fn = (conversationApiClient as unknown as Record<string, unknown>)[
685
+ prop as string
686
+ ];
687
+ if (typeof fn !== "function") {
688
+ throw new TypeError(`clawchat api-client has no method ${String(prop)}`);
689
+ }
690
+ return (fn as (...a: unknown[]) => unknown).apply(conversationApiClient, args);
691
+ };
692
+ return restWithRefresh
693
+ ? restWithRefresh(() => Promise.resolve(invoke()) as Promise<unknown>)
694
+ : invoke();
695
+ };
696
+ },
697
+ });
698
+ };
699
+ // Rebuilt after every in-memory token swap so REST calls use the fresh token.
700
+ const invalidateConversationApiClient = () => {
701
+ conversationApiClient = undefined;
702
+ };
703
+
704
+ const resolveMutateConfigFile = (): OpenclawClawchatMutateConfigFile | undefined => {
705
+ if (params.mutateConfigFile) return params.mutateConfigFile;
706
+ const runtimeConfig = runtime.config as unknown as {
707
+ mutateConfigFile?: OpenclawClawchatMutateConfigFile;
708
+ };
709
+ return typeof runtimeConfig?.mutateConfigFile === "function"
710
+ ? runtimeConfig.mutateConfigFile
711
+ : undefined;
712
+ };
713
+
714
+ // §0/§C.1 — write the channel-config `token`/`refreshToken` keys. `tokens=null`
715
+ // blanks them (auto-logout); otherwise persists the rotated pair.
716
+ const persistConfigTokens = async (
717
+ tokens: { accessToken: string; refreshToken: string } | null,
718
+ ): Promise<void> => {
719
+ const mutateConfigFile = resolveMutateConfigFile();
720
+ if (!mutateConfigFile) {
721
+ log?.error?.(
722
+ `[${accountId}] clawchat-plugin-openclaw config persistence unavailable; cannot ${tokens ? "rotate" : "clear"} tokens in config`,
723
+ );
724
+ return;
725
+ }
726
+ await mutateConfigFile({
727
+ afterWrite: { mode: "none", reason: "clawchat-plugin-openclaw token refresh" },
728
+ mutate(draft) {
729
+ const channels = ((draft as { channels?: Record<string, unknown> }).channels ?? {}) as Record<
730
+ string,
731
+ unknown
732
+ >;
733
+ const existing = (channels[CHANNEL_ID] ?? {}) as Record<string, unknown>;
734
+ const nextSection: Record<string, unknown> = {
735
+ ...existing,
736
+ token: tokens ? tokens.accessToken : "",
737
+ refreshToken: tokens ? tokens.refreshToken : "",
738
+ };
739
+ Object.assign(draft, {
740
+ ...draft,
741
+ channels: { ...channels, [CHANNEL_ID]: nextSection },
742
+ });
743
+ },
744
+ });
575
745
  };
576
746
 
577
747
  let lastHelloFailTraceId = "-";
@@ -583,6 +753,141 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
583
753
  let authFailureLogged = false;
584
754
  let closingForAbort = false;
585
755
  let wsReady = false;
756
+ // True once this gateway attempt reached "connected" at least once — used to
757
+ // route a later auth-fail through the live-session reactive refresh path
758
+ // rather than the initial-connect catch.
759
+ let wsReadyEverThisAttempt = false;
760
+ // §D — set when a refresh succeeded and we are closing the live WS to
761
+ // reconnect with the new token; suppresses the auth-failed teardown path and
762
+ // drives a clean re-enter into the gateway with the rotated account.
763
+ let reconnectWithRefreshedToken = false;
764
+ // §C — set once auto-logout has fired so we don't double-emit or reconnect.
765
+ let autoLoggedOut = false;
766
+
767
+ // §E — connect-time device id for `X-Device-Id` on refresh. Prefer the value
768
+ // recorded in SQLite at connect; backfill legacy rows (no column) to the
769
+ // deterministic constant `CHANNEL_ID` actually sent by `authHeaders`.
770
+ const refreshDeviceId =
771
+ (activationAccount.source === "sqlite" && store?.getActivationCredentials
772
+ ? store.getActivationCredentials({ platform: "openclaw", accountId })?.deviceId
773
+ : null) || CHANNEL_ID;
774
+
775
+ // §C — auto-logout on permanent refresh failure. Blank creds in BOTH stores
776
+ // (KEEP identity), flip not-configured via the auth-failure status path, and
777
+ // emit the user-visible message. Idempotent.
778
+ const performAutoLogout = async (info: { code: number; message: string }): Promise<void> => {
779
+ if (autoLoggedOut) return;
780
+ autoLoggedOut = true;
781
+ log?.error?.(
782
+ `[${accountId}] clawchat-plugin-openclaw auto-logout (token permanently expired) code=${info.code}: ${info.message}`,
783
+ );
784
+ // SQLite: blank access/refresh, keep user/owner/device for re-pair.
785
+ if (store?.clearActivationCredentials) {
786
+ recordConnection("clear activation credentials", () =>
787
+ store.clearActivationCredentials?.({ platform: "openclaw", accountId }),
788
+ );
789
+ }
790
+ // Config: blank token/refreshToken keys.
791
+ try {
792
+ await persistConfigTokens(null);
793
+ } catch (err) {
794
+ log?.error?.(
795
+ `[${accountId}] clawchat-plugin-openclaw failed to clear config credentials on auto-logout: ${err instanceof Error ? err.message : String(err)}`,
796
+ );
797
+ }
798
+ // Flip not-configured (existing auth-failure status path) with the re-pair
799
+ // hint as `lastError`.
800
+ setStatus({
801
+ ...getStatus(),
802
+ connected: false,
803
+ configured: false,
804
+ running: false,
805
+ lastError: CLAWCHAT_TOKEN_EXPIRED_LAST_ERROR,
806
+ });
807
+ // User-visible notification (in addition to logs). Best-effort; never throws.
808
+ emitUserVisibleAuthLogout();
809
+ };
810
+
811
+ // §C.1.4 — surface the permanent-expiry message to the user/operator. The
812
+ // plugin has no guaranteed live chat target after creds are cleared, so we
813
+ // route through the runtime notification surface when present and always log.
814
+ const emitUserVisibleAuthLogout = (): void => {
815
+ log?.error?.(`[${accountId}] clawchat-plugin-openclaw ${CLAWCHAT_TOKEN_EXPIRED_MESSAGE}`);
816
+ try {
817
+ const notify = (runtime as unknown as {
818
+ notifications?: { notify?: (input: { level?: string; message: string }) => void };
819
+ }).notifications?.notify;
820
+ if (typeof notify === "function") {
821
+ notify({ level: "error", message: CLAWCHAT_TOKEN_EXPIRED_MESSAGE });
822
+ }
823
+ } catch {
824
+ // Best effort only.
825
+ }
826
+ };
827
+
828
+ // The refresh token is not part of the resolved account; source it from
829
+ // SQLite first (authoritative after a rotation) then the config channel
830
+ // section. Kept in a mutable cell so a swap updates it in place.
831
+ let latestRefreshToken: string | null =
832
+ (activationAccount.source === "sqlite" && store?.getActivationCredentials
833
+ ? store.getActivationCredentials({ platform: "openclaw", accountId })?.refreshToken
834
+ : null) ?? readConfigRefreshToken(cfg);
835
+
836
+ const refreshManager = new RefreshManager({
837
+ baseUrl: account.baseUrl,
838
+ deviceId: refreshDeviceId,
839
+ getAccessToken: () => account.token,
840
+ getRefreshToken: () => latestRefreshToken,
841
+ persistRotatedTokens: async (tokens) => {
842
+ // §0 — persist to BOTH stores BEFORE the in-memory swap. A failure in
843
+ // EITHER store must REJECT so the manager skips the in-memory swap and
844
+ // treats the refresh as transient (keep the current tokens, back off). Do
845
+ // NOT swallow the SQLite write error: `rotateActivationTokens` returns
846
+ // `null` when its internal `write()` caught an exception (a real write
847
+ // failure), `false` only when no activation row exists yet (config-sourced
848
+ // agent — legitimately nothing to update). A swallowed write failure must
849
+ // not leave the SQLite row holding the now-dead refresh token while the
850
+ // in-memory token is rotated, which would brick a sqlite-sourced agent on
851
+ // restart.
852
+ if (store?.rotateActivationTokens) {
853
+ const rotateResult = store.rotateActivationTokens({
854
+ platform: "openclaw",
855
+ accountId,
856
+ accessToken: tokens.accessToken,
857
+ refreshToken: tokens.refreshToken,
858
+ });
859
+ if (rotateResult === null) {
860
+ throw new Error("clawchat-plugin-openclaw sqlite rotate activation tokens failed");
861
+ }
862
+ }
863
+ // A config write failure rejects out of `mutateConfigFile` and propagates
864
+ // here, which is what we want — persistence incomplete ⇒ no swap.
865
+ await persistConfigTokens(tokens);
866
+ },
867
+ swapInMemoryTokens: (tokens) => {
868
+ account = { ...account, token: tokens.accessToken, configured: true };
869
+ latestRefreshToken = tokens.refreshToken;
870
+ activatedAtMs = Date.now();
871
+ invalidateConversationApiClient();
872
+ },
873
+ onPermanentFailure: performAutoLogout,
874
+ // §A.1/§D — proactive-timer success closes the live WS and re-enters with the
875
+ // rotated token. The running ws-client captured the OLD token at `connect`
876
+ // time, so the in-memory swap alone never reaches a `connect` envelope.
877
+ onProactiveRefreshed: async () => {
878
+ await runRefreshReconnect("proactive-timer");
879
+ },
880
+ ...(params.refreshFetchImpl ? { fetchImpl: params.refreshFetchImpl } : {}),
881
+ ...(params.refreshSetTimer ? { setTimer: params.refreshSetTimer } : {}),
882
+ ...(params.refreshClearTimer ? { clearTimer: params.refreshClearTimer } : {}),
883
+ ...(params.refreshJitter ? { jitter: params.refreshJitter } : {}),
884
+ log,
885
+ });
886
+ // §A.3/§A.4 — carry the single-flight latch + min-interval across re-enters so
887
+ // the guards bound a rotate-then-reject loop instead of resetting each time.
888
+ if (params.refreshManagerState) {
889
+ refreshManager.restoreState(params.refreshManagerState);
890
+ }
586
891
  let currentConnectionId: number | null = null;
587
892
  let currentConnectionFinished = false;
588
893
  const reconnectTracker = createReconnectTracker({
@@ -721,7 +1026,10 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
721
1026
  const memoryRoot = resolveMemoryRootForPeer(peer);
722
1027
  if (!memoryRoot) return;
723
1028
 
724
- if (refreshBehavior) {
1029
+ // §9.3: agent behavior is per-agent metadata that lives only on the agent's
1030
+ // DIRECT conversation — never refetch it for a group invalidation, even on a
1031
+ // "refetch everything" (empty/unknown) scope.
1032
+ if (refreshBehavior && peer.kind === "direct") {
725
1033
  await refreshAgentBehavior({
726
1034
  source: "metadata_invalidation",
727
1035
  ...(version !== undefined ? { metadataVersion: version } : {}),
@@ -792,8 +1100,41 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
792
1100
  log: { error: (message) => log?.error?.(`[${accountId}] ${message}`) },
793
1101
  });
794
1102
  };
1103
+ // §A.4 — startup refresh-if-near-expiry, BEFORE the first WS connect. Recovers
1104
+ // a long-stopped pod with no manual re-pair. On a permanent refresh failure
1105
+ // auto-logout immediately and skip the doomed connect.
1106
+ if (
1107
+ !abortSignal.aborted &&
1108
+ latestRefreshToken &&
1109
+ refreshManager.isNearExpiry(activatedAtMs)
1110
+ ) {
1111
+ log?.info?.(
1112
+ `[${accountId}] clawchat-plugin-openclaw access token near expiry at startup; refreshing before connect`,
1113
+ );
1114
+ const startupOutcome = await refreshManager.refresh("startup-near-expiry");
1115
+ if (abortSignal.aborted) return;
1116
+ if (startupOutcome.kind === "permanent") {
1117
+ // Auto-logout already performed by the manager's onPermanentFailure.
1118
+ return;
1119
+ }
1120
+ // success swaps the in-memory token in place; transient/skipped just connect
1121
+ // with the current token (the WS handshake will then drive reactive refresh).
1122
+ }
1123
+ // Reuse the device id the server resolved on a previous connection so a pod
1124
+ // restart (fresh hostname → fresh hostname-derived id) does not present a
1125
+ // brand-new device, which would force a full inbox replay and orphan the
1126
+ // prior device's cursor. Persisted from `hello-ok` via markConnectionReady.
1127
+ const persistedDeviceId = store?.getLastResolvedDeviceId
1128
+ ? store.getLastResolvedDeviceId({ platform: "openclaw", accountId })
1129
+ : null;
1130
+ if (persistedDeviceId) {
1131
+ log?.info?.(
1132
+ `[${accountId}] clawchat-plugin-openclaw reusing persisted resolved_device_id`,
1133
+ );
1134
+ }
795
1135
  const client = createOpenclawClawlingClient(account, {
796
1136
  ...(params.transport ? { transport: params.transport } : {}),
1137
+ ...(persistedDeviceId ? { deviceIdOverride: persistedDeviceId } : {}),
797
1138
  wsLifecycle: {
798
1139
  onConnectFrameSent: (env) => {
799
1140
  lastConnectTraceId = typeof env.trace_id === "string" ? env.trace_id : "-";
@@ -823,6 +1164,202 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
823
1164
  });
824
1165
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw runtime client created`);
825
1166
 
1167
+ // §A.3/§A.4 — recompute the refresh-driven reconnect streak (depth + window).
1168
+ // Returns the next depth/window to thread into the re-enter, and whether the
1169
+ // cap is exceeded so the caller must NOT re-enter via refresh again.
1170
+ const nextRefreshReconnectStreak = (): {
1171
+ depth: number;
1172
+ windowStartedAt: number;
1173
+ capped: boolean;
1174
+ } => {
1175
+ const now = Date.now();
1176
+ const priorWindowStart = params.refreshReconnectWindowStartedAt ?? 0;
1177
+ const withinWindow =
1178
+ priorWindowStart !== 0 && now - priorWindowStart < REFRESH_RECONNECT_WINDOW_MS;
1179
+ const windowStartedAt = withinWindow ? priorWindowStart : now;
1180
+ const depth = (withinWindow ? params.refreshReconnectDepth ?? 0 : 0) + 1;
1181
+ return { depth, windowStartedAt, capped: depth > MAX_REFRESH_RECONNECTS };
1182
+ };
1183
+
1184
+ // §B/§D — re-enter the gateway after a plain transport-backoff delay, with the
1185
+ // CURRENT (unchanged) token and creds left untouched. Used when a reactive
1186
+ // refresh is transient/skipped (§B: a transient refresh failure NEVER
1187
+ // auto-logs-out and NEVER stops — keep retrying with the current token) and
1188
+ // when the refresh-reconnect loop is capped (§A.4). Carries the refresh
1189
+ // manager's latch + min-interval state so the guards keep bounding the loop.
1190
+ const scheduleTransportBackoffReconnect = (reason: string): void => {
1191
+ if (abortSignal.aborted || autoLoggedOut || reconnectWithRefreshedToken) return;
1192
+ reconnectWithRefreshedToken = true; // suppress the auth-failed teardown path.
1193
+ refreshManager.stop();
1194
+ activeClients.delete(accountId);
1195
+ finishCurrentConnection({
1196
+ state: "disconnected",
1197
+ closeCode: 1000,
1198
+ closeReason: "transport backoff reconnect",
1199
+ });
1200
+ try {
1201
+ client.close();
1202
+ } catch {
1203
+ // best effort
1204
+ }
1205
+ const attempt = (params.refreshReconnectDepth ?? 0) + 1;
1206
+ const delayMs =
1207
+ params.transportBackoffDelayMs ??
1208
+ Math.min(TRANSPORT_BACKOFF_MAX_MS, TRANSPORT_BACKOFF_BASE_MS * 2 ** Math.max(0, attempt - 1));
1209
+ log?.info?.(
1210
+ `[${accountId}] clawchat-plugin-openclaw reactive refresh ${reason}; backoff-reconnect with current token delayMs=${delayMs}`,
1211
+ );
1212
+ const managerState = refreshManager.exportState();
1213
+ const streak = nextRefreshReconnectStreak();
1214
+ const reEnter = () => {
1215
+ if (abortSignal.aborted) return;
1216
+ void startOpenclawClawlingGateway({
1217
+ ...params,
1218
+ account: { ...params.account },
1219
+ transportBackoffReconnect: true,
1220
+ refreshReconnectDepth: streak.depth,
1221
+ refreshReconnectWindowStartedAt: streak.windowStartedAt,
1222
+ refreshManagerState: managerState,
1223
+ });
1224
+ };
1225
+ const timer = params.backoffTimer ?? ((cb, ms) => void setTimeout(cb, ms));
1226
+ timer(reEnter, delayMs);
1227
+ };
1228
+
1229
+ // §A/§D — close the live WS and re-enter the gateway with the rotated token
1230
+ // (a token only enters via a fresh `connect` envelope; it cannot be hot-swapped
1231
+ // onto a live socket). Assumes a refresh ALREADY succeeded and swapped the
1232
+ // in-memory token (proactive path), or is called by `runRefreshAndReconnect`
1233
+ // after its own successful refresh (reactive path). Carries the refresh
1234
+ // manager's latch + min-interval + reconnect-streak state across the re-enter.
1235
+ const closeAndReconnectWithRefreshedToken = async (reason: string): Promise<void> => {
1236
+ if (abortSignal.aborted || autoLoggedOut || reconnectWithRefreshedToken) return;
1237
+ reconnectWithRefreshedToken = true;
1238
+ const managerState = refreshManager.exportState();
1239
+ refreshManager.stop();
1240
+ activeClients.delete(accountId);
1241
+ log?.info?.(
1242
+ `[${accountId}] clawchat-plugin-openclaw token refreshed (${reason}); closing WS to reconnect with new token`,
1243
+ );
1244
+ finishCurrentConnection({
1245
+ state: "disconnected",
1246
+ closeCode: 1000,
1247
+ closeReason: "token refresh",
1248
+ });
1249
+ try {
1250
+ client.close();
1251
+ } catch {
1252
+ // best effort
1253
+ }
1254
+ if (abortSignal.aborted) return;
1255
+ const streak = nextRefreshReconnectStreak();
1256
+ // Re-enter with the rotated in-memory account; SQLite/config already hold
1257
+ // the rotated pair (persisted before the swap). Reuse the same device id.
1258
+ await startOpenclawClawlingGateway({
1259
+ ...params,
1260
+ account: {
1261
+ ...params.account,
1262
+ configured: true,
1263
+ token: account.token,
1264
+ userId: account.userId,
1265
+ ownerUserId: account.ownerUserId,
1266
+ },
1267
+ refreshReconnectDepth: streak.depth,
1268
+ refreshReconnectWindowStartedAt: streak.windowStartedAt,
1269
+ refreshManagerState: managerState,
1270
+ });
1271
+ };
1272
+ // Alias used by the proactive port (refresh already succeeded + swapped).
1273
+ const runRefreshReconnect = closeAndReconnectWithRefreshedToken;
1274
+
1275
+ // §A/§B/§D — run a single-flight refresh and act on the outcome:
1276
+ // - success → close the live WS + re-enter with the rotated token (§D).
1277
+ // - permanent→ the manager already auto-logged-out (§C); nothing more here.
1278
+ // - transient/skipped → §B: NEVER teardown. Backoff-reconnect with the CURRENT
1279
+ // token, creds + configured untouched, and keep retrying.
1280
+ // Returns "handled" when it took ownership of the next connection lifecycle
1281
+ // (reconnect scheduled / auto-logout), "fallthrough" when the caller should run
1282
+ // its own path (only when aborted mid-flight).
1283
+ const runRefreshAndReconnect = async (reason: string): Promise<"handled" | "fallthrough"> => {
1284
+ if (abortSignal.aborted || autoLoggedOut || reconnectWithRefreshedToken) return "handled";
1285
+ // §A.4 — if the refresh-driven reconnect loop is already capped, do not run
1286
+ // another refresh; fall back to plain transport backoff with the current
1287
+ // token so a rotate-then-reject server cannot loop forever with no backoff.
1288
+ if ((params.refreshReconnectDepth ?? 0) >= MAX_REFRESH_RECONNECTS) {
1289
+ log?.error?.(
1290
+ `[${accountId}] clawchat-plugin-openclaw refresh-reconnect loop capped (depth=${params.refreshReconnectDepth}); backoff-reconnect with current token`,
1291
+ );
1292
+ scheduleTransportBackoffReconnect("refresh-reconnect-capped");
1293
+ return "handled";
1294
+ }
1295
+ const outcome = await refreshManager.refresh(reason);
1296
+ if (abortSignal.aborted) return "fallthrough";
1297
+ if (autoLoggedOut) return "handled"; // permanent → manager auto-logged-out.
1298
+ if (outcome.kind === "success") {
1299
+ await closeAndReconnectWithRefreshedToken(reason);
1300
+ return "handled";
1301
+ }
1302
+ // §B — transient / skipped (in-flight / min-interval / rejected-latch /
1303
+ // no-refresh-token): keep the WS in backoff with the CURRENT token; do NOT
1304
+ // teardown. (no-refresh-token has no path to recover, but tearing down is
1305
+ // wrong per §B; backoff keeps the supervisor alive without a refresh storm.)
1306
+ scheduleTransportBackoffReconnect(`refresh-${outcome.kind}`);
1307
+ return "handled";
1308
+ };
1309
+
1310
+ // §A.2.1 — run an authenticated REST call; on a 401/403 (`ClawlingApiError`
1311
+ // kind "auth") run the single-flight refresh and retry the call ONCE with a
1312
+ // rebuilt api-client. Any other error propagates. Used to wrap the REST
1313
+ // api-client so metadata/profile calls survive an expired access token
1314
+ // without waiting for the WS handshake.
1315
+ const isRestAuthError = (err: unknown): boolean =>
1316
+ err instanceof ClawlingApiError && err.kind === "auth";
1317
+ const withRefresh = async <T>(call: () => Promise<T>): Promise<T> => {
1318
+ try {
1319
+ return await call();
1320
+ } catch (err) {
1321
+ if (!isRestAuthError(err) || abortSignal.aborted) throw err;
1322
+ const outcome = await refreshManager.refresh("rest-401");
1323
+ if (outcome.kind !== "success") throw err;
1324
+ // The in-memory swap already invalidated the cached api-client; the next
1325
+ // `call()` rebuilds it with the fresh token.
1326
+ return await call();
1327
+ }
1328
+ };
1329
+ // Activate the REST proxy's refresh wrapper now that the manager exists.
1330
+ restWithRefresh = withRefresh;
1331
+
1332
+ // §A.2/§B — handle a WS hello-fail(auth) by gating a reactive refresh on the
1333
+ // reason classification:
1334
+ // - token-rejected → refresh. Success reconnects with the fresh token;
1335
+ // permanent auto-logs-out; transient/skipped backoff-reconnects with the
1336
+ // CURRENT token (§B: a transient refresh failure NEVER auto-logs-out and
1337
+ // NEVER stops — `runRefreshAndReconnect` owns all three).
1338
+ // - generic + token near expiry → same refresh path.
1339
+ // - generic + token NOT near expiry → §A.2: transient backoff with the current
1340
+ // token (NO refresh, NO teardown). A backend outage emitting a generic
1341
+ // reason must not trigger a refresh storm OR a spurious logout.
1342
+ // - auth-unavailable never reaches here (the ws-client routes it as a
1343
+ // TransportError so its own backoff loop handles it).
1344
+ const handleWsAuthFailure = async (reason: string): Promise<void> => {
1345
+ if (abortSignal.aborted || reconnectWithRefreshedToken || autoLoggedOut) return;
1346
+ const klass = classifyHelloFailReason(reason);
1347
+ const eligible =
1348
+ klass === "token-rejected" ||
1349
+ (klass === "generic" && refreshManager.isNearExpiry(activatedAtMs));
1350
+ if (eligible) {
1351
+ // `runRefreshAndReconnect` is total: it either reconnects (success),
1352
+ // auto-logs-out (permanent), or backoff-reconnects with the current token
1353
+ // (transient/skipped). No teardown path remains for an eligible hello-fail.
1354
+ await runRefreshAndReconnect("ws-hello-fail");
1355
+ return;
1356
+ }
1357
+ // §A.2 / Finding 5 — generic + token NOT near expiry: keep the WS in
1358
+ // transport backoff with the current token. Do NOT refresh and do NOT tear
1359
+ // the account down (the old teardown wrongly flipped configured:false).
1360
+ scheduleTransportBackoffReconnect("hello-fail-generic-not-near");
1361
+ };
1362
+
826
1363
  setAlignedOutboundLogContext(client, wsLogContext);
827
1364
  client.on("hello:ok", (env: Envelope) => {
828
1365
  const payload = env.payload && typeof env.payload === "object"
@@ -837,6 +1374,11 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
837
1374
  send: () => {},
838
1375
  context: wsLogContext,
839
1376
  });
1377
+ const notifySignalObserver = createNotifySignalObserver({
1378
+ accountId,
1379
+ log: (msg) => log?.info?.(msg),
1380
+ context: wsLogContext,
1381
+ });
840
1382
  const logAuthFailure = (reason: string) => {
841
1383
  if (authFailureLogged) return;
842
1384
  authFailureLogged = true;
@@ -861,6 +1403,7 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
861
1403
  client.on("state", ({ from, to }) => {
862
1404
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw state ${from} -> ${to}`);
863
1405
  wsReady = to === "connected";
1406
+ if (to === "connected") wsReadyEverThisAttempt = true;
864
1407
  if (to === "connecting") {
865
1408
  reconnectTracker.connectStart();
866
1409
  currentAttemptStartedAt = Date.now();
@@ -931,8 +1474,15 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
931
1474
  }
932
1475
  void refreshConversationCacheAfterReady();
933
1476
  void dispatchActivationBootstrap();
1477
+ // §A.1 — arm the proactive refresh timer from the live token's `exp`
1478
+ // every time a connection becomes ready (re-armed after every refresh via
1479
+ // the gateway re-enter).
1480
+ refreshManager.armProactiveTimer(activatedAtMs);
934
1481
  } else if (to === "disconnected") {
935
1482
  reconnectTracker.markClosed();
1483
+ // §A.1 — clear the proactive timer on disconnect; it re-arms on the next
1484
+ // ready, or the gateway re-enter arms a fresh one.
1485
+ if (!reconnectWithRefreshedToken) refreshManager.disarmProactiveTimer();
936
1486
  }
937
1487
  const next = { ...getStatus(), ...mapClawlingStateToStatus(to as ClawlingState) };
938
1488
  setStatus(next);
@@ -1076,21 +1626,50 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1076
1626
  void handleMetadataInvalidation(env);
1077
1627
  });
1078
1628
 
1629
+ client.on("notify:signal", (env: Envelope) => {
1630
+ // §9.4 reliable system notification. The plugin holds no friend/roster
1631
+ // cache (friends are fetched on demand via REST tools), so there is nothing
1632
+ // to invalidate — observe + dedup only. The live frame and its reliable
1633
+ // inbox replay carry the same event_id and collapse to one observation.
1634
+ notifySignalObserver.observe(env);
1635
+ });
1636
+
1637
+ client.on("replay:done", (env: Envelope) => {
1638
+ // §11.5 terminal control frame: device replay drained, live delivery begins.
1639
+ // Fires on every reconnect (even zero-backlog). Replayed messages are
1640
+ // processed inline, so this is a logged boundary marker, not a gate.
1641
+ log?.info?.(`[${accountId}] clawchat-plugin-openclaw replay.done trace=${env.trace_id}`);
1642
+ });
1643
+
1079
1644
  client.on("error", (err: unknown) => {
1080
1645
  const classified = classifyClawlingClientError(err);
1081
1646
  if (classified.kind === "auth") {
1082
- finishCurrentConnection({
1083
- state: "auth_failed",
1084
- error: lastHelloFailReason || classified.message,
1085
- });
1086
- logAuthFailure(classified.message);
1087
- setStatus({
1088
- ...getStatus(),
1089
- connected: false,
1090
- configured: false,
1091
- running: false,
1092
- lastError: classified.message,
1093
- });
1647
+ // §A.2 — a WS hello-fail(auth) on a LIVE (already-connected) session.
1648
+ // Attempt a gated reactive refresh before tearing the account down. The
1649
+ // INITIAL-connect auth failure is owned by the `client.connect()` catch
1650
+ // below (which runs the refresh/backoff/teardown decision and the recursive
1651
+ // re-enter), so only react here once the session was previously ready —
1652
+ // otherwise we'd double-handle and the error handler's teardown would race
1653
+ // the catch's transient-backoff branch (wrongly flipping configured:false).
1654
+ if (!reconnectWithRefreshedToken && !autoLoggedOut && wsReadyEverThisAttempt) {
1655
+ void handleWsAuthFailure(lastHelloFailReason || classified.message);
1656
+ return;
1657
+ }
1658
+ // Not-ready (initial connect): the `client.connect()` catch below owns the
1659
+ // refresh/backoff/teardown DECISION and the status flip. Record the
1660
+ // connection as auth_failed here (bookkeeping — the ws-client's own 4001
1661
+ // close would otherwise finish it as a plain "disconnected") and log the
1662
+ // auth failure, but do NOT flip status here: a transient refresh / generic
1663
+ // backoff must leave configured untouched, and that decision lives in the
1664
+ // catch.
1665
+ if (!reconnectWithRefreshedToken && !autoLoggedOut) {
1666
+ finishCurrentConnection({
1667
+ state: "auth_failed",
1668
+ error: lastHelloFailReason || classified.message,
1669
+ });
1670
+ logAuthFailure(classified.message);
1671
+ }
1672
+ return;
1094
1673
  } else if (classified.kind === "transport") {
1095
1674
  finishCurrentConnection({ state: "transport_error", error: classified.message });
1096
1675
  const current = wsLogContext();
@@ -1672,14 +2251,47 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1672
2251
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw runtime client.connect() resolved`);
1673
2252
  } catch (err) {
1674
2253
  const classified = classifyClawlingClientError(err);
1675
- setStatus({
1676
- ...getStatus(),
1677
- connected: false,
1678
- configured: classified.kind !== "auth",
1679
- running: false,
1680
- lastError: classified.message,
1681
- });
1682
2254
  if (classified.kind === "auth") {
2255
+ // §A.2/§B — initial-connect hello-fail(auth). Do NOT pre-flip
2256
+ // configured:false here: a transient refresh must leave creds + configured
2257
+ // untouched (§B). Branch on the refresh-eligibility classification first.
2258
+ const klass = classifyHelloFailReason(lastHelloFailReason || classified.message);
2259
+ const eligible =
2260
+ !reconnectWithRefreshedToken &&
2261
+ !autoLoggedOut &&
2262
+ !abortSignal.aborted &&
2263
+ Boolean(latestRefreshToken) &&
2264
+ (klass === "token-rejected" ||
2265
+ (klass === "generic" && refreshManager.isNearExpiry(activatedAtMs)));
2266
+ if (eligible) {
2267
+ // Total: success reconnects, permanent auto-logs-out, transient/skipped
2268
+ // backoff-reconnects with the current token (creds + configured intact).
2269
+ await runRefreshAndReconnect("ws-initial-connect-auth");
2270
+ return;
2271
+ }
2272
+ // §A.2 / Finding 5 — generic + token NOT near expiry (and we have a refresh
2273
+ // token but it isn't refresh-eligible): keep the WS in transport backoff
2274
+ // with the current token instead of tearing the account down.
2275
+ if (
2276
+ klass === "generic" &&
2277
+ Boolean(latestRefreshToken) &&
2278
+ !reconnectWithRefreshedToken &&
2279
+ !autoLoggedOut &&
2280
+ !abortSignal.aborted
2281
+ ) {
2282
+ scheduleTransportBackoffReconnect("initial-connect-generic-not-near");
2283
+ return;
2284
+ }
2285
+ // Not refresh-eligible and no usable refresh token — fall back to the
2286
+ // legacy auth-failed teardown so the gateway flips not-configured and (for
2287
+ // a sqlite-sourced account) re-enters wait-for-activation.
2288
+ setStatus({
2289
+ ...getStatus(),
2290
+ connected: false,
2291
+ configured: false,
2292
+ running: false,
2293
+ lastError: classified.message,
2294
+ });
1683
2295
  finishCurrentConnection({
1684
2296
  state: "auth_failed",
1685
2297
  error: lastHelloFailReason || classified.message,
@@ -1700,6 +2312,13 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1700
2312
  }
1701
2313
  return;
1702
2314
  }
2315
+ setStatus({
2316
+ ...getStatus(),
2317
+ connected: false,
2318
+ configured: true,
2319
+ running: false,
2320
+ lastError: classified.message,
2321
+ });
1703
2322
  log?.error?.(
1704
2323
  `[${accountId}] clawchat-plugin-openclaw connect failed (${classified.kind}): ${classified.message}`,
1705
2324
  );
@@ -1719,6 +2338,8 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1719
2338
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw runtime abort received; closing client`);
1720
2339
  activeClients.delete(accountId);
1721
2340
  closingForAbort = true;
2341
+ // §A.1 — stop the proactive refresh timer on shutdown.
2342
+ refreshManager.stop();
1722
2343
  groupCoalescer.cancelAll();
1723
2344
  finishCurrentConnection({
1724
2345
  state: "disconnected",