@clawling/clawchat-plugin-openclaw 2026.5.12-39 → 2026.5.13-1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/runtime.ts CHANGED
@@ -13,9 +13,12 @@ import { waitUntilAbort } from "openclaw/plugin-sdk/channel-lifecycle";
13
13
  import { hasControlCommand } from "openclaw/plugin-sdk/command-detection";
14
14
  import type { OpenClawConfig, PluginRuntime } from "openclaw/plugin-sdk/core";
15
15
  import { createPluginRuntimeStore } from "openclaw/plugin-sdk/runtime-store";
16
- import { createOpenclawClawlingClient } from "./client.ts";
16
+ import { createOpenclawClawlingClient, resolveOpenclawClawlingDeviceId } from "./client.ts";
17
17
  import { createOpenclawClawlingApiClient } from "./api-client.ts";
18
+ import { reportPluginVersionSafe, resolvePluginVersion } from "./plugin-report.ts";
18
19
  import { ClawlingApiError } from "./api-types.ts";
20
+ import { RefreshManager } from "./refresh-manager.ts";
21
+ import type { OpenclawClawchatMutateConfigFile } from "./login.runtime.ts";
19
22
  import {
20
23
  CHANNEL_ID,
21
24
  effectiveOutputVisibility,
@@ -35,7 +38,7 @@ import {
35
38
  setAlignedOutboundLogContext,
36
39
  } from "./outbound.ts";
37
40
  import { formatWsLog } from "./ws-log.ts";
38
- import { createProtocolControlHandler, createReconnectTracker } from "./ws-alignment.ts";
41
+ import { createNotifySignalObserver, createProtocolControlHandler, createReconnectTracker } from "./ws-alignment.ts";
39
42
  import {
40
43
  clawChatDbPathForStateDir,
41
44
  getClawChatStore,
@@ -77,6 +80,8 @@ type RuntimeConnectionStore = Pick<
77
80
  Pick<
78
81
  ClawChatStore,
79
82
  | "getActivationCredentials"
83
+ | "rotateActivationTokens"
84
+ | "clearActivationCredentials"
80
85
  | "insertMessage"
81
86
  | "claimMessageOnce"
82
87
  | "markMessageAcknowledged"
@@ -85,6 +90,7 @@ type RuntimeConnectionStore = Pick<
85
90
  | "releaseActivationBootstrapClaim"
86
91
  | "markActivationBootstrapSent"
87
92
  | "getActivationConversation"
93
+ | "getLastResolvedDeviceId"
88
94
  >
89
95
  >;
90
96
 
@@ -111,11 +117,44 @@ const OPENCLAW_CONFIRM_SLASH_COMMANDS = new Set([
111
117
  "nevermind",
112
118
  ]);
113
119
  const GROUP_OWNER_ATTENTION_TITLE = "requires owner attention";
120
+ // §C.1 — user-visible message emitted on permanent token expiry. Kept
121
+ // byte-identical to the Hermes plugin (parity spec §C.1.4).
122
+ const CLAWCHAT_TOKEN_EXPIRED_MESSAGE =
123
+ "ClawChat token expired and could not be refreshed. Re-pair with `/clawchat-activate <code>`.";
124
+ const CLAWCHAT_TOKEN_EXPIRED_LAST_ERROR = "token expired — re-pair required";
114
125
 
115
126
  function isRecord(value: unknown): value is Record<string, unknown> {
116
127
  return Boolean(value && typeof value === "object" && !Array.isArray(value));
117
128
  }
118
129
 
130
+ /**
131
+ * §A.2 — classify a WS `hello-fail` reason for refresh gating.
132
+ * - "token-rejected": reason names an authentication failure → refresh.
133
+ * - "auth-unavailable": 5xx auth-backend outage → backoff, DO NOT refresh.
134
+ * - "generic": unattributed → refresh only if the token is at/near expiry.
135
+ *
136
+ * `auth service unavailable` is already split off by the ws-client into a
137
+ * TransportError (backoff), but we classify defensively here too.
138
+ */
139
+ export function classifyHelloFailReason(
140
+ reason: string,
141
+ ): "token-rejected" | "auth-unavailable" | "generic" {
142
+ const r = (reason || "").toLowerCase();
143
+ if (/auth service unavailable|temporarily unavailable/.test(r)) return "auth-unavailable";
144
+ if (/authentication failed|invalid token|token expired|unauthorized|auth failed|invalid credentials/.test(r)) {
145
+ return "token-rejected";
146
+ }
147
+ return "generic";
148
+ }
149
+
150
+ /** Read `channels.<CHANNEL_ID>.refreshToken` from a live config, or null. */
151
+ function readConfigRefreshToken(cfg: OpenClawConfig): string | null {
152
+ const channels = (cfg as { channels?: Record<string, unknown> }).channels;
153
+ const channel = isRecord(channels) ? channels[CHANNEL_ID] : undefined;
154
+ const refreshToken = isRecord(channel) ? channel.refreshToken : undefined;
155
+ return typeof refreshToken === "string" && refreshToken.trim() ? refreshToken.trim() : null;
156
+ }
157
+
119
158
  function withFullVerboseDispatchConfig(cfg: OpenClawConfig, agentId: string): OpenClawConfig {
120
159
  const cfgRecord = cfg as Record<string, unknown>;
121
160
  const agents = isRecord(cfgRecord.agents) ? cfgRecord.agents : {};
@@ -339,7 +378,11 @@ function metadataScopesFromEnvelope(env: Envelope): string[] {
339
378
  }
340
379
 
341
380
  function shouldRefreshBehaviorForScopes(scopes: string[]): boolean {
342
- return scopes.includes("behavior");
381
+ // §9.3: empty/absent scope ⇒ "refetch everything"; unknown scope strings must
382
+ // also trigger a refresh. Only the known non-behavior scopes (title,
383
+ // description) leave agent behavior untouched.
384
+ if (scopes.length === 0) return true;
385
+ return scopes.some((scope) => scope !== "title" && scope !== "description");
343
386
  }
344
387
 
345
388
  function shouldRefreshConversationForScopes(scopes: string[]): boolean {
@@ -419,8 +462,56 @@ export interface StartGatewayParams {
419
462
  activationPollIntervalMs?: number;
420
463
  /** Test hook only. */
421
464
  rejectedActivationToken?: string;
465
+ /** Test hook only — fetch impl used by the refresh manager (`/v1/auth/refresh`). */
466
+ refreshFetchImpl?: typeof fetch;
467
+ /** Test hook only — setTimeout override for the refresh manager's proactive timer. */
468
+ refreshSetTimer?: (cb: () => void, ms: number) => ReturnType<typeof setTimeout> | number;
469
+ /** Test hook only — clearTimeout override for the refresh manager's proactive timer. */
470
+ refreshClearTimer?: (handle: ReturnType<typeof setTimeout> | number) => void;
471
+ /** Test hook only — jitter override (ms) for the proactive timer. */
472
+ refreshJitter?: () => number;
473
+ /** Test hook only — config-file mutator used to persist rotated/blanked creds. */
474
+ mutateConfigFile?: OpenclawClawchatMutateConfigFile;
475
+ /** Internal — set when the current attempt is a refresh-driven reconnect. */
476
+ refreshReconnectDepth?: number;
477
+ /**
478
+ * Internal — epoch-ms at which the current refresh-driven reconnect streak
479
+ * began. Used with `refreshReconnectDepth` to bound a rotate-then-reject loop
480
+ * (§A.3/§A.4): once depth exceeds the cap inside the window we stop re-entering
481
+ * via refresh and fall back to plain transport backoff with the current token.
482
+ */
483
+ refreshReconnectWindowStartedAt?: number;
484
+ /**
485
+ * Internal — the refresh manager's single-flight latch + min-interval state,
486
+ * carried across gateway re-enters so the §A.3 guards (rejected-token latch,
487
+ * min-interval floor) actually constrain a cross-reconnect refresh loop instead
488
+ * of resetting on every fresh `RefreshManager`.
489
+ */
490
+ refreshManagerState?: { rejectedToken: string | null; lastAttemptAt: number };
491
+ /**
492
+ * Internal — set when the current attempt is a plain transport-backoff
493
+ * re-enter (transient/skipped reactive refresh). Carries the current (unchanged)
494
+ * token so creds stay untouched; the auth-failure teardown is suppressed.
495
+ */
496
+ transportBackoffReconnect?: boolean;
497
+ /** Test hook only — setTimeout override for the transport-backoff delay. */
498
+ backoffTimer?: (cb: () => void, ms: number) => void;
499
+ /** Test hook only — fixed transport-backoff delay (ms) for re-enter. */
500
+ transportBackoffDelayMs?: number;
422
501
  }
423
502
 
503
+ /**
504
+ * §A.3/§A.4 — max consecutive refresh-driven reconnects within
505
+ * `REFRESH_RECONNECT_WINDOW_MS` before we abandon the refresh loop and fall back
506
+ * to plain transport backoff (or auto-logout on a permanent reject). Bounds a
507
+ * server that keeps rotating-then-rejecting fresh tokens.
508
+ */
509
+ export const MAX_REFRESH_RECONNECTS = 3;
510
+ export const REFRESH_RECONNECT_WINDOW_MS = 5 * 60_000;
511
+ /** §B — default transport-backoff delay before a reactive re-enter (capped 30s). */
512
+ export const TRANSPORT_BACKOFF_BASE_MS = 1_000;
513
+ export const TRANSPORT_BACKOFF_MAX_MS = 30_000;
514
+
424
515
  function resolveConnectionStore(
425
516
  params: StartGatewayParams,
426
517
  runtime: PluginRuntime,
@@ -551,6 +642,19 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
551
642
  log?.info?.(
552
643
  `[${accountId}] clawchat-plugin-openclaw runtime start entered configured=${account.configured} enabled=${account.enabled} hasToken=${Boolean(account.token)} hasUserId=${Boolean(account.userId)} hasOwnerUserId=${Boolean(account.ownerUserId)} websocketUrl=${account.websocketUrl || "(empty)"}`,
553
644
  );
645
+ // Freeze one report device_id for this process; reused for the paired report
646
+ // so the backend links both to the same row. Imported from ./client.ts.
647
+ const reportDeviceId = resolveOpenclawClawlingDeviceId(account);
648
+ const pluginVersion = resolvePluginVersion();
649
+ void reportPluginVersionSafe({
650
+ baseUrl: account.baseUrl,
651
+ mediaBaseUrl: account.mediaBaseUrl,
652
+ token: "",
653
+ deviceId: reportDeviceId,
654
+ pluginVersion,
655
+ authenticated: false,
656
+ log,
657
+ });
554
658
  const activationAccount = await waitForActivationCredentials({
555
659
  account,
556
660
  abortSignal,
@@ -563,15 +667,106 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
563
667
  });
564
668
  if (!activationAccount) return;
565
669
  account = activationAccount.account;
670
+ // Paired: link the report row via the authenticated endpoint, reusing the
671
+ // SAME frozen device_id so the backend upserts the existing unpaired row.
672
+ void reportPluginVersionSafe({
673
+ baseUrl: account.baseUrl,
674
+ mediaBaseUrl: account.mediaBaseUrl,
675
+ token: account.token,
676
+ deviceId: reportDeviceId,
677
+ pluginVersion,
678
+ authenticated: true,
679
+ log,
680
+ });
681
+ // §A.0 — fallback expiry source. Prefer the SQLite `activated_at`; null when
682
+ // the credentials came from config (no activation row yet) — in that case the
683
+ // refresh manager relies on the JWT `exp` alone.
684
+ let activatedAtMs: number | null =
685
+ activationAccount.source === "sqlite" && store?.getActivationCredentials
686
+ ? store.getActivationCredentials({ platform: "openclaw", accountId })?.activatedAt ?? null
687
+ : null;
566
688
  let conversationApiClient: ReturnType<typeof createOpenclawClawlingApiClient> | undefined;
567
- const getConversationApiClient = () => {
568
- conversationApiClient ??= createOpenclawClawlingApiClient({
689
+ const buildConversationApiClient = (): ReturnType<typeof createOpenclawClawlingApiClient> =>
690
+ createOpenclawClawlingApiClient({
569
691
  baseUrl: account.baseUrl,
570
692
  mediaBaseUrl: account.mediaBaseUrl,
571
693
  token: account.token,
572
694
  userId: account.userId,
573
695
  });
574
- return conversationApiClient;
696
+ // §A.2.1 — forward reference to the single-flight REST refresh wrapper; set
697
+ // once the refresh manager exists. Until then, calls run un-wrapped.
698
+ let restWithRefresh: (<T>(call: () => Promise<T>) => Promise<T>) | null = null;
699
+ // Returns a proxy whose every method call runs through `restWithRefresh`, so a
700
+ // 401/403 transparently triggers one single-flight refresh + retry. The proxy
701
+ // reads the cached client lazily on each call so a post-refresh rebuild is
702
+ // picked up automatically.
703
+ const getConversationApiClient = (): ReturnType<typeof createOpenclawClawlingApiClient> => {
704
+ return new Proxy({} as ReturnType<typeof createOpenclawClawlingApiClient>, {
705
+ get: (_target, prop) => {
706
+ return (...args: unknown[]) => {
707
+ const invoke = () => {
708
+ conversationApiClient ??= buildConversationApiClient();
709
+ const fn = (conversationApiClient as unknown as Record<string, unknown>)[
710
+ prop as string
711
+ ];
712
+ if (typeof fn !== "function") {
713
+ throw new TypeError(`clawchat api-client has no method ${String(prop)}`);
714
+ }
715
+ return (fn as (...a: unknown[]) => unknown).apply(conversationApiClient, args);
716
+ };
717
+ return restWithRefresh
718
+ ? restWithRefresh(() => Promise.resolve(invoke()) as Promise<unknown>)
719
+ : invoke();
720
+ };
721
+ },
722
+ });
723
+ };
724
+ // Rebuilt after every in-memory token swap so REST calls use the fresh token.
725
+ const invalidateConversationApiClient = () => {
726
+ conversationApiClient = undefined;
727
+ };
728
+
729
+ const resolveMutateConfigFile = (): OpenclawClawchatMutateConfigFile | undefined => {
730
+ if (params.mutateConfigFile) return params.mutateConfigFile;
731
+ const runtimeConfig = runtime.config as unknown as {
732
+ mutateConfigFile?: OpenclawClawchatMutateConfigFile;
733
+ };
734
+ return typeof runtimeConfig?.mutateConfigFile === "function"
735
+ ? runtimeConfig.mutateConfigFile
736
+ : undefined;
737
+ };
738
+
739
+ // §0/§C.1 — write the channel-config `token`/`refreshToken` keys. `tokens=null`
740
+ // blanks them (auto-logout); otherwise persists the rotated pair.
741
+ const persistConfigTokens = async (
742
+ tokens: { accessToken: string; refreshToken: string } | null,
743
+ ): Promise<void> => {
744
+ const mutateConfigFile = resolveMutateConfigFile();
745
+ if (!mutateConfigFile) {
746
+ log?.error?.(
747
+ `[${accountId}] clawchat-plugin-openclaw config persistence unavailable; cannot ${tokens ? "rotate" : "clear"} tokens in config`,
748
+ );
749
+ return;
750
+ }
751
+ await mutateConfigFile({
752
+ afterWrite: { mode: "none", reason: "clawchat-plugin-openclaw token refresh" },
753
+ mutate(draft) {
754
+ const channels = ((draft as { channels?: Record<string, unknown> }).channels ?? {}) as Record<
755
+ string,
756
+ unknown
757
+ >;
758
+ const existing = (channels[CHANNEL_ID] ?? {}) as Record<string, unknown>;
759
+ const nextSection: Record<string, unknown> = {
760
+ ...existing,
761
+ token: tokens ? tokens.accessToken : "",
762
+ refreshToken: tokens ? tokens.refreshToken : "",
763
+ };
764
+ Object.assign(draft, {
765
+ ...draft,
766
+ channels: { ...channels, [CHANNEL_ID]: nextSection },
767
+ });
768
+ },
769
+ });
575
770
  };
576
771
 
577
772
  let lastHelloFailTraceId = "-";
@@ -583,6 +778,141 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
583
778
  let authFailureLogged = false;
584
779
  let closingForAbort = false;
585
780
  let wsReady = false;
781
+ // True once this gateway attempt reached "connected" at least once — used to
782
+ // route a later auth-fail through the live-session reactive refresh path
783
+ // rather than the initial-connect catch.
784
+ let wsReadyEverThisAttempt = false;
785
+ // §D — set when a refresh succeeded and we are closing the live WS to
786
+ // reconnect with the new token; suppresses the auth-failed teardown path and
787
+ // drives a clean re-enter into the gateway with the rotated account.
788
+ let reconnectWithRefreshedToken = false;
789
+ // §C — set once auto-logout has fired so we don't double-emit or reconnect.
790
+ let autoLoggedOut = false;
791
+
792
+ // §E — connect-time device id for `X-Device-Id` on refresh. Prefer the value
793
+ // recorded in SQLite at connect; backfill legacy rows (no column) to the
794
+ // deterministic constant `CHANNEL_ID` actually sent by `authHeaders`.
795
+ const refreshDeviceId =
796
+ (activationAccount.source === "sqlite" && store?.getActivationCredentials
797
+ ? store.getActivationCredentials({ platform: "openclaw", accountId })?.deviceId
798
+ : null) || CHANNEL_ID;
799
+
800
+ // §C — auto-logout on permanent refresh failure. Blank creds in BOTH stores
801
+ // (KEEP identity), flip not-configured via the auth-failure status path, and
802
+ // emit the user-visible message. Idempotent.
803
+ const performAutoLogout = async (info: { code: number; message: string }): Promise<void> => {
804
+ if (autoLoggedOut) return;
805
+ autoLoggedOut = true;
806
+ log?.error?.(
807
+ `[${accountId}] clawchat-plugin-openclaw auto-logout (token permanently expired) code=${info.code}: ${info.message}`,
808
+ );
809
+ // SQLite: blank access/refresh, keep user/owner/device for re-pair.
810
+ if (store?.clearActivationCredentials) {
811
+ recordConnection("clear activation credentials", () =>
812
+ store.clearActivationCredentials?.({ platform: "openclaw", accountId }),
813
+ );
814
+ }
815
+ // Config: blank token/refreshToken keys.
816
+ try {
817
+ await persistConfigTokens(null);
818
+ } catch (err) {
819
+ log?.error?.(
820
+ `[${accountId}] clawchat-plugin-openclaw failed to clear config credentials on auto-logout: ${err instanceof Error ? err.message : String(err)}`,
821
+ );
822
+ }
823
+ // Flip not-configured (existing auth-failure status path) with the re-pair
824
+ // hint as `lastError`.
825
+ setStatus({
826
+ ...getStatus(),
827
+ connected: false,
828
+ configured: false,
829
+ running: false,
830
+ lastError: CLAWCHAT_TOKEN_EXPIRED_LAST_ERROR,
831
+ });
832
+ // User-visible notification (in addition to logs). Best-effort; never throws.
833
+ emitUserVisibleAuthLogout();
834
+ };
835
+
836
+ // §C.1.4 — surface the permanent-expiry message to the user/operator. The
837
+ // plugin has no guaranteed live chat target after creds are cleared, so we
838
+ // route through the runtime notification surface when present and always log.
839
+ const emitUserVisibleAuthLogout = (): void => {
840
+ log?.error?.(`[${accountId}] clawchat-plugin-openclaw ${CLAWCHAT_TOKEN_EXPIRED_MESSAGE}`);
841
+ try {
842
+ const notify = (runtime as unknown as {
843
+ notifications?: { notify?: (input: { level?: string; message: string }) => void };
844
+ }).notifications?.notify;
845
+ if (typeof notify === "function") {
846
+ notify({ level: "error", message: CLAWCHAT_TOKEN_EXPIRED_MESSAGE });
847
+ }
848
+ } catch {
849
+ // Best effort only.
850
+ }
851
+ };
852
+
853
+ // The refresh token is not part of the resolved account; source it from
854
+ // SQLite first (authoritative after a rotation) then the config channel
855
+ // section. Kept in a mutable cell so a swap updates it in place.
856
+ let latestRefreshToken: string | null =
857
+ (activationAccount.source === "sqlite" && store?.getActivationCredentials
858
+ ? store.getActivationCredentials({ platform: "openclaw", accountId })?.refreshToken
859
+ : null) ?? readConfigRefreshToken(cfg);
860
+
861
+ const refreshManager = new RefreshManager({
862
+ baseUrl: account.baseUrl,
863
+ deviceId: refreshDeviceId,
864
+ getAccessToken: () => account.token,
865
+ getRefreshToken: () => latestRefreshToken,
866
+ persistRotatedTokens: async (tokens) => {
867
+ // §0 — persist to BOTH stores BEFORE the in-memory swap. A failure in
868
+ // EITHER store must REJECT so the manager skips the in-memory swap and
869
+ // treats the refresh as transient (keep the current tokens, back off). Do
870
+ // NOT swallow the SQLite write error: `rotateActivationTokens` returns
871
+ // `null` when its internal `write()` caught an exception (a real write
872
+ // failure), `false` only when no activation row exists yet (config-sourced
873
+ // agent — legitimately nothing to update). A swallowed write failure must
874
+ // not leave the SQLite row holding the now-dead refresh token while the
875
+ // in-memory token is rotated, which would brick a sqlite-sourced agent on
876
+ // restart.
877
+ if (store?.rotateActivationTokens) {
878
+ const rotateResult = store.rotateActivationTokens({
879
+ platform: "openclaw",
880
+ accountId,
881
+ accessToken: tokens.accessToken,
882
+ refreshToken: tokens.refreshToken,
883
+ });
884
+ if (rotateResult === null) {
885
+ throw new Error("clawchat-plugin-openclaw sqlite rotate activation tokens failed");
886
+ }
887
+ }
888
+ // A config write failure rejects out of `mutateConfigFile` and propagates
889
+ // here, which is what we want — persistence incomplete ⇒ no swap.
890
+ await persistConfigTokens(tokens);
891
+ },
892
+ swapInMemoryTokens: (tokens) => {
893
+ account = { ...account, token: tokens.accessToken, configured: true };
894
+ latestRefreshToken = tokens.refreshToken;
895
+ activatedAtMs = Date.now();
896
+ invalidateConversationApiClient();
897
+ },
898
+ onPermanentFailure: performAutoLogout,
899
+ // §A.1/§D — proactive-timer success closes the live WS and re-enters with the
900
+ // rotated token. The running ws-client captured the OLD token at `connect`
901
+ // time, so the in-memory swap alone never reaches a `connect` envelope.
902
+ onProactiveRefreshed: async () => {
903
+ await runRefreshReconnect("proactive-timer");
904
+ },
905
+ ...(params.refreshFetchImpl ? { fetchImpl: params.refreshFetchImpl } : {}),
906
+ ...(params.refreshSetTimer ? { setTimer: params.refreshSetTimer } : {}),
907
+ ...(params.refreshClearTimer ? { clearTimer: params.refreshClearTimer } : {}),
908
+ ...(params.refreshJitter ? { jitter: params.refreshJitter } : {}),
909
+ log,
910
+ });
911
+ // §A.3/§A.4 — carry the single-flight latch + min-interval across re-enters so
912
+ // the guards bound a rotate-then-reject loop instead of resetting each time.
913
+ if (params.refreshManagerState) {
914
+ refreshManager.restoreState(params.refreshManagerState);
915
+ }
586
916
  let currentConnectionId: number | null = null;
587
917
  let currentConnectionFinished = false;
588
918
  const reconnectTracker = createReconnectTracker({
@@ -721,7 +1051,10 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
721
1051
  const memoryRoot = resolveMemoryRootForPeer(peer);
722
1052
  if (!memoryRoot) return;
723
1053
 
724
- if (refreshBehavior) {
1054
+ // §9.3: agent behavior is per-agent metadata that lives only on the agent's
1055
+ // DIRECT conversation — never refetch it for a group invalidation, even on a
1056
+ // "refetch everything" (empty/unknown) scope.
1057
+ if (refreshBehavior && peer.kind === "direct") {
725
1058
  await refreshAgentBehavior({
726
1059
  source: "metadata_invalidation",
727
1060
  ...(version !== undefined ? { metadataVersion: version } : {}),
@@ -792,8 +1125,41 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
792
1125
  log: { error: (message) => log?.error?.(`[${accountId}] ${message}`) },
793
1126
  });
794
1127
  };
1128
+ // §A.4 — startup refresh-if-near-expiry, BEFORE the first WS connect. Recovers
1129
+ // a long-stopped pod with no manual re-pair. On a permanent refresh failure
1130
+ // auto-logout immediately and skip the doomed connect.
1131
+ if (
1132
+ !abortSignal.aborted &&
1133
+ latestRefreshToken &&
1134
+ refreshManager.isNearExpiry(activatedAtMs)
1135
+ ) {
1136
+ log?.info?.(
1137
+ `[${accountId}] clawchat-plugin-openclaw access token near expiry at startup; refreshing before connect`,
1138
+ );
1139
+ const startupOutcome = await refreshManager.refresh("startup-near-expiry");
1140
+ if (abortSignal.aborted) return;
1141
+ if (startupOutcome.kind === "permanent") {
1142
+ // Auto-logout already performed by the manager's onPermanentFailure.
1143
+ return;
1144
+ }
1145
+ // success swaps the in-memory token in place; transient/skipped just connect
1146
+ // with the current token (the WS handshake will then drive reactive refresh).
1147
+ }
1148
+ // Reuse the device id the server resolved on a previous connection so a pod
1149
+ // restart (fresh hostname → fresh hostname-derived id) does not present a
1150
+ // brand-new device, which would force a full inbox replay and orphan the
1151
+ // prior device's cursor. Persisted from `hello-ok` via markConnectionReady.
1152
+ const persistedDeviceId = store?.getLastResolvedDeviceId
1153
+ ? store.getLastResolvedDeviceId({ platform: "openclaw", accountId })
1154
+ : null;
1155
+ if (persistedDeviceId) {
1156
+ log?.info?.(
1157
+ `[${accountId}] clawchat-plugin-openclaw reusing persisted resolved_device_id`,
1158
+ );
1159
+ }
795
1160
  const client = createOpenclawClawlingClient(account, {
796
1161
  ...(params.transport ? { transport: params.transport } : {}),
1162
+ ...(persistedDeviceId ? { deviceIdOverride: persistedDeviceId } : {}),
797
1163
  wsLifecycle: {
798
1164
  onConnectFrameSent: (env) => {
799
1165
  lastConnectTraceId = typeof env.trace_id === "string" ? env.trace_id : "-";
@@ -823,6 +1189,202 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
823
1189
  });
824
1190
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw runtime client created`);
825
1191
 
1192
+ // §A.3/§A.4 — recompute the refresh-driven reconnect streak (depth + window).
1193
+ // Returns the next depth/window to thread into the re-enter, and whether the
1194
+ // cap is exceeded so the caller must NOT re-enter via refresh again.
1195
+ const nextRefreshReconnectStreak = (): {
1196
+ depth: number;
1197
+ windowStartedAt: number;
1198
+ capped: boolean;
1199
+ } => {
1200
+ const now = Date.now();
1201
+ const priorWindowStart = params.refreshReconnectWindowStartedAt ?? 0;
1202
+ const withinWindow =
1203
+ priorWindowStart !== 0 && now - priorWindowStart < REFRESH_RECONNECT_WINDOW_MS;
1204
+ const windowStartedAt = withinWindow ? priorWindowStart : now;
1205
+ const depth = (withinWindow ? params.refreshReconnectDepth ?? 0 : 0) + 1;
1206
+ return { depth, windowStartedAt, capped: depth > MAX_REFRESH_RECONNECTS };
1207
+ };
1208
+
1209
+ // §B/§D — re-enter the gateway after a plain transport-backoff delay, with the
1210
+ // CURRENT (unchanged) token and creds left untouched. Used when a reactive
1211
+ // refresh is transient/skipped (§B: a transient refresh failure NEVER
1212
+ // auto-logs-out and NEVER stops — keep retrying with the current token) and
1213
+ // when the refresh-reconnect loop is capped (§A.4). Carries the refresh
1214
+ // manager's latch + min-interval state so the guards keep bounding the loop.
1215
+ const scheduleTransportBackoffReconnect = (reason: string): void => {
1216
+ if (abortSignal.aborted || autoLoggedOut || reconnectWithRefreshedToken) return;
1217
+ reconnectWithRefreshedToken = true; // suppress the auth-failed teardown path.
1218
+ refreshManager.stop();
1219
+ activeClients.delete(accountId);
1220
+ finishCurrentConnection({
1221
+ state: "disconnected",
1222
+ closeCode: 1000,
1223
+ closeReason: "transport backoff reconnect",
1224
+ });
1225
+ try {
1226
+ client.close();
1227
+ } catch {
1228
+ // best effort
1229
+ }
1230
+ const attempt = (params.refreshReconnectDepth ?? 0) + 1;
1231
+ const delayMs =
1232
+ params.transportBackoffDelayMs ??
1233
+ Math.min(TRANSPORT_BACKOFF_MAX_MS, TRANSPORT_BACKOFF_BASE_MS * 2 ** Math.max(0, attempt - 1));
1234
+ log?.info?.(
1235
+ `[${accountId}] clawchat-plugin-openclaw reactive refresh ${reason}; backoff-reconnect with current token delayMs=${delayMs}`,
1236
+ );
1237
+ const managerState = refreshManager.exportState();
1238
+ const streak = nextRefreshReconnectStreak();
1239
+ const reEnter = () => {
1240
+ if (abortSignal.aborted) return;
1241
+ void startOpenclawClawlingGateway({
1242
+ ...params,
1243
+ account: { ...params.account },
1244
+ transportBackoffReconnect: true,
1245
+ refreshReconnectDepth: streak.depth,
1246
+ refreshReconnectWindowStartedAt: streak.windowStartedAt,
1247
+ refreshManagerState: managerState,
1248
+ });
1249
+ };
1250
+ const timer = params.backoffTimer ?? ((cb, ms) => void setTimeout(cb, ms));
1251
+ timer(reEnter, delayMs);
1252
+ };
1253
+
1254
+ // §A/§D — close the live WS and re-enter the gateway with the rotated token
1255
+ // (a token only enters via a fresh `connect` envelope; it cannot be hot-swapped
1256
+ // onto a live socket). Assumes a refresh ALREADY succeeded and swapped the
1257
+ // in-memory token (proactive path), or is called by `runRefreshAndReconnect`
1258
+ // after its own successful refresh (reactive path). Carries the refresh
1259
+ // manager's latch + min-interval + reconnect-streak state across the re-enter.
1260
+ const closeAndReconnectWithRefreshedToken = async (reason: string): Promise<void> => {
1261
+ if (abortSignal.aborted || autoLoggedOut || reconnectWithRefreshedToken) return;
1262
+ reconnectWithRefreshedToken = true;
1263
+ const managerState = refreshManager.exportState();
1264
+ refreshManager.stop();
1265
+ activeClients.delete(accountId);
1266
+ log?.info?.(
1267
+ `[${accountId}] clawchat-plugin-openclaw token refreshed (${reason}); closing WS to reconnect with new token`,
1268
+ );
1269
+ finishCurrentConnection({
1270
+ state: "disconnected",
1271
+ closeCode: 1000,
1272
+ closeReason: "token refresh",
1273
+ });
1274
+ try {
1275
+ client.close();
1276
+ } catch {
1277
+ // best effort
1278
+ }
1279
+ if (abortSignal.aborted) return;
1280
+ const streak = nextRefreshReconnectStreak();
1281
+ // Re-enter with the rotated in-memory account; SQLite/config already hold
1282
+ // the rotated pair (persisted before the swap). Reuse the same device id.
1283
+ await startOpenclawClawlingGateway({
1284
+ ...params,
1285
+ account: {
1286
+ ...params.account,
1287
+ configured: true,
1288
+ token: account.token,
1289
+ userId: account.userId,
1290
+ ownerUserId: account.ownerUserId,
1291
+ },
1292
+ refreshReconnectDepth: streak.depth,
1293
+ refreshReconnectWindowStartedAt: streak.windowStartedAt,
1294
+ refreshManagerState: managerState,
1295
+ });
1296
+ };
1297
+ // Alias used by the proactive port (refresh already succeeded + swapped).
1298
+ const runRefreshReconnect = closeAndReconnectWithRefreshedToken;
1299
+
1300
+ // §A/§B/§D — run a single-flight refresh and act on the outcome:
1301
+ // - success → close the live WS + re-enter with the rotated token (§D).
1302
+ // - permanent→ the manager already auto-logged-out (§C); nothing more here.
1303
+ // - transient/skipped → §B: NEVER teardown. Backoff-reconnect with the CURRENT
1304
+ // token, creds + configured untouched, and keep retrying.
1305
+ // Returns "handled" when it took ownership of the next connection lifecycle
1306
+ // (reconnect scheduled / auto-logout), "fallthrough" when the caller should run
1307
+ // its own path (only when aborted mid-flight).
1308
+ const runRefreshAndReconnect = async (reason: string): Promise<"handled" | "fallthrough"> => {
1309
+ if (abortSignal.aborted || autoLoggedOut || reconnectWithRefreshedToken) return "handled";
1310
+ // §A.4 — if the refresh-driven reconnect loop is already capped, do not run
1311
+ // another refresh; fall back to plain transport backoff with the current
1312
+ // token so a rotate-then-reject server cannot loop forever with no backoff.
1313
+ if ((params.refreshReconnectDepth ?? 0) >= MAX_REFRESH_RECONNECTS) {
1314
+ log?.error?.(
1315
+ `[${accountId}] clawchat-plugin-openclaw refresh-reconnect loop capped (depth=${params.refreshReconnectDepth}); backoff-reconnect with current token`,
1316
+ );
1317
+ scheduleTransportBackoffReconnect("refresh-reconnect-capped");
1318
+ return "handled";
1319
+ }
1320
+ const outcome = await refreshManager.refresh(reason);
1321
+ if (abortSignal.aborted) return "fallthrough";
1322
+ if (autoLoggedOut) return "handled"; // permanent → manager auto-logged-out.
1323
+ if (outcome.kind === "success") {
1324
+ await closeAndReconnectWithRefreshedToken(reason);
1325
+ return "handled";
1326
+ }
1327
+ // §B — transient / skipped (in-flight / min-interval / rejected-latch /
1328
+ // no-refresh-token): keep the WS in backoff with the CURRENT token; do NOT
1329
+ // teardown. (no-refresh-token has no path to recover, but tearing down is
1330
+ // wrong per §B; backoff keeps the supervisor alive without a refresh storm.)
1331
+ scheduleTransportBackoffReconnect(`refresh-${outcome.kind}`);
1332
+ return "handled";
1333
+ };
1334
+
1335
+ // §A.2.1 — run an authenticated REST call; on a 401/403 (`ClawlingApiError`
1336
+ // kind "auth") run the single-flight refresh and retry the call ONCE with a
1337
+ // rebuilt api-client. Any other error propagates. Used to wrap the REST
1338
+ // api-client so metadata/profile calls survive an expired access token
1339
+ // without waiting for the WS handshake.
1340
+ const isRestAuthError = (err: unknown): boolean =>
1341
+ err instanceof ClawlingApiError && err.kind === "auth";
1342
+ const withRefresh = async <T>(call: () => Promise<T>): Promise<T> => {
1343
+ try {
1344
+ return await call();
1345
+ } catch (err) {
1346
+ if (!isRestAuthError(err) || abortSignal.aborted) throw err;
1347
+ const outcome = await refreshManager.refresh("rest-401");
1348
+ if (outcome.kind !== "success") throw err;
1349
+ // The in-memory swap already invalidated the cached api-client; the next
1350
+ // `call()` rebuilds it with the fresh token.
1351
+ return await call();
1352
+ }
1353
+ };
1354
+ // Activate the REST proxy's refresh wrapper now that the manager exists.
1355
+ restWithRefresh = withRefresh;
1356
+
1357
+ // §A.2/§B — handle a WS hello-fail(auth) by gating a reactive refresh on the
1358
+ // reason classification:
1359
+ // - token-rejected → refresh. Success reconnects with the fresh token;
1360
+ // permanent auto-logs-out; transient/skipped backoff-reconnects with the
1361
+ // CURRENT token (§B: a transient refresh failure NEVER auto-logs-out and
1362
+ // NEVER stops — `runRefreshAndReconnect` owns all three).
1363
+ // - generic + token near expiry → same refresh path.
1364
+ // - generic + token NOT near expiry → §A.2: transient backoff with the current
1365
+ // token (NO refresh, NO teardown). A backend outage emitting a generic
1366
+ // reason must not trigger a refresh storm OR a spurious logout.
1367
+ // - auth-unavailable never reaches here (the ws-client routes it as a
1368
+ // TransportError so its own backoff loop handles it).
1369
+ const handleWsAuthFailure = async (reason: string): Promise<void> => {
1370
+ if (abortSignal.aborted || reconnectWithRefreshedToken || autoLoggedOut) return;
1371
+ const klass = classifyHelloFailReason(reason);
1372
+ const eligible =
1373
+ klass === "token-rejected" ||
1374
+ (klass === "generic" && refreshManager.isNearExpiry(activatedAtMs));
1375
+ if (eligible) {
1376
+ // `runRefreshAndReconnect` is total: it either reconnects (success),
1377
+ // auto-logs-out (permanent), or backoff-reconnects with the current token
1378
+ // (transient/skipped). No teardown path remains for an eligible hello-fail.
1379
+ await runRefreshAndReconnect("ws-hello-fail");
1380
+ return;
1381
+ }
1382
+ // §A.2 / Finding 5 — generic + token NOT near expiry: keep the WS in
1383
+ // transport backoff with the current token. Do NOT refresh and do NOT tear
1384
+ // the account down (the old teardown wrongly flipped configured:false).
1385
+ scheduleTransportBackoffReconnect("hello-fail-generic-not-near");
1386
+ };
1387
+
826
1388
  setAlignedOutboundLogContext(client, wsLogContext);
827
1389
  client.on("hello:ok", (env: Envelope) => {
828
1390
  const payload = env.payload && typeof env.payload === "object"
@@ -837,6 +1399,11 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
837
1399
  send: () => {},
838
1400
  context: wsLogContext,
839
1401
  });
1402
+ const notifySignalObserver = createNotifySignalObserver({
1403
+ accountId,
1404
+ log: (msg) => log?.info?.(msg),
1405
+ context: wsLogContext,
1406
+ });
840
1407
  const logAuthFailure = (reason: string) => {
841
1408
  if (authFailureLogged) return;
842
1409
  authFailureLogged = true;
@@ -861,6 +1428,7 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
861
1428
  client.on("state", ({ from, to }) => {
862
1429
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw state ${from} -> ${to}`);
863
1430
  wsReady = to === "connected";
1431
+ if (to === "connected") wsReadyEverThisAttempt = true;
864
1432
  if (to === "connecting") {
865
1433
  reconnectTracker.connectStart();
866
1434
  currentAttemptStartedAt = Date.now();
@@ -931,8 +1499,15 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
931
1499
  }
932
1500
  void refreshConversationCacheAfterReady();
933
1501
  void dispatchActivationBootstrap();
1502
+ // §A.1 — arm the proactive refresh timer from the live token's `exp`
1503
+ // every time a connection becomes ready (re-armed after every refresh via
1504
+ // the gateway re-enter).
1505
+ refreshManager.armProactiveTimer(activatedAtMs);
934
1506
  } else if (to === "disconnected") {
935
1507
  reconnectTracker.markClosed();
1508
+ // §A.1 — clear the proactive timer on disconnect; it re-arms on the next
1509
+ // ready, or the gateway re-enter arms a fresh one.
1510
+ if (!reconnectWithRefreshedToken) refreshManager.disarmProactiveTimer();
936
1511
  }
937
1512
  const next = { ...getStatus(), ...mapClawlingStateToStatus(to as ClawlingState) };
938
1513
  setStatus(next);
@@ -1076,21 +1651,50 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1076
1651
  void handleMetadataInvalidation(env);
1077
1652
  });
1078
1653
 
1654
+ client.on("notify:signal", (env: Envelope) => {
1655
+ // §9.4 reliable system notification. The plugin holds no friend/roster
1656
+ // cache (friends are fetched on demand via REST tools), so there is nothing
1657
+ // to invalidate — observe + dedup only. The live frame and its reliable
1658
+ // inbox replay carry the same event_id and collapse to one observation.
1659
+ notifySignalObserver.observe(env);
1660
+ });
1661
+
1662
+ client.on("replay:done", (env: Envelope) => {
1663
+ // §11.5 terminal control frame: device replay drained, live delivery begins.
1664
+ // Fires on every reconnect (even zero-backlog). Replayed messages are
1665
+ // processed inline, so this is a logged boundary marker, not a gate.
1666
+ log?.info?.(`[${accountId}] clawchat-plugin-openclaw replay.done trace=${env.trace_id}`);
1667
+ });
1668
+
1079
1669
  client.on("error", (err: unknown) => {
1080
1670
  const classified = classifyClawlingClientError(err);
1081
1671
  if (classified.kind === "auth") {
1082
- finishCurrentConnection({
1083
- state: "auth_failed",
1084
- error: lastHelloFailReason || classified.message,
1085
- });
1086
- logAuthFailure(classified.message);
1087
- setStatus({
1088
- ...getStatus(),
1089
- connected: false,
1090
- configured: false,
1091
- running: false,
1092
- lastError: classified.message,
1093
- });
1672
+ // §A.2 — a WS hello-fail(auth) on a LIVE (already-connected) session.
1673
+ // Attempt a gated reactive refresh before tearing the account down. The
1674
+ // INITIAL-connect auth failure is owned by the `client.connect()` catch
1675
+ // below (which runs the refresh/backoff/teardown decision and the recursive
1676
+ // re-enter), so only react here once the session was previously ready —
1677
+ // otherwise we'd double-handle and the error handler's teardown would race
1678
+ // the catch's transient-backoff branch (wrongly flipping configured:false).
1679
+ if (!reconnectWithRefreshedToken && !autoLoggedOut && wsReadyEverThisAttempt) {
1680
+ void handleWsAuthFailure(lastHelloFailReason || classified.message);
1681
+ return;
1682
+ }
1683
+ // Not-ready (initial connect): the `client.connect()` catch below owns the
1684
+ // refresh/backoff/teardown DECISION and the status flip. Record the
1685
+ // connection as auth_failed here (bookkeeping — the ws-client's own 4001
1686
+ // close would otherwise finish it as a plain "disconnected") and log the
1687
+ // auth failure, but do NOT flip status here: a transient refresh / generic
1688
+ // backoff must leave configured untouched, and that decision lives in the
1689
+ // catch.
1690
+ if (!reconnectWithRefreshedToken && !autoLoggedOut) {
1691
+ finishCurrentConnection({
1692
+ state: "auth_failed",
1693
+ error: lastHelloFailReason || classified.message,
1694
+ });
1695
+ logAuthFailure(classified.message);
1696
+ }
1697
+ return;
1094
1698
  } else if (classified.kind === "transport") {
1095
1699
  finishCurrentConnection({ state: "transport_error", error: classified.message });
1096
1700
  const current = wsLogContext();
@@ -1348,7 +1952,13 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1348
1952
  : {}),
1349
1953
  },
1350
1954
  ...(memoryRoot ? { extra: { memoryRoot } } : {}),
1351
- ...(turn.peer.kind === "group"
1955
+ // Deliver the rendered ClawChat per-turn prompt (owner agent_behavior,
1956
+ // metadata, peer/sender profile) to the host for ALL chat types. The host
1957
+ // appends `GroupSystemPrompt` to the system prompt regardless of chat
1958
+ // kind. Direct chats previously relied only on the `before_prompt_build`
1959
+ // staging hook, which is not applied by the host for DM sessions, so the
1960
+ // owner-configured behavior never reached the LLM in 1:1 chats.
1961
+ ...(turnPrompt
1352
1962
  ? { supplemental: { groupSystemPrompt: turnPrompt } }
1353
1963
  : {}),
1354
1964
  }) as MutableOpenClawReplyContext;
@@ -1672,14 +2282,47 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1672
2282
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw runtime client.connect() resolved`);
1673
2283
  } catch (err) {
1674
2284
  const classified = classifyClawlingClientError(err);
1675
- setStatus({
1676
- ...getStatus(),
1677
- connected: false,
1678
- configured: classified.kind !== "auth",
1679
- running: false,
1680
- lastError: classified.message,
1681
- });
1682
2285
  if (classified.kind === "auth") {
2286
+ // §A.2/§B — initial-connect hello-fail(auth). Do NOT pre-flip
2287
+ // configured:false here: a transient refresh must leave creds + configured
2288
+ // untouched (§B). Branch on the refresh-eligibility classification first.
2289
+ const klass = classifyHelloFailReason(lastHelloFailReason || classified.message);
2290
+ const eligible =
2291
+ !reconnectWithRefreshedToken &&
2292
+ !autoLoggedOut &&
2293
+ !abortSignal.aborted &&
2294
+ Boolean(latestRefreshToken) &&
2295
+ (klass === "token-rejected" ||
2296
+ (klass === "generic" && refreshManager.isNearExpiry(activatedAtMs)));
2297
+ if (eligible) {
2298
+ // Total: success reconnects, permanent auto-logs-out, transient/skipped
2299
+ // backoff-reconnects with the current token (creds + configured intact).
2300
+ await runRefreshAndReconnect("ws-initial-connect-auth");
2301
+ return;
2302
+ }
2303
+ // §A.2 / Finding 5 — generic + token NOT near expiry (and we have a refresh
2304
+ // token but it isn't refresh-eligible): keep the WS in transport backoff
2305
+ // with the current token instead of tearing the account down.
2306
+ if (
2307
+ klass === "generic" &&
2308
+ Boolean(latestRefreshToken) &&
2309
+ !reconnectWithRefreshedToken &&
2310
+ !autoLoggedOut &&
2311
+ !abortSignal.aborted
2312
+ ) {
2313
+ scheduleTransportBackoffReconnect("initial-connect-generic-not-near");
2314
+ return;
2315
+ }
2316
+ // Not refresh-eligible and no usable refresh token — fall back to the
2317
+ // legacy auth-failed teardown so the gateway flips not-configured and (for
2318
+ // a sqlite-sourced account) re-enters wait-for-activation.
2319
+ setStatus({
2320
+ ...getStatus(),
2321
+ connected: false,
2322
+ configured: false,
2323
+ running: false,
2324
+ lastError: classified.message,
2325
+ });
1683
2326
  finishCurrentConnection({
1684
2327
  state: "auth_failed",
1685
2328
  error: lastHelloFailReason || classified.message,
@@ -1700,6 +2343,13 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1700
2343
  }
1701
2344
  return;
1702
2345
  }
2346
+ setStatus({
2347
+ ...getStatus(),
2348
+ connected: false,
2349
+ configured: true,
2350
+ running: false,
2351
+ lastError: classified.message,
2352
+ });
1703
2353
  log?.error?.(
1704
2354
  `[${accountId}] clawchat-plugin-openclaw connect failed (${classified.kind}): ${classified.message}`,
1705
2355
  );
@@ -1719,6 +2369,8 @@ export async function startOpenclawClawlingGateway(params: StartGatewayParams):
1719
2369
  log?.info?.(`[${accountId}] clawchat-plugin-openclaw runtime abort received; closing client`);
1720
2370
  activeClients.delete(accountId);
1721
2371
  closingForAbort = true;
2372
+ // §A.1 — stop the proactive refresh timer on shutdown.
2373
+ refreshManager.stop();
1722
2374
  groupCoalescer.cancelAll();
1723
2375
  finishCurrentConnection({
1724
2376
  state: "disconnected",