metame-cli 1.6.1 → 1.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -282,8 +282,6 @@ function ensureLaunchdPlist({ daemonScript, daemonLog }) {
282
282
  <string>${LAUNCHD_LABEL}</string>
283
283
  <key>ProgramArguments</key>
284
284
  <array>
285
- <string>/usr/bin/caffeinate</string>
286
- <string>-i</string>
287
285
  <string>${nodePath}</string>
288
286
  <string>${daemonScript}</string>
289
287
  </array>
@@ -2717,10 +2715,8 @@ try {
2717
2715
  } catch { /* PID file stale, daemon not running */ }
2718
2716
  }
2719
2717
  if (!daemonRunning) {
2720
- const _isMac = process.platform === 'darwin';
2721
- const dCmd = _isMac ? 'caffeinate' : process.execPath;
2722
- const dArgs = _isMac ? ['-i', process.execPath, _daemonScript] : [_daemonScript];
2723
- const bg = spawn(dCmd, dArgs, {
2718
+ const dArgs = [_daemonScript];
2719
+ const bg = spawn(process.execPath, dArgs, {
2724
2720
  detached: true,
2725
2721
  stdio: 'ignore',
2726
2722
  windowsHide: true,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "metame-cli",
3
- "version": "1.6.1",
3
+ "version": "1.6.2",
4
4
  "description": "The Cognitive Profile Layer for Claude Code. Knows how you think, not just what you said.",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -736,21 +736,24 @@ function createBridgeStarter(deps) {
736
736
  const { createBot } = require('./feishu-adapter.js');
737
737
  const bot = createBot(config.feishu);
738
738
 
739
- // Validate credentials before starting WebSocket fail loud, not silent
739
+ // Credential pre-check is informational only. We always start the WS
740
+ // pipeline — it has its own network-ready-probe + backoff reconnect, so
741
+ // even if startup lands in a "just woke / network flaky" window, recovery
742
+ // is automatic instead of requiring a manual daemon restart.
740
743
  try {
741
744
  const validation = await bot.validateCredentials();
742
745
  if (!validation.ok) {
743
- log('ERROR', `Feishu credential check FAILED: ${validation.error}`);
744
746
  if (validation.isAuthError) {
745
- log('ERROR', 'Feishu bridge will NOT start fix app_id/app_secret in ~/.metame/daemon.yaml and restart daemon');
746
- return null;
747
+ log('ERROR', `Feishu credential check FAILED (likely bad app_id/app_secret): ${validation.error}`);
748
+ log('WARN', 'Starting bridge anyway — if this persists, fix ~/.metame/daemon.yaml and restart daemon');
749
+ } else {
750
+ log('WARN', `Feishu credential pre-check failed (transient): ${validation.error} — WS pipeline will retry`);
747
751
  }
748
- log('WARN', 'Feishu credential check failed (possibly network issue) — attempting to start anyway');
749
752
  } else {
750
753
  log('INFO', 'Feishu credentials validated OK');
751
754
  }
752
755
  } catch (e) {
753
- log('WARN', `Feishu credential pre-check error: ${e.message} — attempting to start anyway`);
756
+ log('WARN', `Feishu credential pre-check error: ${e.message} — WS pipeline will retry`);
754
757
  }
755
758
 
756
759
  try {
@@ -8,6 +8,7 @@
8
8
 
9
9
  const fs = require('fs');
10
10
  const path = require('path');
11
+ const dns = require('dns');
11
12
 
12
13
  let Lark;
13
14
  function _tryRequireLark() {
@@ -58,6 +59,40 @@ function withTimeout(promise, ms = 10000) {
58
59
  return Promise.race([promise, timeout]).finally(() => clearTimeout(timer));
59
60
  }
60
61
 
62
+ // Wait for DNS to resolve a target host with exponential backoff.
63
+ // Used after system wake / before reconnect: the OS may report clock/events
64
+ // restored before WiFi+DNS are actually usable. Retries 1/2/4/8s, total cap 30s.
65
+ async function waitForNetworkReady(hostname, opts = {}) {
66
+ const log = opts.log || (() => {});
67
+ const totalBudget = Number.isFinite(opts.totalBudgetMs) ? opts.totalBudgetMs : 30000;
68
+ const lookup = opts.lookup || dns.promises.lookup;
69
+ const sleep = opts.sleep || ((ms) => new Promise((r) => setTimeout(r, ms)));
70
+ const startedAt = Date.now();
71
+ let attempt = 0;
72
+ let lastError = null;
73
+ // Backoff schedule: 0s, 1s, 2s, 4s, 8s between attempts (before the next attempt)
74
+ const backoff = [0, 1000, 2000, 4000, 8000];
75
+ // Always make at least one attempt; subsequent attempts are budget-gated.
76
+ do {
77
+ const wait = backoff[Math.min(attempt, backoff.length - 1)];
78
+ if (wait > 0) await sleep(wait);
79
+ attempt += 1;
80
+ try {
81
+ await lookup(hostname);
82
+ return { ok: true, attempts: attempt, elapsed: Date.now() - startedAt };
83
+ } catch (err) {
84
+ lastError = err;
85
+ log('DEBUG', `[net-ready] ${hostname} attempt ${attempt} failed: ${err.code || err.message}`);
86
+ }
87
+ } while (Date.now() - startedAt < totalBudget);
88
+ return {
89
+ ok: false,
90
+ attempts: attempt,
91
+ elapsed: Date.now() - startedAt,
92
+ error: lastError && (lastError.message || String(lastError)),
93
+ };
94
+ }
95
+
61
96
  // Max chars per lark_md element (Feishu limit ~4000)
62
97
  const MAX_CHUNK = 3800;
63
98
 
@@ -101,12 +136,25 @@ function createBot(config) {
101
136
  return { ok: true };
102
137
  } catch (err) {
103
138
  const msg = err && err.message || String(err);
104
- const isAuthError = /invalid|unauthorized|forbidden|token|credential|app_id|app_secret|permission|99991663|99991664|99991665/i.test(msg);
139
+ // Only flag as auth error when we have strong evidence: known Feishu
140
+ // auth error codes, HTTP 401/403, or explicit 'invalid app_id/secret'.
141
+ // Previously a loose /token/ regex false-positived on SDK-internal
142
+ // messages like "Cannot destructure 'tenant_access_token' of undefined"
143
+ // (which is really a network/empty-response failure) and caused the
144
+ // bridge to refuse to start across a lid-close/wake cycle.
145
+ const authPatterns = [
146
+ /\b(99991663|99991664|99991665)\b/, // Feishu token invalid codes
147
+ /\b(401|403)\b/, // HTTP 401/403
148
+ /invalid\s+(app_?id|app_?secret|tenant_access_token|access_?token)/i,
149
+ /unauthorized/i,
150
+ /\bforbidden\b/i,
151
+ ];
152
+ const isAuthError = authPatterns.some((p) => p.test(msg));
105
153
  return {
106
154
  ok: false,
107
155
  error: isAuthError
108
156
  ? `Feishu credential validation failed (app_id/app_secret may be incorrect): ${msg}`
109
- : `Feishu API probe failed (network or config issue): ${msg}`,
157
+ : `Feishu API probe failed (network or transient issue): ${msg}`,
110
158
  isAuthError,
111
159
  };
112
160
  }
@@ -395,15 +443,22 @@ function createBot(config) {
395
443
  let healthTimer = null;
396
444
  let sleepWakeTimer = null;
397
445
  let reconnectTimer = null;
398
- let reconnectDelay = 5000; // start 5s, doubles up to 60s
446
+ let aliveTimer = null;
447
+ let reconnectScheduled = false; // dedup flag: true while a reconnect is pending
448
+ let wsEpoch = 0; // increments each connect(); underlying-ws hooks capture their own epoch
449
+ const INITIAL_RECONNECT_DELAY = 5000;
399
450
  const MAX_RECONNECT_DELAY = 60000;
400
- const HEALTH_CHECK_INTERVAL = 90000; // check every 90s
401
- const SILENT_THRESHOLD = 300000; // 5 min no SDK activity → suspect dead
402
- const SLEEP_DETECT_INTERVAL = 5000; // tick every 5s to detect clock jump
403
- const SLEEP_JUMP_THRESHOLD = 30000; // clock jump >30s = system was sleeping
451
+ let reconnectDelay = INITIAL_RECONNECT_DELAY;
452
+ const HEALTH_CHECK_INTERVAL = 30000; // tighter bottom-line probe (was 90s)
453
+ const SILENT_THRESHOLD = 90000; // 90s no SDK activity probe (was 300s)
454
+ const SLEEP_DETECT_INTERVAL = 5000;
455
+ const SLEEP_JUMP_THRESHOLD = 30000; // clock jump >30s = was sleeping
456
+ const ALIVE_CHECK_WINDOW = 15000; // after connect, must see activity within 15s
457
+ const FEISHU_HOST = 'open.feishu.cn';
404
458
 
405
459
  // Track last SDK activity (any event received = alive)
406
460
  let _lastActivityAt = Date.now();
461
+ let _connectedAt = 0; // when the current WSClient was (re)started
407
462
  function touchActivity() { _lastActivityAt = Date.now(); }
408
463
 
409
464
  // Dedup: track recent message_ids (Feishu may redeliver on slow ack)
@@ -490,58 +545,162 @@ function createBot(config) {
490
545
  });
491
546
  }
492
547
 
548
+ // Hook the underlying ws instance for first-class close/error notification.
549
+ // Lark SDK stores the live WebSocket via wsConfig.setWSInstance; we wrap it
550
+ // so we learn about 'close' immediately instead of waiting for silence.
551
+ // Defensive: SDK internals can change between versions — any failure just
552
+ // downgrades to the silent/health/sleep bottom-lines.
553
+ function hookUnderlyingWs(wsClient, epoch) {
554
+ try {
555
+ const cfg = wsClient && wsClient.wsConfig;
556
+ if (!cfg || typeof cfg.setWSInstance !== 'function') return;
557
+ const orig = cfg.setWSInstance.bind(cfg);
558
+ cfg.setWSInstance = (inst) => {
559
+ orig(inst);
560
+ if (!inst || inst._metameHooked) return;
561
+ inst._metameHooked = true;
562
+ try {
563
+ inst.on('close', () => {
564
+ if (stopped) return;
565
+ if (epoch !== wsEpoch) return; // stale: a newer connect() has superseded this one
566
+ _log('INFO', 'Feishu underlying WS closed — scheduling reconnect');
567
+ scheduleReconnect({ immediate: true, reason: 'ws-close' });
568
+ });
569
+ inst.on('error', (e) => {
570
+ if (epoch !== wsEpoch) return;
571
+ _log('WARN', `Feishu underlying WS error: ${e && e.message || e}`);
572
+ });
573
+ } catch (hookErr) {
574
+ _log('WARN', `Feishu ws event hook failed: ${hookErr.message}`);
575
+ }
576
+ };
577
+ } catch (err) {
578
+ _log('WARN', `Feishu SDK hook unavailable (${err.message}) — falling back to silence/sleep detection`);
579
+ }
580
+ }
581
+
493
582
  function connect() {
494
583
  if (stopped) return;
584
+ clearTimeout(aliveTimer);
585
+ wsEpoch += 1;
586
+ const myEpoch = wsEpoch;
587
+ let ws;
495
588
  try {
496
- currentWs = new Lark.WSClient({
589
+ ws = new Lark.WSClient({
497
590
  appId: app_id,
498
591
  appSecret: app_secret,
499
592
  loggerLevel: Lark.LoggerLevel.info,
593
+ autoReconnect: false, // we own the reconnect lifecycle
500
594
  });
595
+ currentWs = ws;
596
+ hookUnderlyingWs(ws, myEpoch);
501
597
  const eventDispatcher = buildDispatcher();
502
- currentWs.start({ eventDispatcher });
598
+ const startResult = ws.start({ eventDispatcher });
599
+ _connectedAt = Date.now();
503
600
  touchActivity();
504
- reconnectDelay = 5000; // reset backoff on successful start
505
601
  _log('INFO', 'Feishu WebSocket connecting...');
602
+ startAliveCheck();
603
+ // start() may return a Promise. Surface async failures into the reconnect pipeline
604
+ // so we don't depend solely on the 15s alive-check to recover.
605
+ if (startResult && typeof startResult.then === 'function') {
606
+ startResult.catch((err) => {
607
+ if (stopped) return;
608
+ if (myEpoch !== wsEpoch) return; // superseded
609
+ _log('ERROR', `Feishu WSClient.start rejected: ${err && err.message || err}`);
610
+ scheduleReconnect({ immediate: true, reason: 'start-rejected', failed: true });
611
+ });
612
+ }
506
613
  } catch (err) {
507
614
  _log('ERROR', `Feishu WSClient.start failed: ${err.message}`);
508
- scheduleReconnect();
615
+ scheduleReconnect({ immediate: true, reason: 'start-failed', failed: true });
509
616
  }
510
617
  }
511
618
 
512
- function scheduleReconnect() {
619
+ // Single entry point for all reconnect signals. Dedup'd via reconnectScheduled
620
+ // so concurrent ws-close + alive-probe-fail + sleep events collapse into one
621
+ // reconnect. Backoff only grows when the caller marks this as a failure recovery
622
+ // (failed:true) — known-cause resets (manual / system-wake) start from 0s.
623
+ function scheduleReconnect({ immediate = false, reason = '', failed = false } = {}) {
513
624
  if (stopped) return;
625
+ if (reconnectScheduled) {
626
+ _log('DEBUG', `Feishu reconnect already scheduled — dropping duplicate (reason: ${reason})`);
627
+ return;
628
+ }
629
+ reconnectScheduled = true;
514
630
  clearTimeout(reconnectTimer);
515
- _log('INFO', `Feishu reconnecting in ${reconnectDelay / 1000}s...`);
516
- reconnectTimer = setTimeout(() => {
517
- _log('INFO', 'Feishu reconnecting now...');
631
+ clearTimeout(aliveTimer);
632
+ try { currentWs?.stop?.(); } catch { /* ignore */ }
633
+ currentWs = null;
634
+ if (failed) {
635
+ // Only failure paths grow the backoff ceiling for the *next* attempt.
636
+ reconnectDelay = Math.min(reconnectDelay * 2, MAX_RECONNECT_DELAY);
637
+ }
638
+ const delay = immediate ? 0 : reconnectDelay;
639
+ _log('INFO', `Feishu reconnect in ${Math.round(delay / 1000)}s (reason: ${reason || 'unspecified'})`);
640
+ reconnectTimer = setTimeout(async () => {
641
+ reconnectScheduled = false;
642
+ if (stopped) return;
643
+ const net = await waitForNetworkReady(FEISHU_HOST, { log: _log });
644
+ if (stopped) return;
645
+ if (!net.ok) {
646
+ _log('WARN', `Feishu network still down after ${Math.round(net.elapsed / 1000)}s (${net.error || 'unknown'}) — retrying`);
647
+ scheduleReconnect({ immediate: false, reason: 'network-wait-timeout', failed: true });
648
+ return;
649
+ }
650
+ if (net.attempts > 1) {
651
+ _log('INFO', `Feishu network ready after ${net.attempts} attempts (${Math.round(net.elapsed / 1000)}s)`);
652
+ }
518
653
  connect();
519
- }, reconnectDelay);
520
- reconnectDelay = Math.min(reconnectDelay * 2, MAX_RECONNECT_DELAY);
654
+ }, delay);
521
655
  }
522
656
 
523
- // Health check: detect silent WebSocket death via API probe
657
+ // Alive-check: after each connect, require either SDK activity or a
658
+ // successful API probe within ALIVE_CHECK_WINDOW. Otherwise reconnect.
659
+ // This catches the "WSClient.start returned but underlying socket is
660
+ // dead" case that the 120s SDK loop would otherwise sit on.
661
+ function startAliveCheck() {
662
+ clearTimeout(aliveTimer);
663
+ const connectedAt = _connectedAt;
664
+ aliveTimer = setTimeout(async () => {
665
+ if (stopped) return;
666
+ if (_lastActivityAt > connectedAt) {
667
+ // SDK delivered at least one event strictly after connect → healthy.
668
+ // Using `>` (not `>=`) because connect() calls touchActivity(), so
669
+ // _lastActivityAt === _connectedAt at connect time — `>=` would
670
+ // false-positive immediately without any real post-connect activity.
671
+ reconnectDelay = INITIAL_RECONNECT_DELAY;
672
+ return;
673
+ }
674
+ try {
675
+ await withTimeout(client.im.chat.list({ params: { page_size: 1 } }), 8000);
676
+ touchActivity();
677
+ reconnectDelay = INITIAL_RECONNECT_DELAY;
678
+ _log('INFO', 'Feishu alive probe ok');
679
+ } catch (err) {
680
+ _log('WARN', `Feishu alive probe failed: ${err.message} — reconnecting`);
681
+ scheduleReconnect({ immediate: true, reason: 'alive-probe-failed', failed: true });
682
+ }
683
+ }, ALIVE_CHECK_WINDOW);
684
+ }
685
+
686
+ // Health check: bottom-line probe for silent dead-sockets the hooks missed.
524
687
  function startHealthCheck() {
525
688
  clearInterval(healthTimer);
526
689
  healthTimer = setInterval(async () => {
527
690
  if (stopped) return;
528
691
  const silentMs = Date.now() - _lastActivityAt;
529
- if (silentMs < SILENT_THRESHOLD) return; // recently active, skip
530
- // Probe: try a lightweight API call to verify token + connectivity
692
+ if (silentMs < SILENT_THRESHOLD) return;
531
693
  try {
532
- await withTimeout(client.im.chat.list({ params: { page_size: 1 } }), 10000);
533
- // API works — connection might still be alive, just quiet. Reset activity.
694
+ await withTimeout(client.im.chat.list({ params: { page_size: 1 } }), 8000);
534
695
  touchActivity();
535
696
  } catch (err) {
536
697
  _log('WARN', `Feishu health check failed after ${Math.round(silentMs / 1000)}s silence: ${err.message} — reconnecting`);
537
- try { currentWs?.stop?.(); } catch { /* ignore */ }
538
- currentWs = null;
539
- connect();
698
+ scheduleReconnect({ immediate: true, reason: 'health-probe-failed', failed: true });
540
699
  }
541
700
  }, HEALTH_CHECK_INTERVAL);
542
701
  }
543
702
 
544
- // Sleep/wake detector: if the JS clock jumps >30s, system was sleeping → force reconnect
703
+ // Sleep/wake detector: JS clock jump >30s system was suspended.
545
704
  function startSleepWakeDetector() {
546
705
  let _lastTickAt = Date.now();
547
706
  sleepWakeTimer = setInterval(() => {
@@ -550,13 +709,9 @@ function createBot(config) {
550
709
  const elapsed = now - _lastTickAt;
551
710
  _lastTickAt = now;
552
711
  if (elapsed > SLEEP_JUMP_THRESHOLD) {
553
- _log('INFO', `System wake detected (${Math.round(elapsed / 1000)}s gap) — forcing reconnect`);
554
- reconnectDelay = 5000;
555
- clearTimeout(reconnectTimer);
556
- try { currentWs?.stop?.(); } catch { /* ignore */ }
557
- currentWs = null;
558
- touchActivity(); // reset silence counter so health check doesn't double-fire
559
- connect();
712
+ _log('INFO', `Feishu system wake detected (${Math.round(elapsed / 1000)}s gap) — reconnecting`);
713
+ reconnectDelay = INITIAL_RECONNECT_DELAY; // wake is a known cause, not a failure
714
+ scheduleReconnect({ immediate: true, reason: 'system-wake' });
560
715
  }
561
716
  }, SLEEP_DETECT_INTERVAL);
562
717
  }
@@ -570,17 +725,16 @@ function createBot(config) {
570
725
  stop() {
571
726
  stopped = true;
572
727
  clearTimeout(reconnectTimer);
728
+ clearTimeout(aliveTimer);
573
729
  clearInterval(healthTimer);
574
730
  clearInterval(sleepWakeTimer);
731
+ try { currentWs?.stop?.(); } catch { /* ignore */ }
575
732
  currentWs = null;
576
733
  },
577
734
  reconnect() {
578
735
  _log('INFO', 'Feishu manual reconnect triggered');
579
- reconnectDelay = 5000;
580
- clearTimeout(reconnectTimer);
581
- try { currentWs?.stop?.(); } catch { /* ignore */ }
582
- currentWs = null;
583
- connect();
736
+ reconnectDelay = INITIAL_RECONNECT_DELAY;
737
+ scheduleReconnect({ immediate: true, reason: 'manual' });
584
738
  },
585
739
  isAlive() {
586
740
  return !stopped && (Date.now() - _lastActivityAt) < SILENT_THRESHOLD;
@@ -592,4 +746,4 @@ function createBot(config) {
592
746
  };
593
747
  }
594
748
 
595
- module.exports = { createBot };
749
+ module.exports = { createBot, _internal: { waitForNetworkReady } };
@@ -159,23 +159,53 @@ function saveProviders(config) {
159
159
  // PROVIDER ENV BUILDER (Core mechanism)
160
160
  // ---------------------------------------------------------
161
161
 
162
+ /**
163
+ * Read the env mapping defined in ~/.claude/settings.json.
164
+ * Returns a plain string→string object (only string values are kept).
165
+ * Returns {} on any error or if the file/env block is missing.
166
+ */
167
+ function readClaudeSettingsEnv() {
168
+ const home = process.env.HOME || os.homedir();
169
+ const settingsPath = path.join(home, '.claude', 'settings.json');
170
+ try {
171
+ if (!fs.existsSync(settingsPath)) return {};
172
+ const data = JSON.parse(fs.readFileSync(settingsPath, 'utf8'));
173
+ if (!data || typeof data.env !== 'object' || data.env === null) return {};
174
+ const out = {};
175
+ for (const [k, v] of Object.entries(data.env)) {
176
+ if (typeof v === 'string') out[k] = v;
177
+ }
178
+ return out;
179
+ } catch {
180
+ return {};
181
+ }
182
+ }
183
+
162
184
  /**
163
185
  * Build env var overrides for a named provider.
164
- * Returns {} for 'anthropic' (official) — use Claude Code defaults.
165
- * Returns { ANTHROPIC_BASE_URL, ANTHROPIC_API_KEY } for relays.
186
+ *
187
+ * Always inherits the env mapping from ~/.claude/settings.json (slot mappings
188
+ * like ANTHROPIC_DEFAULT_*_MODEL stay in place across providers).
189
+ * For 'anthropic' (official): returns the inherited Claude settings env unchanged.
190
+ * For custom providers: overrides ANTHROPIC_BASE_URL plus both
191
+ * ANTHROPIC_API_KEY and ANTHROPIC_AUTH_TOKEN with the provider's credentials.
166
192
  */
167
193
  function buildEnv(providerName) {
168
194
  const config = loadProviders();
169
195
  const name = providerName || config.active;
170
196
 
171
- if (name === 'anthropic') return {};
197
+ const env = readClaudeSettingsEnv();
198
+
199
+ if (name === 'anthropic') return env;
172
200
 
173
201
  const provider = config.providers[name];
174
- if (!provider) return {};
202
+ if (!provider) return env;
175
203
 
176
- const env = {};
177
204
  if (provider.base_url) env.ANTHROPIC_BASE_URL = provider.base_url;
178
- if (provider.api_key) env.ANTHROPIC_API_KEY = provider.api_key;
205
+ if (provider.api_key) {
206
+ env.ANTHROPIC_API_KEY = provider.api_key;
207
+ env.ANTHROPIC_AUTH_TOKEN = provider.api_key;
208
+ }
179
209
  return env;
180
210
  }
181
211
 
@@ -390,6 +420,7 @@ function getEngine() { return _currentEngine; }
390
420
  const api = {
391
421
  loadProviders,
392
422
  saveProviders,
423
+ readClaudeSettingsEnv,
393
424
  buildEnv,
394
425
  buildSpawnEnv,
395
426
  buildActiveEnv,