@debugg-ai/debugg-ai-mcp 2.0.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -7,12 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ### Added — tunnel fault-injection + trace harness for diagnosis
11
+
12
+ - New `DEBUGG_TUNNEL_FAULT_MODE` env var (dev/test only — inert when `NODE_ENV=production`) lets developers force specific ngrok-side failures without mocking, to reproduce client-reported transient "Tunnel setup failed" incidents. Modes: `fail-connect-N:<count>`, `empty-url-N:<count>`, `delay-connect:<ms>`, combinable with commas. Bead `42g`.
13
+ - Structured `TunnelTrace` captures timestamped lifecycle events per tunnel-create call (start, each connect attempt, fault inject, agent reset, backoff, success/fail). Dumped to WARN logs on any tunnel creation failure so real-world flakes get a post-mortem trail instead of an opaque error message.
14
+
10
15
  ### Fixed — tunnel provisioning flakiness surfaces as user-facing errors
11
16
 
12
17
  - `check_app_in_browser` / `trigger_crawl` now automatically retry transient tunnel-provision failures (5xx, 408, 429, network errors like ECONNRESET) with exponential backoff (500ms → 1500ms → 3000ms, 3 attempts). Previously a single ngrok/backend blip forced the caller to manually retry the tool call. Bead `7nx`.
18
+ - **ngrok.connect() retry widened from 2 to 3 attempts** with 500ms / 1500ms backoff. A client still hit "Tunnel setup failed" after `7nx` shipped — the failure was in the ngrok-listener-bringup path, not the backend-provision path. Auth errors still fail fast. Bead `ixh`.
13
19
  - Tunnel-provision error messages now carry structured diagnostic context — HTTP status, ngrok error code, backend `x-request-id`, retryable flag — so users have something actionable to file bug reports against instead of opaque "Tunnel setup failed". Bead `5wz`.
14
20
  - 4xx auth/quota errors (401/403/404) fail fast without retry to avoid loops against a bad API key.
15
- - New posthog telemetry event `tunnel.provision_retry` fires per retry attempt with outcome, status, and diagnostic fields so flaky provision rates become measurable.
21
+ - New posthog telemetry event `tunnel.provision_retry` fires per retry attempt with outcome, status, stage (`ngrok_connect` vs backend-provision), and diagnostic fields so flaky rates become measurable.
16
22
 
17
23
  ## [2.0.0] - 2026-04-23
18
24
 
@@ -0,0 +1,115 @@
1
+ /**
2
+ * Fault injection + trace collection for tunnel lifecycle debugging (bead 42g).
3
+ *
4
+ * This is a TEST/DEV harness. Activation requires BOTH:
5
+ * - NODE_ENV !== 'production'
6
+ * - DEBUGG_TUNNEL_FAULT_MODE env var explicitly set
7
+ *
8
+ * Modes (comma-separated, parseable by parseFaultMode):
9
+ * fail-connect-N:<count> — fail the first <count> ngrok.connect() attempts
10
+ * empty-url-N:<count> — return empty URL from first <count> connect() attempts
11
+ * delay-connect:<ms> — sleep <ms> before each connect() call
12
+ *
13
+ * Examples:
14
+ * DEBUGG_TUNNEL_FAULT_MODE=fail-connect-N:2
15
+ * DEBUGG_TUNNEL_FAULT_MODE=delay-connect:2000,fail-connect-N:1
16
+ */
17
+ export function parseFaultMode(raw) {
18
+ if (!raw)
19
+ return null;
20
+ const mode = {};
21
+ for (const token of raw.split(',').map((s) => s.trim()).filter(Boolean)) {
22
+ const m = token.match(/^(fail-connect-N|empty-url-N|delay-connect):(\d+)$/);
23
+ if (!m)
24
+ continue;
25
+ const [, name, valStr] = m;
26
+ const val = parseInt(valStr, 10);
27
+ if (name === 'fail-connect-N')
28
+ mode.failConnectN = val;
29
+ else if (name === 'empty-url-N')
30
+ mode.emptyUrlN = val;
31
+ else if (name === 'delay-connect')
32
+ mode.delayConnectMs = val;
33
+ }
34
+ return Object.keys(mode).length > 0 ? mode : null;
35
+ }
36
+ export function getFaultModeFromEnv() {
37
+ if (process.env.NODE_ENV === 'production')
38
+ return null;
39
+ return parseFaultMode(process.env.DEBUGG_TUNNEL_FAULT_MODE);
40
+ }
41
+ /**
42
+ * Per-call, mutable fault-injection state. Tracks remaining fault counts so
43
+ * a 'fail first N' mode applies to the first N attempts within one call, not
44
+ * forever.
45
+ */
46
+ export class FaultInjector {
47
+ failConnectRemaining;
48
+ emptyUrlRemaining;
49
+ delayMs;
50
+ constructor(mode) {
51
+ this.failConnectRemaining = mode?.failConnectN ?? 0;
52
+ this.emptyUrlRemaining = mode?.emptyUrlN ?? 0;
53
+ this.delayMs = mode?.delayConnectMs ?? 0;
54
+ }
55
+ /** Returns true if this attempt should be forced to fail. Consumes the counter. */
56
+ shouldFailConnect() {
57
+ if (this.failConnectRemaining > 0) {
58
+ this.failConnectRemaining -= 1;
59
+ return true;
60
+ }
61
+ return false;
62
+ }
63
+ /** Returns true if this attempt should return an empty URL. Consumes the counter. */
64
+ shouldReturnEmptyUrl() {
65
+ if (this.emptyUrlRemaining > 0) {
66
+ this.emptyUrlRemaining -= 1;
67
+ return true;
68
+ }
69
+ return false;
70
+ }
71
+ delayMsForAttempt() {
72
+ return this.delayMs;
73
+ }
74
+ /** For diagnostic logging — what's left after in-flight consumption. */
75
+ snapshot() {
76
+ return {
77
+ failConnectRemaining: this.failConnectRemaining,
78
+ emptyUrlRemaining: this.emptyUrlRemaining,
79
+ delayMs: this.delayMs,
80
+ };
81
+ }
82
+ }
83
+ export class TunnelTrace {
84
+ startTime;
85
+ events = [];
86
+ constructor(startTime = Date.now()) {
87
+ this.startTime = startTime;
88
+ }
89
+ emit(event, context) {
90
+ const now = Date.now();
91
+ this.events.push({
92
+ timestamp: now,
93
+ elapsedMs: now - this.startTime,
94
+ event,
95
+ context,
96
+ });
97
+ }
98
+ toJSON() {
99
+ const last = this.events[this.events.length - 1];
100
+ return {
101
+ startTime: this.startTime,
102
+ durationMs: last ? last.elapsedMs : 0,
103
+ events: this.events,
104
+ };
105
+ }
106
+ /** Human-readable one-line-per-event dump, newest last. */
107
+ format() {
108
+ return this.events
109
+ .map((e) => {
110
+ const ctx = e.context ? ' ' + JSON.stringify(e.context) : '';
111
+ return `+${e.elapsedMs.toString().padStart(6)}ms ${e.event}${ctx}`;
112
+ })
113
+ .join('\n');
114
+ }
115
+ }
@@ -20,6 +20,7 @@ import { Logger } from '../../utils/logger.js';
20
20
  import { Telemetry, TelemetryEvents } from '../../utils/telemetry.js';
21
21
  import { isLocalhostUrl, extractLocalhostPort, generateTunnelUrl } from '../../utils/urlParser.js';
22
22
  import { v4 as uuidv4 } from 'uuid';
23
+ import { FaultInjector, TunnelTrace, getFaultModeFromEnv } from './tunnelFaultInjection.js';
23
24
  import { getDefaultRegistry, } from './tunnelRegistry.js';
24
25
  let ngrokModule = null;
25
26
  async function getNgrok() {
@@ -48,6 +49,12 @@ class TunnelManager {
48
49
  pendingTunnels = new Map();
49
50
  initialized = false;
50
51
  TUNNEL_TIMEOUT_MS = 55 * 60 * 1000;
52
+ /**
53
+ * Backoff schedule (ms) between ngrok.connect() retry attempts. Bead ixh.
54
+ * Exposed on the class so tests can override with short delays without
55
+ * changing the public API or depending on jest fake timers.
56
+ */
57
+ connectBackoffMs = [500, 1500];
51
58
  constructor(reg = getDefaultRegistry()) {
52
59
  this.reg = reg;
53
60
  }
@@ -266,37 +273,109 @@ class TunnelManager {
266
273
  else {
267
274
  localAddr = inDocker ? `${dockerHost}:${port}` : port;
268
275
  }
276
+ // Bead ixh: 3-attempt retry for ngrok.connect transient failures. Previously
277
+ // only retried ONCE (with agent reset), which is insufficient against real
278
+ // ngrok / network flakes (client-reported incident 2026-04-24).
279
+ // - Attempt 1: fresh connect
280
+ // - Attempt 2: after 500ms backoff, reset the ngrok agent module and retry
281
+ // (existing "agent died" recovery path)
282
+ // - Attempt 3: after 1500ms backoff, retry with the already-reset agent
283
+ // Auth-token errors short-circuit at any attempt — no point looping.
284
+ const self = this;
285
+ // Bead 42g: fault injection + trace. Only active when NODE_ENV !== 'production'
286
+ // AND DEBUGG_TUNNEL_FAULT_MODE env var is set. Zero overhead when disabled.
287
+ const faultMode = getFaultModeFromEnv();
288
+ const faults = new FaultInjector(faultMode);
289
+ const trace = new TunnelTrace();
290
+ trace.emit('createTunnel.start', { port, tunnelId, hasFaultMode: !!faultMode });
269
291
  const connectWithRetry = async () => {
270
- try {
271
- const ngrok = await getNgrok();
272
- const url = await ngrok.connect({
273
- proto: 'http',
274
- addr: localAddr,
275
- hostname: tunnelDomain,
276
- authtoken: authToken,
277
- });
278
- if (!url)
279
- throw new Error('ngrok.connect() returned empty URL');
280
- return url;
281
- }
282
- catch (firstError) {
283
- // The ngrok agent process may have died after a previous disconnect.
284
- // Reset module state and retry once with a fresh agent.
285
- logger.warn(`ngrok.connect() failed, retrying with fresh agent: ${firstError}`);
286
- resetNgrokModule();
287
- this.initialized = false;
288
- await this.ensureInitialized();
289
- const ngrok = await getNgrok();
290
- const url = await ngrok.connect({
291
- proto: 'http',
292
- addr: localAddr,
293
- hostname: tunnelDomain,
294
- authtoken: authToken,
295
- });
296
- if (!url)
297
- throw new Error('ngrok.connect() returned empty URL after retry');
298
- return url;
292
+ const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
293
+ const BACKOFF_MS = self.connectBackoffMs; // bead ixh: test-overridable
294
+ const MAX_ATTEMPTS = BACKOFF_MS.length + 1; // N sleeps between N+1 attempts
295
+ const connectOpts = {
296
+ proto: 'http',
297
+ addr: localAddr,
298
+ hostname: tunnelDomain,
299
+ authtoken: authToken,
300
+ };
301
+ let lastError;
302
+ for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
303
+ trace.emit('connect.attempt.start', { attempt });
304
+ // Optional fault-injected delay before each attempt.
305
+ const delayMs = faults.delayMsForAttempt();
306
+ if (delayMs > 0) {
307
+ trace.emit('connect.fault.delay', { attempt, delayMs });
308
+ await sleep(delayMs);
309
+ }
310
+ try {
311
+ const ngrok = await getNgrok();
312
+ // Fault-inject a synthetic failure BEFORE ngrok.connect runs so we
313
+ // can simulate connect-layer failures without hitting the real API.
314
+ if (faults.shouldFailConnect()) {
315
+ trace.emit('connect.fault.inject', { attempt, mode: 'fail-connect-N' });
316
+ throw new Error(`[fault-inject] synthetic connect failure (attempt ${attempt})`);
317
+ }
318
+ const url = faults.shouldReturnEmptyUrl() ? '' : await ngrok.connect(connectOpts);
319
+ if (!url) {
320
+ trace.emit('connect.attempt.empty-url', { attempt });
321
+ throw new Error(`ngrok.connect() returned empty URL (attempt ${attempt})`);
322
+ }
323
+ trace.emit('connect.attempt.success', { attempt });
324
+ if (attempt > 1) {
325
+ Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
326
+ attempt,
327
+ outcome: 'success',
328
+ stage: 'ngrok_connect',
329
+ });
330
+ }
331
+ return url;
332
+ }
333
+ catch (err) {
334
+ lastError = err;
335
+ const msg = err instanceof Error ? err.message : String(err);
336
+ trace.emit('connect.attempt.fail', { attempt, message: msg.slice(0, 200) });
337
+ // Auth-class errors are non-retryable — retrying with the same token
338
+ // would loop. Let the outer catch classify the message.
339
+ if (/authtoken|unauthorized|\b401\b|\b403\b/i.test(msg)) {
340
+ trace.emit('connect.giving-up', { reason: 'auth-error' });
341
+ Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
342
+ attempt,
343
+ outcome: 'giving-up',
344
+ stage: 'ngrok_connect',
345
+ reason: 'auth-error',
346
+ });
347
+ throw err;
348
+ }
349
+ const isLastAttempt = attempt >= MAX_ATTEMPTS;
350
+ Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
351
+ attempt,
352
+ outcome: isLastAttempt ? 'giving-up' : 'will-retry',
353
+ stage: 'ngrok_connect',
354
+ });
355
+ if (isLastAttempt) {
356
+ trace.emit('connect.giving-up', { reason: 'max-attempts' });
357
+ throw err;
358
+ }
359
+ // Between attempt 1→2, do an agent-reset (covers the "agent died"
360
+ // failure mode that used to be the only retried case). Between 2→3,
361
+ // just wait — the reset already happened.
362
+ if (attempt === 1) {
363
+ logger.warn(`ngrok.connect() failed (attempt 1/${MAX_ATTEMPTS}), resetting agent: ${msg}`);
364
+ trace.emit('agent.reset');
365
+ resetNgrokModule();
366
+ this.initialized = false;
367
+ await this.ensureInitialized();
368
+ }
369
+ else {
370
+ logger.warn(`ngrok.connect() failed (attempt ${attempt}/${MAX_ATTEMPTS}), will retry: ${msg}`);
371
+ }
372
+ const backoffMs = BACKOFF_MS[attempt - 1] ?? BACKOFF_MS[BACKOFF_MS.length - 1];
373
+ trace.emit('connect.backoff', { attempt, backoffMs });
374
+ await sleep(backoffMs);
375
+ }
299
376
  }
377
+ // Unreachable (loop always returns or throws), but satisfy TS
378
+ throw lastError ?? new Error('connectWithRetry: exhausted attempts without error');
300
379
  };
301
380
  try {
302
381
  const tunnelUrl = await connectWithRetry();
@@ -332,12 +411,18 @@ class TunnelManager {
332
411
  // best-effort
333
412
  }
334
413
  this.resetTunnelTimer(tunnelInfo);
414
+ trace.emit('createTunnel.success', { tunnelId, publicUrl });
335
415
  logger.info(`Tunnel created: ${publicUrl} → localhost:${port}`);
336
416
  Telemetry.capture(TelemetryEvents.TUNNEL_PROVISIONED, { port, how: 'created' });
337
417
  return tunnelInfo;
338
418
  }
339
419
  catch (error) {
340
420
  const msg = error instanceof Error ? error.message : 'Unknown error';
421
+ trace.emit('createTunnel.fail', { message: msg.slice(0, 200) });
422
+ // Bead 42g: when the trace captured meaningful timing info, log it at
423
+ // WARN so operators can post-mortem. Keeping it out of the thrown error
424
+ // text so we don't leak internals to users.
425
+ logger.warn(`Tunnel lifecycle trace (fail path):\n${trace.format()}`);
341
426
  if (msg.includes('authtoken')) {
342
427
  throw new Error(`Failed to create tunnel: invalid auth token. ${msg}`);
343
428
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@debugg-ai/debugg-ai-mcp",
3
- "version": "2.0.3",
3
+ "version": "2.1.0",
4
4
  "description": "Zero-Config, Fully AI-Managed End-to-End Testing for all code gen platforms.",
5
5
  "type": "module",
6
6
  "bin": {