@debugg-ai/debugg-ai-mcp 2.0.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
### Added — tunnel fault-injection + trace harness for diagnosis
|
|
11
|
+
|
|
12
|
+
- New `DEBUGG_TUNNEL_FAULT_MODE` env var (dev/test only — inert when `NODE_ENV=production`) lets developers force specific ngrok-side failures without mocking, to reproduce client-reported transient "Tunnel setup failed" incidents. Modes: `fail-connect-N:<count>`, `empty-url-N:<count>`, `delay-connect:<ms>`, combinable with commas. Bead `42g`.
|
|
13
|
+
- Structured `TunnelTrace` captures timestamped lifecycle events per tunnel-create call (start, each connect attempt, fault inject, agent reset, backoff, success/fail). Dumped to WARN logs on any tunnel creation failure so real-world flakes get a post-mortem trail instead of an opaque error message.
|
|
14
|
+
|
|
10
15
|
### Fixed — tunnel provisioning flakiness surfaces as user-facing errors
|
|
11
16
|
|
|
12
17
|
- `check_app_in_browser` / `trigger_crawl` now automatically retry transient tunnel-provision failures (5xx, 408, 429, network errors like ECONNRESET) with exponential backoff (500ms → 1500ms → 3000ms, 3 attempts). Previously a single ngrok/backend blip forced the caller to manually retry the tool call. Bead `7nx`.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fault injection + trace collection for tunnel lifecycle debugging (bead 42g).
|
|
3
|
+
*
|
|
4
|
+
* This is a TEST/DEV harness. Activation requires BOTH:
|
|
5
|
+
* - NODE_ENV !== 'production'
|
|
6
|
+
* - DEBUGG_TUNNEL_FAULT_MODE env var explicitly set
|
|
7
|
+
*
|
|
8
|
+
* Modes (comma-separated, parseable by parseFaultMode):
|
|
9
|
+
* fail-connect-N:<count> — fail the first <count> ngrok.connect() attempts
|
|
10
|
+
* empty-url-N:<count> — return empty URL from first <count> connect() attempts
|
|
11
|
+
* delay-connect:<ms> — sleep <ms> before each connect() call
|
|
12
|
+
*
|
|
13
|
+
* Examples:
|
|
14
|
+
* DEBUGG_TUNNEL_FAULT_MODE=fail-connect-N:2
|
|
15
|
+
* DEBUGG_TUNNEL_FAULT_MODE=delay-connect:2000,fail-connect-N:1
|
|
16
|
+
*/
|
|
17
|
+
export function parseFaultMode(raw) {
|
|
18
|
+
if (!raw)
|
|
19
|
+
return null;
|
|
20
|
+
const mode = {};
|
|
21
|
+
for (const token of raw.split(',').map((s) => s.trim()).filter(Boolean)) {
|
|
22
|
+
const m = token.match(/^(fail-connect-N|empty-url-N|delay-connect):(\d+)$/);
|
|
23
|
+
if (!m)
|
|
24
|
+
continue;
|
|
25
|
+
const [, name, valStr] = m;
|
|
26
|
+
const val = parseInt(valStr, 10);
|
|
27
|
+
if (name === 'fail-connect-N')
|
|
28
|
+
mode.failConnectN = val;
|
|
29
|
+
else if (name === 'empty-url-N')
|
|
30
|
+
mode.emptyUrlN = val;
|
|
31
|
+
else if (name === 'delay-connect')
|
|
32
|
+
mode.delayConnectMs = val;
|
|
33
|
+
}
|
|
34
|
+
return Object.keys(mode).length > 0 ? mode : null;
|
|
35
|
+
}
|
|
36
|
+
export function getFaultModeFromEnv() {
|
|
37
|
+
if (process.env.NODE_ENV === 'production')
|
|
38
|
+
return null;
|
|
39
|
+
return parseFaultMode(process.env.DEBUGG_TUNNEL_FAULT_MODE);
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Per-call, mutable fault-injection state. Tracks remaining fault counts so
|
|
43
|
+
* a 'fail first N' mode applies to the first N attempts within one call, not
|
|
44
|
+
* forever.
|
|
45
|
+
*/
|
|
46
|
+
export class FaultInjector {
|
|
47
|
+
failConnectRemaining;
|
|
48
|
+
emptyUrlRemaining;
|
|
49
|
+
delayMs;
|
|
50
|
+
constructor(mode) {
|
|
51
|
+
this.failConnectRemaining = mode?.failConnectN ?? 0;
|
|
52
|
+
this.emptyUrlRemaining = mode?.emptyUrlN ?? 0;
|
|
53
|
+
this.delayMs = mode?.delayConnectMs ?? 0;
|
|
54
|
+
}
|
|
55
|
+
/** Returns true if this attempt should be forced to fail. Consumes the counter. */
|
|
56
|
+
shouldFailConnect() {
|
|
57
|
+
if (this.failConnectRemaining > 0) {
|
|
58
|
+
this.failConnectRemaining -= 1;
|
|
59
|
+
return true;
|
|
60
|
+
}
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
/** Returns true if this attempt should return an empty URL. Consumes the counter. */
|
|
64
|
+
shouldReturnEmptyUrl() {
|
|
65
|
+
if (this.emptyUrlRemaining > 0) {
|
|
66
|
+
this.emptyUrlRemaining -= 1;
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
delayMsForAttempt() {
|
|
72
|
+
return this.delayMs;
|
|
73
|
+
}
|
|
74
|
+
/** For diagnostic logging — what's left after in-flight consumption. */
|
|
75
|
+
snapshot() {
|
|
76
|
+
return {
|
|
77
|
+
failConnectRemaining: this.failConnectRemaining,
|
|
78
|
+
emptyUrlRemaining: this.emptyUrlRemaining,
|
|
79
|
+
delayMs: this.delayMs,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
export class TunnelTrace {
|
|
84
|
+
startTime;
|
|
85
|
+
events = [];
|
|
86
|
+
constructor(startTime = Date.now()) {
|
|
87
|
+
this.startTime = startTime;
|
|
88
|
+
}
|
|
89
|
+
emit(event, context) {
|
|
90
|
+
const now = Date.now();
|
|
91
|
+
this.events.push({
|
|
92
|
+
timestamp: now,
|
|
93
|
+
elapsedMs: now - this.startTime,
|
|
94
|
+
event,
|
|
95
|
+
context,
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
toJSON() {
|
|
99
|
+
const last = this.events[this.events.length - 1];
|
|
100
|
+
return {
|
|
101
|
+
startTime: this.startTime,
|
|
102
|
+
durationMs: last ? last.elapsedMs : 0,
|
|
103
|
+
events: this.events,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
/** Human-readable one-line-per-event dump, newest last. */
|
|
107
|
+
format() {
|
|
108
|
+
return this.events
|
|
109
|
+
.map((e) => {
|
|
110
|
+
const ctx = e.context ? ' ' + JSON.stringify(e.context) : '';
|
|
111
|
+
return `+${e.elapsedMs.toString().padStart(6)}ms ${e.event}${ctx}`;
|
|
112
|
+
})
|
|
113
|
+
.join('\n');
|
|
114
|
+
}
|
|
115
|
+
}
|
|
@@ -20,6 +20,7 @@ import { Logger } from '../../utils/logger.js';
|
|
|
20
20
|
import { Telemetry, TelemetryEvents } from '../../utils/telemetry.js';
|
|
21
21
|
import { isLocalhostUrl, extractLocalhostPort, generateTunnelUrl } from '../../utils/urlParser.js';
|
|
22
22
|
import { v4 as uuidv4 } from 'uuid';
|
|
23
|
+
import { FaultInjector, TunnelTrace, getFaultModeFromEnv } from './tunnelFaultInjection.js';
|
|
23
24
|
import { getDefaultRegistry, } from './tunnelRegistry.js';
|
|
24
25
|
let ngrokModule = null;
|
|
25
26
|
async function getNgrok() {
|
|
@@ -281,6 +282,12 @@ class TunnelManager {
|
|
|
281
282
|
// - Attempt 3: after 1500ms backoff, retry with the already-reset agent
|
|
282
283
|
// Auth-token errors short-circuit at any attempt — no point looping.
|
|
283
284
|
const self = this;
|
|
285
|
+
// Bead 42g: fault injection + trace. Only active when NODE_ENV !== 'production'
|
|
286
|
+
// AND DEBUGG_TUNNEL_FAULT_MODE env var is set. Zero overhead when disabled.
|
|
287
|
+
const faultMode = getFaultModeFromEnv();
|
|
288
|
+
const faults = new FaultInjector(faultMode);
|
|
289
|
+
const trace = new TunnelTrace();
|
|
290
|
+
trace.emit('createTunnel.start', { port, tunnelId, hasFaultMode: !!faultMode });
|
|
284
291
|
const connectWithRetry = async () => {
|
|
285
292
|
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
286
293
|
const BACKOFF_MS = self.connectBackoffMs; // bead ixh: test-overridable
|
|
@@ -293,11 +300,27 @@ class TunnelManager {
|
|
|
293
300
|
};
|
|
294
301
|
let lastError;
|
|
295
302
|
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
|
|
303
|
+
trace.emit('connect.attempt.start', { attempt });
|
|
304
|
+
// Optional fault-injected delay before each attempt.
|
|
305
|
+
const delayMs = faults.delayMsForAttempt();
|
|
306
|
+
if (delayMs > 0) {
|
|
307
|
+
trace.emit('connect.fault.delay', { attempt, delayMs });
|
|
308
|
+
await sleep(delayMs);
|
|
309
|
+
}
|
|
296
310
|
try {
|
|
297
311
|
const ngrok = await getNgrok();
|
|
298
|
-
|
|
299
|
-
|
|
312
|
+
// Fault-inject a synthetic failure BEFORE ngrok.connect runs so we
|
|
313
|
+
// can simulate connect-layer failures without hitting the real API.
|
|
314
|
+
if (faults.shouldFailConnect()) {
|
|
315
|
+
trace.emit('connect.fault.inject', { attempt, mode: 'fail-connect-N' });
|
|
316
|
+
throw new Error(`[fault-inject] synthetic connect failure (attempt ${attempt})`);
|
|
317
|
+
}
|
|
318
|
+
const url = faults.shouldReturnEmptyUrl() ? '' : await ngrok.connect(connectOpts);
|
|
319
|
+
if (!url) {
|
|
320
|
+
trace.emit('connect.attempt.empty-url', { attempt });
|
|
300
321
|
throw new Error(`ngrok.connect() returned empty URL (attempt ${attempt})`);
|
|
322
|
+
}
|
|
323
|
+
trace.emit('connect.attempt.success', { attempt });
|
|
301
324
|
if (attempt > 1) {
|
|
302
325
|
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
|
|
303
326
|
attempt,
|
|
@@ -310,9 +333,11 @@ class TunnelManager {
|
|
|
310
333
|
catch (err) {
|
|
311
334
|
lastError = err;
|
|
312
335
|
const msg = err instanceof Error ? err.message : String(err);
|
|
336
|
+
trace.emit('connect.attempt.fail', { attempt, message: msg.slice(0, 200) });
|
|
313
337
|
// Auth-class errors are non-retryable — retrying with the same token
|
|
314
|
-
// would loop. Let the outer catch
|
|
338
|
+
// would loop. Let the outer catch classify the message.
|
|
315
339
|
if (/authtoken|unauthorized|\b401\b|\b403\b/i.test(msg)) {
|
|
340
|
+
trace.emit('connect.giving-up', { reason: 'auth-error' });
|
|
316
341
|
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISION_RETRY, {
|
|
317
342
|
attempt,
|
|
318
343
|
outcome: 'giving-up',
|
|
@@ -327,13 +352,16 @@ class TunnelManager {
|
|
|
327
352
|
outcome: isLastAttempt ? 'giving-up' : 'will-retry',
|
|
328
353
|
stage: 'ngrok_connect',
|
|
329
354
|
});
|
|
330
|
-
if (isLastAttempt)
|
|
355
|
+
if (isLastAttempt) {
|
|
356
|
+
trace.emit('connect.giving-up', { reason: 'max-attempts' });
|
|
331
357
|
throw err;
|
|
358
|
+
}
|
|
332
359
|
// Between attempt 1→2, do an agent-reset (covers the "agent died"
|
|
333
360
|
// failure mode that used to be the only retried case). Between 2→3,
|
|
334
361
|
// just wait — the reset already happened.
|
|
335
362
|
if (attempt === 1) {
|
|
336
363
|
logger.warn(`ngrok.connect() failed (attempt 1/${MAX_ATTEMPTS}), resetting agent: ${msg}`);
|
|
364
|
+
trace.emit('agent.reset');
|
|
337
365
|
resetNgrokModule();
|
|
338
366
|
this.initialized = false;
|
|
339
367
|
await this.ensureInitialized();
|
|
@@ -341,7 +369,9 @@ class TunnelManager {
|
|
|
341
369
|
else {
|
|
342
370
|
logger.warn(`ngrok.connect() failed (attempt ${attempt}/${MAX_ATTEMPTS}), will retry: ${msg}`);
|
|
343
371
|
}
|
|
344
|
-
|
|
372
|
+
const backoffMs = BACKOFF_MS[attempt - 1] ?? BACKOFF_MS[BACKOFF_MS.length - 1];
|
|
373
|
+
trace.emit('connect.backoff', { attempt, backoffMs });
|
|
374
|
+
await sleep(backoffMs);
|
|
345
375
|
}
|
|
346
376
|
}
|
|
347
377
|
// Unreachable (loop always returns or throws), but satisfy TS
|
|
@@ -381,12 +411,18 @@ class TunnelManager {
|
|
|
381
411
|
// best-effort
|
|
382
412
|
}
|
|
383
413
|
this.resetTunnelTimer(tunnelInfo);
|
|
414
|
+
trace.emit('createTunnel.success', { tunnelId, publicUrl });
|
|
384
415
|
logger.info(`Tunnel created: ${publicUrl} → localhost:${port}`);
|
|
385
416
|
Telemetry.capture(TelemetryEvents.TUNNEL_PROVISIONED, { port, how: 'created' });
|
|
386
417
|
return tunnelInfo;
|
|
387
418
|
}
|
|
388
419
|
catch (error) {
|
|
389
420
|
const msg = error instanceof Error ? error.message : 'Unknown error';
|
|
421
|
+
trace.emit('createTunnel.fail', { message: msg.slice(0, 200) });
|
|
422
|
+
// Bead 42g: when the trace captured meaningful timing info, log it at
|
|
423
|
+
// WARN so operators can post-mortem. Keeping it out of the thrown error
|
|
424
|
+
// text so we don't leak internals to users.
|
|
425
|
+
logger.warn(`Tunnel lifecycle trace (fail path):\n${trace.format()}`);
|
|
390
426
|
if (msg.includes('authtoken')) {
|
|
391
427
|
throw new Error(`Failed to create tunnel: invalid auth token. ${msg}`);
|
|
392
428
|
}
|